From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- src/rocksdb/db/arena_wrapped_db_iter.cc | 160 + src/rocksdb/db/arena_wrapped_db_iter.h | 127 + src/rocksdb/db/blob/blob_constants.h | 16 + src/rocksdb/db/blob/blob_contents.cc | 90 + src/rocksdb/db/blob/blob_contents.h | 56 + src/rocksdb/db/blob/blob_counting_iterator.h | 146 + src/rocksdb/db/blob/blob_counting_iterator_test.cc | 327 + src/rocksdb/db/blob/blob_fetcher.cc | 34 + src/rocksdb/db/blob/blob_fetcher.h | 37 + src/rocksdb/db/blob/blob_file_addition.cc | 156 + src/rocksdb/db/blob/blob_file_addition.h | 67 + src/rocksdb/db/blob/blob_file_addition_test.cc | 211 + src/rocksdb/db/blob/blob_file_builder.cc | 446 ++ src/rocksdb/db/blob/blob_file_builder.h | 112 + src/rocksdb/db/blob/blob_file_builder_test.cc | 680 ++ src/rocksdb/db/blob/blob_file_cache.cc | 102 + src/rocksdb/db/blob/blob_file_cache.h | 52 + src/rocksdb/db/blob/blob_file_cache_test.cc | 269 + .../db/blob/blob_file_completion_callback.h | 101 + src/rocksdb/db/blob/blob_file_garbage.cc | 134 + src/rocksdb/db/blob/blob_file_garbage.h | 57 + src/rocksdb/db/blob/blob_file_garbage_test.cc | 174 + src/rocksdb/db/blob/blob_file_meta.cc | 62 + src/rocksdb/db/blob/blob_file_meta.h | 170 + src/rocksdb/db/blob/blob_file_reader.cc | 610 ++ src/rocksdb/db/blob/blob_file_reader.h | 108 + src/rocksdb/db/blob/blob_file_reader_test.cc | 1024 +++ src/rocksdb/db/blob/blob_garbage_meter.cc | 100 + src/rocksdb/db/blob/blob_garbage_meter.h | 102 + src/rocksdb/db/blob/blob_garbage_meter_test.cc | 197 + src/rocksdb/db/blob/blob_index.h | 187 + src/rocksdb/db/blob/blob_log_format.cc | 143 + src/rocksdb/db/blob/blob_log_format.h | 164 + src/rocksdb/db/blob/blob_log_sequential_reader.cc | 134 + src/rocksdb/db/blob/blob_log_sequential_reader.h | 83 + src/rocksdb/db/blob/blob_log_writer.cc | 178 + src/rocksdb/db/blob/blob_log_writer.h | 83 + src/rocksdb/db/blob/blob_read_request.h | 58 + src/rocksdb/db/blob/blob_source.cc | 488 ++ src/rocksdb/db/blob/blob_source.h | 153 + src/rocksdb/db/blob/blob_source_test.cc | 1624 ++++ src/rocksdb/db/blob/db_blob_basic_test.cc | 1789 +++++ src/rocksdb/db/blob/db_blob_compaction_test.cc | 913 +++ src/rocksdb/db/blob/db_blob_corruption_test.cc | 82 + src/rocksdb/db/blob/db_blob_index_test.cc | 602 ++ src/rocksdb/db/blob/prefetch_buffer_collection.cc | 21 + src/rocksdb/db/blob/prefetch_buffer_collection.h | 38 + src/rocksdb/db/builder.cc | 434 ++ src/rocksdb/db/builder.h | 77 + src/rocksdb/db/c.cc | 6390 +++++++++++++++ src/rocksdb/db/c_test.c | 3476 +++++++++ src/rocksdb/db/column_family.cc | 1683 ++++ src/rocksdb/db/column_family.h | 845 ++ src/rocksdb/db/column_family_test.cc | 3453 ++++++++ src/rocksdb/db/compact_files_test.cc | 502 ++ src/rocksdb/db/compaction/clipping_iterator.h | 276 + .../db/compaction/clipping_iterator_test.cc | 259 + src/rocksdb/db/compaction/compaction.cc | 855 ++ src/rocksdb/db/compaction/compaction.h | 559 ++ .../db/compaction/compaction_iteration_stats.h | 49 + src/rocksdb/db/compaction/compaction_iterator.cc | 1338 ++++ src/rocksdb/db/compaction/compaction_iterator.h | 513 ++ .../db/compaction/compaction_iterator_test.cc | 1618 ++++ src/rocksdb/db/compaction/compaction_job.cc | 2060 +++++ src/rocksdb/db/compaction/compaction_job.h | 500 ++ .../db/compaction/compaction_job_stats_test.cc | 975 +++ src/rocksdb/db/compaction/compaction_job_test.cc | 2451 ++++++ src/rocksdb/db/compaction/compaction_outputs.cc | 646 ++ src/rocksdb/db/compaction/compaction_outputs.h | 385 + src/rocksdb/db/compaction/compaction_picker.cc | 1234 +++ src/rocksdb/db/compaction/compaction_picker.h | 323 + .../db/compaction/compaction_picker_fifo.cc | 433 ++ src/rocksdb/db/compaction/compaction_picker_fifo.h | 63 + .../db/compaction/compaction_picker_level.cc | 841 ++ .../db/compaction/compaction_picker_level.h | 33 + .../db/compaction/compaction_picker_test.cc | 3964 ++++++++++ .../db/compaction/compaction_picker_universal.cc | 1450 ++++ .../db/compaction/compaction_picker_universal.h | 32 + .../db/compaction/compaction_service_job.cc | 829 ++ .../db/compaction/compaction_service_test.cc | 966 +++ src/rocksdb/db/compaction/compaction_state.cc | 46 + src/rocksdb/db/compaction/compaction_state.h | 42 + src/rocksdb/db/compaction/file_pri.h | 92 + src/rocksdb/db/compaction/sst_partitioner.cc | 90 + src/rocksdb/db/compaction/subcompaction_state.cc | 106 + src/rocksdb/db/compaction/subcompaction_state.h | 214 + .../db/compaction/tiered_compaction_test.cc | 2028 +++++ src/rocksdb/db/comparator_db_test.cc | 678 ++ src/rocksdb/db/convenience.cc | 81 + src/rocksdb/db/corruption_test.cc | 1587 ++++ src/rocksdb/db/cuckoo_table_db_test.cc | 361 + src/rocksdb/db/db_basic_test.cc | 4643 +++++++++++ src/rocksdb/db/db_block_cache_test.cc | 2313 ++++++ src/rocksdb/db/db_bloom_filter_test.cc | 3498 +++++++++ src/rocksdb/db/db_compaction_filter_test.cc | 1036 +++ src/rocksdb/db/db_compaction_test.cc | 8227 ++++++++++++++++++++ src/rocksdb/db/db_dynamic_level_test.cc | 507 ++ src/rocksdb/db/db_encryption_test.cc | 130 + src/rocksdb/db/db_filesnapshot.cc | 442 ++ src/rocksdb/db/db_flush_test.cc | 3084 ++++++++ src/rocksdb/db/db_impl/compacted_db_impl.cc | 257 + src/rocksdb/db/db_impl/compacted_db_impl.h | 154 + src/rocksdb/db/db_impl/db_impl.cc | 5918 ++++++++++++++ src/rocksdb/db/db_impl/db_impl.h | 2804 +++++++ src/rocksdb/db/db_impl/db_impl_compaction_flush.cc | 3857 +++++++++ src/rocksdb/db/db_impl/db_impl_debug.cc | 312 + src/rocksdb/db/db_impl/db_impl_experimental.cc | 158 + src/rocksdb/db/db_impl/db_impl_files.cc | 1013 +++ src/rocksdb/db/db_impl/db_impl_open.cc | 2106 +++++ src/rocksdb/db/db_impl/db_impl_readonly.cc | 341 + src/rocksdb/db/db_impl/db_impl_readonly.h | 170 + src/rocksdb/db/db_impl/db_impl_secondary.cc | 967 +++ src/rocksdb/db/db_impl/db_impl_secondary.h | 410 + src/rocksdb/db/db_impl/db_impl_write.cc | 2435 ++++++ src/rocksdb/db/db_info_dumper.cc | 147 + src/rocksdb/db/db_info_dumper.h | 15 + src/rocksdb/db/db_inplace_update_test.cc | 262 + src/rocksdb/db/db_io_failure_test.cc | 593 ++ src/rocksdb/db/db_iter.cc | 1708 ++++ src/rocksdb/db/db_iter.h | 420 + src/rocksdb/db/db_iter_stress_test.cc | 658 ++ src/rocksdb/db/db_iter_test.cc | 3195 ++++++++ src/rocksdb/db/db_iterator_test.cc | 3265 ++++++++ src/rocksdb/db/db_kv_checksum_test.cc | 885 +++ src/rocksdb/db/db_log_iter_test.cc | 305 + src/rocksdb/db/db_logical_block_size_cache_test.cc | 521 ++ src/rocksdb/db/db_memtable_test.cc | 344 + src/rocksdb/db/db_merge_operand_test.cc | 448 ++ src/rocksdb/db/db_merge_operator_test.cc | 669 ++ src/rocksdb/db/db_options_test.cc | 1219 +++ src/rocksdb/db/db_properties_test.cc | 2206 ++++++ src/rocksdb/db/db_range_del_test.cc | 2807 +++++++ src/rocksdb/db/db_rate_limiter_test.cc | 451 ++ src/rocksdb/db/db_readonly_with_timestamp_test.cc | 960 +++ src/rocksdb/db/db_secondary_test.cc | 1693 ++++ src/rocksdb/db/db_sst_test.cc | 1868 +++++ src/rocksdb/db/db_statistics_test.cc | 215 + src/rocksdb/db/db_table_properties_test.cc | 625 ++ src/rocksdb/db/db_tailing_iter_test.cc | 604 ++ src/rocksdb/db/db_test.cc | 7397 ++++++++++++++++++ src/rocksdb/db/db_test2.cc | 7652 ++++++++++++++++++ src/rocksdb/db/db_test_util.cc | 1773 +++++ src/rocksdb/db/db_test_util.h | 1402 ++++ src/rocksdb/db/db_universal_compaction_test.cc | 2235 ++++++ src/rocksdb/db/db_wal_test.cc | 2314 ++++++ src/rocksdb/db/db_with_timestamp_basic_test.cc | 3880 +++++++++ .../db/db_with_timestamp_compaction_test.cc | 334 + src/rocksdb/db/db_with_timestamp_test_util.cc | 96 + src/rocksdb/db/db_with_timestamp_test_util.h | 126 + src/rocksdb/db/db_write_buffer_manager_test.cc | 862 ++ src/rocksdb/db/db_write_test.cc | 679 ++ src/rocksdb/db/dbformat.cc | 188 + src/rocksdb/db/dbformat.h | 865 ++ src/rocksdb/db/dbformat_test.cc | 214 + src/rocksdb/db/deletefile_test.cc | 614 ++ src/rocksdb/db/error_handler.cc | 819 ++ src/rocksdb/db/error_handler.h | 124 + src/rocksdb/db/error_handler_fs_test.cc | 2875 +++++++ src/rocksdb/db/event_helpers.cc | 371 + src/rocksdb/db/event_helpers.h | 82 + src/rocksdb/db/experimental.cc | 155 + src/rocksdb/db/external_sst_file_basic_test.cc | 1997 +++++ src/rocksdb/db/external_sst_file_ingestion_job.cc | 1020 +++ src/rocksdb/db/external_sst_file_ingestion_job.h | 201 + src/rocksdb/db/external_sst_file_test.cc | 2967 +++++++ src/rocksdb/db/fault_injection_test.cc | 637 ++ src/rocksdb/db/file_indexer.cc | 218 + src/rocksdb/db/file_indexer.h | 140 + src/rocksdb/db/file_indexer_test.cc | 352 + src/rocksdb/db/filename_test.cc | 241 + src/rocksdb/db/flush_job.cc | 1094 +++ src/rocksdb/db/flush_job.h | 203 + src/rocksdb/db/flush_job_test.cc | 745 ++ src/rocksdb/db/flush_scheduler.cc | 86 + src/rocksdb/db/flush_scheduler.h | 55 + src/rocksdb/db/forward_iterator.cc | 1062 +++ src/rocksdb/db/forward_iterator.h | 168 + src/rocksdb/db/forward_iterator_bench.cc | 378 + src/rocksdb/db/history_trimming_iterator.h | 91 + src/rocksdb/db/import_column_family_job.cc | 312 + src/rocksdb/db/import_column_family_job.h | 82 + src/rocksdb/db/import_column_family_test.cc | 644 ++ src/rocksdb/db/internal_stats.cc | 2002 +++++ src/rocksdb/db/internal_stats.h | 996 +++ src/rocksdb/db/job_context.h | 238 + src/rocksdb/db/kv_checksum.h | 398 + src/rocksdb/db/listener_test.cc | 1595 ++++ src/rocksdb/db/log_format.h | 51 + src/rocksdb/db/log_reader.cc | 854 ++ src/rocksdb/db/log_reader.h | 225 + src/rocksdb/db/log_test.cc | 1062 +++ src/rocksdb/db/log_writer.cc | 249 + src/rocksdb/db/log_writer.h | 128 + src/rocksdb/db/logs_with_prep_tracker.cc | 67 + src/rocksdb/db/logs_with_prep_tracker.h | 62 + src/rocksdb/db/lookup_key.h | 68 + src/rocksdb/db/malloc_stats.cc | 55 + src/rocksdb/db/malloc_stats.h | 24 + src/rocksdb/db/manual_compaction_test.cc | 308 + src/rocksdb/db/memtable.cc | 1675 ++++ src/rocksdb/db/memtable.h | 664 ++ src/rocksdb/db/memtable_list.cc | 991 +++ src/rocksdb/db/memtable_list.h | 471 ++ src/rocksdb/db/memtable_list_test.cc | 1039 +++ src/rocksdb/db/merge_context.h | 147 + src/rocksdb/db/merge_helper.cc | 583 ++ src/rocksdb/db/merge_helper.h | 216 + src/rocksdb/db/merge_helper_test.cc | 298 + src/rocksdb/db/merge_operator.cc | 85 + src/rocksdb/db/merge_test.cc | 629 ++ src/rocksdb/db/obsolete_files_test.cc | 328 + src/rocksdb/db/options_file_test.cc | 120 + src/rocksdb/db/output_validator.cc | 33 + src/rocksdb/db/output_validator.h | 48 + src/rocksdb/db/perf_context_test.cc | 1010 +++ src/rocksdb/db/periodic_task_scheduler.cc | 113 + src/rocksdb/db/periodic_task_scheduler.h | 110 + src/rocksdb/db/periodic_task_scheduler_test.cc | 231 + src/rocksdb/db/pinned_iterators_manager.h | 92 + src/rocksdb/db/plain_table_db_test.cc | 1357 ++++ src/rocksdb/db/post_memtable_callback.h | 25 + src/rocksdb/db/pre_release_callback.h | 37 + src/rocksdb/db/prefix_test.cc | 906 +++ src/rocksdb/db/range_del_aggregator.cc | 524 ++ src/rocksdb/db/range_del_aggregator.h | 476 ++ src/rocksdb/db/range_del_aggregator_bench.cc | 280 + src/rocksdb/db/range_del_aggregator_test.cc | 715 ++ src/rocksdb/db/range_tombstone_fragmenter.cc | 502 ++ src/rocksdb/db/range_tombstone_fragmenter.h | 357 + src/rocksdb/db/range_tombstone_fragmenter_test.cc | 555 ++ src/rocksdb/db/read_callback.h | 54 + src/rocksdb/db/repair.cc | 771 ++ src/rocksdb/db/repair_test.cc | 442 ++ src/rocksdb/db/seqno_time_test.cc | 996 +++ src/rocksdb/db/seqno_to_time_mapping.cc | 341 + src/rocksdb/db/seqno_to_time_mapping.h | 189 + src/rocksdb/db/snapshot_checker.h | 60 + src/rocksdb/db/snapshot_impl.cc | 25 + src/rocksdb/db/snapshot_impl.h | 239 + src/rocksdb/db/table_cache.cc | 753 ++ src/rocksdb/db/table_cache.h | 275 + src/rocksdb/db/table_cache_sync_and_async.h | 135 + src/rocksdb/db/table_properties_collector.cc | 74 + src/rocksdb/db/table_properties_collector.h | 175 + src/rocksdb/db/table_properties_collector_test.cc | 513 ++ src/rocksdb/db/transaction_log_impl.cc | 298 + src/rocksdb/db/transaction_log_impl.h | 130 + src/rocksdb/db/trim_history_scheduler.cc | 54 + src/rocksdb/db/trim_history_scheduler.h | 46 + src/rocksdb/db/version_builder.cc | 1372 ++++ src/rocksdb/db/version_builder.h | 72 + src/rocksdb/db/version_builder_test.cc | 1695 ++++ src/rocksdb/db/version_edit.cc | 1043 +++ src/rocksdb/db/version_edit.h | 669 ++ src/rocksdb/db/version_edit_handler.cc | 1002 +++ src/rocksdb/db/version_edit_handler.h | 313 + src/rocksdb/db/version_edit_test.cc | 730 ++ src/rocksdb/db/version_set.cc | 6903 ++++++++++++++++ src/rocksdb/db/version_set.h | 1652 ++++ src/rocksdb/db/version_set_sync_and_async.h | 151 + src/rocksdb/db/version_set_test.cc | 3587 +++++++++ src/rocksdb/db/version_util.h | 71 + src/rocksdb/db/wal_edit.cc | 211 + src/rocksdb/db/wal_edit.h | 177 + src/rocksdb/db/wal_edit_test.cc | 213 + src/rocksdb/db/wal_manager.cc | 529 ++ src/rocksdb/db/wal_manager.h | 138 + src/rocksdb/db/wal_manager_test.cc | 346 + src/rocksdb/db/wide/db_wide_basic_test.cc | 654 ++ src/rocksdb/db/wide/wide_column_serialization.cc | 182 + src/rocksdb/db/wide/wide_column_serialization.h | 77 + .../db/wide/wide_column_serialization_test.cc | 338 + src/rocksdb/db/wide/wide_columns.cc | 22 + src/rocksdb/db/write_batch.cc | 3137 ++++++++ src/rocksdb/db/write_batch_base.cc | 94 + src/rocksdb/db/write_batch_internal.h | 401 + src/rocksdb/db/write_batch_test.cc | 1114 +++ src/rocksdb/db/write_callback.h | 27 + src/rocksdb/db/write_callback_test.cc | 465 ++ src/rocksdb/db/write_controller.cc | 121 + src/rocksdb/db/write_controller.h | 148 + src/rocksdb/db/write_controller_test.cc | 248 + src/rocksdb/db/write_thread.cc | 815 ++ src/rocksdb/db/write_thread.h | 440 ++ 284 files changed, 233845 insertions(+) create mode 100644 src/rocksdb/db/arena_wrapped_db_iter.cc create mode 100644 src/rocksdb/db/arena_wrapped_db_iter.h create mode 100644 src/rocksdb/db/blob/blob_constants.h create mode 100644 src/rocksdb/db/blob/blob_contents.cc create mode 100644 src/rocksdb/db/blob/blob_contents.h create mode 100644 src/rocksdb/db/blob/blob_counting_iterator.h create mode 100644 src/rocksdb/db/blob/blob_counting_iterator_test.cc create mode 100644 src/rocksdb/db/blob/blob_fetcher.cc create mode 100644 src/rocksdb/db/blob/blob_fetcher.h create mode 100644 src/rocksdb/db/blob/blob_file_addition.cc create mode 100644 src/rocksdb/db/blob/blob_file_addition.h create mode 100644 src/rocksdb/db/blob/blob_file_addition_test.cc create mode 100644 src/rocksdb/db/blob/blob_file_builder.cc create mode 100644 src/rocksdb/db/blob/blob_file_builder.h create mode 100644 src/rocksdb/db/blob/blob_file_builder_test.cc create mode 100644 src/rocksdb/db/blob/blob_file_cache.cc create mode 100644 src/rocksdb/db/blob/blob_file_cache.h create mode 100644 src/rocksdb/db/blob/blob_file_cache_test.cc create mode 100644 src/rocksdb/db/blob/blob_file_completion_callback.h create mode 100644 src/rocksdb/db/blob/blob_file_garbage.cc create mode 100644 src/rocksdb/db/blob/blob_file_garbage.h create mode 100644 src/rocksdb/db/blob/blob_file_garbage_test.cc create mode 100644 src/rocksdb/db/blob/blob_file_meta.cc create mode 100644 src/rocksdb/db/blob/blob_file_meta.h create mode 100644 src/rocksdb/db/blob/blob_file_reader.cc create mode 100644 src/rocksdb/db/blob/blob_file_reader.h create mode 100644 src/rocksdb/db/blob/blob_file_reader_test.cc create mode 100644 src/rocksdb/db/blob/blob_garbage_meter.cc create mode 100644 src/rocksdb/db/blob/blob_garbage_meter.h create mode 100644 src/rocksdb/db/blob/blob_garbage_meter_test.cc create mode 100644 src/rocksdb/db/blob/blob_index.h create mode 100644 src/rocksdb/db/blob/blob_log_format.cc create mode 100644 src/rocksdb/db/blob/blob_log_format.h create mode 100644 src/rocksdb/db/blob/blob_log_sequential_reader.cc create mode 100644 src/rocksdb/db/blob/blob_log_sequential_reader.h create mode 100644 src/rocksdb/db/blob/blob_log_writer.cc create mode 100644 src/rocksdb/db/blob/blob_log_writer.h create mode 100644 src/rocksdb/db/blob/blob_read_request.h create mode 100644 src/rocksdb/db/blob/blob_source.cc create mode 100644 src/rocksdb/db/blob/blob_source.h create mode 100644 src/rocksdb/db/blob/blob_source_test.cc create mode 100644 src/rocksdb/db/blob/db_blob_basic_test.cc create mode 100644 src/rocksdb/db/blob/db_blob_compaction_test.cc create mode 100644 src/rocksdb/db/blob/db_blob_corruption_test.cc create mode 100644 src/rocksdb/db/blob/db_blob_index_test.cc create mode 100644 src/rocksdb/db/blob/prefetch_buffer_collection.cc create mode 100644 src/rocksdb/db/blob/prefetch_buffer_collection.h create mode 100644 src/rocksdb/db/builder.cc create mode 100644 src/rocksdb/db/builder.h create mode 100644 src/rocksdb/db/c.cc create mode 100644 src/rocksdb/db/c_test.c create mode 100644 src/rocksdb/db/column_family.cc create mode 100644 src/rocksdb/db/column_family.h create mode 100644 src/rocksdb/db/column_family_test.cc create mode 100644 src/rocksdb/db/compact_files_test.cc create mode 100644 src/rocksdb/db/compaction/clipping_iterator.h create mode 100644 src/rocksdb/db/compaction/clipping_iterator_test.cc create mode 100644 src/rocksdb/db/compaction/compaction.cc create mode 100644 src/rocksdb/db/compaction/compaction.h create mode 100644 src/rocksdb/db/compaction/compaction_iteration_stats.h create mode 100644 src/rocksdb/db/compaction/compaction_iterator.cc create mode 100644 src/rocksdb/db/compaction/compaction_iterator.h create mode 100644 src/rocksdb/db/compaction/compaction_iterator_test.cc create mode 100644 src/rocksdb/db/compaction/compaction_job.cc create mode 100644 src/rocksdb/db/compaction/compaction_job.h create mode 100644 src/rocksdb/db/compaction/compaction_job_stats_test.cc create mode 100644 src/rocksdb/db/compaction/compaction_job_test.cc create mode 100644 src/rocksdb/db/compaction/compaction_outputs.cc create mode 100644 src/rocksdb/db/compaction/compaction_outputs.h create mode 100644 src/rocksdb/db/compaction/compaction_picker.cc create mode 100644 src/rocksdb/db/compaction/compaction_picker.h create mode 100644 src/rocksdb/db/compaction/compaction_picker_fifo.cc create mode 100644 src/rocksdb/db/compaction/compaction_picker_fifo.h create mode 100644 src/rocksdb/db/compaction/compaction_picker_level.cc create mode 100644 src/rocksdb/db/compaction/compaction_picker_level.h create mode 100644 src/rocksdb/db/compaction/compaction_picker_test.cc create mode 100644 src/rocksdb/db/compaction/compaction_picker_universal.cc create mode 100644 src/rocksdb/db/compaction/compaction_picker_universal.h create mode 100644 src/rocksdb/db/compaction/compaction_service_job.cc create mode 100644 src/rocksdb/db/compaction/compaction_service_test.cc create mode 100644 src/rocksdb/db/compaction/compaction_state.cc create mode 100644 src/rocksdb/db/compaction/compaction_state.h create mode 100644 src/rocksdb/db/compaction/file_pri.h create mode 100644 src/rocksdb/db/compaction/sst_partitioner.cc create mode 100644 src/rocksdb/db/compaction/subcompaction_state.cc create mode 100644 src/rocksdb/db/compaction/subcompaction_state.h create mode 100644 src/rocksdb/db/compaction/tiered_compaction_test.cc create mode 100644 src/rocksdb/db/comparator_db_test.cc create mode 100644 src/rocksdb/db/convenience.cc create mode 100644 src/rocksdb/db/corruption_test.cc create mode 100644 src/rocksdb/db/cuckoo_table_db_test.cc create mode 100644 src/rocksdb/db/db_basic_test.cc create mode 100644 src/rocksdb/db/db_block_cache_test.cc create mode 100644 src/rocksdb/db/db_bloom_filter_test.cc create mode 100644 src/rocksdb/db/db_compaction_filter_test.cc create mode 100644 src/rocksdb/db/db_compaction_test.cc create mode 100644 src/rocksdb/db/db_dynamic_level_test.cc create mode 100644 src/rocksdb/db/db_encryption_test.cc create mode 100644 src/rocksdb/db/db_filesnapshot.cc create mode 100644 src/rocksdb/db/db_flush_test.cc create mode 100644 src/rocksdb/db/db_impl/compacted_db_impl.cc create mode 100644 src/rocksdb/db/db_impl/compacted_db_impl.h create mode 100644 src/rocksdb/db/db_impl/db_impl.cc create mode 100644 src/rocksdb/db/db_impl/db_impl.h create mode 100644 src/rocksdb/db/db_impl/db_impl_compaction_flush.cc create mode 100644 src/rocksdb/db/db_impl/db_impl_debug.cc create mode 100644 src/rocksdb/db/db_impl/db_impl_experimental.cc create mode 100644 src/rocksdb/db/db_impl/db_impl_files.cc create mode 100644 src/rocksdb/db/db_impl/db_impl_open.cc create mode 100644 src/rocksdb/db/db_impl/db_impl_readonly.cc create mode 100644 src/rocksdb/db/db_impl/db_impl_readonly.h create mode 100644 src/rocksdb/db/db_impl/db_impl_secondary.cc create mode 100644 src/rocksdb/db/db_impl/db_impl_secondary.h create mode 100644 src/rocksdb/db/db_impl/db_impl_write.cc create mode 100644 src/rocksdb/db/db_info_dumper.cc create mode 100644 src/rocksdb/db/db_info_dumper.h create mode 100644 src/rocksdb/db/db_inplace_update_test.cc create mode 100644 src/rocksdb/db/db_io_failure_test.cc create mode 100644 src/rocksdb/db/db_iter.cc create mode 100644 src/rocksdb/db/db_iter.h create mode 100644 src/rocksdb/db/db_iter_stress_test.cc create mode 100644 src/rocksdb/db/db_iter_test.cc create mode 100644 src/rocksdb/db/db_iterator_test.cc create mode 100644 src/rocksdb/db/db_kv_checksum_test.cc create mode 100644 src/rocksdb/db/db_log_iter_test.cc create mode 100644 src/rocksdb/db/db_logical_block_size_cache_test.cc create mode 100644 src/rocksdb/db/db_memtable_test.cc create mode 100644 src/rocksdb/db/db_merge_operand_test.cc create mode 100644 src/rocksdb/db/db_merge_operator_test.cc create mode 100644 src/rocksdb/db/db_options_test.cc create mode 100644 src/rocksdb/db/db_properties_test.cc create mode 100644 src/rocksdb/db/db_range_del_test.cc create mode 100644 src/rocksdb/db/db_rate_limiter_test.cc create mode 100644 src/rocksdb/db/db_readonly_with_timestamp_test.cc create mode 100644 src/rocksdb/db/db_secondary_test.cc create mode 100644 src/rocksdb/db/db_sst_test.cc create mode 100644 src/rocksdb/db/db_statistics_test.cc create mode 100644 src/rocksdb/db/db_table_properties_test.cc create mode 100644 src/rocksdb/db/db_tailing_iter_test.cc create mode 100644 src/rocksdb/db/db_test.cc create mode 100644 src/rocksdb/db/db_test2.cc create mode 100644 src/rocksdb/db/db_test_util.cc create mode 100644 src/rocksdb/db/db_test_util.h create mode 100644 src/rocksdb/db/db_universal_compaction_test.cc create mode 100644 src/rocksdb/db/db_wal_test.cc create mode 100644 src/rocksdb/db/db_with_timestamp_basic_test.cc create mode 100644 src/rocksdb/db/db_with_timestamp_compaction_test.cc create mode 100644 src/rocksdb/db/db_with_timestamp_test_util.cc create mode 100644 src/rocksdb/db/db_with_timestamp_test_util.h create mode 100644 src/rocksdb/db/db_write_buffer_manager_test.cc create mode 100644 src/rocksdb/db/db_write_test.cc create mode 100644 src/rocksdb/db/dbformat.cc create mode 100644 src/rocksdb/db/dbformat.h create mode 100644 src/rocksdb/db/dbformat_test.cc create mode 100644 src/rocksdb/db/deletefile_test.cc create mode 100644 src/rocksdb/db/error_handler.cc create mode 100644 src/rocksdb/db/error_handler.h create mode 100644 src/rocksdb/db/error_handler_fs_test.cc create mode 100644 src/rocksdb/db/event_helpers.cc create mode 100644 src/rocksdb/db/event_helpers.h create mode 100644 src/rocksdb/db/experimental.cc create mode 100644 src/rocksdb/db/external_sst_file_basic_test.cc create mode 100644 src/rocksdb/db/external_sst_file_ingestion_job.cc create mode 100644 src/rocksdb/db/external_sst_file_ingestion_job.h create mode 100644 src/rocksdb/db/external_sst_file_test.cc create mode 100644 src/rocksdb/db/fault_injection_test.cc create mode 100644 src/rocksdb/db/file_indexer.cc create mode 100644 src/rocksdb/db/file_indexer.h create mode 100644 src/rocksdb/db/file_indexer_test.cc create mode 100644 src/rocksdb/db/filename_test.cc create mode 100644 src/rocksdb/db/flush_job.cc create mode 100644 src/rocksdb/db/flush_job.h create mode 100644 src/rocksdb/db/flush_job_test.cc create mode 100644 src/rocksdb/db/flush_scheduler.cc create mode 100644 src/rocksdb/db/flush_scheduler.h create mode 100644 src/rocksdb/db/forward_iterator.cc create mode 100644 src/rocksdb/db/forward_iterator.h create mode 100644 src/rocksdb/db/forward_iterator_bench.cc create mode 100644 src/rocksdb/db/history_trimming_iterator.h create mode 100644 src/rocksdb/db/import_column_family_job.cc create mode 100644 src/rocksdb/db/import_column_family_job.h create mode 100644 src/rocksdb/db/import_column_family_test.cc create mode 100644 src/rocksdb/db/internal_stats.cc create mode 100644 src/rocksdb/db/internal_stats.h create mode 100644 src/rocksdb/db/job_context.h create mode 100644 src/rocksdb/db/kv_checksum.h create mode 100644 src/rocksdb/db/listener_test.cc create mode 100644 src/rocksdb/db/log_format.h create mode 100644 src/rocksdb/db/log_reader.cc create mode 100644 src/rocksdb/db/log_reader.h create mode 100644 src/rocksdb/db/log_test.cc create mode 100644 src/rocksdb/db/log_writer.cc create mode 100644 src/rocksdb/db/log_writer.h create mode 100644 src/rocksdb/db/logs_with_prep_tracker.cc create mode 100644 src/rocksdb/db/logs_with_prep_tracker.h create mode 100644 src/rocksdb/db/lookup_key.h create mode 100644 src/rocksdb/db/malloc_stats.cc create mode 100644 src/rocksdb/db/malloc_stats.h create mode 100644 src/rocksdb/db/manual_compaction_test.cc create mode 100644 src/rocksdb/db/memtable.cc create mode 100644 src/rocksdb/db/memtable.h create mode 100644 src/rocksdb/db/memtable_list.cc create mode 100644 src/rocksdb/db/memtable_list.h create mode 100644 src/rocksdb/db/memtable_list_test.cc create mode 100644 src/rocksdb/db/merge_context.h create mode 100644 src/rocksdb/db/merge_helper.cc create mode 100644 src/rocksdb/db/merge_helper.h create mode 100644 src/rocksdb/db/merge_helper_test.cc create mode 100644 src/rocksdb/db/merge_operator.cc create mode 100644 src/rocksdb/db/merge_test.cc create mode 100644 src/rocksdb/db/obsolete_files_test.cc create mode 100644 src/rocksdb/db/options_file_test.cc create mode 100644 src/rocksdb/db/output_validator.cc create mode 100644 src/rocksdb/db/output_validator.h create mode 100644 src/rocksdb/db/perf_context_test.cc create mode 100644 src/rocksdb/db/periodic_task_scheduler.cc create mode 100644 src/rocksdb/db/periodic_task_scheduler.h create mode 100644 src/rocksdb/db/periodic_task_scheduler_test.cc create mode 100644 src/rocksdb/db/pinned_iterators_manager.h create mode 100644 src/rocksdb/db/plain_table_db_test.cc create mode 100644 src/rocksdb/db/post_memtable_callback.h create mode 100644 src/rocksdb/db/pre_release_callback.h create mode 100644 src/rocksdb/db/prefix_test.cc create mode 100644 src/rocksdb/db/range_del_aggregator.cc create mode 100644 src/rocksdb/db/range_del_aggregator.h create mode 100644 src/rocksdb/db/range_del_aggregator_bench.cc create mode 100644 src/rocksdb/db/range_del_aggregator_test.cc create mode 100644 src/rocksdb/db/range_tombstone_fragmenter.cc create mode 100644 src/rocksdb/db/range_tombstone_fragmenter.h create mode 100644 src/rocksdb/db/range_tombstone_fragmenter_test.cc create mode 100644 src/rocksdb/db/read_callback.h create mode 100644 src/rocksdb/db/repair.cc create mode 100644 src/rocksdb/db/repair_test.cc create mode 100644 src/rocksdb/db/seqno_time_test.cc create mode 100644 src/rocksdb/db/seqno_to_time_mapping.cc create mode 100644 src/rocksdb/db/seqno_to_time_mapping.h create mode 100644 src/rocksdb/db/snapshot_checker.h create mode 100644 src/rocksdb/db/snapshot_impl.cc create mode 100644 src/rocksdb/db/snapshot_impl.h create mode 100644 src/rocksdb/db/table_cache.cc create mode 100644 src/rocksdb/db/table_cache.h create mode 100644 src/rocksdb/db/table_cache_sync_and_async.h create mode 100644 src/rocksdb/db/table_properties_collector.cc create mode 100644 src/rocksdb/db/table_properties_collector.h create mode 100644 src/rocksdb/db/table_properties_collector_test.cc create mode 100644 src/rocksdb/db/transaction_log_impl.cc create mode 100644 src/rocksdb/db/transaction_log_impl.h create mode 100644 src/rocksdb/db/trim_history_scheduler.cc create mode 100644 src/rocksdb/db/trim_history_scheduler.h create mode 100644 src/rocksdb/db/version_builder.cc create mode 100644 src/rocksdb/db/version_builder.h create mode 100644 src/rocksdb/db/version_builder_test.cc create mode 100644 src/rocksdb/db/version_edit.cc create mode 100644 src/rocksdb/db/version_edit.h create mode 100644 src/rocksdb/db/version_edit_handler.cc create mode 100644 src/rocksdb/db/version_edit_handler.h create mode 100644 src/rocksdb/db/version_edit_test.cc create mode 100644 src/rocksdb/db/version_set.cc create mode 100644 src/rocksdb/db/version_set.h create mode 100644 src/rocksdb/db/version_set_sync_and_async.h create mode 100644 src/rocksdb/db/version_set_test.cc create mode 100644 src/rocksdb/db/version_util.h create mode 100644 src/rocksdb/db/wal_edit.cc create mode 100644 src/rocksdb/db/wal_edit.h create mode 100644 src/rocksdb/db/wal_edit_test.cc create mode 100644 src/rocksdb/db/wal_manager.cc create mode 100644 src/rocksdb/db/wal_manager.h create mode 100644 src/rocksdb/db/wal_manager_test.cc create mode 100644 src/rocksdb/db/wide/db_wide_basic_test.cc create mode 100644 src/rocksdb/db/wide/wide_column_serialization.cc create mode 100644 src/rocksdb/db/wide/wide_column_serialization.h create mode 100644 src/rocksdb/db/wide/wide_column_serialization_test.cc create mode 100644 src/rocksdb/db/wide/wide_columns.cc create mode 100644 src/rocksdb/db/write_batch.cc create mode 100644 src/rocksdb/db/write_batch_base.cc create mode 100644 src/rocksdb/db/write_batch_internal.h create mode 100644 src/rocksdb/db/write_batch_test.cc create mode 100644 src/rocksdb/db/write_callback.h create mode 100644 src/rocksdb/db/write_callback_test.cc create mode 100644 src/rocksdb/db/write_controller.cc create mode 100644 src/rocksdb/db/write_controller.h create mode 100644 src/rocksdb/db/write_controller_test.cc create mode 100644 src/rocksdb/db/write_thread.cc create mode 100644 src/rocksdb/db/write_thread.h (limited to 'src/rocksdb/db') diff --git a/src/rocksdb/db/arena_wrapped_db_iter.cc b/src/rocksdb/db/arena_wrapped_db_iter.cc new file mode 100644 index 000000000..607403ccc --- /dev/null +++ b/src/rocksdb/db/arena_wrapped_db_iter.cc @@ -0,0 +1,160 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/arena_wrapped_db_iter.h" + +#include "memory/arena.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "table/internal_iterator.h" +#include "table/iterator_wrapper.h" +#include "util/user_comparator_wrapper.h" + +namespace ROCKSDB_NAMESPACE { + +Status ArenaWrappedDBIter::GetProperty(std::string prop_name, + std::string* prop) { + if (prop_name == "rocksdb.iterator.super-version-number") { + // First try to pass the value returned from inner iterator. + if (!db_iter_->GetProperty(prop_name, prop).ok()) { + *prop = std::to_string(sv_number_); + } + return Status::OK(); + } + return db_iter_->GetProperty(prop_name, prop); +} + +void ArenaWrappedDBIter::Init( + Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, const Version* version, + const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration, + uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl, + ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) { + auto mem = arena_.AllocateAligned(sizeof(DBIter)); + db_iter_ = + new (mem) DBIter(env, read_options, ioptions, mutable_cf_options, + ioptions.user_comparator, /* iter */ nullptr, version, + sequence, true, max_sequential_skip_in_iteration, + read_callback, db_impl, cfd, expose_blob_index); + sv_number_ = version_number; + read_options_ = read_options; + allow_refresh_ = allow_refresh; + memtable_range_tombstone_iter_ = nullptr; +} + +Status ArenaWrappedDBIter::Refresh() { + if (cfd_ == nullptr || db_impl_ == nullptr || !allow_refresh_) { + return Status::NotSupported("Creating renew iterator is not allowed."); + } + assert(db_iter_ != nullptr); + // TODO(yiwu): For last_seq_same_as_publish_seq_==false, this is not the + // correct behavior. Will be corrected automatically when we take a snapshot + // here for the case of WritePreparedTxnDB. + uint64_t cur_sv_number = cfd_->GetSuperVersionNumber(); + TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:1"); + TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:2"); + auto reinit_internal_iter = [&]() { + Env* env = db_iter_->env(); + db_iter_->~DBIter(); + arena_.~Arena(); + new (&arena_) Arena(); + + SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_); + SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber(); + if (read_callback_) { + read_callback_->Refresh(latest_seq); + } + Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options, + sv->current, latest_seq, + sv->mutable_cf_options.max_sequential_skip_in_iterations, + cur_sv_number, read_callback_, db_impl_, cfd_, expose_blob_index_, + allow_refresh_); + + InternalIterator* internal_iter = db_impl_->NewInternalIterator( + read_options_, cfd_, sv, &arena_, latest_seq, + /* allow_unprepared_value */ true, /* db_iter */ this); + SetIterUnderDBIter(internal_iter); + }; + while (true) { + if (sv_number_ != cur_sv_number) { + reinit_internal_iter(); + break; + } else { + SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber(); + // Refresh range-tombstones in MemTable + if (!read_options_.ignore_range_deletions) { + SuperVersion* sv = cfd_->GetThreadLocalSuperVersion(db_impl_); + TEST_SYNC_POINT_CALLBACK("ArenaWrappedDBIter::Refresh:SV", nullptr); + auto t = sv->mem->NewRangeTombstoneIterator( + read_options_, latest_seq, false /* immutable_memtable */); + if (!t || t->empty()) { + // If memtable_range_tombstone_iter_ points to a non-empty tombstone + // iterator, then it means sv->mem is not the memtable that + // memtable_range_tombstone_iter_ points to, so SV must have changed + // after the sv_number_ != cur_sv_number check above. We will fall + // back to re-init the InternalIterator, and the tombstone iterator + // will be freed during db_iter destruction there. + if (memtable_range_tombstone_iter_) { + assert(!*memtable_range_tombstone_iter_ || + sv_number_ != cfd_->GetSuperVersionNumber()); + } + delete t; + } else { // current mutable memtable has range tombstones + if (!memtable_range_tombstone_iter_) { + delete t; + db_impl_->ReturnAndCleanupSuperVersion(cfd_, sv); + // The memtable under DBIter did not have range tombstone before + // refresh. + reinit_internal_iter(); + break; + } else { + delete *memtable_range_tombstone_iter_; + *memtable_range_tombstone_iter_ = new TruncatedRangeDelIterator( + std::unique_ptr(t), + &cfd_->internal_comparator(), nullptr, nullptr); + } + } + db_impl_->ReturnAndCleanupSuperVersion(cfd_, sv); + } + // Refresh latest sequence number + db_iter_->set_sequence(latest_seq); + db_iter_->set_valid(false); + // Check again if the latest super version number is changed + uint64_t latest_sv_number = cfd_->GetSuperVersionNumber(); + if (latest_sv_number != cur_sv_number) { + // If the super version number is changed after refreshing, + // fallback to Re-Init the InternalIterator + cur_sv_number = latest_sv_number; + continue; + } + break; + } + } + return Status::OK(); +} + +ArenaWrappedDBIter* NewArenaWrappedDbIterator( + Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, const Version* version, + const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, + uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl, + ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) { + ArenaWrappedDBIter* iter = new ArenaWrappedDBIter(); + iter->Init(env, read_options, ioptions, mutable_cf_options, version, sequence, + max_sequential_skip_in_iterations, version_number, read_callback, + db_impl, cfd, expose_blob_index, allow_refresh); + if (db_impl != nullptr && cfd != nullptr && allow_refresh) { + iter->StoreRefreshInfo(db_impl, cfd, read_callback, expose_blob_index); + } + + return iter; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/arena_wrapped_db_iter.h b/src/rocksdb/db/arena_wrapped_db_iter.h new file mode 100644 index 000000000..f15be306d --- /dev/null +++ b/src/rocksdb/db/arena_wrapped_db_iter.h @@ -0,0 +1,127 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include + +#include "db/db_impl/db_impl.h" +#include "db/db_iter.h" +#include "db/range_del_aggregator.h" +#include "memory/arena.h" +#include "options/cf_options.h" +#include "rocksdb/db.h" +#include "rocksdb/iterator.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class Arena; +class Version; + +// A wrapper iterator which wraps DB Iterator and the arena, with which the DB +// iterator is supposed to be allocated. This class is used as an entry point of +// a iterator hierarchy whose memory can be allocated inline. In that way, +// accessing the iterator tree can be more cache friendly. It is also faster +// to allocate. +// When using the class's Iterator interface, the behavior is exactly +// the same as the inner DBIter. +class ArenaWrappedDBIter : public Iterator { + public: + ~ArenaWrappedDBIter() override { + if (db_iter_ != nullptr) { + db_iter_->~DBIter(); + } else { + assert(false); + } + } + + // Get the arena to be used to allocate memory for DBIter to be wrapped, + // as well as child iterators in it. + virtual Arena* GetArena() { return &arena_; } + + const ReadOptions& GetReadOptions() { return read_options_; } + + // Set the internal iterator wrapped inside the DB Iterator. Usually it is + // a merging iterator. + virtual void SetIterUnderDBIter(InternalIterator* iter) { + db_iter_->SetIter(iter); + } + + void SetMemtableRangetombstoneIter(TruncatedRangeDelIterator** iter) { + memtable_range_tombstone_iter_ = iter; + } + + bool Valid() const override { return db_iter_->Valid(); } + void SeekToFirst() override { db_iter_->SeekToFirst(); } + void SeekToLast() override { db_iter_->SeekToLast(); } + // 'target' does not contain timestamp, even if user timestamp feature is + // enabled. + void Seek(const Slice& target) override { db_iter_->Seek(target); } + void SeekForPrev(const Slice& target) override { + db_iter_->SeekForPrev(target); + } + void Next() override { db_iter_->Next(); } + void Prev() override { db_iter_->Prev(); } + Slice key() const override { return db_iter_->key(); } + Slice value() const override { return db_iter_->value(); } + const WideColumns& columns() const override { return db_iter_->columns(); } + Status status() const override { return db_iter_->status(); } + Slice timestamp() const override { return db_iter_->timestamp(); } + bool IsBlob() const { return db_iter_->IsBlob(); } + + Status GetProperty(std::string prop_name, std::string* prop) override; + + Status Refresh() override; + + void Init(Env* env, const ReadOptions& read_options, + const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, const Version* version, + const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iterations, uint64_t version_number, + ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd, + bool expose_blob_index, bool allow_refresh); + + // Store some parameters so we can refresh the iterator at a later point + // with these same params + void StoreRefreshInfo(DBImpl* db_impl, ColumnFamilyData* cfd, + ReadCallback* read_callback, bool expose_blob_index) { + db_impl_ = db_impl; + cfd_ = cfd; + read_callback_ = read_callback; + expose_blob_index_ = expose_blob_index; + } + + private: + DBIter* db_iter_ = nullptr; + Arena arena_; + uint64_t sv_number_; + ColumnFamilyData* cfd_ = nullptr; + DBImpl* db_impl_ = nullptr; + ReadOptions read_options_; + ReadCallback* read_callback_; + bool expose_blob_index_ = false; + bool allow_refresh_ = true; + // If this is nullptr, it means the mutable memtable does not contain range + // tombstone when added under this DBIter. + TruncatedRangeDelIterator** memtable_range_tombstone_iter_ = nullptr; +}; + +// Generate the arena wrapped iterator class. +// `db_impl` and `cfd` are used for reneweal. If left null, renewal will not +// be supported. +extern ArenaWrappedDBIter* NewArenaWrappedDbIterator( + Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, const Version* version, + const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, + uint64_t version_number, ReadCallback* read_callback, + DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr, + bool expose_blob_index = false, bool allow_refresh = true); +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_constants.h b/src/rocksdb/db/blob/blob_constants.h new file mode 100644 index 000000000..a5d09ac76 --- /dev/null +++ b/src/rocksdb/db/blob/blob_constants.h @@ -0,0 +1,16 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +constexpr uint64_t kInvalidBlobFileNumber = 0; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_contents.cc b/src/rocksdb/db/blob/blob_contents.cc new file mode 100644 index 000000000..9015609e7 --- /dev/null +++ b/src/rocksdb/db/blob/blob_contents.cc @@ -0,0 +1,90 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_contents.h" + +#include + +#include "cache/cache_entry_roles.h" +#include "cache/cache_helpers.h" +#include "port/malloc.h" + +namespace ROCKSDB_NAMESPACE { + +std::unique_ptr BlobContents::Create( + CacheAllocationPtr&& allocation, size_t size) { + return std::unique_ptr( + new BlobContents(std::move(allocation), size)); +} + +size_t BlobContents::ApproximateMemoryUsage() const { + size_t usage = 0; + + if (allocation_) { + MemoryAllocator* const allocator = allocation_.get_deleter().allocator; + + if (allocator) { + usage += allocator->UsableSize(allocation_.get(), data_.size()); + } else { +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(allocation_.get()); +#else + usage += data_.size(); +#endif + } + } + +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast(this)); +#else + usage += sizeof(*this); +#endif + + return usage; +} + +size_t BlobContents::SizeCallback(void* obj) { + assert(obj); + + return static_cast(obj)->size(); +} + +Status BlobContents::SaveToCallback(void* from_obj, size_t from_offset, + size_t length, void* out) { + assert(from_obj); + + const BlobContents* buf = static_cast(from_obj); + assert(buf->size() >= from_offset + length); + + memcpy(out, buf->data().data() + from_offset, length); + + return Status::OK(); +} + +Cache::CacheItemHelper* BlobContents::GetCacheItemHelper() { + static Cache::CacheItemHelper cache_helper( + &SizeCallback, &SaveToCallback, + GetCacheEntryDeleterForRole()); + + return &cache_helper; +} + +Status BlobContents::CreateCallback(CacheAllocationPtr&& allocation, + const void* buf, size_t size, + void** out_obj, size_t* charge) { + assert(allocation); + + memcpy(allocation.get(), buf, size); + + std::unique_ptr obj = Create(std::move(allocation), size); + BlobContents* const contents = obj.release(); + + *out_obj = contents; + *charge = contents->ApproximateMemoryUsage(); + + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_contents.h b/src/rocksdb/db/blob/blob_contents.h new file mode 100644 index 000000000..9b7c5b969 --- /dev/null +++ b/src/rocksdb/db/blob/blob_contents.h @@ -0,0 +1,56 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "memory/memory_allocator.h" +#include "rocksdb/cache.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// A class representing a single uncompressed value read from a blob file. +class BlobContents { + public: + static std::unique_ptr Create(CacheAllocationPtr&& allocation, + size_t size); + + BlobContents(const BlobContents&) = delete; + BlobContents& operator=(const BlobContents&) = delete; + + BlobContents(BlobContents&&) = default; + BlobContents& operator=(BlobContents&&) = default; + + ~BlobContents() = default; + + const Slice& data() const { return data_; } + size_t size() const { return data_.size(); } + + size_t ApproximateMemoryUsage() const; + + // Callbacks for secondary cache + static size_t SizeCallback(void* obj); + + static Status SaveToCallback(void* from_obj, size_t from_offset, + size_t length, void* out); + + static Cache::CacheItemHelper* GetCacheItemHelper(); + + static Status CreateCallback(CacheAllocationPtr&& allocation, const void* buf, + size_t size, void** out_obj, size_t* charge); + + private: + BlobContents(CacheAllocationPtr&& allocation, size_t size) + : allocation_(std::move(allocation)), data_(allocation_.get(), size) {} + + CacheAllocationPtr allocation_; + Slice data_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_counting_iterator.h b/src/rocksdb/db/blob/blob_counting_iterator.h new file mode 100644 index 000000000..de549afa2 --- /dev/null +++ b/src/rocksdb/db/blob/blob_counting_iterator.h @@ -0,0 +1,146 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "db/blob/blob_garbage_meter.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" +#include "table/internal_iterator.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +// An internal iterator that passes each key-value encountered to +// BlobGarbageMeter as inflow in order to measure the total number and size of +// blobs in the compaction input on a per-blob file basis. +class BlobCountingIterator : public InternalIterator { + public: + BlobCountingIterator(InternalIterator* iter, + BlobGarbageMeter* blob_garbage_meter) + : iter_(iter), blob_garbage_meter_(blob_garbage_meter) { + assert(iter_); + assert(blob_garbage_meter_); + + UpdateAndCountBlobIfNeeded(); + } + + bool Valid() const override { return iter_->Valid() && status_.ok(); } + + void SeekToFirst() override { + iter_->SeekToFirst(); + UpdateAndCountBlobIfNeeded(); + } + + void SeekToLast() override { + iter_->SeekToLast(); + UpdateAndCountBlobIfNeeded(); + } + + void Seek(const Slice& target) override { + iter_->Seek(target); + UpdateAndCountBlobIfNeeded(); + } + + void SeekForPrev(const Slice& target) override { + iter_->SeekForPrev(target); + UpdateAndCountBlobIfNeeded(); + } + + void Next() override { + assert(Valid()); + + iter_->Next(); + UpdateAndCountBlobIfNeeded(); + } + + bool NextAndGetResult(IterateResult* result) override { + assert(Valid()); + + const bool res = iter_->NextAndGetResult(result); + UpdateAndCountBlobIfNeeded(); + return res; + } + + void Prev() override { + assert(Valid()); + + iter_->Prev(); + UpdateAndCountBlobIfNeeded(); + } + + Slice key() const override { + assert(Valid()); + return iter_->key(); + } + + Slice user_key() const override { + assert(Valid()); + return iter_->user_key(); + } + + Slice value() const override { + assert(Valid()); + return iter_->value(); + } + + Status status() const override { return status_; } + + bool PrepareValue() override { + assert(Valid()); + return iter_->PrepareValue(); + } + + bool MayBeOutOfLowerBound() override { + assert(Valid()); + return iter_->MayBeOutOfLowerBound(); + } + + IterBoundCheck UpperBoundCheckResult() override { + assert(Valid()); + return iter_->UpperBoundCheckResult(); + } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + iter_->SetPinnedItersMgr(pinned_iters_mgr); + } + + bool IsKeyPinned() const override { + assert(Valid()); + return iter_->IsKeyPinned(); + } + + bool IsValuePinned() const override { + assert(Valid()); + return iter_->IsValuePinned(); + } + + Status GetProperty(std::string prop_name, std::string* prop) override { + return iter_->GetProperty(prop_name, prop); + } + + private: + void UpdateAndCountBlobIfNeeded() { + assert(!iter_->Valid() || iter_->status().ok()); + + if (!iter_->Valid()) { + status_ = iter_->status(); + return; + } + + TEST_SYNC_POINT( + "BlobCountingIterator::UpdateAndCountBlobIfNeeded:ProcessInFlow"); + + status_ = blob_garbage_meter_->ProcessInFlow(key(), value()); + } + + InternalIterator* iter_; + BlobGarbageMeter* blob_garbage_meter_; + Status status_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_counting_iterator_test.cc b/src/rocksdb/db/blob/blob_counting_iterator_test.cc new file mode 100644 index 000000000..c7bbc8f58 --- /dev/null +++ b/src/rocksdb/db/blob/blob_counting_iterator_test.cc @@ -0,0 +1,327 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_counting_iterator.h" + +#include +#include + +#include "db/blob/blob_garbage_meter.h" +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/dbformat.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/vector_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +void CheckInFlow(const BlobGarbageMeter& blob_garbage_meter, + uint64_t blob_file_number, uint64_t count, uint64_t bytes) { + const auto& flows = blob_garbage_meter.flows(); + + const auto it = flows.find(blob_file_number); + if (it == flows.end()) { + ASSERT_EQ(count, 0); + ASSERT_EQ(bytes, 0); + return; + } + + const auto& in = it->second.GetInFlow(); + + ASSERT_EQ(in.GetCount(), count); + ASSERT_EQ(in.GetBytes(), bytes); +} + +TEST(BlobCountingIteratorTest, CountBlobs) { + // Note: the input consists of three key-values: two are blob references to + // different blob files, while the third one is a plain value. + constexpr char user_key0[] = "key0"; + constexpr char user_key1[] = "key1"; + constexpr char user_key2[] = "key2"; + + const std::vector keys{ + test::KeyStr(user_key0, 1, kTypeBlobIndex), + test::KeyStr(user_key1, 2, kTypeBlobIndex), + test::KeyStr(user_key2, 3, kTypeValue)}; + + constexpr uint64_t first_blob_file_number = 4; + constexpr uint64_t first_offset = 1000; + constexpr uint64_t first_size = 2000; + + std::string first_blob_index; + BlobIndex::EncodeBlob(&first_blob_index, first_blob_file_number, first_offset, + first_size, kNoCompression); + + constexpr uint64_t second_blob_file_number = 6; + constexpr uint64_t second_offset = 2000; + constexpr uint64_t second_size = 4000; + + std::string second_blob_index; + BlobIndex::EncodeBlob(&second_blob_index, second_blob_file_number, + second_offset, second_size, kNoCompression); + + const std::vector values{first_blob_index, second_blob_index, + "raw_value"}; + + assert(keys.size() == values.size()); + + VectorIterator input(keys, values); + BlobGarbageMeter blob_garbage_meter; + + BlobCountingIterator blob_counter(&input, &blob_garbage_meter); + + constexpr uint64_t first_expected_bytes = + first_size + + BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(user_key0) - 1); + constexpr uint64_t second_expected_bytes = + second_size + + BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(user_key1) - 1); + + // Call SeekToFirst and iterate forward + blob_counter.SeekToFirst(); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[0]); + ASSERT_EQ(blob_counter.user_key(), user_key0); + ASSERT_EQ(blob_counter.value(), values[0]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 1, + first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 0, 0); + + blob_counter.Next(); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[1]); + ASSERT_EQ(blob_counter.user_key(), user_key1); + ASSERT_EQ(blob_counter.value(), values[1]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 1, + first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 1, + second_expected_bytes); + + blob_counter.Next(); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[2]); + ASSERT_EQ(blob_counter.user_key(), user_key2); + ASSERT_EQ(blob_counter.value(), values[2]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 1, + first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 1, + second_expected_bytes); + + blob_counter.Next(); + ASSERT_FALSE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 1, + first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 1, + second_expected_bytes); + + // Do it again using NextAndGetResult + blob_counter.SeekToFirst(); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[0]); + ASSERT_EQ(blob_counter.user_key(), user_key0); + ASSERT_EQ(blob_counter.value(), values[0]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 2, + 2 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 1, + second_expected_bytes); + + { + IterateResult result; + ASSERT_TRUE(blob_counter.NextAndGetResult(&result)); + ASSERT_EQ(result.key, keys[1]); + ASSERT_EQ(blob_counter.user_key(), user_key1); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[1]); + ASSERT_EQ(blob_counter.value(), values[1]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 2, + 2 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 2, + 2 * second_expected_bytes); + } + + { + IterateResult result; + ASSERT_TRUE(blob_counter.NextAndGetResult(&result)); + ASSERT_EQ(result.key, keys[2]); + ASSERT_EQ(blob_counter.user_key(), user_key2); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[2]); + ASSERT_EQ(blob_counter.value(), values[2]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 2, + 2 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 2, + 2 * second_expected_bytes); + } + + { + IterateResult result; + ASSERT_FALSE(blob_counter.NextAndGetResult(&result)); + ASSERT_FALSE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 2, + 2 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 2, + 2 * second_expected_bytes); + } + + // Call SeekToLast and iterate backward + blob_counter.SeekToLast(); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[2]); + ASSERT_EQ(blob_counter.user_key(), user_key2); + ASSERT_EQ(blob_counter.value(), values[2]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 2, + 2 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 2, + 2 * second_expected_bytes); + + blob_counter.Prev(); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[1]); + ASSERT_EQ(blob_counter.user_key(), user_key1); + ASSERT_EQ(blob_counter.value(), values[1]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 2, + 2 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 3, + 3 * second_expected_bytes); + + blob_counter.Prev(); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[0]); + ASSERT_EQ(blob_counter.user_key(), user_key0); + ASSERT_EQ(blob_counter.value(), values[0]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 3, + 3 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 3, + 3 * second_expected_bytes); + + blob_counter.Prev(); + ASSERT_FALSE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 3, + 3 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 3, + 3 * second_expected_bytes); + + // Call Seek for all keys (plus one that's greater than all of them) + blob_counter.Seek(keys[0]); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[0]); + ASSERT_EQ(blob_counter.user_key(), user_key0); + ASSERT_EQ(blob_counter.value(), values[0]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 4, + 4 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 3, + 3 * second_expected_bytes); + + blob_counter.Seek(keys[1]); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[1]); + ASSERT_EQ(blob_counter.user_key(), user_key1); + ASSERT_EQ(blob_counter.value(), values[1]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 4, + 4 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 4, + 4 * second_expected_bytes); + + blob_counter.Seek(keys[2]); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[2]); + ASSERT_EQ(blob_counter.user_key(), user_key2); + ASSERT_EQ(blob_counter.value(), values[2]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 4, + 4 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 4, + 4 * second_expected_bytes); + + blob_counter.Seek("zzz"); + ASSERT_FALSE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 4, + 4 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 4, + 4 * second_expected_bytes); + + // Call SeekForPrev for all keys (plus one that's less than all of them) + blob_counter.SeekForPrev("aaa"); + ASSERT_FALSE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 4, + 4 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 4, + 4 * second_expected_bytes); + + blob_counter.SeekForPrev(keys[0]); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[0]); + ASSERT_EQ(blob_counter.user_key(), user_key0); + ASSERT_EQ(blob_counter.value(), values[0]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 5, + 5 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 4, + 4 * second_expected_bytes); + + blob_counter.SeekForPrev(keys[1]); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[1]); + ASSERT_EQ(blob_counter.user_key(), user_key1); + ASSERT_EQ(blob_counter.value(), values[1]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 5, + 5 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 5, + 5 * second_expected_bytes); + + blob_counter.SeekForPrev(keys[2]); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[2]); + ASSERT_EQ(blob_counter.user_key(), user_key2); + ASSERT_EQ(blob_counter.value(), values[2]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 5, + 5 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 5, + 5 * second_expected_bytes); +} + +TEST(BlobCountingIteratorTest, CorruptBlobIndex) { + const std::vector keys{ + test::KeyStr("user_key", 1, kTypeBlobIndex)}; + const std::vector values{"i_am_not_a_blob_index"}; + + assert(keys.size() == values.size()); + + VectorIterator input(keys, values); + BlobGarbageMeter blob_garbage_meter; + + BlobCountingIterator blob_counter(&input, &blob_garbage_meter); + + blob_counter.SeekToFirst(); + ASSERT_FALSE(blob_counter.Valid()); + ASSERT_NOK(blob_counter.status()); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/blob/blob_fetcher.cc b/src/rocksdb/db/blob/blob_fetcher.cc new file mode 100644 index 000000000..124429f93 --- /dev/null +++ b/src/rocksdb/db/blob/blob_fetcher.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_fetcher.h" + +#include "db/version_set.h" + +namespace ROCKSDB_NAMESPACE { + +Status BlobFetcher::FetchBlob(const Slice& user_key, + const Slice& blob_index_slice, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* blob_value, + uint64_t* bytes_read) const { + assert(version_); + + return version_->GetBlob(read_options_, user_key, blob_index_slice, + prefetch_buffer, blob_value, bytes_read); +} + +Status BlobFetcher::FetchBlob(const Slice& user_key, + const BlobIndex& blob_index, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* blob_value, + uint64_t* bytes_read) const { + assert(version_); + + return version_->GetBlob(read_options_, user_key, blob_index, prefetch_buffer, + blob_value, bytes_read); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_fetcher.h b/src/rocksdb/db/blob/blob_fetcher.h new file mode 100644 index 000000000..8aeaf965d --- /dev/null +++ b/src/rocksdb/db/blob/blob_fetcher.h @@ -0,0 +1,37 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class Version; +class Slice; +class FilePrefetchBuffer; +class PinnableSlice; +class BlobIndex; + +// A thin wrapper around the blob retrieval functionality of Version. +class BlobFetcher { + public: + BlobFetcher(const Version* version, const ReadOptions& read_options) + : version_(version), read_options_(read_options) {} + + Status FetchBlob(const Slice& user_key, const Slice& blob_index_slice, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* blob_value, uint64_t* bytes_read) const; + + Status FetchBlob(const Slice& user_key, const BlobIndex& blob_index, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* blob_value, uint64_t* bytes_read) const; + + private: + const Version* version_; + ReadOptions read_options_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_addition.cc b/src/rocksdb/db/blob/blob_file_addition.cc new file mode 100644 index 000000000..71b1bb7fc --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_addition.cc @@ -0,0 +1,156 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_addition.h" + +#include +#include + +#include "logging/event_logger.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "test_util/sync_point.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +// Tags for custom fields. Note that these get persisted in the manifest, +// so existing tags should not be modified. +enum BlobFileAddition::CustomFieldTags : uint32_t { + kEndMarker, + + // Add forward compatible fields here + + ///////////////////////////////////////////////////////////////////// + + kForwardIncompatibleMask = 1 << 6, + + // Add forward incompatible fields here +}; + +void BlobFileAddition::EncodeTo(std::string* output) const { + PutVarint64(output, blob_file_number_); + PutVarint64(output, total_blob_count_); + PutVarint64(output, total_blob_bytes_); + PutLengthPrefixedSlice(output, checksum_method_); + PutLengthPrefixedSlice(output, checksum_value_); + + // Encode any custom fields here. The format to use is a Varint32 tag (see + // CustomFieldTags above) followed by a length prefixed slice. Unknown custom + // fields will be ignored during decoding unless they're in the forward + // incompatible range. + + TEST_SYNC_POINT_CALLBACK("BlobFileAddition::EncodeTo::CustomFields", output); + + PutVarint32(output, kEndMarker); +} + +Status BlobFileAddition::DecodeFrom(Slice* input) { + constexpr char class_name[] = "BlobFileAddition"; + + if (!GetVarint64(input, &blob_file_number_)) { + return Status::Corruption(class_name, "Error decoding blob file number"); + } + + if (!GetVarint64(input, &total_blob_count_)) { + return Status::Corruption(class_name, "Error decoding total blob count"); + } + + if (!GetVarint64(input, &total_blob_bytes_)) { + return Status::Corruption(class_name, "Error decoding total blob bytes"); + } + + Slice checksum_method; + if (!GetLengthPrefixedSlice(input, &checksum_method)) { + return Status::Corruption(class_name, "Error decoding checksum method"); + } + checksum_method_ = checksum_method.ToString(); + + Slice checksum_value; + if (!GetLengthPrefixedSlice(input, &checksum_value)) { + return Status::Corruption(class_name, "Error decoding checksum value"); + } + checksum_value_ = checksum_value.ToString(); + + while (true) { + uint32_t custom_field_tag = 0; + if (!GetVarint32(input, &custom_field_tag)) { + return Status::Corruption(class_name, "Error decoding custom field tag"); + } + + if (custom_field_tag == kEndMarker) { + break; + } + + if (custom_field_tag & kForwardIncompatibleMask) { + return Status::Corruption( + class_name, "Forward incompatible custom field encountered"); + } + + Slice custom_field_value; + if (!GetLengthPrefixedSlice(input, &custom_field_value)) { + return Status::Corruption(class_name, + "Error decoding custom field value"); + } + } + + return Status::OK(); +} + +std::string BlobFileAddition::DebugString() const { + std::ostringstream oss; + + oss << *this; + + return oss.str(); +} + +std::string BlobFileAddition::DebugJSON() const { + JSONWriter jw; + + jw << *this; + + jw.EndObject(); + + return jw.Get(); +} + +bool operator==(const BlobFileAddition& lhs, const BlobFileAddition& rhs) { + return lhs.GetBlobFileNumber() == rhs.GetBlobFileNumber() && + lhs.GetTotalBlobCount() == rhs.GetTotalBlobCount() && + lhs.GetTotalBlobBytes() == rhs.GetTotalBlobBytes() && + lhs.GetChecksumMethod() == rhs.GetChecksumMethod() && + lhs.GetChecksumValue() == rhs.GetChecksumValue(); +} + +bool operator!=(const BlobFileAddition& lhs, const BlobFileAddition& rhs) { + return !(lhs == rhs); +} + +std::ostream& operator<<(std::ostream& os, + const BlobFileAddition& blob_file_addition) { + os << "blob_file_number: " << blob_file_addition.GetBlobFileNumber() + << " total_blob_count: " << blob_file_addition.GetTotalBlobCount() + << " total_blob_bytes: " << blob_file_addition.GetTotalBlobBytes() + << " checksum_method: " << blob_file_addition.GetChecksumMethod() + << " checksum_value: " + << Slice(blob_file_addition.GetChecksumValue()).ToString(/* hex */ true); + + return os; +} + +JSONWriter& operator<<(JSONWriter& jw, + const BlobFileAddition& blob_file_addition) { + jw << "BlobFileNumber" << blob_file_addition.GetBlobFileNumber() + << "TotalBlobCount" << blob_file_addition.GetTotalBlobCount() + << "TotalBlobBytes" << blob_file_addition.GetTotalBlobBytes() + << "ChecksumMethod" << blob_file_addition.GetChecksumMethod() + << "ChecksumValue" + << Slice(blob_file_addition.GetChecksumValue()).ToString(/* hex */ true); + + return jw; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_addition.h b/src/rocksdb/db/blob/blob_file_addition.h new file mode 100644 index 000000000..43b1a0bcb --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_addition.h @@ -0,0 +1,67 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "db/blob/blob_constants.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class JSONWriter; +class Slice; +class Status; + +class BlobFileAddition { + public: + BlobFileAddition() = default; + + BlobFileAddition(uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, std::string checksum_method, + std::string checksum_value) + : blob_file_number_(blob_file_number), + total_blob_count_(total_blob_count), + total_blob_bytes_(total_blob_bytes), + checksum_method_(std::move(checksum_method)), + checksum_value_(std::move(checksum_value)) { + assert(checksum_method_.empty() == checksum_value_.empty()); + } + + uint64_t GetBlobFileNumber() const { return blob_file_number_; } + uint64_t GetTotalBlobCount() const { return total_blob_count_; } + uint64_t GetTotalBlobBytes() const { return total_blob_bytes_; } + const std::string& GetChecksumMethod() const { return checksum_method_; } + const std::string& GetChecksumValue() const { return checksum_value_; } + + void EncodeTo(std::string* output) const; + Status DecodeFrom(Slice* input); + + std::string DebugString() const; + std::string DebugJSON() const; + + private: + enum CustomFieldTags : uint32_t; + + uint64_t blob_file_number_ = kInvalidBlobFileNumber; + uint64_t total_blob_count_ = 0; + uint64_t total_blob_bytes_ = 0; + std::string checksum_method_; + std::string checksum_value_; +}; + +bool operator==(const BlobFileAddition& lhs, const BlobFileAddition& rhs); +bool operator!=(const BlobFileAddition& lhs, const BlobFileAddition& rhs); + +std::ostream& operator<<(std::ostream& os, + const BlobFileAddition& blob_file_addition); +JSONWriter& operator<<(JSONWriter& jw, + const BlobFileAddition& blob_file_addition); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_addition_test.cc b/src/rocksdb/db/blob/blob_file_addition_test.cc new file mode 100644 index 000000000..64cb0a9d6 --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_addition_test.cc @@ -0,0 +1,211 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_addition.h" + +#include +#include +#include + +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +class BlobFileAdditionTest : public testing::Test { + public: + static void TestEncodeDecode(const BlobFileAddition& blob_file_addition) { + std::string encoded; + blob_file_addition.EncodeTo(&encoded); + + BlobFileAddition decoded; + Slice input(encoded); + ASSERT_OK(decoded.DecodeFrom(&input)); + + ASSERT_EQ(blob_file_addition, decoded); + } +}; + +TEST_F(BlobFileAdditionTest, Empty) { + BlobFileAddition blob_file_addition; + + ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), kInvalidBlobFileNumber); + ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 0); + ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), 0); + ASSERT_TRUE(blob_file_addition.GetChecksumMethod().empty()); + ASSERT_TRUE(blob_file_addition.GetChecksumValue().empty()); + + TestEncodeDecode(blob_file_addition); +} + +TEST_F(BlobFileAdditionTest, NonEmpty) { + constexpr uint64_t blob_file_number = 123; + constexpr uint64_t total_blob_count = 2; + constexpr uint64_t total_blob_bytes = 123456; + const std::string checksum_method("SHA1"); + const std::string checksum_value( + "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52" + "\x5c\xbd"); + + BlobFileAddition blob_file_addition(blob_file_number, total_blob_count, + total_blob_bytes, checksum_method, + checksum_value); + + ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), total_blob_count); + ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), total_blob_bytes); + ASSERT_EQ(blob_file_addition.GetChecksumMethod(), checksum_method); + ASSERT_EQ(blob_file_addition.GetChecksumValue(), checksum_value); + + TestEncodeDecode(blob_file_addition); +} + +TEST_F(BlobFileAdditionTest, DecodeErrors) { + std::string str; + Slice slice(str); + + BlobFileAddition blob_file_addition; + + { + const Status s = blob_file_addition.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "blob file number")); + } + + constexpr uint64_t blob_file_number = 123; + PutVarint64(&str, blob_file_number); + slice = str; + + { + const Status s = blob_file_addition.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "total blob count")); + } + + constexpr uint64_t total_blob_count = 4567; + PutVarint64(&str, total_blob_count); + slice = str; + + { + const Status s = blob_file_addition.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "total blob bytes")); + } + + constexpr uint64_t total_blob_bytes = 12345678; + PutVarint64(&str, total_blob_bytes); + slice = str; + + { + const Status s = blob_file_addition.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "checksum method")); + } + + constexpr char checksum_method[] = "SHA1"; + PutLengthPrefixedSlice(&str, checksum_method); + slice = str; + + { + const Status s = blob_file_addition.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "checksum value")); + } + + constexpr char checksum_value[] = + "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52" + "\x5c\xbd"; + PutLengthPrefixedSlice(&str, checksum_value); + slice = str; + + { + const Status s = blob_file_addition.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "custom field tag")); + } + + constexpr uint32_t custom_tag = 2; + PutVarint32(&str, custom_tag); + slice = str; + + { + const Status s = blob_file_addition.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "custom field value")); + } +} + +TEST_F(BlobFileAdditionTest, ForwardCompatibleCustomField) { + SyncPoint::GetInstance()->SetCallBack( + "BlobFileAddition::EncodeTo::CustomFields", [&](void* arg) { + std::string* output = static_cast(arg); + + constexpr uint32_t forward_compatible_tag = 2; + PutVarint32(output, forward_compatible_tag); + + PutLengthPrefixedSlice(output, "deadbeef"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr uint64_t blob_file_number = 678; + constexpr uint64_t total_blob_count = 9999; + constexpr uint64_t total_blob_bytes = 100000000; + const std::string checksum_method("CRC32"); + const std::string checksum_value("\x3d\x87\xff\x57"); + + BlobFileAddition blob_file_addition(blob_file_number, total_blob_count, + total_blob_bytes, checksum_method, + checksum_value); + + TestEncodeDecode(blob_file_addition); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(BlobFileAdditionTest, ForwardIncompatibleCustomField) { + SyncPoint::GetInstance()->SetCallBack( + "BlobFileAddition::EncodeTo::CustomFields", [&](void* arg) { + std::string* output = static_cast(arg); + + constexpr uint32_t forward_incompatible_tag = (1 << 6) + 1; + PutVarint32(output, forward_incompatible_tag); + + PutLengthPrefixedSlice(output, "foobar"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr uint64_t blob_file_number = 456; + constexpr uint64_t total_blob_count = 100; + constexpr uint64_t total_blob_bytes = 2000000; + const std::string checksum_method("CRC32B"); + const std::string checksum_value("\x6d\xbd\xf2\x3a"); + + BlobFileAddition blob_file_addition(blob_file_number, total_blob_count, + total_blob_bytes, checksum_method, + checksum_value); + + std::string encoded; + blob_file_addition.EncodeTo(&encoded); + + BlobFileAddition decoded_blob_file_addition; + Slice input(encoded); + const Status s = decoded_blob_file_addition.DecodeFrom(&input); + + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "Forward incompatible")); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/blob/blob_file_builder.cc b/src/rocksdb/db/blob/blob_file_builder.cc new file mode 100644 index 000000000..5e0e7f6cb --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_builder.cc @@ -0,0 +1,446 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_builder.h" + +#include + +#include "db/blob/blob_contents.h" +#include "db/blob/blob_file_addition.h" +#include "db/blob/blob_file_completion_callback.h" +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/blob/blob_log_writer.h" +#include "db/event_helpers.h" +#include "db/version_set.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "file/writable_file_writer.h" +#include "logging/logging.h" +#include "options/cf_options.h" +#include "options/options_helper.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "test_util/sync_point.h" +#include "trace_replay/io_tracer.h" +#include "util/compression.h" + +namespace ROCKSDB_NAMESPACE { + +BlobFileBuilder::BlobFileBuilder( + VersionSet* versions, FileSystem* fs, + const ImmutableOptions* immutable_options, + const MutableCFOptions* mutable_cf_options, const FileOptions* file_options, + std::string db_id, std::string db_session_id, int job_id, + uint32_t column_family_id, const std::string& column_family_name, + Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint, + const std::shared_ptr& io_tracer, + BlobFileCompletionCallback* blob_callback, + BlobFileCreationReason creation_reason, + std::vector* blob_file_paths, + std::vector* blob_file_additions) + : BlobFileBuilder([versions]() { return versions->NewFileNumber(); }, fs, + immutable_options, mutable_cf_options, file_options, + db_id, db_session_id, job_id, column_family_id, + column_family_name, io_priority, write_hint, io_tracer, + blob_callback, creation_reason, blob_file_paths, + blob_file_additions) {} + +BlobFileBuilder::BlobFileBuilder( + std::function file_number_generator, FileSystem* fs, + const ImmutableOptions* immutable_options, + const MutableCFOptions* mutable_cf_options, const FileOptions* file_options, + std::string db_id, std::string db_session_id, int job_id, + uint32_t column_family_id, const std::string& column_family_name, + Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint, + const std::shared_ptr& io_tracer, + BlobFileCompletionCallback* blob_callback, + BlobFileCreationReason creation_reason, + std::vector* blob_file_paths, + std::vector* blob_file_additions) + : file_number_generator_(std::move(file_number_generator)), + fs_(fs), + immutable_options_(immutable_options), + min_blob_size_(mutable_cf_options->min_blob_size), + blob_file_size_(mutable_cf_options->blob_file_size), + blob_compression_type_(mutable_cf_options->blob_compression_type), + prepopulate_blob_cache_(mutable_cf_options->prepopulate_blob_cache), + file_options_(file_options), + db_id_(std::move(db_id)), + db_session_id_(std::move(db_session_id)), + job_id_(job_id), + column_family_id_(column_family_id), + column_family_name_(column_family_name), + io_priority_(io_priority), + write_hint_(write_hint), + io_tracer_(io_tracer), + blob_callback_(blob_callback), + creation_reason_(creation_reason), + blob_file_paths_(blob_file_paths), + blob_file_additions_(blob_file_additions), + blob_count_(0), + blob_bytes_(0) { + assert(file_number_generator_); + assert(fs_); + assert(immutable_options_); + assert(file_options_); + assert(blob_file_paths_); + assert(blob_file_paths_->empty()); + assert(blob_file_additions_); + assert(blob_file_additions_->empty()); +} + +BlobFileBuilder::~BlobFileBuilder() = default; + +Status BlobFileBuilder::Add(const Slice& key, const Slice& value, + std::string* blob_index) { + assert(blob_index); + assert(blob_index->empty()); + + if (value.size() < min_blob_size_) { + return Status::OK(); + } + + { + const Status s = OpenBlobFileIfNeeded(); + if (!s.ok()) { + return s; + } + } + + Slice blob = value; + std::string compressed_blob; + + { + const Status s = CompressBlobIfNeeded(&blob, &compressed_blob); + if (!s.ok()) { + return s; + } + } + + uint64_t blob_file_number = 0; + uint64_t blob_offset = 0; + + { + const Status s = + WriteBlobToFile(key, blob, &blob_file_number, &blob_offset); + if (!s.ok()) { + return s; + } + } + + { + const Status s = CloseBlobFileIfNeeded(); + if (!s.ok()) { + return s; + } + } + + { + const Status s = + PutBlobIntoCacheIfNeeded(value, blob_file_number, blob_offset); + if (!s.ok()) { + ROCKS_LOG_WARN(immutable_options_->info_log, + "Failed to pre-populate the blob into blob cache: %s", + s.ToString().c_str()); + } + } + + BlobIndex::EncodeBlob(blob_index, blob_file_number, blob_offset, blob.size(), + blob_compression_type_); + + return Status::OK(); +} + +Status BlobFileBuilder::Finish() { + if (!IsBlobFileOpen()) { + return Status::OK(); + } + + return CloseBlobFile(); +} + +bool BlobFileBuilder::IsBlobFileOpen() const { return !!writer_; } + +Status BlobFileBuilder::OpenBlobFileIfNeeded() { + if (IsBlobFileOpen()) { + return Status::OK(); + } + + assert(!blob_count_); + assert(!blob_bytes_); + + assert(file_number_generator_); + const uint64_t blob_file_number = file_number_generator_(); + + assert(immutable_options_); + assert(!immutable_options_->cf_paths.empty()); + std::string blob_file_path = + BlobFileName(immutable_options_->cf_paths.front().path, blob_file_number); + + if (blob_callback_) { + blob_callback_->OnBlobFileCreationStarted( + blob_file_path, column_family_name_, job_id_, creation_reason_); + } + + std::unique_ptr file; + + { + assert(file_options_); + Status s = NewWritableFile(fs_, blob_file_path, &file, *file_options_); + + TEST_SYNC_POINT_CALLBACK( + "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile", &s); + + if (!s.ok()) { + return s; + } + } + + // Note: files get added to blob_file_paths_ right after the open, so they + // can be cleaned up upon failure. Contrast this with blob_file_additions_, + // which only contains successfully written files. + assert(blob_file_paths_); + blob_file_paths_->emplace_back(std::move(blob_file_path)); + + assert(file); + file->SetIOPriority(io_priority_); + file->SetWriteLifeTimeHint(write_hint_); + FileTypeSet tmp_set = immutable_options_->checksum_handoff_file_types; + Statistics* const statistics = immutable_options_->stats; + std::unique_ptr file_writer(new WritableFileWriter( + std::move(file), blob_file_paths_->back(), *file_options_, + immutable_options_->clock, io_tracer_, statistics, + immutable_options_->listeners, + immutable_options_->file_checksum_gen_factory.get(), + tmp_set.Contains(FileType::kBlobFile), false)); + + constexpr bool do_flush = false; + + std::unique_ptr blob_log_writer(new BlobLogWriter( + std::move(file_writer), immutable_options_->clock, statistics, + blob_file_number, immutable_options_->use_fsync, do_flush)); + + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + + BlobLogHeader header(column_family_id_, blob_compression_type_, has_ttl, + expiration_range); + + { + Status s = blob_log_writer->WriteHeader(header); + + TEST_SYNC_POINT_CALLBACK( + "BlobFileBuilder::OpenBlobFileIfNeeded:WriteHeader", &s); + + if (!s.ok()) { + return s; + } + } + + writer_ = std::move(blob_log_writer); + + assert(IsBlobFileOpen()); + + return Status::OK(); +} + +Status BlobFileBuilder::CompressBlobIfNeeded( + Slice* blob, std::string* compressed_blob) const { + assert(blob); + assert(compressed_blob); + assert(compressed_blob->empty()); + assert(immutable_options_); + + if (blob_compression_type_ == kNoCompression) { + return Status::OK(); + } + + CompressionOptions opts; + CompressionContext context(blob_compression_type_); + constexpr uint64_t sample_for_compression = 0; + + CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), + blob_compression_type_, sample_for_compression); + + constexpr uint32_t compression_format_version = 2; + + bool success = false; + + { + StopWatch stop_watch(immutable_options_->clock, immutable_options_->stats, + BLOB_DB_COMPRESSION_MICROS); + success = + CompressData(*blob, info, compression_format_version, compressed_blob); + } + + if (!success) { + return Status::Corruption("Error compressing blob"); + } + + *blob = Slice(*compressed_blob); + + return Status::OK(); +} + +Status BlobFileBuilder::WriteBlobToFile(const Slice& key, const Slice& blob, + uint64_t* blob_file_number, + uint64_t* blob_offset) { + assert(IsBlobFileOpen()); + assert(blob_file_number); + assert(blob_offset); + + uint64_t key_offset = 0; + + Status s = writer_->AddRecord(key, blob, &key_offset, blob_offset); + + TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AddRecord", &s); + + if (!s.ok()) { + return s; + } + + *blob_file_number = writer_->get_log_number(); + + ++blob_count_; + blob_bytes_ += BlobLogRecord::kHeaderSize + key.size() + blob.size(); + + return Status::OK(); +} + +Status BlobFileBuilder::CloseBlobFile() { + assert(IsBlobFileOpen()); + + BlobLogFooter footer; + footer.blob_count = blob_count_; + + std::string checksum_method; + std::string checksum_value; + + Status s = writer_->AppendFooter(footer, &checksum_method, &checksum_value); + + TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AppendFooter", &s); + + if (!s.ok()) { + return s; + } + + const uint64_t blob_file_number = writer_->get_log_number(); + + if (blob_callback_) { + s = blob_callback_->OnBlobFileCompleted( + blob_file_paths_->back(), column_family_name_, job_id_, + blob_file_number, creation_reason_, s, checksum_value, checksum_method, + blob_count_, blob_bytes_); + } + + assert(blob_file_additions_); + blob_file_additions_->emplace_back(blob_file_number, blob_count_, blob_bytes_, + std::move(checksum_method), + std::move(checksum_value)); + + assert(immutable_options_); + ROCKS_LOG_INFO(immutable_options_->logger, + "[%s] [JOB %d] Generated blob file #%" PRIu64 ": %" PRIu64 + " total blobs, %" PRIu64 " total bytes", + column_family_name_.c_str(), job_id_, blob_file_number, + blob_count_, blob_bytes_); + + writer_.reset(); + blob_count_ = 0; + blob_bytes_ = 0; + + return s; +} + +Status BlobFileBuilder::CloseBlobFileIfNeeded() { + assert(IsBlobFileOpen()); + + const WritableFileWriter* const file_writer = writer_->file(); + assert(file_writer); + + if (file_writer->GetFileSize() < blob_file_size_) { + return Status::OK(); + } + + return CloseBlobFile(); +} + +void BlobFileBuilder::Abandon(const Status& s) { + if (!IsBlobFileOpen()) { + return; + } + if (blob_callback_) { + // BlobFileBuilder::Abandon() is called because of error while writing to + // Blob files. So we can ignore the below error. + blob_callback_ + ->OnBlobFileCompleted(blob_file_paths_->back(), column_family_name_, + job_id_, writer_->get_log_number(), + creation_reason_, s, "", "", blob_count_, + blob_bytes_) + .PermitUncheckedError(); + } + + writer_.reset(); + blob_count_ = 0; + blob_bytes_ = 0; +} + +Status BlobFileBuilder::PutBlobIntoCacheIfNeeded(const Slice& blob, + uint64_t blob_file_number, + uint64_t blob_offset) const { + Status s = Status::OK(); + + auto blob_cache = immutable_options_->blob_cache; + auto statistics = immutable_options_->statistics.get(); + bool warm_cache = + prepopulate_blob_cache_ == PrepopulateBlobCache::kFlushOnly && + creation_reason_ == BlobFileCreationReason::kFlush; + + if (blob_cache && warm_cache) { + const OffsetableCacheKey base_cache_key(db_id_, db_session_id_, + blob_file_number); + const CacheKey cache_key = base_cache_key.WithOffset(blob_offset); + const Slice key = cache_key.AsSlice(); + + const Cache::Priority priority = Cache::Priority::BOTTOM; + + // Objects to be put into the cache have to be heap-allocated and + // self-contained, i.e. own their contents. The Cache has to be able to + // take unique ownership of them. + CacheAllocationPtr allocation = + AllocateBlock(blob.size(), blob_cache->memory_allocator()); + memcpy(allocation.get(), blob.data(), blob.size()); + std::unique_ptr buf = + BlobContents::Create(std::move(allocation), blob.size()); + + Cache::CacheItemHelper* const cache_item_helper = + BlobContents::GetCacheItemHelper(); + assert(cache_item_helper); + + if (immutable_options_->lowest_used_cache_tier == + CacheTier::kNonVolatileBlockTier) { + s = blob_cache->Insert(key, buf.get(), cache_item_helper, + buf->ApproximateMemoryUsage(), + nullptr /* cache_handle */, priority); + } else { + s = blob_cache->Insert(key, buf.get(), buf->ApproximateMemoryUsage(), + cache_item_helper->del_cb, + nullptr /* cache_handle */, priority); + } + + if (s.ok()) { + RecordTick(statistics, BLOB_DB_CACHE_ADD); + RecordTick(statistics, BLOB_DB_CACHE_BYTES_WRITE, buf->size()); + buf.release(); + } else { + RecordTick(statistics, BLOB_DB_CACHE_ADD_FAILURES); + } + } + + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_builder.h b/src/rocksdb/db/blob/blob_file_builder.h new file mode 100644 index 000000000..8e7aab502 --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_builder.h @@ -0,0 +1,112 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include +#include +#include +#include + +#include "rocksdb/advanced_options.h" +#include "rocksdb/compression_type.h" +#include "rocksdb/env.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +class VersionSet; +class FileSystem; +class SystemClock; +struct ImmutableOptions; +struct MutableCFOptions; +struct FileOptions; +class BlobFileAddition; +class Status; +class Slice; +class BlobLogWriter; +class IOTracer; +class BlobFileCompletionCallback; + +class BlobFileBuilder { + public: + BlobFileBuilder(VersionSet* versions, FileSystem* fs, + const ImmutableOptions* immutable_options, + const MutableCFOptions* mutable_cf_options, + const FileOptions* file_options, std::string db_id, + std::string db_session_id, int job_id, + uint32_t column_family_id, + const std::string& column_family_name, + Env::IOPriority io_priority, + Env::WriteLifeTimeHint write_hint, + const std::shared_ptr& io_tracer, + BlobFileCompletionCallback* blob_callback, + BlobFileCreationReason creation_reason, + std::vector* blob_file_paths, + std::vector* blob_file_additions); + + BlobFileBuilder(std::function file_number_generator, + FileSystem* fs, const ImmutableOptions* immutable_options, + const MutableCFOptions* mutable_cf_options, + const FileOptions* file_options, std::string db_id, + std::string db_session_id, int job_id, + uint32_t column_family_id, + const std::string& column_family_name, + Env::IOPriority io_priority, + Env::WriteLifeTimeHint write_hint, + const std::shared_ptr& io_tracer, + BlobFileCompletionCallback* blob_callback, + BlobFileCreationReason creation_reason, + std::vector* blob_file_paths, + std::vector* blob_file_additions); + + BlobFileBuilder(const BlobFileBuilder&) = delete; + BlobFileBuilder& operator=(const BlobFileBuilder&) = delete; + + ~BlobFileBuilder(); + + Status Add(const Slice& key, const Slice& value, std::string* blob_index); + Status Finish(); + void Abandon(const Status& s); + + private: + bool IsBlobFileOpen() const; + Status OpenBlobFileIfNeeded(); + Status CompressBlobIfNeeded(Slice* blob, std::string* compressed_blob) const; + Status WriteBlobToFile(const Slice& key, const Slice& blob, + uint64_t* blob_file_number, uint64_t* blob_offset); + Status CloseBlobFile(); + Status CloseBlobFileIfNeeded(); + + Status PutBlobIntoCacheIfNeeded(const Slice& blob, uint64_t blob_file_number, + uint64_t blob_offset) const; + + std::function file_number_generator_; + FileSystem* fs_; + const ImmutableOptions* immutable_options_; + uint64_t min_blob_size_; + uint64_t blob_file_size_; + CompressionType blob_compression_type_; + PrepopulateBlobCache prepopulate_blob_cache_; + const FileOptions* file_options_; + const std::string db_id_; + const std::string db_session_id_; + int job_id_; + uint32_t column_family_id_; + std::string column_family_name_; + Env::IOPriority io_priority_; + Env::WriteLifeTimeHint write_hint_; + std::shared_ptr io_tracer_; + BlobFileCompletionCallback* blob_callback_; + BlobFileCreationReason creation_reason_; + std::vector* blob_file_paths_; + std::vector* blob_file_additions_; + std::unique_ptr writer_; + uint64_t blob_count_; + uint64_t blob_bytes_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_builder_test.cc b/src/rocksdb/db/blob/blob_file_builder_test.cc new file mode 100644 index 000000000..3a0feee45 --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_builder_test.cc @@ -0,0 +1,680 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_builder.h" + +#include +#include +#include +#include +#include + +#include "db/blob/blob_file_addition.h" +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/blob/blob_log_sequential_reader.h" +#include "env/mock_env.h" +#include "file/filename.h" +#include "file/random_access_file_reader.h" +#include "options/cf_options.h" +#include "rocksdb/env.h" +#include "rocksdb/file_checksum.h" +#include "rocksdb/options.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "util/compression.h" +#include "utilities/fault_injection_env.h" + +namespace ROCKSDB_NAMESPACE { + +class TestFileNumberGenerator { + public: + uint64_t operator()() { return ++next_file_number_; } + + private: + uint64_t next_file_number_ = 1; +}; + +class BlobFileBuilderTest : public testing::Test { + protected: + BlobFileBuilderTest() { + mock_env_.reset(MockEnv::Create(Env::Default())); + fs_ = mock_env_->GetFileSystem().get(); + clock_ = mock_env_->GetSystemClock().get(); + } + + void VerifyBlobFile(uint64_t blob_file_number, + const std::string& blob_file_path, + uint32_t column_family_id, + CompressionType blob_compression_type, + const std::vector>& + expected_key_value_pairs, + const std::vector& blob_indexes) { + assert(expected_key_value_pairs.size() == blob_indexes.size()); + + std::unique_ptr file; + constexpr IODebugContext* dbg = nullptr; + ASSERT_OK( + fs_->NewRandomAccessFile(blob_file_path, file_options_, &file, dbg)); + + std::unique_ptr file_reader( + new RandomAccessFileReader(std::move(file), blob_file_path, clock_)); + + constexpr Statistics* statistics = nullptr; + BlobLogSequentialReader blob_log_reader(std::move(file_reader), clock_, + statistics); + + BlobLogHeader header; + ASSERT_OK(blob_log_reader.ReadHeader(&header)); + ASSERT_EQ(header.version, kVersion1); + ASSERT_EQ(header.column_family_id, column_family_id); + ASSERT_EQ(header.compression, blob_compression_type); + ASSERT_FALSE(header.has_ttl); + ASSERT_EQ(header.expiration_range, ExpirationRange()); + + for (size_t i = 0; i < expected_key_value_pairs.size(); ++i) { + BlobLogRecord record; + uint64_t blob_offset = 0; + + ASSERT_OK(blob_log_reader.ReadRecord( + &record, BlobLogSequentialReader::kReadHeaderKeyBlob, &blob_offset)); + + // Check the contents of the blob file + const auto& expected_key_value = expected_key_value_pairs[i]; + const auto& key = expected_key_value.first; + const auto& value = expected_key_value.second; + + ASSERT_EQ(record.key_size, key.size()); + ASSERT_EQ(record.value_size, value.size()); + ASSERT_EQ(record.expiration, 0); + ASSERT_EQ(record.key, key); + ASSERT_EQ(record.value, value); + + // Make sure the blob reference returned by the builder points to the + // right place + BlobIndex blob_index; + ASSERT_OK(blob_index.DecodeFrom(blob_indexes[i])); + ASSERT_FALSE(blob_index.IsInlined()); + ASSERT_FALSE(blob_index.HasTTL()); + ASSERT_EQ(blob_index.file_number(), blob_file_number); + ASSERT_EQ(blob_index.offset(), blob_offset); + ASSERT_EQ(blob_index.size(), value.size()); + } + + BlobLogFooter footer; + ASSERT_OK(blob_log_reader.ReadFooter(&footer)); + ASSERT_EQ(footer.blob_count, expected_key_value_pairs.size()); + ASSERT_EQ(footer.expiration_range, ExpirationRange()); + } + + std::unique_ptr mock_env_; + FileSystem* fs_; + SystemClock* clock_; + FileOptions file_options_; +}; + +TEST_F(BlobFileBuilderTest, BuildAndCheckOneFile) { + // Build a single blob file + constexpr size_t number_of_blobs = 10; + constexpr size_t key_size = 1; + constexpr size_t value_size = 4; + constexpr size_t value_offset = 1234; + + Options options; + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileBuilderTest_BuildAndCheckOneFile"), + 0); + options.enable_blob_files = true; + options.env = mock_env_.get(); + + ImmutableOptions immutable_options(options); + MutableCFOptions mutable_cf_options(options); + + constexpr int job_id = 1; + constexpr uint32_t column_family_id = 123; + constexpr char column_family_name[] = "foobar"; + constexpr Env::IOPriority io_priority = Env::IO_HIGH; + constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; + + std::vector blob_file_paths; + std::vector blob_file_additions; + + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, + column_family_id, column_family_name, io_priority, write_hint, + nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); + + std::vector> expected_key_value_pairs( + number_of_blobs); + std::vector blob_indexes(number_of_blobs); + + for (size_t i = 0; i < number_of_blobs; ++i) { + auto& expected_key_value = expected_key_value_pairs[i]; + + auto& key = expected_key_value.first; + key = std::to_string(i); + assert(key.size() == key_size); + + auto& value = expected_key_value.second; + value = std::to_string(i + value_offset); + assert(value.size() == value_size); + + auto& blob_index = blob_indexes[i]; + + ASSERT_OK(builder.Add(key, value, &blob_index)); + ASSERT_FALSE(blob_index.empty()); + } + + ASSERT_OK(builder.Finish()); + + // Check the metadata generated + constexpr uint64_t blob_file_number = 2; + + ASSERT_EQ(blob_file_paths.size(), 1); + + const std::string& blob_file_path = blob_file_paths[0]; + + ASSERT_EQ( + blob_file_path, + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number)); + + ASSERT_EQ(blob_file_additions.size(), 1); + + const auto& blob_file_addition = blob_file_additions[0]; + + ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), number_of_blobs); + ASSERT_EQ( + blob_file_addition.GetTotalBlobBytes(), + number_of_blobs * (BlobLogRecord::kHeaderSize + key_size + value_size)); + + // Verify the contents of the new blob file as well as the blob references + VerifyBlobFile(blob_file_number, blob_file_path, column_family_id, + kNoCompression, expected_key_value_pairs, blob_indexes); +} + +TEST_F(BlobFileBuilderTest, BuildAndCheckMultipleFiles) { + // Build multiple blob files: file size limit is set to the size of a single + // value, so each blob ends up in a file of its own + constexpr size_t number_of_blobs = 10; + constexpr size_t key_size = 1; + constexpr size_t value_size = 10; + constexpr size_t value_offset = 1234567890; + + Options options; + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileBuilderTest_BuildAndCheckMultipleFiles"), + 0); + options.enable_blob_files = true; + options.blob_file_size = value_size; + options.env = mock_env_.get(); + + ImmutableOptions immutable_options(options); + MutableCFOptions mutable_cf_options(options); + + constexpr int job_id = 1; + constexpr uint32_t column_family_id = 123; + constexpr char column_family_name[] = "foobar"; + constexpr Env::IOPriority io_priority = Env::IO_HIGH; + constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; + + std::vector blob_file_paths; + std::vector blob_file_additions; + + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, + column_family_id, column_family_name, io_priority, write_hint, + nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); + + std::vector> expected_key_value_pairs( + number_of_blobs); + std::vector blob_indexes(number_of_blobs); + + for (size_t i = 0; i < number_of_blobs; ++i) { + auto& expected_key_value = expected_key_value_pairs[i]; + + auto& key = expected_key_value.first; + key = std::to_string(i); + assert(key.size() == key_size); + + auto& value = expected_key_value.second; + value = std::to_string(i + value_offset); + assert(value.size() == value_size); + + auto& blob_index = blob_indexes[i]; + + ASSERT_OK(builder.Add(key, value, &blob_index)); + ASSERT_FALSE(blob_index.empty()); + } + + ASSERT_OK(builder.Finish()); + + // Check the metadata generated + ASSERT_EQ(blob_file_paths.size(), number_of_blobs); + ASSERT_EQ(blob_file_additions.size(), number_of_blobs); + + for (size_t i = 0; i < number_of_blobs; ++i) { + const uint64_t blob_file_number = i + 2; + + ASSERT_EQ(blob_file_paths[i], + BlobFileName(immutable_options.cf_paths.front().path, + blob_file_number)); + + const auto& blob_file_addition = blob_file_additions[i]; + + ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1); + ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), + BlobLogRecord::kHeaderSize + key_size + value_size); + } + + // Verify the contents of the new blob files as well as the blob references + for (size_t i = 0; i < number_of_blobs; ++i) { + std::vector> expected_key_value_pair{ + expected_key_value_pairs[i]}; + std::vector blob_index{blob_indexes[i]}; + + VerifyBlobFile(i + 2, blob_file_paths[i], column_family_id, kNoCompression, + expected_key_value_pair, blob_index); + } +} + +TEST_F(BlobFileBuilderTest, InlinedValues) { + // All values are below the min_blob_size threshold; no blob files get written + constexpr size_t number_of_blobs = 10; + constexpr size_t key_size = 1; + constexpr size_t value_size = 10; + constexpr size_t value_offset = 1234567890; + + Options options; + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileBuilderTest_InlinedValues"), + 0); + options.enable_blob_files = true; + options.min_blob_size = 1024; + options.env = mock_env_.get(); + + ImmutableOptions immutable_options(options); + MutableCFOptions mutable_cf_options(options); + + constexpr int job_id = 1; + constexpr uint32_t column_family_id = 123; + constexpr char column_family_name[] = "foobar"; + constexpr Env::IOPriority io_priority = Env::IO_HIGH; + constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; + + std::vector blob_file_paths; + std::vector blob_file_additions; + + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, + column_family_id, column_family_name, io_priority, write_hint, + nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); + + for (size_t i = 0; i < number_of_blobs; ++i) { + const std::string key = std::to_string(i); + assert(key.size() == key_size); + + const std::string value = std::to_string(i + value_offset); + assert(value.size() == value_size); + + std::string blob_index; + ASSERT_OK(builder.Add(key, value, &blob_index)); + ASSERT_TRUE(blob_index.empty()); + } + + ASSERT_OK(builder.Finish()); + + // Check the metadata generated + ASSERT_TRUE(blob_file_paths.empty()); + ASSERT_TRUE(blob_file_additions.empty()); +} + +TEST_F(BlobFileBuilderTest, Compression) { + // Build a blob file with a compressed blob + if (!Snappy_Supported()) { + return; + } + + constexpr size_t key_size = 1; + constexpr size_t value_size = 100; + + Options options; + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), "BlobFileBuilderTest_Compression"), + 0); + options.enable_blob_files = true; + options.blob_compression_type = kSnappyCompression; + options.env = mock_env_.get(); + + ImmutableOptions immutable_options(options); + MutableCFOptions mutable_cf_options(options); + + constexpr int job_id = 1; + constexpr uint32_t column_family_id = 123; + constexpr char column_family_name[] = "foobar"; + constexpr Env::IOPriority io_priority = Env::IO_HIGH; + constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; + + std::vector blob_file_paths; + std::vector blob_file_additions; + + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, + column_family_id, column_family_name, io_priority, write_hint, + nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); + + const std::string key("1"); + const std::string uncompressed_value(value_size, 'x'); + + std::string blob_index; + + ASSERT_OK(builder.Add(key, uncompressed_value, &blob_index)); + ASSERT_FALSE(blob_index.empty()); + + ASSERT_OK(builder.Finish()); + + // Check the metadata generated + constexpr uint64_t blob_file_number = 2; + + ASSERT_EQ(blob_file_paths.size(), 1); + + const std::string& blob_file_path = blob_file_paths[0]; + + ASSERT_EQ( + blob_file_path, + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number)); + + ASSERT_EQ(blob_file_additions.size(), 1); + + const auto& blob_file_addition = blob_file_additions[0]; + + ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1); + + CompressionOptions opts; + CompressionContext context(kSnappyCompression); + constexpr uint64_t sample_for_compression = 0; + + CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), + kSnappyCompression, sample_for_compression); + + std::string compressed_value; + ASSERT_TRUE(Snappy_Compress(info, uncompressed_value.data(), + uncompressed_value.size(), &compressed_value)); + + ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), + BlobLogRecord::kHeaderSize + key_size + compressed_value.size()); + + // Verify the contents of the new blob file as well as the blob reference + std::vector> expected_key_value_pairs{ + {key, compressed_value}}; + std::vector blob_indexes{blob_index}; + + VerifyBlobFile(blob_file_number, blob_file_path, column_family_id, + kSnappyCompression, expected_key_value_pairs, blob_indexes); +} + +TEST_F(BlobFileBuilderTest, CompressionError) { + // Simulate an error during compression + if (!Snappy_Supported()) { + return; + } + + Options options; + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileBuilderTest_CompressionError"), + 0); + options.enable_blob_files = true; + options.blob_compression_type = kSnappyCompression; + options.env = mock_env_.get(); + ImmutableOptions immutable_options(options); + MutableCFOptions mutable_cf_options(options); + + constexpr int job_id = 1; + constexpr uint32_t column_family_id = 123; + constexpr char column_family_name[] = "foobar"; + constexpr Env::IOPriority io_priority = Env::IO_HIGH; + constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; + + std::vector blob_file_paths; + std::vector blob_file_additions; + + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, + column_family_id, column_family_name, io_priority, write_hint, + nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); + + SyncPoint::GetInstance()->SetCallBack("CompressData:TamperWithReturnValue", + [](void* arg) { + bool* ret = static_cast(arg); + *ret = false; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr char key[] = "1"; + constexpr char value[] = "deadbeef"; + + std::string blob_index; + + ASSERT_TRUE(builder.Add(key, value, &blob_index).IsCorruption()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + constexpr uint64_t blob_file_number = 2; + + ASSERT_EQ(blob_file_paths.size(), 1); + ASSERT_EQ( + blob_file_paths[0], + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number)); + + ASSERT_TRUE(blob_file_additions.empty()); +} + +TEST_F(BlobFileBuilderTest, Checksum) { + // Build a blob file with checksum + + class DummyFileChecksumGenerator : public FileChecksumGenerator { + public: + void Update(const char* /* data */, size_t /* n */) override {} + + void Finalize() override {} + + std::string GetChecksum() const override { return std::string("dummy"); } + + const char* Name() const override { return "DummyFileChecksum"; } + }; + + class DummyFileChecksumGenFactory : public FileChecksumGenFactory { + public: + std::unique_ptr CreateFileChecksumGenerator( + const FileChecksumGenContext& /* context */) override { + return std::unique_ptr( + new DummyFileChecksumGenerator); + } + + const char* Name() const override { return "DummyFileChecksumGenFactory"; } + }; + + Options options; + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), "BlobFileBuilderTest_Checksum"), + 0); + options.enable_blob_files = true; + options.file_checksum_gen_factory = + std::make_shared(); + options.env = mock_env_.get(); + + ImmutableOptions immutable_options(options); + MutableCFOptions mutable_cf_options(options); + + constexpr int job_id = 1; + constexpr uint32_t column_family_id = 123; + constexpr char column_family_name[] = "foobar"; + constexpr Env::IOPriority io_priority = Env::IO_HIGH; + constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; + + std::vector blob_file_paths; + std::vector blob_file_additions; + + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, + column_family_id, column_family_name, io_priority, write_hint, + nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); + + const std::string key("1"); + const std::string value("deadbeef"); + + std::string blob_index; + + ASSERT_OK(builder.Add(key, value, &blob_index)); + ASSERT_FALSE(blob_index.empty()); + + ASSERT_OK(builder.Finish()); + + // Check the metadata generated + constexpr uint64_t blob_file_number = 2; + + ASSERT_EQ(blob_file_paths.size(), 1); + + const std::string& blob_file_path = blob_file_paths[0]; + + ASSERT_EQ( + blob_file_path, + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number)); + + ASSERT_EQ(blob_file_additions.size(), 1); + + const auto& blob_file_addition = blob_file_additions[0]; + + ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1); + ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), + BlobLogRecord::kHeaderSize + key.size() + value.size()); + ASSERT_EQ(blob_file_addition.GetChecksumMethod(), "DummyFileChecksum"); + ASSERT_EQ(blob_file_addition.GetChecksumValue(), "dummy"); + + // Verify the contents of the new blob file as well as the blob reference + std::vector> expected_key_value_pairs{ + {key, value}}; + std::vector blob_indexes{blob_index}; + + VerifyBlobFile(blob_file_number, blob_file_path, column_family_id, + kNoCompression, expected_key_value_pairs, blob_indexes); +} + +class BlobFileBuilderIOErrorTest + : public testing::Test, + public testing::WithParamInterface { + protected: + BlobFileBuilderIOErrorTest() : sync_point_(GetParam()) { + mock_env_.reset(MockEnv::Create(Env::Default())); + fs_ = mock_env_->GetFileSystem().get(); + } + + std::unique_ptr mock_env_; + FileSystem* fs_; + FileOptions file_options_; + std::string sync_point_; +}; + +INSTANTIATE_TEST_CASE_P( + BlobFileBuilderTest, BlobFileBuilderIOErrorTest, + ::testing::ValuesIn(std::vector{ + "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile", + "BlobFileBuilder::OpenBlobFileIfNeeded:WriteHeader", + "BlobFileBuilder::WriteBlobToFile:AddRecord", + "BlobFileBuilder::WriteBlobToFile:AppendFooter"})); + +TEST_P(BlobFileBuilderIOErrorTest, IOError) { + // Simulate an I/O error during the specified step of Add() + // Note: blob_file_size will be set to value_size in order for the first blob + // to trigger close + constexpr size_t value_size = 8; + + Options options; + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileBuilderIOErrorTest_IOError"), + 0); + options.enable_blob_files = true; + options.blob_file_size = value_size; + options.env = mock_env_.get(); + + ImmutableOptions immutable_options(options); + MutableCFOptions mutable_cf_options(options); + + constexpr int job_id = 1; + constexpr uint32_t column_family_id = 123; + constexpr char column_family_name[] = "foobar"; + constexpr Env::IOPriority io_priority = Env::IO_HIGH; + constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; + + std::vector blob_file_paths; + std::vector blob_file_additions; + + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, + column_family_id, column_family_name, io_priority, write_hint, + nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) { + Status* const s = static_cast(arg); + assert(s); + + (*s) = Status::IOError(sync_point_); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr char key[] = "1"; + constexpr char value[] = "deadbeef"; + + std::string blob_index; + + ASSERT_TRUE(builder.Add(key, value, &blob_index).IsIOError()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + if (sync_point_ == "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile") { + ASSERT_TRUE(blob_file_paths.empty()); + } else { + constexpr uint64_t blob_file_number = 2; + + ASSERT_EQ(blob_file_paths.size(), 1); + ASSERT_EQ(blob_file_paths[0], + BlobFileName(immutable_options.cf_paths.front().path, + blob_file_number)); + } + + ASSERT_TRUE(blob_file_additions.empty()); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/blob/blob_file_cache.cc b/src/rocksdb/db/blob/blob_file_cache.cc new file mode 100644 index 000000000..1a6cdf688 --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_cache.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_cache.h" + +#include +#include + +#include "db/blob/blob_file_reader.h" +#include "options/cf_options.h" +#include "rocksdb/cache.h" +#include "rocksdb/slice.h" +#include "test_util/sync_point.h" +#include "trace_replay/io_tracer.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +BlobFileCache::BlobFileCache(Cache* cache, + const ImmutableOptions* immutable_options, + const FileOptions* file_options, + uint32_t column_family_id, + HistogramImpl* blob_file_read_hist, + const std::shared_ptr& io_tracer) + : cache_(cache), + mutex_(kNumberOfMutexStripes, kGetSliceNPHash64UnseededFnPtr), + immutable_options_(immutable_options), + file_options_(file_options), + column_family_id_(column_family_id), + blob_file_read_hist_(blob_file_read_hist), + io_tracer_(io_tracer) { + assert(cache_); + assert(immutable_options_); + assert(file_options_); +} + +Status BlobFileCache::GetBlobFileReader( + uint64_t blob_file_number, + CacheHandleGuard* blob_file_reader) { + assert(blob_file_reader); + assert(blob_file_reader->IsEmpty()); + + const Slice key = GetSlice(&blob_file_number); + + assert(cache_); + + Cache::Handle* handle = cache_->Lookup(key); + if (handle) { + *blob_file_reader = CacheHandleGuard(cache_, handle); + return Status::OK(); + } + + TEST_SYNC_POINT("BlobFileCache::GetBlobFileReader:DoubleCheck"); + + // Check again while holding mutex + MutexLock lock(mutex_.get(key)); + + handle = cache_->Lookup(key); + if (handle) { + *blob_file_reader = CacheHandleGuard(cache_, handle); + return Status::OK(); + } + + assert(immutable_options_); + Statistics* const statistics = immutable_options_->stats; + + RecordTick(statistics, NO_FILE_OPENS); + + std::unique_ptr reader; + + { + assert(file_options_); + const Status s = BlobFileReader::Create( + *immutable_options_, *file_options_, column_family_id_, + blob_file_read_hist_, blob_file_number, io_tracer_, &reader); + if (!s.ok()) { + RecordTick(statistics, NO_FILE_ERRORS); + return s; + } + } + + { + constexpr size_t charge = 1; + + const Status s = cache_->Insert(key, reader.get(), charge, + &DeleteCacheEntry, &handle); + if (!s.ok()) { + RecordTick(statistics, NO_FILE_ERRORS); + return s; + } + } + + reader.release(); + + *blob_file_reader = CacheHandleGuard(cache_, handle); + + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_cache.h b/src/rocksdb/db/blob/blob_file_cache.h new file mode 100644 index 000000000..8eec05f18 --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_cache.h @@ -0,0 +1,52 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "cache/cache_helpers.h" +#include "rocksdb/rocksdb_namespace.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +class Cache; +struct ImmutableOptions; +struct FileOptions; +class HistogramImpl; +class Status; +class BlobFileReader; +class Slice; +class IOTracer; + +class BlobFileCache { + public: + BlobFileCache(Cache* cache, const ImmutableOptions* immutable_options, + const FileOptions* file_options, uint32_t column_family_id, + HistogramImpl* blob_file_read_hist, + const std::shared_ptr& io_tracer); + + BlobFileCache(const BlobFileCache&) = delete; + BlobFileCache& operator=(const BlobFileCache&) = delete; + + Status GetBlobFileReader(uint64_t blob_file_number, + CacheHandleGuard* blob_file_reader); + + private: + Cache* cache_; + // Note: mutex_ below is used to guard against multiple threads racing to open + // the same file. + Striped mutex_; + const ImmutableOptions* immutable_options_; + const FileOptions* file_options_; + uint32_t column_family_id_; + HistogramImpl* blob_file_read_hist_; + std::shared_ptr io_tracer_; + + static constexpr size_t kNumberOfMutexStripes = 1 << 7; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_cache_test.cc b/src/rocksdb/db/blob/blob_file_cache_test.cc new file mode 100644 index 000000000..d3a61b3c5 --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_cache_test.cc @@ -0,0 +1,269 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_cache.h" + +#include +#include + +#include "db/blob/blob_log_format.h" +#include "db/blob/blob_log_writer.h" +#include "env/mock_env.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "file/writable_file_writer.h" +#include "options/cf_options.h" +#include "rocksdb/cache.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +// Creates a test blob file with a single blob in it. +void WriteBlobFile(uint32_t column_family_id, + const ImmutableOptions& immutable_options, + uint64_t blob_file_number) { + assert(!immutable_options.cf_paths.empty()); + + const std::string blob_file_path = + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number); + + std::unique_ptr file; + ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file, + FileOptions())); + + std::unique_ptr file_writer(new WritableFileWriter( + std::move(file), blob_file_path, FileOptions(), immutable_options.clock)); + + constexpr Statistics* statistics = nullptr; + constexpr bool use_fsync = false; + constexpr bool do_flush = false; + + BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock, + statistics, blob_file_number, use_fsync, + do_flush); + + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + + BlobLogHeader header(column_family_id, kNoCompression, has_ttl, + expiration_range); + + ASSERT_OK(blob_log_writer.WriteHeader(header)); + + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + std::string compressed_blob; + + uint64_t key_offset = 0; + uint64_t blob_offset = 0; + + ASSERT_OK(blob_log_writer.AddRecord(key, blob, &key_offset, &blob_offset)); + + BlobLogFooter footer; + footer.blob_count = 1; + footer.expiration_range = expiration_range; + + std::string checksum_method; + std::string checksum_value; + + ASSERT_OK( + blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value)); +} + +} // anonymous namespace + +class BlobFileCacheTest : public testing::Test { + protected: + BlobFileCacheTest() { mock_env_.reset(MockEnv::Create(Env::Default())); } + + std::unique_ptr mock_env_; +}; + +TEST_F(BlobFileCacheTest, GetBlobFileReader) { + Options options; + options.env = mock_env_.get(); + options.statistics = CreateDBStatistics(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileCacheTest_GetBlobFileReader"), + 0); + options.enable_blob_files = true; + + constexpr uint32_t column_family_id = 1; + ImmutableOptions immutable_options(options); + constexpr uint64_t blob_file_number = 123; + + WriteBlobFile(column_family_id, immutable_options, blob_file_number); + + constexpr size_t capacity = 10; + std::shared_ptr backing_cache = NewLRUCache(capacity); + + FileOptions file_options; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options, + &file_options, column_family_id, + blob_file_read_hist, nullptr /*IOTracer*/); + + // First try: reader should be opened and put in cache + CacheHandleGuard first; + + ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first)); + ASSERT_NE(first.GetValue(), nullptr); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); + + // Second try: reader should be served from cache + CacheHandleGuard second; + + ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second)); + ASSERT_NE(second.GetValue(), nullptr); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); + + ASSERT_EQ(first.GetValue(), second.GetValue()); +} + +TEST_F(BlobFileCacheTest, GetBlobFileReader_Race) { + Options options; + options.env = mock_env_.get(); + options.statistics = CreateDBStatistics(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileCacheTest_GetBlobFileReader_Race"), + 0); + options.enable_blob_files = true; + + constexpr uint32_t column_family_id = 1; + ImmutableOptions immutable_options(options); + constexpr uint64_t blob_file_number = 123; + + WriteBlobFile(column_family_id, immutable_options, blob_file_number); + + constexpr size_t capacity = 10; + std::shared_ptr backing_cache = NewLRUCache(capacity); + + FileOptions file_options; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options, + &file_options, column_family_id, + blob_file_read_hist, nullptr /*IOTracer*/); + + CacheHandleGuard first; + CacheHandleGuard second; + + SyncPoint::GetInstance()->SetCallBack( + "BlobFileCache::GetBlobFileReader:DoubleCheck", [&](void* /* arg */) { + // Disabling sync points to prevent infinite recursion + SyncPoint::GetInstance()->DisableProcessing(); + + ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second)); + ASSERT_NE(second.GetValue(), nullptr); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first)); + ASSERT_NE(first.GetValue(), nullptr); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); + + ASSERT_EQ(first.GetValue(), second.GetValue()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(BlobFileCacheTest, GetBlobFileReader_IOError) { + Options options; + options.env = mock_env_.get(); + options.statistics = CreateDBStatistics(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileCacheTest_GetBlobFileReader_IOError"), + 0); + options.enable_blob_files = true; + + constexpr size_t capacity = 10; + std::shared_ptr backing_cache = NewLRUCache(capacity); + + ImmutableOptions immutable_options(options); + FileOptions file_options; + constexpr uint32_t column_family_id = 1; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options, + &file_options, column_family_id, + blob_file_read_hist, nullptr /*IOTracer*/); + + // Note: there is no blob file with the below number + constexpr uint64_t blob_file_number = 123; + + CacheHandleGuard reader; + + ASSERT_TRUE( + blob_file_cache.GetBlobFileReader(blob_file_number, &reader).IsIOError()); + ASSERT_EQ(reader.GetValue(), nullptr); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1); +} + +TEST_F(BlobFileCacheTest, GetBlobFileReader_CacheFull) { + Options options; + options.env = mock_env_.get(); + options.statistics = CreateDBStatistics(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileCacheTest_GetBlobFileReader_CacheFull"), + 0); + options.enable_blob_files = true; + + constexpr uint32_t column_family_id = 1; + ImmutableOptions immutable_options(options); + constexpr uint64_t blob_file_number = 123; + + WriteBlobFile(column_family_id, immutable_options, blob_file_number); + + constexpr size_t capacity = 0; + constexpr int num_shard_bits = -1; // determined automatically + constexpr bool strict_capacity_limit = true; + std::shared_ptr backing_cache = + NewLRUCache(capacity, num_shard_bits, strict_capacity_limit); + + FileOptions file_options; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options, + &file_options, column_family_id, + blob_file_read_hist, nullptr /*IOTracer*/); + + // Insert into cache should fail since it has zero capacity and + // strict_capacity_limit is set + CacheHandleGuard reader; + + ASSERT_TRUE(blob_file_cache.GetBlobFileReader(blob_file_number, &reader) + .IsMemoryLimit()); + ASSERT_EQ(reader.GetValue(), nullptr); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/blob/blob_file_completion_callback.h b/src/rocksdb/db/blob/blob_file_completion_callback.h new file mode 100644 index 000000000..ffe65a0ff --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_completion_callback.h @@ -0,0 +1,101 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "file/sst_file_manager_impl.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class BlobFileCompletionCallback { + public: + BlobFileCompletionCallback( + SstFileManager* sst_file_manager, InstrumentedMutex* mutex, + ErrorHandler* error_handler, EventLogger* event_logger, + const std::vector>& listeners, + const std::string& dbname) + : event_logger_(event_logger), listeners_(listeners), dbname_(dbname) { +#ifndef ROCKSDB_LITE + sst_file_manager_ = sst_file_manager; + mutex_ = mutex; + error_handler_ = error_handler; +#else + (void)sst_file_manager; + (void)mutex; + (void)error_handler; +#endif // ROCKSDB_LITE + } + + void OnBlobFileCreationStarted(const std::string& file_name, + const std::string& column_family_name, + int job_id, + BlobFileCreationReason creation_reason) { +#ifndef ROCKSDB_LITE + // Notify the listeners. + EventHelpers::NotifyBlobFileCreationStarted(listeners_, dbname_, + column_family_name, file_name, + job_id, creation_reason); +#else + (void)file_name; + (void)column_family_name; + (void)job_id; + (void)creation_reason; +#endif + } + + Status OnBlobFileCompleted(const std::string& file_name, + const std::string& column_family_name, int job_id, + uint64_t file_number, + BlobFileCreationReason creation_reason, + const Status& report_status, + const std::string& checksum_value, + const std::string& checksum_method, + uint64_t blob_count, uint64_t blob_bytes) { + Status s; + +#ifndef ROCKSDB_LITE + auto sfm = static_cast(sst_file_manager_); + if (sfm) { + // Report new blob files to SstFileManagerImpl + s = sfm->OnAddFile(file_name); + if (sfm->IsMaxAllowedSpaceReached()) { + s = Status::SpaceLimit("Max allowed space was reached"); + TEST_SYNC_POINT( + "BlobFileCompletionCallback::CallBack::MaxAllowedSpaceReached"); + InstrumentedMutexLock l(mutex_); + error_handler_->SetBGError(s, BackgroundErrorReason::kFlush); + } + } +#endif // !ROCKSDB_LITE + + // Notify the listeners. + EventHelpers::LogAndNotifyBlobFileCreationFinished( + event_logger_, listeners_, dbname_, column_family_name, file_name, + job_id, file_number, creation_reason, + (!report_status.ok() ? report_status : s), + (checksum_value.empty() ? kUnknownFileChecksum : checksum_value), + (checksum_method.empty() ? kUnknownFileChecksumFuncName + : checksum_method), + blob_count, blob_bytes); + return s; + } + + private: +#ifndef ROCKSDB_LITE + SstFileManager* sst_file_manager_; + InstrumentedMutex* mutex_; + ErrorHandler* error_handler_; +#endif // ROCKSDB_LITE + EventLogger* event_logger_; + std::vector> listeners_; + std::string dbname_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_garbage.cc b/src/rocksdb/db/blob/blob_file_garbage.cc new file mode 100644 index 000000000..52c336f49 --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_garbage.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_garbage.h" + +#include +#include + +#include "logging/event_logger.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "test_util/sync_point.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +// Tags for custom fields. Note that these get persisted in the manifest, +// so existing tags should not be modified. +enum BlobFileGarbage::CustomFieldTags : uint32_t { + kEndMarker, + + // Add forward compatible fields here + + ///////////////////////////////////////////////////////////////////// + + kForwardIncompatibleMask = 1 << 6, + + // Add forward incompatible fields here +}; + +void BlobFileGarbage::EncodeTo(std::string* output) const { + PutVarint64(output, blob_file_number_); + PutVarint64(output, garbage_blob_count_); + PutVarint64(output, garbage_blob_bytes_); + + // Encode any custom fields here. The format to use is a Varint32 tag (see + // CustomFieldTags above) followed by a length prefixed slice. Unknown custom + // fields will be ignored during decoding unless they're in the forward + // incompatible range. + + TEST_SYNC_POINT_CALLBACK("BlobFileGarbage::EncodeTo::CustomFields", output); + + PutVarint32(output, kEndMarker); +} + +Status BlobFileGarbage::DecodeFrom(Slice* input) { + constexpr char class_name[] = "BlobFileGarbage"; + + if (!GetVarint64(input, &blob_file_number_)) { + return Status::Corruption(class_name, "Error decoding blob file number"); + } + + if (!GetVarint64(input, &garbage_blob_count_)) { + return Status::Corruption(class_name, "Error decoding garbage blob count"); + } + + if (!GetVarint64(input, &garbage_blob_bytes_)) { + return Status::Corruption(class_name, "Error decoding garbage blob bytes"); + } + + while (true) { + uint32_t custom_field_tag = 0; + if (!GetVarint32(input, &custom_field_tag)) { + return Status::Corruption(class_name, "Error decoding custom field tag"); + } + + if (custom_field_tag == kEndMarker) { + break; + } + + if (custom_field_tag & kForwardIncompatibleMask) { + return Status::Corruption( + class_name, "Forward incompatible custom field encountered"); + } + + Slice custom_field_value; + if (!GetLengthPrefixedSlice(input, &custom_field_value)) { + return Status::Corruption(class_name, + "Error decoding custom field value"); + } + } + + return Status::OK(); +} + +std::string BlobFileGarbage::DebugString() const { + std::ostringstream oss; + + oss << *this; + + return oss.str(); +} + +std::string BlobFileGarbage::DebugJSON() const { + JSONWriter jw; + + jw << *this; + + jw.EndObject(); + + return jw.Get(); +} + +bool operator==(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs) { + return lhs.GetBlobFileNumber() == rhs.GetBlobFileNumber() && + lhs.GetGarbageBlobCount() == rhs.GetGarbageBlobCount() && + lhs.GetGarbageBlobBytes() == rhs.GetGarbageBlobBytes(); +} + +bool operator!=(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs) { + return !(lhs == rhs); +} + +std::ostream& operator<<(std::ostream& os, + const BlobFileGarbage& blob_file_garbage) { + os << "blob_file_number: " << blob_file_garbage.GetBlobFileNumber() + << " garbage_blob_count: " << blob_file_garbage.GetGarbageBlobCount() + << " garbage_blob_bytes: " << blob_file_garbage.GetGarbageBlobBytes(); + + return os; +} + +JSONWriter& operator<<(JSONWriter& jw, + const BlobFileGarbage& blob_file_garbage) { + jw << "BlobFileNumber" << blob_file_garbage.GetBlobFileNumber() + << "GarbageBlobCount" << blob_file_garbage.GetGarbageBlobCount() + << "GarbageBlobBytes" << blob_file_garbage.GetGarbageBlobBytes(); + + return jw; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_garbage.h b/src/rocksdb/db/blob/blob_file_garbage.h new file mode 100644 index 000000000..6dc14ddca --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_garbage.h @@ -0,0 +1,57 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "db/blob/blob_constants.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class JSONWriter; +class Slice; +class Status; + +class BlobFileGarbage { + public: + BlobFileGarbage() = default; + + BlobFileGarbage(uint64_t blob_file_number, uint64_t garbage_blob_count, + uint64_t garbage_blob_bytes) + : blob_file_number_(blob_file_number), + garbage_blob_count_(garbage_blob_count), + garbage_blob_bytes_(garbage_blob_bytes) {} + + uint64_t GetBlobFileNumber() const { return blob_file_number_; } + uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; } + uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; } + + void EncodeTo(std::string* output) const; + Status DecodeFrom(Slice* input); + + std::string DebugString() const; + std::string DebugJSON() const; + + private: + enum CustomFieldTags : uint32_t; + + uint64_t blob_file_number_ = kInvalidBlobFileNumber; + uint64_t garbage_blob_count_ = 0; + uint64_t garbage_blob_bytes_ = 0; +}; + +bool operator==(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs); +bool operator!=(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs); + +std::ostream& operator<<(std::ostream& os, + const BlobFileGarbage& blob_file_garbage); +JSONWriter& operator<<(JSONWriter& jw, + const BlobFileGarbage& blob_file_garbage); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_garbage_test.cc b/src/rocksdb/db/blob/blob_file_garbage_test.cc new file mode 100644 index 000000000..292a8b38a --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_garbage_test.cc @@ -0,0 +1,174 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_garbage.h" + +#include +#include +#include + +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +class BlobFileGarbageTest : public testing::Test { + public: + static void TestEncodeDecode(const BlobFileGarbage& blob_file_garbage) { + std::string encoded; + blob_file_garbage.EncodeTo(&encoded); + + BlobFileGarbage decoded; + Slice input(encoded); + ASSERT_OK(decoded.DecodeFrom(&input)); + + ASSERT_EQ(blob_file_garbage, decoded); + } +}; + +TEST_F(BlobFileGarbageTest, Empty) { + BlobFileGarbage blob_file_garbage; + + ASSERT_EQ(blob_file_garbage.GetBlobFileNumber(), kInvalidBlobFileNumber); + ASSERT_EQ(blob_file_garbage.GetGarbageBlobCount(), 0); + ASSERT_EQ(blob_file_garbage.GetGarbageBlobBytes(), 0); + + TestEncodeDecode(blob_file_garbage); +} + +TEST_F(BlobFileGarbageTest, NonEmpty) { + constexpr uint64_t blob_file_number = 123; + constexpr uint64_t garbage_blob_count = 1; + constexpr uint64_t garbage_blob_bytes = 9876; + + BlobFileGarbage blob_file_garbage(blob_file_number, garbage_blob_count, + garbage_blob_bytes); + + ASSERT_EQ(blob_file_garbage.GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(blob_file_garbage.GetGarbageBlobCount(), garbage_blob_count); + ASSERT_EQ(blob_file_garbage.GetGarbageBlobBytes(), garbage_blob_bytes); + + TestEncodeDecode(blob_file_garbage); +} + +TEST_F(BlobFileGarbageTest, DecodeErrors) { + std::string str; + Slice slice(str); + + BlobFileGarbage blob_file_garbage; + + { + const Status s = blob_file_garbage.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "blob file number")); + } + + constexpr uint64_t blob_file_number = 123; + PutVarint64(&str, blob_file_number); + slice = str; + + { + const Status s = blob_file_garbage.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "garbage blob count")); + } + + constexpr uint64_t garbage_blob_count = 4567; + PutVarint64(&str, garbage_blob_count); + slice = str; + + { + const Status s = blob_file_garbage.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "garbage blob bytes")); + } + + constexpr uint64_t garbage_blob_bytes = 12345678; + PutVarint64(&str, garbage_blob_bytes); + slice = str; + + { + const Status s = blob_file_garbage.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "custom field tag")); + } + + constexpr uint32_t custom_tag = 2; + PutVarint32(&str, custom_tag); + slice = str; + + { + const Status s = blob_file_garbage.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "custom field value")); + } +} + +TEST_F(BlobFileGarbageTest, ForwardCompatibleCustomField) { + SyncPoint::GetInstance()->SetCallBack( + "BlobFileGarbage::EncodeTo::CustomFields", [&](void* arg) { + std::string* output = static_cast(arg); + + constexpr uint32_t forward_compatible_tag = 2; + PutVarint32(output, forward_compatible_tag); + + PutLengthPrefixedSlice(output, "deadbeef"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr uint64_t blob_file_number = 678; + constexpr uint64_t garbage_blob_count = 9999; + constexpr uint64_t garbage_blob_bytes = 100000000; + + BlobFileGarbage blob_file_garbage(blob_file_number, garbage_blob_count, + garbage_blob_bytes); + + TestEncodeDecode(blob_file_garbage); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(BlobFileGarbageTest, ForwardIncompatibleCustomField) { + SyncPoint::GetInstance()->SetCallBack( + "BlobFileGarbage::EncodeTo::CustomFields", [&](void* arg) { + std::string* output = static_cast(arg); + + constexpr uint32_t forward_incompatible_tag = (1 << 6) + 1; + PutVarint32(output, forward_incompatible_tag); + + PutLengthPrefixedSlice(output, "foobar"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr uint64_t blob_file_number = 456; + constexpr uint64_t garbage_blob_count = 100; + constexpr uint64_t garbage_blob_bytes = 2000000; + + BlobFileGarbage blob_file_garbage(blob_file_number, garbage_blob_count, + garbage_blob_bytes); + + std::string encoded; + blob_file_garbage.EncodeTo(&encoded); + + BlobFileGarbage decoded_blob_file_addition; + Slice input(encoded); + const Status s = decoded_blob_file_addition.DecodeFrom(&input); + + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "Forward incompatible")); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/blob/blob_file_meta.cc b/src/rocksdb/db/blob/blob_file_meta.cc new file mode 100644 index 000000000..4913137e5 --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_meta.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_meta.h" + +#include +#include + +#include "db/blob/blob_log_format.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { +uint64_t SharedBlobFileMetaData::GetBlobFileSize() const { + return BlobLogHeader::kSize + total_blob_bytes_ + BlobLogFooter::kSize; +} + +std::string SharedBlobFileMetaData::DebugString() const { + std::ostringstream oss; + oss << (*this); + + return oss.str(); +} + +std::ostream& operator<<(std::ostream& os, + const SharedBlobFileMetaData& shared_meta) { + os << "blob_file_number: " << shared_meta.GetBlobFileNumber() + << " total_blob_count: " << shared_meta.GetTotalBlobCount() + << " total_blob_bytes: " << shared_meta.GetTotalBlobBytes() + << " checksum_method: " << shared_meta.GetChecksumMethod() + << " checksum_value: " + << Slice(shared_meta.GetChecksumValue()).ToString(/* hex */ true); + + return os; +} + +std::string BlobFileMetaData::DebugString() const { + std::ostringstream oss; + oss << (*this); + + return oss.str(); +} + +std::ostream& operator<<(std::ostream& os, const BlobFileMetaData& meta) { + const auto& shared_meta = meta.GetSharedMeta(); + assert(shared_meta); + os << (*shared_meta); + + os << " linked_ssts: {"; + for (uint64_t file_number : meta.GetLinkedSsts()) { + os << ' ' << file_number; + } + os << " }"; + + os << " garbage_blob_count: " << meta.GetGarbageBlobCount() + << " garbage_blob_bytes: " << meta.GetGarbageBlobBytes(); + + return os; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_meta.h b/src/rocksdb/db/blob/blob_file_meta.h new file mode 100644 index 000000000..d7c8a1243 --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_meta.h @@ -0,0 +1,170 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// SharedBlobFileMetaData represents the immutable part of blob files' metadata, +// like the blob file number, total number and size of blobs, or checksum +// method and value. There is supposed to be one object of this class per blob +// file (shared across all versions that include the blob file in question); +// hence, the type is neither copyable nor movable. A blob file can be marked +// obsolete when the corresponding SharedBlobFileMetaData object is destroyed. + +class SharedBlobFileMetaData { + public: + static std::shared_ptr Create( + uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, std::string checksum_method, + std::string checksum_value) { + return std::shared_ptr(new SharedBlobFileMetaData( + blob_file_number, total_blob_count, total_blob_bytes, + std::move(checksum_method), std::move(checksum_value))); + } + + template + static std::shared_ptr Create( + uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, std::string checksum_method, + std::string checksum_value, Deleter deleter) { + return std::shared_ptr( + new SharedBlobFileMetaData(blob_file_number, total_blob_count, + total_blob_bytes, std::move(checksum_method), + std::move(checksum_value)), + deleter); + } + + SharedBlobFileMetaData(const SharedBlobFileMetaData&) = delete; + SharedBlobFileMetaData& operator=(const SharedBlobFileMetaData&) = delete; + + SharedBlobFileMetaData(SharedBlobFileMetaData&&) = delete; + SharedBlobFileMetaData& operator=(SharedBlobFileMetaData&&) = delete; + + uint64_t GetBlobFileSize() const; + uint64_t GetBlobFileNumber() const { return blob_file_number_; } + uint64_t GetTotalBlobCount() const { return total_blob_count_; } + uint64_t GetTotalBlobBytes() const { return total_blob_bytes_; } + const std::string& GetChecksumMethod() const { return checksum_method_; } + const std::string& GetChecksumValue() const { return checksum_value_; } + + std::string DebugString() const; + + private: + SharedBlobFileMetaData(uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, std::string checksum_method, + std::string checksum_value) + : blob_file_number_(blob_file_number), + total_blob_count_(total_blob_count), + total_blob_bytes_(total_blob_bytes), + checksum_method_(std::move(checksum_method)), + checksum_value_(std::move(checksum_value)) { + assert(checksum_method_.empty() == checksum_value_.empty()); + } + + uint64_t blob_file_number_; + uint64_t total_blob_count_; + uint64_t total_blob_bytes_; + std::string checksum_method_; + std::string checksum_value_; +}; + +std::ostream& operator<<(std::ostream& os, + const SharedBlobFileMetaData& shared_meta); + +// BlobFileMetaData contains the part of the metadata for blob files that can +// vary across versions, like the amount of garbage in the blob file. In +// addition, BlobFileMetaData objects point to and share the ownership of the +// SharedBlobFileMetaData object for the corresponding blob file. Similarly to +// SharedBlobFileMetaData, BlobFileMetaData are not copyable or movable. They +// are meant to be jointly owned by the versions in which the blob file has the +// same (immutable *and* mutable) state. + +class BlobFileMetaData { + public: + using LinkedSsts = std::unordered_set; + + static std::shared_ptr Create( + std::shared_ptr shared_meta, + LinkedSsts linked_ssts, uint64_t garbage_blob_count, + uint64_t garbage_blob_bytes) { + return std::shared_ptr( + new BlobFileMetaData(std::move(shared_meta), std::move(linked_ssts), + garbage_blob_count, garbage_blob_bytes)); + } + + BlobFileMetaData(const BlobFileMetaData&) = delete; + BlobFileMetaData& operator=(const BlobFileMetaData&) = delete; + + BlobFileMetaData(BlobFileMetaData&&) = delete; + BlobFileMetaData& operator=(BlobFileMetaData&&) = delete; + + const std::shared_ptr& GetSharedMeta() const { + return shared_meta_; + } + + uint64_t GetBlobFileSize() const { + assert(shared_meta_); + return shared_meta_->GetBlobFileSize(); + } + + uint64_t GetBlobFileNumber() const { + assert(shared_meta_); + return shared_meta_->GetBlobFileNumber(); + } + uint64_t GetTotalBlobCount() const { + assert(shared_meta_); + return shared_meta_->GetTotalBlobCount(); + } + uint64_t GetTotalBlobBytes() const { + assert(shared_meta_); + return shared_meta_->GetTotalBlobBytes(); + } + const std::string& GetChecksumMethod() const { + assert(shared_meta_); + return shared_meta_->GetChecksumMethod(); + } + const std::string& GetChecksumValue() const { + assert(shared_meta_); + return shared_meta_->GetChecksumValue(); + } + + const LinkedSsts& GetLinkedSsts() const { return linked_ssts_; } + + uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; } + uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; } + + std::string DebugString() const; + + private: + BlobFileMetaData(std::shared_ptr shared_meta, + LinkedSsts linked_ssts, uint64_t garbage_blob_count, + uint64_t garbage_blob_bytes) + : shared_meta_(std::move(shared_meta)), + linked_ssts_(std::move(linked_ssts)), + garbage_blob_count_(garbage_blob_count), + garbage_blob_bytes_(garbage_blob_bytes) { + assert(shared_meta_); + assert(garbage_blob_count_ <= shared_meta_->GetTotalBlobCount()); + assert(garbage_blob_bytes_ <= shared_meta_->GetTotalBlobBytes()); + } + + std::shared_ptr shared_meta_; + LinkedSsts linked_ssts_; + uint64_t garbage_blob_count_; + uint64_t garbage_blob_bytes_; +}; + +std::ostream& operator<<(std::ostream& os, const BlobFileMetaData& meta); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_reader.cc b/src/rocksdb/db/blob/blob_file_reader.cc new file mode 100644 index 000000000..a4eabb605 --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_reader.cc @@ -0,0 +1,610 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_reader.h" + +#include +#include + +#include "db/blob/blob_contents.h" +#include "db/blob/blob_log_format.h" +#include "file/file_prefetch_buffer.h" +#include "file/filename.h" +#include "monitoring/statistics.h" +#include "options/cf_options.h" +#include "rocksdb/file_system.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "table/multiget_context.h" +#include "test_util/sync_point.h" +#include "util/compression.h" +#include "util/crc32c.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +Status BlobFileReader::Create( + const ImmutableOptions& immutable_options, const FileOptions& file_options, + uint32_t column_family_id, HistogramImpl* blob_file_read_hist, + uint64_t blob_file_number, const std::shared_ptr& io_tracer, + std::unique_ptr* blob_file_reader) { + assert(blob_file_reader); + assert(!*blob_file_reader); + + uint64_t file_size = 0; + std::unique_ptr file_reader; + + { + const Status s = + OpenFile(immutable_options, file_options, blob_file_read_hist, + blob_file_number, io_tracer, &file_size, &file_reader); + if (!s.ok()) { + return s; + } + } + + assert(file_reader); + + Statistics* const statistics = immutable_options.stats; + + CompressionType compression_type = kNoCompression; + + { + const Status s = ReadHeader(file_reader.get(), column_family_id, statistics, + &compression_type); + if (!s.ok()) { + return s; + } + } + + { + const Status s = ReadFooter(file_reader.get(), file_size, statistics); + if (!s.ok()) { + return s; + } + } + + blob_file_reader->reset( + new BlobFileReader(std::move(file_reader), file_size, compression_type, + immutable_options.clock, statistics)); + + return Status::OK(); +} + +Status BlobFileReader::OpenFile( + const ImmutableOptions& immutable_options, const FileOptions& file_opts, + HistogramImpl* blob_file_read_hist, uint64_t blob_file_number, + const std::shared_ptr& io_tracer, uint64_t* file_size, + std::unique_ptr* file_reader) { + assert(file_size); + assert(file_reader); + + const auto& cf_paths = immutable_options.cf_paths; + assert(!cf_paths.empty()); + + const std::string blob_file_path = + BlobFileName(cf_paths.front().path, blob_file_number); + + FileSystem* const fs = immutable_options.fs.get(); + assert(fs); + + constexpr IODebugContext* dbg = nullptr; + + { + TEST_SYNC_POINT("BlobFileReader::OpenFile:GetFileSize"); + + const Status s = + fs->GetFileSize(blob_file_path, IOOptions(), file_size, dbg); + if (!s.ok()) { + return s; + } + } + + if (*file_size < BlobLogHeader::kSize + BlobLogFooter::kSize) { + return Status::Corruption("Malformed blob file"); + } + + std::unique_ptr file; + + { + TEST_SYNC_POINT("BlobFileReader::OpenFile:NewRandomAccessFile"); + + const Status s = + fs->NewRandomAccessFile(blob_file_path, file_opts, &file, dbg); + if (!s.ok()) { + return s; + } + } + + assert(file); + + if (immutable_options.advise_random_on_open) { + file->Hint(FSRandomAccessFile::kRandom); + } + + file_reader->reset(new RandomAccessFileReader( + std::move(file), blob_file_path, immutable_options.clock, io_tracer, + immutable_options.stats, BLOB_DB_BLOB_FILE_READ_MICROS, + blob_file_read_hist, immutable_options.rate_limiter.get(), + immutable_options.listeners)); + + return Status::OK(); +} + +Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader, + uint32_t column_family_id, + Statistics* statistics, + CompressionType* compression_type) { + assert(file_reader); + assert(compression_type); + + Slice header_slice; + Buffer buf; + AlignedBuf aligned_buf; + + { + TEST_SYNC_POINT("BlobFileReader::ReadHeader:ReadFromFile"); + + constexpr uint64_t read_offset = 0; + constexpr size_t read_size = BlobLogHeader::kSize; + + // TODO: rate limit reading headers from blob files. + const Status s = ReadFromFile(file_reader, read_offset, read_size, + statistics, &header_slice, &buf, &aligned_buf, + Env::IO_TOTAL /* rate_limiter_priority */); + if (!s.ok()) { + return s; + } + + TEST_SYNC_POINT_CALLBACK("BlobFileReader::ReadHeader:TamperWithResult", + &header_slice); + } + + BlobLogHeader header; + + { + const Status s = header.DecodeFrom(header_slice); + if (!s.ok()) { + return s; + } + } + + constexpr ExpirationRange no_expiration_range; + + if (header.has_ttl || header.expiration_range != no_expiration_range) { + return Status::Corruption("Unexpected TTL blob file"); + } + + if (header.column_family_id != column_family_id) { + return Status::Corruption("Column family ID mismatch"); + } + + *compression_type = header.compression; + + return Status::OK(); +} + +Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader, + uint64_t file_size, Statistics* statistics) { + assert(file_size >= BlobLogHeader::kSize + BlobLogFooter::kSize); + assert(file_reader); + + Slice footer_slice; + Buffer buf; + AlignedBuf aligned_buf; + + { + TEST_SYNC_POINT("BlobFileReader::ReadFooter:ReadFromFile"); + + const uint64_t read_offset = file_size - BlobLogFooter::kSize; + constexpr size_t read_size = BlobLogFooter::kSize; + + // TODO: rate limit reading footers from blob files. + const Status s = ReadFromFile(file_reader, read_offset, read_size, + statistics, &footer_slice, &buf, &aligned_buf, + Env::IO_TOTAL /* rate_limiter_priority */); + if (!s.ok()) { + return s; + } + + TEST_SYNC_POINT_CALLBACK("BlobFileReader::ReadFooter:TamperWithResult", + &footer_slice); + } + + BlobLogFooter footer; + + { + const Status s = footer.DecodeFrom(footer_slice); + if (!s.ok()) { + return s; + } + } + + constexpr ExpirationRange no_expiration_range; + + if (footer.expiration_range != no_expiration_range) { + return Status::Corruption("Unexpected TTL blob file"); + } + + return Status::OK(); +} + +Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader, + uint64_t read_offset, size_t read_size, + Statistics* statistics, Slice* slice, + Buffer* buf, AlignedBuf* aligned_buf, + Env::IOPriority rate_limiter_priority) { + assert(slice); + assert(buf); + assert(aligned_buf); + + assert(file_reader); + + RecordTick(statistics, BLOB_DB_BLOB_FILE_BYTES_READ, read_size); + + Status s; + + if (file_reader->use_direct_io()) { + constexpr char* scratch = nullptr; + + s = file_reader->Read(IOOptions(), read_offset, read_size, slice, scratch, + aligned_buf, rate_limiter_priority); + } else { + buf->reset(new char[read_size]); + constexpr AlignedBuf* aligned_scratch = nullptr; + + s = file_reader->Read(IOOptions(), read_offset, read_size, slice, + buf->get(), aligned_scratch, rate_limiter_priority); + } + + if (!s.ok()) { + return s; + } + + if (slice->size() != read_size) { + return Status::Corruption("Failed to read data from blob file"); + } + + return Status::OK(); +} + +BlobFileReader::BlobFileReader( + std::unique_ptr&& file_reader, uint64_t file_size, + CompressionType compression_type, SystemClock* clock, + Statistics* statistics) + : file_reader_(std::move(file_reader)), + file_size_(file_size), + compression_type_(compression_type), + clock_(clock), + statistics_(statistics) { + assert(file_reader_); +} + +BlobFileReader::~BlobFileReader() = default; + +Status BlobFileReader::GetBlob( + const ReadOptions& read_options, const Slice& user_key, uint64_t offset, + uint64_t value_size, CompressionType compression_type, + FilePrefetchBuffer* prefetch_buffer, MemoryAllocator* allocator, + std::unique_ptr* result, uint64_t* bytes_read) const { + assert(result); + + const uint64_t key_size = user_key.size(); + + if (!IsValidBlobOffset(offset, key_size, value_size, file_size_)) { + return Status::Corruption("Invalid blob offset"); + } + + if (compression_type != compression_type_) { + return Status::Corruption("Compression type mismatch when reading blob"); + } + + // Note: if verify_checksum is set, we read the entire blob record to be able + // to perform the verification; otherwise, we just read the blob itself. Since + // the offset in BlobIndex actually points to the blob value, we need to make + // an adjustment in the former case. + const uint64_t adjustment = + read_options.verify_checksums + ? BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) + : 0; + assert(offset >= adjustment); + + const uint64_t record_offset = offset - adjustment; + const uint64_t record_size = value_size + adjustment; + + Slice record_slice; + Buffer buf; + AlignedBuf aligned_buf; + + bool prefetched = false; + + if (prefetch_buffer) { + Status s; + constexpr bool for_compaction = true; + + prefetched = prefetch_buffer->TryReadFromCache( + IOOptions(), file_reader_.get(), record_offset, + static_cast(record_size), &record_slice, &s, + read_options.rate_limiter_priority, for_compaction); + if (!s.ok()) { + return s; + } + } + + if (!prefetched) { + TEST_SYNC_POINT("BlobFileReader::GetBlob:ReadFromFile"); + PERF_COUNTER_ADD(blob_read_count, 1); + PERF_COUNTER_ADD(blob_read_byte, record_size); + PERF_TIMER_GUARD(blob_read_time); + const Status s = ReadFromFile(file_reader_.get(), record_offset, + static_cast(record_size), statistics_, + &record_slice, &buf, &aligned_buf, + read_options.rate_limiter_priority); + if (!s.ok()) { + return s; + } + } + + TEST_SYNC_POINT_CALLBACK("BlobFileReader::GetBlob:TamperWithResult", + &record_slice); + + if (read_options.verify_checksums) { + const Status s = VerifyBlob(record_slice, user_key, value_size); + if (!s.ok()) { + return s; + } + } + + const Slice value_slice(record_slice.data() + adjustment, value_size); + + { + const Status s = UncompressBlobIfNeeded( + value_slice, compression_type, allocator, clock_, statistics_, result); + if (!s.ok()) { + return s; + } + } + + if (bytes_read) { + *bytes_read = record_size; + } + + return Status::OK(); +} + +void BlobFileReader::MultiGetBlob( + const ReadOptions& read_options, MemoryAllocator* allocator, + autovector>>& + blob_reqs, + uint64_t* bytes_read) const { + const size_t num_blobs = blob_reqs.size(); + assert(num_blobs > 0); + assert(num_blobs <= MultiGetContext::MAX_BATCH_SIZE); + +#ifndef NDEBUG + for (size_t i = 0; i < num_blobs - 1; ++i) { + assert(blob_reqs[i].first->offset <= blob_reqs[i + 1].first->offset); + } +#endif // !NDEBUG + + std::vector read_reqs; + autovector adjustments; + uint64_t total_len = 0; + read_reqs.reserve(num_blobs); + for (size_t i = 0; i < num_blobs; ++i) { + BlobReadRequest* const req = blob_reqs[i].first; + assert(req); + assert(req->user_key); + assert(req->status); + + const size_t key_size = req->user_key->size(); + const uint64_t offset = req->offset; + const uint64_t value_size = req->len; + + if (!IsValidBlobOffset(offset, key_size, value_size, file_size_)) { + *req->status = Status::Corruption("Invalid blob offset"); + continue; + } + if (req->compression != compression_type_) { + *req->status = + Status::Corruption("Compression type mismatch when reading a blob"); + continue; + } + + const uint64_t adjustment = + read_options.verify_checksums + ? BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) + : 0; + assert(req->offset >= adjustment); + adjustments.push_back(adjustment); + + FSReadRequest read_req = {}; + read_req.offset = req->offset - adjustment; + read_req.len = req->len + adjustment; + read_reqs.emplace_back(read_req); + total_len += read_req.len; + } + + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, total_len); + + Buffer buf; + AlignedBuf aligned_buf; + + Status s; + bool direct_io = file_reader_->use_direct_io(); + if (direct_io) { + for (size_t i = 0; i < read_reqs.size(); ++i) { + read_reqs[i].scratch = nullptr; + } + } else { + buf.reset(new char[total_len]); + std::ptrdiff_t pos = 0; + for (size_t i = 0; i < read_reqs.size(); ++i) { + read_reqs[i].scratch = buf.get() + pos; + pos += read_reqs[i].len; + } + } + TEST_SYNC_POINT("BlobFileReader::MultiGetBlob:ReadFromFile"); + PERF_COUNTER_ADD(blob_read_count, num_blobs); + PERF_COUNTER_ADD(blob_read_byte, total_len); + s = file_reader_->MultiRead(IOOptions(), read_reqs.data(), read_reqs.size(), + direct_io ? &aligned_buf : nullptr, + read_options.rate_limiter_priority); + if (!s.ok()) { + for (auto& req : read_reqs) { + req.status.PermitUncheckedError(); + } + for (auto& blob_req : blob_reqs) { + BlobReadRequest* const req = blob_req.first; + assert(req); + assert(req->status); + + if (!req->status->IsCorruption()) { + // Avoid overwriting corruption status. + *req->status = s; + } + } + return; + } + + assert(s.ok()); + + uint64_t total_bytes = 0; + for (size_t i = 0, j = 0; i < num_blobs; ++i) { + BlobReadRequest* const req = blob_reqs[i].first; + assert(req); + assert(req->user_key); + assert(req->status); + + if (!req->status->ok()) { + continue; + } + + assert(j < read_reqs.size()); + auto& read_req = read_reqs[j++]; + const auto& record_slice = read_req.result; + if (read_req.status.ok() && record_slice.size() != read_req.len) { + read_req.status = + IOStatus::Corruption("Failed to read data from blob file"); + } + + *req->status = read_req.status; + if (!req->status->ok()) { + continue; + } + + // Verify checksums if enabled + if (read_options.verify_checksums) { + *req->status = VerifyBlob(record_slice, *req->user_key, req->len); + if (!req->status->ok()) { + continue; + } + } + + // Uncompress blob if needed + Slice value_slice(record_slice.data() + adjustments[i], req->len); + *req->status = + UncompressBlobIfNeeded(value_slice, compression_type_, allocator, + clock_, statistics_, &blob_reqs[i].second); + if (req->status->ok()) { + total_bytes += record_slice.size(); + } + } + + if (bytes_read) { + *bytes_read = total_bytes; + } +} + +Status BlobFileReader::VerifyBlob(const Slice& record_slice, + const Slice& user_key, uint64_t value_size) { + PERF_TIMER_GUARD(blob_checksum_time); + + BlobLogRecord record; + + const Slice header_slice(record_slice.data(), BlobLogRecord::kHeaderSize); + + { + const Status s = record.DecodeHeaderFrom(header_slice); + if (!s.ok()) { + return s; + } + } + + if (record.key_size != user_key.size()) { + return Status::Corruption("Key size mismatch when reading blob"); + } + + if (record.value_size != value_size) { + return Status::Corruption("Value size mismatch when reading blob"); + } + + record.key = + Slice(record_slice.data() + BlobLogRecord::kHeaderSize, record.key_size); + if (record.key != user_key) { + return Status::Corruption("Key mismatch when reading blob"); + } + + record.value = Slice(record.key.data() + record.key_size, value_size); + + { + TEST_SYNC_POINT_CALLBACK("BlobFileReader::VerifyBlob:CheckBlobCRC", + &record); + + const Status s = record.CheckBlobCRC(); + if (!s.ok()) { + return s; + } + } + + return Status::OK(); +} + +Status BlobFileReader::UncompressBlobIfNeeded( + const Slice& value_slice, CompressionType compression_type, + MemoryAllocator* allocator, SystemClock* clock, Statistics* statistics, + std::unique_ptr* result) { + assert(result); + + if (compression_type == kNoCompression) { + CacheAllocationPtr allocation = + AllocateBlock(value_slice.size(), allocator); + memcpy(allocation.get(), value_slice.data(), value_slice.size()); + + *result = BlobContents::Create(std::move(allocation), value_slice.size()); + + return Status::OK(); + } + + UncompressionContext context(compression_type); + UncompressionInfo info(context, UncompressionDict::GetEmptyDict(), + compression_type); + + size_t uncompressed_size = 0; + constexpr uint32_t compression_format_version = 2; + + CacheAllocationPtr output; + + { + PERF_TIMER_GUARD(blob_decompress_time); + StopWatch stop_watch(clock, statistics, BLOB_DB_DECOMPRESSION_MICROS); + output = UncompressData(info, value_slice.data(), value_slice.size(), + &uncompressed_size, compression_format_version, + allocator); + } + + TEST_SYNC_POINT_CALLBACK( + "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", &output); + + if (!output) { + return Status::Corruption("Unable to uncompress blob"); + } + + *result = BlobContents::Create(std::move(output), uncompressed_size); + + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_reader.h b/src/rocksdb/db/blob/blob_file_reader.h new file mode 100644 index 000000000..75b756da1 --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_reader.h @@ -0,0 +1,108 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "db/blob/blob_read_request.h" +#include "file/random_access_file_reader.h" +#include "rocksdb/compression_type.h" +#include "rocksdb/rocksdb_namespace.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class Status; +struct ImmutableOptions; +struct FileOptions; +class HistogramImpl; +struct ReadOptions; +class Slice; +class FilePrefetchBuffer; +class BlobContents; +class Statistics; + +class BlobFileReader { + public: + static Status Create(const ImmutableOptions& immutable_options, + const FileOptions& file_options, + uint32_t column_family_id, + HistogramImpl* blob_file_read_hist, + uint64_t blob_file_number, + const std::shared_ptr& io_tracer, + std::unique_ptr* reader); + + BlobFileReader(const BlobFileReader&) = delete; + BlobFileReader& operator=(const BlobFileReader&) = delete; + + ~BlobFileReader(); + + Status GetBlob(const ReadOptions& read_options, const Slice& user_key, + uint64_t offset, uint64_t value_size, + CompressionType compression_type, + FilePrefetchBuffer* prefetch_buffer, + MemoryAllocator* allocator, + std::unique_ptr* result, + uint64_t* bytes_read) const; + + // offsets must be sorted in ascending order by caller. + void MultiGetBlob( + const ReadOptions& read_options, MemoryAllocator* allocator, + autovector>>& + blob_reqs, + uint64_t* bytes_read) const; + + CompressionType GetCompressionType() const { return compression_type_; } + + uint64_t GetFileSize() const { return file_size_; } + + private: + BlobFileReader(std::unique_ptr&& file_reader, + uint64_t file_size, CompressionType compression_type, + SystemClock* clock, Statistics* statistics); + + static Status OpenFile(const ImmutableOptions& immutable_options, + const FileOptions& file_opts, + HistogramImpl* blob_file_read_hist, + uint64_t blob_file_number, + const std::shared_ptr& io_tracer, + uint64_t* file_size, + std::unique_ptr* file_reader); + + static Status ReadHeader(const RandomAccessFileReader* file_reader, + uint32_t column_family_id, Statistics* statistics, + CompressionType* compression_type); + + static Status ReadFooter(const RandomAccessFileReader* file_reader, + uint64_t file_size, Statistics* statistics); + + using Buffer = std::unique_ptr; + + static Status ReadFromFile(const RandomAccessFileReader* file_reader, + uint64_t read_offset, size_t read_size, + Statistics* statistics, Slice* slice, Buffer* buf, + AlignedBuf* aligned_buf, + Env::IOPriority rate_limiter_priority); + + static Status VerifyBlob(const Slice& record_slice, const Slice& user_key, + uint64_t value_size); + + static Status UncompressBlobIfNeeded(const Slice& value_slice, + CompressionType compression_type, + MemoryAllocator* allocator, + SystemClock* clock, + Statistics* statistics, + std::unique_ptr* result); + + std::unique_ptr file_reader_; + uint64_t file_size_; + CompressionType compression_type_; + SystemClock* clock_; + Statistics* statistics_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_reader_test.cc b/src/rocksdb/db/blob/blob_file_reader_test.cc new file mode 100644 index 000000000..03458e2b5 --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_reader_test.cc @@ -0,0 +1,1024 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_reader.h" + +#include +#include + +#include "db/blob/blob_contents.h" +#include "db/blob/blob_log_format.h" +#include "db/blob/blob_log_writer.h" +#include "env/mock_env.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "file/writable_file_writer.h" +#include "options/cf_options.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/options.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "util/compression.h" +#include "utilities/fault_injection_env.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +// Creates a test blob file with `num` blobs in it. +void WriteBlobFile(const ImmutableOptions& immutable_options, + uint32_t column_family_id, bool has_ttl, + const ExpirationRange& expiration_range_header, + const ExpirationRange& expiration_range_footer, + uint64_t blob_file_number, const std::vector& keys, + const std::vector& blobs, CompressionType compression, + std::vector& blob_offsets, + std::vector& blob_sizes) { + assert(!immutable_options.cf_paths.empty()); + size_t num = keys.size(); + assert(num == blobs.size()); + assert(num == blob_offsets.size()); + assert(num == blob_sizes.size()); + + const std::string blob_file_path = + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number); + std::unique_ptr file; + ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file, + FileOptions())); + + std::unique_ptr file_writer(new WritableFileWriter( + std::move(file), blob_file_path, FileOptions(), immutable_options.clock)); + + constexpr Statistics* statistics = nullptr; + constexpr bool use_fsync = false; + constexpr bool do_flush = false; + + BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock, + statistics, blob_file_number, use_fsync, + do_flush); + + BlobLogHeader header(column_family_id, compression, has_ttl, + expiration_range_header); + + ASSERT_OK(blob_log_writer.WriteHeader(header)); + + std::vector compressed_blobs(num); + std::vector blobs_to_write(num); + if (kNoCompression == compression) { + for (size_t i = 0; i < num; ++i) { + blobs_to_write[i] = blobs[i]; + blob_sizes[i] = blobs[i].size(); + } + } else { + CompressionOptions opts; + CompressionContext context(compression); + constexpr uint64_t sample_for_compression = 0; + CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), + compression, sample_for_compression); + + constexpr uint32_t compression_format_version = 2; + + for (size_t i = 0; i < num; ++i) { + ASSERT_TRUE(CompressData(blobs[i], info, compression_format_version, + &compressed_blobs[i])); + blobs_to_write[i] = compressed_blobs[i]; + blob_sizes[i] = compressed_blobs[i].size(); + } + } + + for (size_t i = 0; i < num; ++i) { + uint64_t key_offset = 0; + ASSERT_OK(blob_log_writer.AddRecord(keys[i], blobs_to_write[i], &key_offset, + &blob_offsets[i])); + } + + BlobLogFooter footer; + footer.blob_count = num; + footer.expiration_range = expiration_range_footer; + + std::string checksum_method; + std::string checksum_value; + ASSERT_OK( + blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value)); +} + +// Creates a test blob file with a single blob in it. Note: this method +// makes it possible to test various corner cases by allowing the caller +// to specify the contents of various blob file header/footer fields. +void WriteBlobFile(const ImmutableOptions& immutable_options, + uint32_t column_family_id, bool has_ttl, + const ExpirationRange& expiration_range_header, + const ExpirationRange& expiration_range_footer, + uint64_t blob_file_number, const Slice& key, + const Slice& blob, CompressionType compression, + uint64_t* blob_offset, uint64_t* blob_size) { + std::vector keys{key}; + std::vector blobs{blob}; + std::vector blob_offsets{0}; + std::vector blob_sizes{0}; + WriteBlobFile(immutable_options, column_family_id, has_ttl, + expiration_range_header, expiration_range_footer, + blob_file_number, keys, blobs, compression, blob_offsets, + blob_sizes); + if (blob_offset) { + *blob_offset = blob_offsets[0]; + } + if (blob_size) { + *blob_size = blob_sizes[0]; + } +} + +} // anonymous namespace + +class BlobFileReaderTest : public testing::Test { + protected: + BlobFileReaderTest() { mock_env_.reset(MockEnv::Create(Env::Default())); } + std::unique_ptr mock_env_; +}; + +TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) { + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileReaderTest_CreateReaderAndGetBlob"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr size_t num_blobs = 3; + const std::vector key_strs = {"key1", "key2", "key3"}; + const std::vector blob_strs = {"blob1", "blob2", "blob3"}; + + const std::vector keys = {key_strs[0], key_strs[1], key_strs[2]}; + const std::vector blobs = {blob_strs[0], blob_strs[1], blob_strs[2]}; + + std::vector blob_offsets(keys.size()); + std::vector blob_sizes(keys.size()); + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, keys, blobs, kNoCompression, + blob_offsets, blob_sizes); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr reader; + + ASSERT_OK(BlobFileReader::Create( + immutable_options, FileOptions(), column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, &reader)); + + // Make sure the blob can be retrieved with and without checksum verification + ReadOptions read_options; + read_options.verify_checksums = false; + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + constexpr MemoryAllocator* allocator = nullptr; + + { + std::unique_ptr value; + uint64_t bytes_read = 0; + + ASSERT_OK(reader->GetBlob(read_options, keys[0], blob_offsets[0], + blob_sizes[0], kNoCompression, prefetch_buffer, + allocator, &value, &bytes_read)); + ASSERT_NE(value, nullptr); + ASSERT_EQ(value->data(), blobs[0]); + ASSERT_EQ(bytes_read, blob_sizes[0]); + + // MultiGetBlob + bytes_read = 0; + size_t total_size = 0; + + std::array statuses_buf; + std::array requests_buf; + autovector>> + blob_reqs; + + for (size_t i = 0; i < num_blobs; ++i) { + requests_buf[i] = + BlobReadRequest(keys[i], blob_offsets[i], blob_sizes[i], + kNoCompression, nullptr, &statuses_buf[i]); + blob_reqs.emplace_back(&requests_buf[i], std::unique_ptr()); + } + + reader->MultiGetBlob(read_options, allocator, blob_reqs, &bytes_read); + + for (size_t i = 0; i < num_blobs; ++i) { + const auto& result = blob_reqs[i].second; + + ASSERT_OK(statuses_buf[i]); + ASSERT_NE(result, nullptr); + ASSERT_EQ(result->data(), blobs[i]); + total_size += blob_sizes[i]; + } + ASSERT_EQ(bytes_read, total_size); + } + + read_options.verify_checksums = true; + + { + std::unique_ptr value; + uint64_t bytes_read = 0; + + ASSERT_OK(reader->GetBlob(read_options, keys[1], blob_offsets[1], + blob_sizes[1], kNoCompression, prefetch_buffer, + allocator, &value, &bytes_read)); + ASSERT_NE(value, nullptr); + ASSERT_EQ(value->data(), blobs[1]); + + const uint64_t key_size = keys[1].size(); + ASSERT_EQ(bytes_read, + BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) + + blob_sizes[1]); + } + + // Invalid offset (too close to start of file) + { + std::unique_ptr value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(read_options, keys[0], blob_offsets[0] - 1, + blob_sizes[0], kNoCompression, prefetch_buffer, + allocator, &value, &bytes_read) + .IsCorruption()); + ASSERT_EQ(value, nullptr); + ASSERT_EQ(bytes_read, 0); + } + + // Invalid offset (too close to end of file) + { + std::unique_ptr value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(read_options, keys[2], blob_offsets[2] + 1, + blob_sizes[2], kNoCompression, prefetch_buffer, + allocator, &value, &bytes_read) + .IsCorruption()); + ASSERT_EQ(value, nullptr); + ASSERT_EQ(bytes_read, 0); + } + + // Incorrect compression type + { + std::unique_ptr value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(read_options, keys[0], blob_offsets[0], + blob_sizes[0], kZSTD, prefetch_buffer, allocator, + &value, &bytes_read) + .IsCorruption()); + ASSERT_EQ(value, nullptr); + ASSERT_EQ(bytes_read, 0); + } + + // Incorrect key size + { + constexpr char shorter_key[] = "k"; + std::unique_ptr value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(read_options, shorter_key, + blob_offsets[0] - + (keys[0].size() - sizeof(shorter_key) + 1), + blob_sizes[0], kNoCompression, prefetch_buffer, + allocator, &value, &bytes_read) + .IsCorruption()); + ASSERT_EQ(value, nullptr); + ASSERT_EQ(bytes_read, 0); + + // MultiGetBlob + autovector> key_refs; + for (const auto& key_ref : keys) { + key_refs.emplace_back(std::cref(key_ref)); + } + Slice shorter_key_slice(shorter_key, sizeof(shorter_key) - 1); + key_refs[1] = std::cref(shorter_key_slice); + + autovector offsets{ + blob_offsets[0], + blob_offsets[1] - (keys[1].size() - key_refs[1].get().size()), + blob_offsets[2]}; + + std::array statuses_buf; + std::array requests_buf; + autovector>> + blob_reqs; + + for (size_t i = 0; i < num_blobs; ++i) { + requests_buf[i] = + BlobReadRequest(key_refs[i], offsets[i], blob_sizes[i], + kNoCompression, nullptr, &statuses_buf[i]); + blob_reqs.emplace_back(&requests_buf[i], std::unique_ptr()); + } + + reader->MultiGetBlob(read_options, allocator, blob_reqs, &bytes_read); + + for (size_t i = 0; i < num_blobs; ++i) { + if (i == 1) { + ASSERT_TRUE(statuses_buf[i].IsCorruption()); + } else { + ASSERT_OK(statuses_buf[i]); + } + } + } + + // Incorrect key + { + constexpr char incorrect_key[] = "foo1"; + std::unique_ptr value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(read_options, incorrect_key, blob_offsets[0], + blob_sizes[0], kNoCompression, prefetch_buffer, + allocator, &value, &bytes_read) + .IsCorruption()); + ASSERT_EQ(value, nullptr); + ASSERT_EQ(bytes_read, 0); + + // MultiGetBlob + autovector> key_refs; + for (const auto& key_ref : keys) { + key_refs.emplace_back(std::cref(key_ref)); + } + Slice wrong_key_slice(incorrect_key, sizeof(incorrect_key) - 1); + key_refs[2] = std::cref(wrong_key_slice); + + std::array statuses_buf; + std::array requests_buf; + autovector>> + blob_reqs; + + for (size_t i = 0; i < num_blobs; ++i) { + requests_buf[i] = + BlobReadRequest(key_refs[i], blob_offsets[i], blob_sizes[i], + kNoCompression, nullptr, &statuses_buf[i]); + blob_reqs.emplace_back(&requests_buf[i], std::unique_ptr()); + } + + reader->MultiGetBlob(read_options, allocator, blob_reqs, &bytes_read); + + for (size_t i = 0; i < num_blobs; ++i) { + if (i == num_blobs - 1) { + ASSERT_TRUE(statuses_buf[i].IsCorruption()); + } else { + ASSERT_OK(statuses_buf[i]); + } + } + } + + // Incorrect value size + { + std::unique_ptr value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(read_options, keys[1], blob_offsets[1], + blob_sizes[1] + 1, kNoCompression, + prefetch_buffer, allocator, &value, &bytes_read) + .IsCorruption()); + ASSERT_EQ(value, nullptr); + ASSERT_EQ(bytes_read, 0); + + // MultiGetBlob + autovector> key_refs; + for (const auto& key_ref : keys) { + key_refs.emplace_back(std::cref(key_ref)); + } + + std::array statuses_buf; + std::array requests_buf; + + requests_buf[0] = + BlobReadRequest(key_refs[0], blob_offsets[0], blob_sizes[0], + kNoCompression, nullptr, &statuses_buf[0]); + requests_buf[1] = + BlobReadRequest(key_refs[1], blob_offsets[1], blob_sizes[1] + 1, + kNoCompression, nullptr, &statuses_buf[1]); + requests_buf[2] = + BlobReadRequest(key_refs[2], blob_offsets[2], blob_sizes[2], + kNoCompression, nullptr, &statuses_buf[2]); + + autovector>> + blob_reqs; + + for (size_t i = 0; i < num_blobs; ++i) { + blob_reqs.emplace_back(&requests_buf[i], std::unique_ptr()); + } + + reader->MultiGetBlob(read_options, allocator, blob_reqs, &bytes_read); + + for (size_t i = 0; i < num_blobs; ++i) { + if (i != 1) { + ASSERT_OK(statuses_buf[i]); + } else { + ASSERT_TRUE(statuses_buf[i].IsCorruption()); + } + } + } +} + +TEST_F(BlobFileReaderTest, Malformed) { + // Write a blob file consisting of nothing but a header, and make sure we + // detect the error when we open it for reading + + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_Malformed"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr uint64_t blob_file_number = 1; + + { + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + + const std::string blob_file_path = + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number); + + std::unique_ptr file; + ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file, + FileOptions())); + + std::unique_ptr file_writer( + new WritableFileWriter(std::move(file), blob_file_path, FileOptions(), + immutable_options.clock)); + + constexpr Statistics* statistics = nullptr; + constexpr bool use_fsync = false; + constexpr bool do_flush = false; + + BlobLogWriter blob_log_writer(std::move(file_writer), + immutable_options.clock, statistics, + blob_file_number, use_fsync, do_flush); + + BlobLogHeader header(column_family_id, kNoCompression, has_ttl, + expiration_range); + + ASSERT_OK(blob_log_writer.WriteHeader(header)); + } + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr reader; + + ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), + column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, + &reader) + .IsCorruption()); +} + +TEST_F(BlobFileReaderTest, TTL) { + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_TTL"), 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = true; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, kNoCompression, + &blob_offset, &blob_size); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr reader; + + ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), + column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, + &reader) + .IsCorruption()); +} + +TEST_F(BlobFileReaderTest, ExpirationRangeInHeader) { + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileReaderTest_ExpirationRangeInHeader"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + const ExpirationRange expiration_range_header( + 1, 2); // can be made constexpr when we adopt C++14 + constexpr ExpirationRange expiration_range_footer; + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, + expiration_range_header, expiration_range_footer, + blob_file_number, key, blob, kNoCompression, &blob_offset, + &blob_size); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr reader; + + ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), + column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, + &reader) + .IsCorruption()); +} + +TEST_F(BlobFileReaderTest, ExpirationRangeInFooter) { + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileReaderTest_ExpirationRangeInFooter"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range_header; + const ExpirationRange expiration_range_footer( + 1, 2); // can be made constexpr when we adopt C++14 + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, + expiration_range_header, expiration_range_footer, + blob_file_number, key, blob, kNoCompression, &blob_offset, + &blob_size); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr reader; + + ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), + column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, + &reader) + .IsCorruption()); +} + +TEST_F(BlobFileReaderTest, IncorrectColumnFamily) { + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileReaderTest_IncorrectColumnFamily"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, kNoCompression, + &blob_offset, &blob_size); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr reader; + + constexpr uint32_t incorrect_column_family_id = 2; + + ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), + incorrect_column_family_id, + blob_file_read_hist, blob_file_number, + nullptr /*IOTracer*/, &reader) + .IsCorruption()); +} + +TEST_F(BlobFileReaderTest, BlobCRCError) { + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_BlobCRCError"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, kNoCompression, + &blob_offset, &blob_size); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr reader; + + ASSERT_OK(BlobFileReader::Create( + immutable_options, FileOptions(), column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, &reader)); + + SyncPoint::GetInstance()->SetCallBack( + "BlobFileReader::VerifyBlob:CheckBlobCRC", [](void* arg) { + BlobLogRecord* const record = static_cast(arg); + assert(record); + + record->blob_crc = 0xfaceb00c; + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + constexpr MemoryAllocator* allocator = nullptr; + + std::unique_ptr value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(ReadOptions(), key, blob_offset, blob_size, + kNoCompression, prefetch_buffer, allocator, &value, + &bytes_read) + .IsCorruption()); + ASSERT_EQ(value, nullptr); + ASSERT_EQ(bytes_read, 0); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(BlobFileReaderTest, Compression) { + if (!Snappy_Supported()) { + return; + } + + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_Compression"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, + kSnappyCompression, &blob_offset, &blob_size); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr reader; + + ASSERT_OK(BlobFileReader::Create( + immutable_options, FileOptions(), column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, &reader)); + + // Make sure the blob can be retrieved with and without checksum verification + ReadOptions read_options; + read_options.verify_checksums = false; + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + constexpr MemoryAllocator* allocator = nullptr; + + { + std::unique_ptr value; + uint64_t bytes_read = 0; + + ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size, + kSnappyCompression, prefetch_buffer, allocator, + &value, &bytes_read)); + ASSERT_NE(value, nullptr); + ASSERT_EQ(value->data(), blob); + ASSERT_EQ(bytes_read, blob_size); + } + + read_options.verify_checksums = true; + + { + std::unique_ptr value; + uint64_t bytes_read = 0; + + ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size, + kSnappyCompression, prefetch_buffer, allocator, + &value, &bytes_read)); + ASSERT_NE(value, nullptr); + ASSERT_EQ(value->data(), blob); + + constexpr uint64_t key_size = sizeof(key) - 1; + ASSERT_EQ(bytes_read, + BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) + + blob_size); + } +} + +TEST_F(BlobFileReaderTest, UncompressionError) { + if (!Snappy_Supported()) { + return; + } + + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileReaderTest_UncompressionError"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, + kSnappyCompression, &blob_offset, &blob_size); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr reader; + + ASSERT_OK(BlobFileReader::Create( + immutable_options, FileOptions(), column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, &reader)); + + SyncPoint::GetInstance()->SetCallBack( + "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", [](void* arg) { + CacheAllocationPtr* const output = + static_cast(arg); + assert(output); + + output->reset(); + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + constexpr MemoryAllocator* allocator = nullptr; + + std::unique_ptr value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(ReadOptions(), key, blob_offset, blob_size, + kSnappyCompression, prefetch_buffer, allocator, + &value, &bytes_read) + .IsCorruption()); + ASSERT_EQ(value, nullptr); + ASSERT_EQ(bytes_read, 0); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +class BlobFileReaderIOErrorTest + : public testing::Test, + public testing::WithParamInterface { + protected: + BlobFileReaderIOErrorTest() : sync_point_(GetParam()) { + mock_env_.reset(MockEnv::Create(Env::Default())); + fault_injection_env_.reset(new FaultInjectionTestEnv(mock_env_.get())); + } + + std::unique_ptr mock_env_; + std::unique_ptr fault_injection_env_; + std::string sync_point_; +}; + +INSTANTIATE_TEST_CASE_P(BlobFileReaderTest, BlobFileReaderIOErrorTest, + ::testing::ValuesIn(std::vector{ + "BlobFileReader::OpenFile:GetFileSize", + "BlobFileReader::OpenFile:NewRandomAccessFile", + "BlobFileReader::ReadHeader:ReadFromFile", + "BlobFileReader::ReadFooter:ReadFromFile", + "BlobFileReader::GetBlob:ReadFromFile"})); + +TEST_P(BlobFileReaderIOErrorTest, IOError) { + // Simulates an I/O error during the specified step + + Options options; + options.env = fault_injection_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(fault_injection_env_.get(), + "BlobFileReaderIOErrorTest_IOError"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, kNoCompression, + &blob_offset, &blob_size); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) { + fault_injection_env_->SetFilesystemActive(false, + Status::IOError(sync_point_)); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr reader; + + const Status s = BlobFileReader::Create( + immutable_options, FileOptions(), column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, &reader); + + const bool fail_during_create = + (sync_point_ != "BlobFileReader::GetBlob:ReadFromFile"); + + if (fail_during_create) { + ASSERT_TRUE(s.IsIOError()); + } else { + ASSERT_OK(s); + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + constexpr MemoryAllocator* allocator = nullptr; + + std::unique_ptr value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(ReadOptions(), key, blob_offset, blob_size, + kNoCompression, prefetch_buffer, allocator, + &value, &bytes_read) + .IsIOError()); + ASSERT_EQ(value, nullptr); + ASSERT_EQ(bytes_read, 0); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +class BlobFileReaderDecodingErrorTest + : public testing::Test, + public testing::WithParamInterface { + protected: + BlobFileReaderDecodingErrorTest() : sync_point_(GetParam()) { + mock_env_.reset(MockEnv::Create(Env::Default())); + } + + std::unique_ptr mock_env_; + std::string sync_point_; +}; + +INSTANTIATE_TEST_CASE_P(BlobFileReaderTest, BlobFileReaderDecodingErrorTest, + ::testing::ValuesIn(std::vector{ + "BlobFileReader::ReadHeader:TamperWithResult", + "BlobFileReader::ReadFooter:TamperWithResult", + "BlobFileReader::GetBlob:TamperWithResult"})); + +TEST_P(BlobFileReaderDecodingErrorTest, DecodingError) { + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileReaderDecodingErrorTest_DecodingError"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, kNoCompression, + &blob_offset, &blob_size); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [](void* arg) { + Slice* const slice = static_cast(arg); + assert(slice); + assert(!slice->empty()); + + slice->remove_prefix(1); + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr reader; + + const Status s = BlobFileReader::Create( + immutable_options, FileOptions(), column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, &reader); + + const bool fail_during_create = + sync_point_ != "BlobFileReader::GetBlob:TamperWithResult"; + + if (fail_during_create) { + ASSERT_TRUE(s.IsCorruption()); + } else { + ASSERT_OK(s); + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + constexpr MemoryAllocator* allocator = nullptr; + + std::unique_ptr value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(ReadOptions(), key, blob_offset, blob_size, + kNoCompression, prefetch_buffer, allocator, + &value, &bytes_read) + .IsCorruption()); + ASSERT_EQ(value, nullptr); + ASSERT_EQ(bytes_read, 0); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/blob/blob_garbage_meter.cc b/src/rocksdb/db/blob/blob_garbage_meter.cc new file mode 100644 index 000000000..d328d7ff4 --- /dev/null +++ b/src/rocksdb/db/blob/blob_garbage_meter.cc @@ -0,0 +1,100 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_garbage_meter.h" + +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/dbformat.h" + +namespace ROCKSDB_NAMESPACE { + +Status BlobGarbageMeter::ProcessInFlow(const Slice& key, const Slice& value) { + uint64_t blob_file_number = kInvalidBlobFileNumber; + uint64_t bytes = 0; + + const Status s = Parse(key, value, &blob_file_number, &bytes); + if (!s.ok()) { + return s; + } + + if (blob_file_number == kInvalidBlobFileNumber) { + return Status::OK(); + } + + flows_[blob_file_number].AddInFlow(bytes); + + return Status::OK(); +} + +Status BlobGarbageMeter::ProcessOutFlow(const Slice& key, const Slice& value) { + uint64_t blob_file_number = kInvalidBlobFileNumber; + uint64_t bytes = 0; + + const Status s = Parse(key, value, &blob_file_number, &bytes); + if (!s.ok()) { + return s; + } + + if (blob_file_number == kInvalidBlobFileNumber) { + return Status::OK(); + } + + // Note: in order to measure the amount of additional garbage, we only need to + // track the outflow for preexisting files, i.e. those that also had inflow. + // (Newly written files would only have outflow.) + auto it = flows_.find(blob_file_number); + if (it == flows_.end()) { + return Status::OK(); + } + + it->second.AddOutFlow(bytes); + + return Status::OK(); +} + +Status BlobGarbageMeter::Parse(const Slice& key, const Slice& value, + uint64_t* blob_file_number, uint64_t* bytes) { + assert(blob_file_number); + assert(*blob_file_number == kInvalidBlobFileNumber); + assert(bytes); + assert(*bytes == 0); + + ParsedInternalKey ikey; + + { + constexpr bool log_err_key = false; + const Status s = ParseInternalKey(key, &ikey, log_err_key); + if (!s.ok()) { + return s; + } + } + + if (ikey.type != kTypeBlobIndex) { + return Status::OK(); + } + + BlobIndex blob_index; + + { + const Status s = blob_index.DecodeFrom(value); + if (!s.ok()) { + return s; + } + } + + if (blob_index.IsInlined() || blob_index.HasTTL()) { + return Status::Corruption("Unexpected TTL/inlined blob index"); + } + + *blob_file_number = blob_index.file_number(); + *bytes = + blob_index.size() + + BlobLogRecord::CalculateAdjustmentForRecordHeader(ikey.user_key.size()); + + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_garbage_meter.h b/src/rocksdb/db/blob/blob_garbage_meter.h new file mode 100644 index 000000000..a6c04b0b2 --- /dev/null +++ b/src/rocksdb/db/blob/blob_garbage_meter.h @@ -0,0 +1,102 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "db/blob/blob_constants.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; + +// A class that can be used to compute the amount of additional garbage +// generated by a compaction. It parses the keys and blob references in the +// input and output of a compaction, and aggregates the "inflow" and "outflow" +// on a per-blob file basis. The amount of additional garbage for any given blob +// file can then be computed by subtracting the outflow from the inflow. +class BlobGarbageMeter { + public: + // A class to store the number and total size of blobs on a per-blob file + // basis. + class BlobStats { + public: + void Add(uint64_t bytes) { + ++count_; + bytes_ += bytes; + } + void Add(uint64_t count, uint64_t bytes) { + count_ += count; + bytes_ += bytes; + } + + uint64_t GetCount() const { return count_; } + uint64_t GetBytes() const { return bytes_; } + + private: + uint64_t count_ = 0; + uint64_t bytes_ = 0; + }; + + // A class to keep track of the "inflow" and the "outflow" and to compute the + // amount of additional garbage for a given blob file. + class BlobInOutFlow { + public: + void AddInFlow(uint64_t bytes) { + in_flow_.Add(bytes); + assert(IsValid()); + } + void AddOutFlow(uint64_t bytes) { + out_flow_.Add(bytes); + assert(IsValid()); + } + + const BlobStats& GetInFlow() const { return in_flow_; } + const BlobStats& GetOutFlow() const { return out_flow_; } + + bool IsValid() const { + return in_flow_.GetCount() >= out_flow_.GetCount() && + in_flow_.GetBytes() >= out_flow_.GetBytes(); + } + bool HasGarbage() const { + assert(IsValid()); + return in_flow_.GetCount() > out_flow_.GetCount(); + } + uint64_t GetGarbageCount() const { + assert(IsValid()); + assert(HasGarbage()); + return in_flow_.GetCount() - out_flow_.GetCount(); + } + uint64_t GetGarbageBytes() const { + assert(IsValid()); + assert(HasGarbage()); + return in_flow_.GetBytes() - out_flow_.GetBytes(); + } + + private: + BlobStats in_flow_; + BlobStats out_flow_; + }; + + Status ProcessInFlow(const Slice& key, const Slice& value); + Status ProcessOutFlow(const Slice& key, const Slice& value); + + const std::unordered_map& flows() const { + return flows_; + } + + private: + static Status Parse(const Slice& key, const Slice& value, + uint64_t* blob_file_number, uint64_t* bytes); + + std::unordered_map flows_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_garbage_meter_test.cc b/src/rocksdb/db/blob/blob_garbage_meter_test.cc new file mode 100644 index 000000000..ba53f06f1 --- /dev/null +++ b/src/rocksdb/db/blob/blob_garbage_meter_test.cc @@ -0,0 +1,197 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_garbage_meter.h" + +#include +#include + +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/dbformat.h" +#include "test_util/testharness.h" + +namespace ROCKSDB_NAMESPACE { + +TEST(BlobGarbageMeterTest, MeasureGarbage) { + BlobGarbageMeter blob_garbage_meter; + + struct BlobDescriptor { + std::string user_key; + uint64_t blob_file_number; + uint64_t offset; + uint64_t size; + CompressionType compression_type; + bool has_in_flow; + bool has_out_flow; + + uint64_t GetExpectedBytes() const { + return size + + BlobLogRecord::CalculateAdjustmentForRecordHeader(user_key.size()); + } + }; + + // Note: blob file 4 has the same inflow and outflow and hence no additional + // garbage. Blob file 5 has less outflow than inflow and thus it does have + // additional garbage. Blob file 6 is a newly written file (i.e. no inflow, + // only outflow) and is thus not tracked by the meter. + std::vector blobs{ + {"key", 4, 1234, 555, kLZ4Compression, true, true}, + {"other_key", 4, 6789, 101010, kLZ4Compression, true, true}, + {"yet_another_key", 5, 22222, 3456, kLZ4Compression, true, true}, + {"foo_key", 5, 77777, 8888, kLZ4Compression, true, true}, + {"bar_key", 5, 999999, 1212, kLZ4Compression, true, false}, + {"baz_key", 5, 1234567, 890, kLZ4Compression, true, false}, + {"new_key", 6, 7777, 9999, kNoCompression, false, true}}; + + for (const auto& blob : blobs) { + constexpr SequenceNumber seq = 123; + const InternalKey key(blob.user_key, seq, kTypeBlobIndex); + const Slice key_slice = key.Encode(); + + std::string value; + BlobIndex::EncodeBlob(&value, blob.blob_file_number, blob.offset, blob.size, + blob.compression_type); + const Slice value_slice(value); + + if (blob.has_in_flow) { + ASSERT_OK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice)); + } + if (blob.has_out_flow) { + ASSERT_OK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice)); + } + } + + const auto& flows = blob_garbage_meter.flows(); + ASSERT_EQ(flows.size(), 2); + + { + const auto it = flows.find(4); + ASSERT_NE(it, flows.end()); + + const auto& flow = it->second; + + constexpr uint64_t expected_count = 2; + const uint64_t expected_bytes = + blobs[0].GetExpectedBytes() + blobs[1].GetExpectedBytes(); + + const auto& in = flow.GetInFlow(); + ASSERT_EQ(in.GetCount(), expected_count); + ASSERT_EQ(in.GetBytes(), expected_bytes); + + const auto& out = flow.GetOutFlow(); + ASSERT_EQ(out.GetCount(), expected_count); + ASSERT_EQ(out.GetBytes(), expected_bytes); + + ASSERT_TRUE(flow.IsValid()); + ASSERT_FALSE(flow.HasGarbage()); + } + + { + const auto it = flows.find(5); + ASSERT_NE(it, flows.end()); + + const auto& flow = it->second; + + const auto& in = flow.GetInFlow(); + + constexpr uint64_t expected_in_count = 4; + const uint64_t expected_in_bytes = + blobs[2].GetExpectedBytes() + blobs[3].GetExpectedBytes() + + blobs[4].GetExpectedBytes() + blobs[5].GetExpectedBytes(); + + ASSERT_EQ(in.GetCount(), expected_in_count); + ASSERT_EQ(in.GetBytes(), expected_in_bytes); + + const auto& out = flow.GetOutFlow(); + + constexpr uint64_t expected_out_count = 2; + const uint64_t expected_out_bytes = + blobs[2].GetExpectedBytes() + blobs[3].GetExpectedBytes(); + + ASSERT_EQ(out.GetCount(), expected_out_count); + ASSERT_EQ(out.GetBytes(), expected_out_bytes); + + ASSERT_TRUE(flow.IsValid()); + ASSERT_TRUE(flow.HasGarbage()); + ASSERT_EQ(flow.GetGarbageCount(), expected_in_count - expected_out_count); + ASSERT_EQ(flow.GetGarbageBytes(), expected_in_bytes - expected_out_bytes); + } +} + +TEST(BlobGarbageMeterTest, PlainValue) { + constexpr char user_key[] = "user_key"; + constexpr SequenceNumber seq = 123; + + const InternalKey key(user_key, seq, kTypeValue); + const Slice key_slice = key.Encode(); + + constexpr char value[] = "value"; + const Slice value_slice(value); + + BlobGarbageMeter blob_garbage_meter; + + ASSERT_OK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice)); + ASSERT_OK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice)); + ASSERT_TRUE(blob_garbage_meter.flows().empty()); +} + +TEST(BlobGarbageMeterTest, CorruptInternalKey) { + constexpr char corrupt_key[] = "i_am_corrupt"; + const Slice key_slice(corrupt_key); + + constexpr char value[] = "value"; + const Slice value_slice(value); + + BlobGarbageMeter blob_garbage_meter; + + ASSERT_NOK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice)); + ASSERT_NOK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice)); +} + +TEST(BlobGarbageMeterTest, CorruptBlobIndex) { + constexpr char user_key[] = "user_key"; + constexpr SequenceNumber seq = 123; + + const InternalKey key(user_key, seq, kTypeBlobIndex); + const Slice key_slice = key.Encode(); + + constexpr char value[] = "i_am_not_a_blob_index"; + const Slice value_slice(value); + + BlobGarbageMeter blob_garbage_meter; + + ASSERT_NOK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice)); + ASSERT_NOK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice)); +} + +TEST(BlobGarbageMeterTest, InlinedTTLBlobIndex) { + constexpr char user_key[] = "user_key"; + constexpr SequenceNumber seq = 123; + + const InternalKey key(user_key, seq, kTypeBlobIndex); + const Slice key_slice = key.Encode(); + + constexpr uint64_t expiration = 1234567890; + constexpr char inlined_value[] = "inlined"; + + std::string value; + BlobIndex::EncodeInlinedTTL(&value, expiration, inlined_value); + + const Slice value_slice(value); + + BlobGarbageMeter blob_garbage_meter; + + ASSERT_NOK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice)); + ASSERT_NOK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice)); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/blob/blob_index.h b/src/rocksdb/db/blob/blob_index.h new file mode 100644 index 000000000..e9944d784 --- /dev/null +++ b/src/rocksdb/db/blob/blob_index.h @@ -0,0 +1,187 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include + +#include "rocksdb/compression_type.h" +#include "util/coding.h" +#include "util/compression.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +// BlobIndex is a pointer to the blob and metadata of the blob. The index is +// stored in base DB as ValueType::kTypeBlobIndex. +// There are three types of blob index: +// +// kInlinedTTL: +// +------+------------+---------------+ +// | type | expiration | value | +// +------+------------+---------------+ +// | char | varint64 | variable size | +// +------+------------+---------------+ +// +// kBlob: +// +------+-------------+----------+----------+-------------+ +// | type | file number | offset | size | compression | +// +------+-------------+----------+----------+-------------+ +// | char | varint64 | varint64 | varint64 | char | +// +------+-------------+----------+----------+-------------+ +// +// kBlobTTL: +// +------+------------+-------------+----------+----------+-------------+ +// | type | expiration | file number | offset | size | compression | +// +------+------------+-------------+----------+----------+-------------+ +// | char | varint64 | varint64 | varint64 | varint64 | char | +// +------+------------+-------------+----------+----------+-------------+ +// +// There isn't a kInlined (without TTL) type since we can store it as a plain +// value (i.e. ValueType::kTypeValue). +class BlobIndex { + public: + enum class Type : unsigned char { + kInlinedTTL = 0, + kBlob = 1, + kBlobTTL = 2, + kUnknown = 3, + }; + + BlobIndex() : type_(Type::kUnknown) {} + + BlobIndex(const BlobIndex&) = default; + BlobIndex& operator=(const BlobIndex&) = default; + + bool IsInlined() const { return type_ == Type::kInlinedTTL; } + + bool HasTTL() const { + return type_ == Type::kInlinedTTL || type_ == Type::kBlobTTL; + } + + uint64_t expiration() const { + assert(HasTTL()); + return expiration_; + } + + const Slice& value() const { + assert(IsInlined()); + return value_; + } + + uint64_t file_number() const { + assert(!IsInlined()); + return file_number_; + } + + uint64_t offset() const { + assert(!IsInlined()); + return offset_; + } + + uint64_t size() const { + assert(!IsInlined()); + return size_; + } + + CompressionType compression() const { + assert(!IsInlined()); + return compression_; + } + + Status DecodeFrom(Slice slice) { + const char* kErrorMessage = "Error while decoding blob index"; + assert(slice.size() > 0); + type_ = static_cast(*slice.data()); + if (type_ >= Type::kUnknown) { + return Status::Corruption(kErrorMessage, + "Unknown blob index type: " + + std::to_string(static_cast(type_))); + } + slice = Slice(slice.data() + 1, slice.size() - 1); + if (HasTTL()) { + if (!GetVarint64(&slice, &expiration_)) { + return Status::Corruption(kErrorMessage, "Corrupted expiration"); + } + } + if (IsInlined()) { + value_ = slice; + } else { + if (GetVarint64(&slice, &file_number_) && GetVarint64(&slice, &offset_) && + GetVarint64(&slice, &size_) && slice.size() == 1) { + compression_ = static_cast(*slice.data()); + } else { + return Status::Corruption(kErrorMessage, "Corrupted blob offset"); + } + } + return Status::OK(); + } + + std::string DebugString(bool output_hex) const { + std::ostringstream oss; + + if (IsInlined()) { + oss << "[inlined blob] value:" << value_.ToString(output_hex); + } else { + oss << "[blob ref] file:" << file_number_ << " offset:" << offset_ + << " size:" << size_ + << " compression: " << CompressionTypeToString(compression_); + } + + if (HasTTL()) { + oss << " exp:" << expiration_; + } + + return oss.str(); + } + + static void EncodeInlinedTTL(std::string* dst, uint64_t expiration, + const Slice& value) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(1 + kMaxVarint64Length + value.size()); + dst->push_back(static_cast(Type::kInlinedTTL)); + PutVarint64(dst, expiration); + dst->append(value.data(), value.size()); + } + + static void EncodeBlob(std::string* dst, uint64_t file_number, + uint64_t offset, uint64_t size, + CompressionType compression) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(kMaxVarint64Length * 3 + 2); + dst->push_back(static_cast(Type::kBlob)); + PutVarint64(dst, file_number); + PutVarint64(dst, offset); + PutVarint64(dst, size); + dst->push_back(static_cast(compression)); + } + + static void EncodeBlobTTL(std::string* dst, uint64_t expiration, + uint64_t file_number, uint64_t offset, + uint64_t size, CompressionType compression) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(kMaxVarint64Length * 4 + 2); + dst->push_back(static_cast(Type::kBlobTTL)); + PutVarint64(dst, expiration); + PutVarint64(dst, file_number); + PutVarint64(dst, offset); + PutVarint64(dst, size); + dst->push_back(static_cast(compression)); + } + + private: + Type type_ = Type::kUnknown; + uint64_t expiration_ = 0; + Slice value_; + uint64_t file_number_ = 0; + uint64_t offset_ = 0; + uint64_t size_ = 0; + CompressionType compression_ = kNoCompression; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_log_format.cc b/src/rocksdb/db/blob/blob_log_format.cc new file mode 100644 index 000000000..8e26281e3 --- /dev/null +++ b/src/rocksdb/db/blob/blob_log_format.cc @@ -0,0 +1,143 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "db/blob/blob_log_format.h" + +#include "util/coding.h" +#include "util/crc32c.h" + +namespace ROCKSDB_NAMESPACE { + +void BlobLogHeader::EncodeTo(std::string* dst) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(BlobLogHeader::kSize); + PutFixed32(dst, kMagicNumber); + PutFixed32(dst, version); + PutFixed32(dst, column_family_id); + unsigned char flags = (has_ttl ? 1 : 0); + dst->push_back(flags); + dst->push_back(compression); + PutFixed64(dst, expiration_range.first); + PutFixed64(dst, expiration_range.second); +} + +Status BlobLogHeader::DecodeFrom(Slice src) { + const char* kErrorMessage = "Error while decoding blob log header"; + if (src.size() != BlobLogHeader::kSize) { + return Status::Corruption(kErrorMessage, + "Unexpected blob file header size"); + } + uint32_t magic_number; + unsigned char flags; + if (!GetFixed32(&src, &magic_number) || !GetFixed32(&src, &version) || + !GetFixed32(&src, &column_family_id)) { + return Status::Corruption( + kErrorMessage, + "Error decoding magic number, version and column family id"); + } + if (magic_number != kMagicNumber) { + return Status::Corruption(kErrorMessage, "Magic number mismatch"); + } + if (version != kVersion1) { + return Status::Corruption(kErrorMessage, "Unknown header version"); + } + flags = src.data()[0]; + compression = static_cast(src.data()[1]); + has_ttl = (flags & 1) == 1; + src.remove_prefix(2); + if (!GetFixed64(&src, &expiration_range.first) || + !GetFixed64(&src, &expiration_range.second)) { + return Status::Corruption(kErrorMessage, "Error decoding expiration range"); + } + return Status::OK(); +} + +void BlobLogFooter::EncodeTo(std::string* dst) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(BlobLogFooter::kSize); + PutFixed32(dst, kMagicNumber); + PutFixed64(dst, blob_count); + PutFixed64(dst, expiration_range.first); + PutFixed64(dst, expiration_range.second); + crc = crc32c::Value(dst->c_str(), dst->size()); + crc = crc32c::Mask(crc); + PutFixed32(dst, crc); +} + +Status BlobLogFooter::DecodeFrom(Slice src) { + const char* kErrorMessage = "Error while decoding blob log footer"; + if (src.size() != BlobLogFooter::kSize) { + return Status::Corruption(kErrorMessage, + "Unexpected blob file footer size"); + } + uint32_t src_crc = 0; + src_crc = crc32c::Value(src.data(), BlobLogFooter::kSize - sizeof(uint32_t)); + src_crc = crc32c::Mask(src_crc); + uint32_t magic_number = 0; + if (!GetFixed32(&src, &magic_number) || !GetFixed64(&src, &blob_count) || + !GetFixed64(&src, &expiration_range.first) || + !GetFixed64(&src, &expiration_range.second) || !GetFixed32(&src, &crc)) { + return Status::Corruption(kErrorMessage, "Error decoding content"); + } + if (magic_number != kMagicNumber) { + return Status::Corruption(kErrorMessage, "Magic number mismatch"); + } + if (src_crc != crc) { + return Status::Corruption(kErrorMessage, "CRC mismatch"); + } + return Status::OK(); +} + +void BlobLogRecord::EncodeHeaderTo(std::string* dst) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(BlobLogRecord::kHeaderSize + key.size() + value.size()); + PutFixed64(dst, key.size()); + PutFixed64(dst, value.size()); + PutFixed64(dst, expiration); + header_crc = crc32c::Value(dst->c_str(), dst->size()); + header_crc = crc32c::Mask(header_crc); + PutFixed32(dst, header_crc); + blob_crc = crc32c::Value(key.data(), key.size()); + blob_crc = crc32c::Extend(blob_crc, value.data(), value.size()); + blob_crc = crc32c::Mask(blob_crc); + PutFixed32(dst, blob_crc); +} + +Status BlobLogRecord::DecodeHeaderFrom(Slice src) { + const char* kErrorMessage = "Error while decoding blob record"; + if (src.size() != BlobLogRecord::kHeaderSize) { + return Status::Corruption(kErrorMessage, + "Unexpected blob record header size"); + } + uint32_t src_crc = 0; + src_crc = crc32c::Value(src.data(), BlobLogRecord::kHeaderSize - 8); + src_crc = crc32c::Mask(src_crc); + if (!GetFixed64(&src, &key_size) || !GetFixed64(&src, &value_size) || + !GetFixed64(&src, &expiration) || !GetFixed32(&src, &header_crc) || + !GetFixed32(&src, &blob_crc)) { + return Status::Corruption(kErrorMessage, "Error decoding content"); + } + if (src_crc != header_crc) { + return Status::Corruption(kErrorMessage, "Header CRC mismatch"); + } + return Status::OK(); +} + +Status BlobLogRecord::CheckBlobCRC() const { + uint32_t expected_crc = 0; + expected_crc = crc32c::Value(key.data(), key.size()); + expected_crc = crc32c::Extend(expected_crc, value.data(), value.size()); + expected_crc = crc32c::Mask(expected_crc); + if (expected_crc != blob_crc) { + return Status::Corruption("Blob CRC mismatch"); + } + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_log_format.h b/src/rocksdb/db/blob/blob_log_format.h new file mode 100644 index 000000000..607db2367 --- /dev/null +++ b/src/rocksdb/db/blob/blob_log_format.h @@ -0,0 +1,164 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Log format information shared by reader and writer. + +#pragma once + +#include +#include + +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +constexpr uint32_t kMagicNumber = 2395959; // 0x00248f37 +constexpr uint32_t kVersion1 = 1; + +using ExpirationRange = std::pair; + +// clang-format off + +// Format of blob log file header (30 bytes): +// +// +--------------+---------+---------+-------+-------------+-------------------+ +// | magic number | version | cf id | flags | compression | expiration range | +// +--------------+---------+---------+-------+-------------+-------------------+ +// | Fixed32 | Fixed32 | Fixed32 | char | char | Fixed64 Fixed64 | +// +--------------+---------+---------+-------+-------------+-------------------+ +// +// List of flags: +// has_ttl: Whether the file contain TTL data. +// +// Expiration range in the header is a rough range based on +// blob_db_options.ttl_range_secs. + +// clang-format on + +struct BlobLogHeader { + static constexpr size_t kSize = 30; + + BlobLogHeader() = default; + BlobLogHeader(uint32_t _column_family_id, CompressionType _compression, + bool _has_ttl, const ExpirationRange& _expiration_range) + : column_family_id(_column_family_id), + compression(_compression), + has_ttl(_has_ttl), + expiration_range(_expiration_range) {} + + uint32_t version = kVersion1; + uint32_t column_family_id = 0; + CompressionType compression = kNoCompression; + bool has_ttl = false; + ExpirationRange expiration_range; + + void EncodeTo(std::string* dst); + + Status DecodeFrom(Slice slice); +}; + +// clang-format off + +// Format of blob log file footer (32 bytes): +// +// +--------------+------------+-------------------+------------+ +// | magic number | blob count | expiration range | footer CRC | +// +--------------+------------+-------------------+------------+ +// | Fixed32 | Fixed64 | Fixed64 + Fixed64 | Fixed32 | +// +--------------+------------+-------------------+------------+ +// +// The footer will be presented only when the blob file is properly closed. +// +// Unlike the same field in file header, expiration range in the footer is the +// range of smallest and largest expiration of the data in this file. + +// clang-format on + +struct BlobLogFooter { + static constexpr size_t kSize = 32; + + uint64_t blob_count = 0; + ExpirationRange expiration_range = std::make_pair(0, 0); + uint32_t crc = 0; + + void EncodeTo(std::string* dst); + + Status DecodeFrom(Slice slice); +}; + +// clang-format off + +// Blob record format (32 bytes header + key + value): +// +// +------------+--------------+------------+------------+----------+---------+-----------+ +// | key length | value length | expiration | header CRC | blob CRC | key | value | +// +------------+--------------+------------+------------+----------+---------+-----------+ +// | Fixed64 | Fixed64 | Fixed64 | Fixed32 | Fixed32 | key len | value len | +// +------------+--------------+------------+------------+----------+---------+-----------+ +// +// If file has has_ttl = false, expiration field is always 0, and the blob +// doesn't has expiration. +// +// Also note that if compression is used, value is compressed value and value +// length is compressed value length. +// +// Header CRC is the checksum of (key_len + val_len + expiration), while +// blob CRC is the checksum of (key + value). +// +// We could use variable length encoding (Varint64) to save more space, but it +// make reader more complicated. + +// clang-format on + +struct BlobLogRecord { + // header include fields up to blob CRC + static constexpr size_t kHeaderSize = 32; + + // Note that the offset field of BlobIndex actually points to the blob value + // as opposed to the start of the blob record. The following method can + // be used to calculate the adjustment needed to read the blob record header. + static constexpr uint64_t CalculateAdjustmentForRecordHeader( + uint64_t key_size) { + return key_size + kHeaderSize; + } + + uint64_t key_size = 0; + uint64_t value_size = 0; + uint64_t expiration = 0; + uint32_t header_crc = 0; + uint32_t blob_crc = 0; + Slice key; + Slice value; + std::unique_ptr key_buf; + std::unique_ptr value_buf; + + uint64_t record_size() const { return kHeaderSize + key_size + value_size; } + + void EncodeHeaderTo(std::string* dst); + + Status DecodeHeaderFrom(Slice src); + + Status CheckBlobCRC() const; +}; + +// Checks whether a blob offset is potentially valid or not. +inline bool IsValidBlobOffset(uint64_t value_offset, uint64_t key_size, + uint64_t value_size, uint64_t file_size) { + if (value_offset < + BlobLogHeader::kSize + BlobLogRecord::kHeaderSize + key_size) { + return false; + } + + if (value_offset + value_size + BlobLogFooter::kSize > file_size) { + return false; + } + + return true; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_log_sequential_reader.cc b/src/rocksdb/db/blob/blob_log_sequential_reader.cc new file mode 100644 index 000000000..778725189 --- /dev/null +++ b/src/rocksdb/db/blob/blob_log_sequential_reader.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "db/blob/blob_log_sequential_reader.h" + +#include "file/random_access_file_reader.h" +#include "monitoring/statistics.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +BlobLogSequentialReader::BlobLogSequentialReader( + std::unique_ptr&& file_reader, SystemClock* clock, + Statistics* statistics) + : file_(std::move(file_reader)), + clock_(clock), + statistics_(statistics), + next_byte_(0) {} + +BlobLogSequentialReader::~BlobLogSequentialReader() = default; + +Status BlobLogSequentialReader::ReadSlice(uint64_t size, Slice* slice, + char* buf) { + assert(slice); + assert(file_); + + StopWatch read_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS); + // TODO: rate limit `BlobLogSequentialReader` reads (it appears unused?) + Status s = + file_->Read(IOOptions(), next_byte_, static_cast(size), slice, + buf, nullptr, Env::IO_TOTAL /* rate_limiter_priority */); + next_byte_ += size; + if (!s.ok()) { + return s; + } + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, slice->size()); + if (slice->size() != size) { + return Status::Corruption("EOF reached while reading record"); + } + return s; +} + +Status BlobLogSequentialReader::ReadHeader(BlobLogHeader* header) { + assert(header); + assert(next_byte_ == 0); + + static_assert(BlobLogHeader::kSize <= sizeof(header_buf_), + "Buffer is smaller than BlobLogHeader::kSize"); + + Status s = ReadSlice(BlobLogHeader::kSize, &buffer_, header_buf_); + if (!s.ok()) { + return s; + } + + if (buffer_.size() != BlobLogHeader::kSize) { + return Status::Corruption("EOF reached before file header"); + } + + return header->DecodeFrom(buffer_); +} + +Status BlobLogSequentialReader::ReadRecord(BlobLogRecord* record, + ReadLevel level, + uint64_t* blob_offset) { + assert(record); + static_assert(BlobLogRecord::kHeaderSize <= sizeof(header_buf_), + "Buffer is smaller than BlobLogRecord::kHeaderSize"); + + Status s = ReadSlice(BlobLogRecord::kHeaderSize, &buffer_, header_buf_); + if (!s.ok()) { + return s; + } + if (buffer_.size() != BlobLogRecord::kHeaderSize) { + return Status::Corruption("EOF reached before record header"); + } + + s = record->DecodeHeaderFrom(buffer_); + if (!s.ok()) { + return s; + } + + uint64_t kb_size = record->key_size + record->value_size; + if (blob_offset != nullptr) { + *blob_offset = next_byte_ + record->key_size; + } + + switch (level) { + case kReadHeader: + next_byte_ += kb_size; + break; + + case kReadHeaderKey: + record->key_buf.reset(new char[record->key_size]); + s = ReadSlice(record->key_size, &record->key, record->key_buf.get()); + next_byte_ += record->value_size; + break; + + case kReadHeaderKeyBlob: + record->key_buf.reset(new char[record->key_size]); + s = ReadSlice(record->key_size, &record->key, record->key_buf.get()); + if (s.ok()) { + record->value_buf.reset(new char[record->value_size]); + s = ReadSlice(record->value_size, &record->value, + record->value_buf.get()); + } + if (s.ok()) { + s = record->CheckBlobCRC(); + } + break; + } + return s; +} + +Status BlobLogSequentialReader::ReadFooter(BlobLogFooter* footer) { + assert(footer); + static_assert(BlobLogFooter::kSize <= sizeof(header_buf_), + "Buffer is smaller than BlobLogFooter::kSize"); + + Status s = ReadSlice(BlobLogFooter::kSize, &buffer_, header_buf_); + if (!s.ok()) { + return s; + } + + if (buffer_.size() != BlobLogFooter::kSize) { + return Status::Corruption("EOF reached before file footer"); + } + + return footer->DecodeFrom(buffer_); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_log_sequential_reader.h b/src/rocksdb/db/blob/blob_log_sequential_reader.h new file mode 100644 index 000000000..98afa8518 --- /dev/null +++ b/src/rocksdb/db/blob/blob_log_sequential_reader.h @@ -0,0 +1,83 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#include + +#include "db/blob/blob_log_format.h" +#include "rocksdb/slice.h" + +#define MAX_HEADER_SIZE(a, b, c) (a > b ? (a > c ? a : c) : (b > c ? b : c)) + +namespace ROCKSDB_NAMESPACE { + +class RandomAccessFileReader; +class Env; +class Statistics; +class Status; +class SystemClock; + +/** + * BlobLogSequentialReader is a general purpose log stream reader + * implementation. The actual job of reading from the device is implemented by + * the RandomAccessFileReader interface. + * + * Please see BlobLogWriter for details on the file and record layout. + */ + +class BlobLogSequentialReader { + public: + enum ReadLevel { + kReadHeader, + kReadHeaderKey, + kReadHeaderKeyBlob, + }; + + // Create a reader that will return log records from "*file_reader". + BlobLogSequentialReader(std::unique_ptr&& file_reader, + SystemClock* clock, Statistics* statistics); + + // No copying allowed + BlobLogSequentialReader(const BlobLogSequentialReader&) = delete; + BlobLogSequentialReader& operator=(const BlobLogSequentialReader&) = delete; + + ~BlobLogSequentialReader(); + + Status ReadHeader(BlobLogHeader* header); + + // Read the next record into *record. Returns true if read + // successfully, false if we hit end of the input. The contents filled in + // *record will only be valid until the next mutating operation on this + // reader. + // If blob_offset is non-null, return offset of the blob through it. + Status ReadRecord(BlobLogRecord* record, ReadLevel level = kReadHeader, + uint64_t* blob_offset = nullptr); + + Status ReadFooter(BlobLogFooter* footer); + + void ResetNextByte() { next_byte_ = 0; } + + uint64_t GetNextByte() const { return next_byte_; } + + private: + Status ReadSlice(uint64_t size, Slice* slice, char* buf); + + const std::unique_ptr file_; + SystemClock* clock_; + + Statistics* statistics_; + + Slice buffer_; + char header_buf_[MAX_HEADER_SIZE(BlobLogHeader::kSize, BlobLogFooter::kSize, + BlobLogRecord::kHeaderSize)]; + + // which byte to read next + uint64_t next_byte_; +}; + +} // namespace ROCKSDB_NAMESPACE + +#undef MAX_HEADER_SIZE \ No newline at end of file diff --git a/src/rocksdb/db/blob/blob_log_writer.cc b/src/rocksdb/db/blob/blob_log_writer.cc new file mode 100644 index 000000000..9dbac7f25 --- /dev/null +++ b/src/rocksdb/db/blob/blob_log_writer.cc @@ -0,0 +1,178 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_log_writer.h" + +#include +#include + +#include "db/blob/blob_log_format.h" +#include "file/writable_file_writer.h" +#include "monitoring/statistics.h" +#include "rocksdb/system_clock.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +BlobLogWriter::BlobLogWriter(std::unique_ptr&& dest, + SystemClock* clock, Statistics* statistics, + uint64_t log_number, bool use_fs, bool do_flush, + uint64_t boffset) + : dest_(std::move(dest)), + clock_(clock), + statistics_(statistics), + log_number_(log_number), + block_offset_(boffset), + use_fsync_(use_fs), + do_flush_(do_flush), + last_elem_type_(kEtNone) {} + +BlobLogWriter::~BlobLogWriter() = default; + +Status BlobLogWriter::Sync() { + TEST_SYNC_POINT("BlobLogWriter::Sync"); + + StopWatch sync_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_SYNC_MICROS); + Status s = dest_->Sync(use_fsync_); + RecordTick(statistics_, BLOB_DB_BLOB_FILE_SYNCED); + return s; +} + +Status BlobLogWriter::WriteHeader(BlobLogHeader& header) { + assert(block_offset_ == 0); + assert(last_elem_type_ == kEtNone); + std::string str; + header.EncodeTo(&str); + + Status s = dest_->Append(Slice(str)); + if (s.ok()) { + block_offset_ += str.size(); + if (do_flush_) { + s = dest_->Flush(); + } + } + last_elem_type_ = kEtFileHdr; + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, + BlobLogHeader::kSize); + return s; +} + +Status BlobLogWriter::AppendFooter(BlobLogFooter& footer, + std::string* checksum_method, + std::string* checksum_value) { + assert(block_offset_ != 0); + assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord); + + std::string str; + footer.EncodeTo(&str); + + Status s; + if (dest_->seen_error()) { + s.PermitUncheckedError(); + return Status::IOError("Seen Error. Skip closing."); + } else { + s = dest_->Append(Slice(str)); + if (s.ok()) { + block_offset_ += str.size(); + + s = Sync(); + + if (s.ok()) { + s = dest_->Close(); + + if (s.ok()) { + assert(!!checksum_method == !!checksum_value); + + if (checksum_method) { + assert(checksum_method->empty()); + + std::string method = dest_->GetFileChecksumFuncName(); + if (method != kUnknownFileChecksumFuncName) { + *checksum_method = std::move(method); + } + } + if (checksum_value) { + assert(checksum_value->empty()); + + std::string value = dest_->GetFileChecksum(); + if (value != kUnknownFileChecksum) { + *checksum_value = std::move(value); + } + } + } + } + } + + dest_.reset(); + } + + last_elem_type_ = kEtFileFooter; + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, + BlobLogFooter::kSize); + return s; +} + +Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val, + uint64_t expiration, uint64_t* key_offset, + uint64_t* blob_offset) { + assert(block_offset_ != 0); + assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord); + + std::string buf; + ConstructBlobHeader(&buf, key, val, expiration); + + Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset); + return s; +} + +Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val, + uint64_t* key_offset, uint64_t* blob_offset) { + assert(block_offset_ != 0); + assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord); + + std::string buf; + ConstructBlobHeader(&buf, key, val, 0); + + Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset); + return s; +} + +void BlobLogWriter::ConstructBlobHeader(std::string* buf, const Slice& key, + const Slice& val, uint64_t expiration) { + BlobLogRecord record; + record.key = key; + record.value = val; + record.expiration = expiration; + record.EncodeHeaderTo(buf); +} + +Status BlobLogWriter::EmitPhysicalRecord(const std::string& headerbuf, + const Slice& key, const Slice& val, + uint64_t* key_offset, + uint64_t* blob_offset) { + StopWatch write_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_WRITE_MICROS); + Status s = dest_->Append(Slice(headerbuf)); + if (s.ok()) { + s = dest_->Append(key); + } + if (s.ok()) { + s = dest_->Append(val); + } + if (do_flush_ && s.ok()) { + s = dest_->Flush(); + } + + *key_offset = block_offset_ + BlobLogRecord::kHeaderSize; + *blob_offset = *key_offset + key.size(); + block_offset_ = *blob_offset + val.size(); + last_elem_type_ = kEtRecord; + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, + BlobLogRecord::kHeaderSize + key.size() + val.size()); + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_log_writer.h b/src/rocksdb/db/blob/blob_log_writer.h new file mode 100644 index 000000000..c1f9f31ad --- /dev/null +++ b/src/rocksdb/db/blob/blob_log_writer.h @@ -0,0 +1,83 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include +#include + +#include "db/blob/blob_log_format.h" +#include "rocksdb/slice.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +class WritableFileWriter; +class SystemClock; +/** + * BlobLogWriter is the blob log stream writer. It provides an append-only + * abstraction for writing blob data. + * + * + * Look at blob_db_format.h to see the details of the record formats. + */ + +class BlobLogWriter { + public: + // Create a writer that will append data to "*dest". + // "*dest" must be initially empty. + // "*dest" must remain live while this BlobLogWriter is in use. + BlobLogWriter(std::unique_ptr&& dest, SystemClock* clock, + Statistics* statistics, uint64_t log_number, bool use_fsync, + bool do_flush, uint64_t boffset = 0); + // No copying allowed + BlobLogWriter(const BlobLogWriter&) = delete; + BlobLogWriter& operator=(const BlobLogWriter&) = delete; + + ~BlobLogWriter(); + + static void ConstructBlobHeader(std::string* buf, const Slice& key, + const Slice& val, uint64_t expiration); + + Status AddRecord(const Slice& key, const Slice& val, uint64_t* key_offset, + uint64_t* blob_offset); + + Status AddRecord(const Slice& key, const Slice& val, uint64_t expiration, + uint64_t* key_offset, uint64_t* blob_offset); + + Status EmitPhysicalRecord(const std::string& headerbuf, const Slice& key, + const Slice& val, uint64_t* key_offset, + uint64_t* blob_offset); + + Status AppendFooter(BlobLogFooter& footer, std::string* checksum_method, + std::string* checksum_value); + + Status WriteHeader(BlobLogHeader& header); + + WritableFileWriter* file() { return dest_.get(); } + + const WritableFileWriter* file() const { return dest_.get(); } + + uint64_t get_log_number() const { return log_number_; } + + Status Sync(); + + private: + std::unique_ptr dest_; + SystemClock* clock_; + Statistics* statistics_; + uint64_t log_number_; + uint64_t block_offset_; // Current offset in block + bool use_fsync_; + bool do_flush_; + + public: + enum ElemType { kEtNone, kEtFileHdr, kEtRecord, kEtFileFooter }; + ElemType last_elem_type_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_read_request.h b/src/rocksdb/db/blob/blob_read_request.h new file mode 100644 index 000000000..f9668ca2e --- /dev/null +++ b/src/rocksdb/db/blob/blob_read_request.h @@ -0,0 +1,58 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/compression_type.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +// A read Blob request structure for use in BlobSource::MultiGetBlob and +// BlobFileReader::MultiGetBlob. +struct BlobReadRequest { + // User key to lookup the paired blob + const Slice* user_key = nullptr; + + // File offset in bytes + uint64_t offset = 0; + + // Length to read in bytes + size_t len = 0; + + // Blob compression type + CompressionType compression = kNoCompression; + + // Output parameter set by MultiGetBlob() to point to the data buffer, and + // the number of valid bytes + PinnableSlice* result = nullptr; + + // Status of read + Status* status = nullptr; + + BlobReadRequest(const Slice& _user_key, uint64_t _offset, size_t _len, + CompressionType _compression, PinnableSlice* _result, + Status* _status) + : user_key(&_user_key), + offset(_offset), + len(_len), + compression(_compression), + result(_result), + status(_status) {} + + BlobReadRequest() = default; + BlobReadRequest(const BlobReadRequest& other) = default; + BlobReadRequest& operator=(const BlobReadRequest& other) = default; +}; + +using BlobFileReadRequests = + std::tuple>; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_source.cc b/src/rocksdb/db/blob/blob_source.cc new file mode 100644 index 000000000..bfade2507 --- /dev/null +++ b/src/rocksdb/db/blob/blob_source.cc @@ -0,0 +1,488 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_source.h" + +#include +#include + +#include "cache/cache_reservation_manager.h" +#include "cache/charged_cache.h" +#include "db/blob/blob_contents.h" +#include "db/blob/blob_file_reader.h" +#include "db/blob/blob_log_format.h" +#include "monitoring/statistics.h" +#include "options/cf_options.h" +#include "table/get_context.h" +#include "table/multiget_context.h" + +namespace ROCKSDB_NAMESPACE { + +BlobSource::BlobSource(const ImmutableOptions* immutable_options, + const std::string& db_id, + const std::string& db_session_id, + BlobFileCache* blob_file_cache) + : db_id_(db_id), + db_session_id_(db_session_id), + statistics_(immutable_options->statistics.get()), + blob_file_cache_(blob_file_cache), + blob_cache_(immutable_options->blob_cache), + lowest_used_cache_tier_(immutable_options->lowest_used_cache_tier) { +#ifndef ROCKSDB_LITE + auto bbto = + immutable_options->table_factory->GetOptions(); + if (bbto && + bbto->cache_usage_options.options_overrides.at(CacheEntryRole::kBlobCache) + .charged == CacheEntryRoleOptions::Decision::kEnabled) { + blob_cache_ = std::make_shared(immutable_options->blob_cache, + bbto->block_cache); + } +#endif // ROCKSDB_LITE +} + +BlobSource::~BlobSource() = default; + +Status BlobSource::GetBlobFromCache( + const Slice& cache_key, CacheHandleGuard* cached_blob) const { + assert(blob_cache_); + assert(!cache_key.empty()); + assert(cached_blob); + assert(cached_blob->IsEmpty()); + + Cache::Handle* cache_handle = nullptr; + cache_handle = GetEntryFromCache(cache_key); + if (cache_handle != nullptr) { + *cached_blob = + CacheHandleGuard(blob_cache_.get(), cache_handle); + + assert(cached_blob->GetValue()); + + PERF_COUNTER_ADD(blob_cache_hit_count, 1); + RecordTick(statistics_, BLOB_DB_CACHE_HIT); + RecordTick(statistics_, BLOB_DB_CACHE_BYTES_READ, + cached_blob->GetValue()->size()); + + return Status::OK(); + } + + RecordTick(statistics_, BLOB_DB_CACHE_MISS); + + return Status::NotFound("Blob not found in cache"); +} + +Status BlobSource::PutBlobIntoCache( + const Slice& cache_key, std::unique_ptr* blob, + CacheHandleGuard* cached_blob) const { + assert(blob_cache_); + assert(!cache_key.empty()); + assert(blob); + assert(*blob); + assert(cached_blob); + assert(cached_blob->IsEmpty()); + + Cache::Handle* cache_handle = nullptr; + const Status s = InsertEntryIntoCache(cache_key, blob->get(), + (*blob)->ApproximateMemoryUsage(), + &cache_handle, Cache::Priority::BOTTOM); + if (s.ok()) { + blob->release(); + + assert(cache_handle != nullptr); + *cached_blob = + CacheHandleGuard(blob_cache_.get(), cache_handle); + + assert(cached_blob->GetValue()); + + RecordTick(statistics_, BLOB_DB_CACHE_ADD); + RecordTick(statistics_, BLOB_DB_CACHE_BYTES_WRITE, + cached_blob->GetValue()->size()); + + } else { + RecordTick(statistics_, BLOB_DB_CACHE_ADD_FAILURES); + } + + return s; +} + +Cache::Handle* BlobSource::GetEntryFromCache(const Slice& key) const { + Cache::Handle* cache_handle = nullptr; + + if (lowest_used_cache_tier_ == CacheTier::kNonVolatileBlockTier) { + Cache::CreateCallback create_cb = + [allocator = blob_cache_->memory_allocator()]( + const void* buf, size_t size, void** out_obj, + size_t* charge) -> Status { + return BlobContents::CreateCallback(AllocateBlock(size, allocator), buf, + size, out_obj, charge); + }; + + cache_handle = blob_cache_->Lookup(key, BlobContents::GetCacheItemHelper(), + create_cb, Cache::Priority::BOTTOM, + true /* wait_for_cache */, statistics_); + } else { + cache_handle = blob_cache_->Lookup(key, statistics_); + } + + return cache_handle; +} + +void BlobSource::PinCachedBlob(CacheHandleGuard* cached_blob, + PinnableSlice* value) { + assert(cached_blob); + assert(cached_blob->GetValue()); + assert(value); + + // To avoid copying the cached blob into the buffer provided by the + // application, we can simply transfer ownership of the cache handle to + // the target PinnableSlice. This has the potential to save a lot of + // CPU, especially with large blob values. + + value->Reset(); + + constexpr Cleanable* cleanable = nullptr; + value->PinSlice(cached_blob->GetValue()->data(), cleanable); + + cached_blob->TransferTo(value); +} + +void BlobSource::PinOwnedBlob(std::unique_ptr* owned_blob, + PinnableSlice* value) { + assert(owned_blob); + assert(*owned_blob); + assert(value); + + BlobContents* const blob = owned_blob->release(); + assert(blob); + + value->Reset(); + value->PinSlice( + blob->data(), + [](void* arg1, void* /* arg2 */) { + delete static_cast(arg1); + }, + blob, nullptr); +} + +Status BlobSource::InsertEntryIntoCache(const Slice& key, BlobContents* value, + size_t charge, + Cache::Handle** cache_handle, + Cache::Priority priority) const { + Status s; + + Cache::CacheItemHelper* const cache_item_helper = + BlobContents::GetCacheItemHelper(); + assert(cache_item_helper); + + if (lowest_used_cache_tier_ == CacheTier::kNonVolatileBlockTier) { + s = blob_cache_->Insert(key, value, cache_item_helper, charge, cache_handle, + priority); + } else { + s = blob_cache_->Insert(key, value, charge, cache_item_helper->del_cb, + cache_handle, priority); + } + + return s; +} + +Status BlobSource::GetBlob(const ReadOptions& read_options, + const Slice& user_key, uint64_t file_number, + uint64_t offset, uint64_t file_size, + uint64_t value_size, + CompressionType compression_type, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* value, uint64_t* bytes_read) { + assert(value); + + Status s; + + const CacheKey cache_key = GetCacheKey(file_number, file_size, offset); + + CacheHandleGuard blob_handle; + + // First, try to get the blob from the cache + // + // If blob cache is enabled, we'll try to read from it. + if (blob_cache_) { + Slice key = cache_key.AsSlice(); + s = GetBlobFromCache(key, &blob_handle); + if (s.ok()) { + PinCachedBlob(&blob_handle, value); + + // For consistency, the size of on-disk (possibly compressed) blob record + // is assigned to bytes_read. + uint64_t adjustment = + read_options.verify_checksums + ? BlobLogRecord::CalculateAdjustmentForRecordHeader( + user_key.size()) + : 0; + assert(offset >= adjustment); + + uint64_t record_size = value_size + adjustment; + if (bytes_read) { + *bytes_read = record_size; + } + return s; + } + } + + assert(blob_handle.IsEmpty()); + + const bool no_io = read_options.read_tier == kBlockCacheTier; + if (no_io) { + s = Status::Incomplete("Cannot read blob(s): no disk I/O allowed"); + return s; + } + + // Can't find the blob from the cache. Since I/O is allowed, read from the + // file. + std::unique_ptr blob_contents; + + { + CacheHandleGuard blob_file_reader; + s = blob_file_cache_->GetBlobFileReader(file_number, &blob_file_reader); + if (!s.ok()) { + return s; + } + + assert(blob_file_reader.GetValue()); + + if (compression_type != blob_file_reader.GetValue()->GetCompressionType()) { + return Status::Corruption("Compression type mismatch when reading blob"); + } + + MemoryAllocator* const allocator = (blob_cache_ && read_options.fill_cache) + ? blob_cache_->memory_allocator() + : nullptr; + + uint64_t read_size = 0; + s = blob_file_reader.GetValue()->GetBlob( + read_options, user_key, offset, value_size, compression_type, + prefetch_buffer, allocator, &blob_contents, &read_size); + if (!s.ok()) { + return s; + } + if (bytes_read) { + *bytes_read = read_size; + } + } + + if (blob_cache_ && read_options.fill_cache) { + // If filling cache is allowed and a cache is configured, try to put the + // blob to the cache. + Slice key = cache_key.AsSlice(); + s = PutBlobIntoCache(key, &blob_contents, &blob_handle); + if (!s.ok()) { + return s; + } + + PinCachedBlob(&blob_handle, value); + } else { + PinOwnedBlob(&blob_contents, value); + } + + assert(s.ok()); + return s; +} + +void BlobSource::MultiGetBlob(const ReadOptions& read_options, + autovector& blob_reqs, + uint64_t* bytes_read) { + assert(blob_reqs.size() > 0); + + uint64_t total_bytes_read = 0; + uint64_t bytes_read_in_file = 0; + + for (auto& [file_number, file_size, blob_reqs_in_file] : blob_reqs) { + // sort blob_reqs_in_file by file offset. + std::sort( + blob_reqs_in_file.begin(), blob_reqs_in_file.end(), + [](const BlobReadRequest& lhs, const BlobReadRequest& rhs) -> bool { + return lhs.offset < rhs.offset; + }); + + MultiGetBlobFromOneFile(read_options, file_number, file_size, + blob_reqs_in_file, &bytes_read_in_file); + + total_bytes_read += bytes_read_in_file; + } + + if (bytes_read) { + *bytes_read = total_bytes_read; + } +} + +void BlobSource::MultiGetBlobFromOneFile(const ReadOptions& read_options, + uint64_t file_number, + uint64_t /*file_size*/, + autovector& blob_reqs, + uint64_t* bytes_read) { + const size_t num_blobs = blob_reqs.size(); + assert(num_blobs > 0); + assert(num_blobs <= MultiGetContext::MAX_BATCH_SIZE); + +#ifndef NDEBUG + for (size_t i = 0; i < num_blobs - 1; ++i) { + assert(blob_reqs[i].offset <= blob_reqs[i + 1].offset); + } +#endif // !NDEBUG + + using Mask = uint64_t; + Mask cache_hit_mask = 0; + + uint64_t total_bytes = 0; + const OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number); + + if (blob_cache_) { + size_t cached_blob_count = 0; + for (size_t i = 0; i < num_blobs; ++i) { + auto& req = blob_reqs[i]; + + CacheHandleGuard blob_handle; + const CacheKey cache_key = base_cache_key.WithOffset(req.offset); + const Slice key = cache_key.AsSlice(); + + const Status s = GetBlobFromCache(key, &blob_handle); + + if (s.ok()) { + assert(req.status); + *req.status = s; + + PinCachedBlob(&blob_handle, req.result); + + // Update the counter for the number of valid blobs read from the cache. + ++cached_blob_count; + + // For consistency, the size of each on-disk (possibly compressed) blob + // record is accumulated to total_bytes. + uint64_t adjustment = + read_options.verify_checksums + ? BlobLogRecord::CalculateAdjustmentForRecordHeader( + req.user_key->size()) + : 0; + assert(req.offset >= adjustment); + total_bytes += req.len + adjustment; + cache_hit_mask |= (Mask{1} << i); // cache hit + } + } + + // All blobs were read from the cache. + if (cached_blob_count == num_blobs) { + if (bytes_read) { + *bytes_read = total_bytes; + } + return; + } + } + + const bool no_io = read_options.read_tier == kBlockCacheTier; + if (no_io) { + for (size_t i = 0; i < num_blobs; ++i) { + if (!(cache_hit_mask & (Mask{1} << i))) { + BlobReadRequest& req = blob_reqs[i]; + assert(req.status); + + *req.status = + Status::Incomplete("Cannot read blob(s): no disk I/O allowed"); + } + } + return; + } + + { + // Find the rest of blobs from the file since I/O is allowed. + autovector>> + _blob_reqs; + uint64_t _bytes_read = 0; + + for (size_t i = 0; i < num_blobs; ++i) { + if (!(cache_hit_mask & (Mask{1} << i))) { + _blob_reqs.emplace_back(&blob_reqs[i], std::unique_ptr()); + } + } + + CacheHandleGuard blob_file_reader; + Status s = + blob_file_cache_->GetBlobFileReader(file_number, &blob_file_reader); + if (!s.ok()) { + for (size_t i = 0; i < _blob_reqs.size(); ++i) { + BlobReadRequest* const req = _blob_reqs[i].first; + assert(req); + assert(req->status); + + *req->status = s; + } + return; + } + + assert(blob_file_reader.GetValue()); + + MemoryAllocator* const allocator = (blob_cache_ && read_options.fill_cache) + ? blob_cache_->memory_allocator() + : nullptr; + + blob_file_reader.GetValue()->MultiGetBlob(read_options, allocator, + _blob_reqs, &_bytes_read); + + if (blob_cache_ && read_options.fill_cache) { + // If filling cache is allowed and a cache is configured, try to put + // the blob(s) to the cache. + for (auto& [req, blob_contents] : _blob_reqs) { + assert(req); + + if (req->status->ok()) { + CacheHandleGuard blob_handle; + const CacheKey cache_key = base_cache_key.WithOffset(req->offset); + const Slice key = cache_key.AsSlice(); + s = PutBlobIntoCache(key, &blob_contents, &blob_handle); + if (!s.ok()) { + *req->status = s; + } else { + PinCachedBlob(&blob_handle, req->result); + } + } + } + } else { + for (auto& [req, blob_contents] : _blob_reqs) { + assert(req); + + if (req->status->ok()) { + PinOwnedBlob(&blob_contents, req->result); + } + } + } + + total_bytes += _bytes_read; + if (bytes_read) { + *bytes_read = total_bytes; + } + } +} + +bool BlobSource::TEST_BlobInCache(uint64_t file_number, uint64_t file_size, + uint64_t offset, size_t* charge) const { + const CacheKey cache_key = GetCacheKey(file_number, file_size, offset); + const Slice key = cache_key.AsSlice(); + + CacheHandleGuard blob_handle; + const Status s = GetBlobFromCache(key, &blob_handle); + + if (s.ok() && blob_handle.GetValue() != nullptr) { + if (charge) { + const Cache* const cache = blob_handle.GetCache(); + assert(cache); + + Cache::Handle* const handle = blob_handle.GetCacheHandle(); + assert(handle); + + *charge = cache->GetUsage(handle); + } + + return true; + } + + return false; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_source.h b/src/rocksdb/db/blob/blob_source.h new file mode 100644 index 000000000..2ed296eeb --- /dev/null +++ b/src/rocksdb/db/blob/blob_source.h @@ -0,0 +1,153 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "cache/cache_helpers.h" +#include "cache/cache_key.h" +#include "db/blob/blob_file_cache.h" +#include "db/blob/blob_read_request.h" +#include "rocksdb/cache.h" +#include "rocksdb/rocksdb_namespace.h" +#include "table/block_based/cachable_entry.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +struct ImmutableOptions; +class Status; +class FilePrefetchBuffer; +class Slice; +class BlobContents; + +// BlobSource is a class that provides universal access to blobs, regardless of +// whether they are in the blob cache, secondary cache, or (remote) storage. +// Depending on user settings, it always fetch blobs from multi-tier cache and +// storage with minimal cost. +class BlobSource { + public: + BlobSource(const ImmutableOptions* immutable_options, + const std::string& db_id, const std::string& db_session_id, + BlobFileCache* blob_file_cache); + + BlobSource(const BlobSource&) = delete; + BlobSource& operator=(const BlobSource&) = delete; + + ~BlobSource(); + + // Read a blob from the underlying cache or one blob file. + // + // If successful, returns ok and sets "*value" to the newly retrieved + // uncompressed blob. If there was an error while fetching the blob, sets + // "*value" to empty and returns a non-ok status. + // + // Note: For consistency, whether the blob is found in the cache or on disk, + // sets "*bytes_read" to the size of on-disk (possibly compressed) blob + // record. + Status GetBlob(const ReadOptions& read_options, const Slice& user_key, + uint64_t file_number, uint64_t offset, uint64_t file_size, + uint64_t value_size, CompressionType compression_type, + FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value, + uint64_t* bytes_read); + + // Read multiple blobs from the underlying cache or blob file(s). + // + // If successful, returns ok and sets "result" in the elements of "blob_reqs" + // to the newly retrieved uncompressed blobs. If there was an error while + // fetching one of blobs, sets its "result" to empty and sets its + // corresponding "status" to a non-ok status. + // + // Note: + // - The main difference between this function and MultiGetBlobFromOneFile is + // that this function can read multiple blobs from multiple blob files. + // + // - For consistency, whether the blob is found in the cache or on disk, sets + // "*bytes_read" to the total size of on-disk (possibly compressed) blob + // records. + void MultiGetBlob(const ReadOptions& read_options, + autovector& blob_reqs, + uint64_t* bytes_read); + + // Read multiple blobs from the underlying cache or one blob file. + // + // If successful, returns ok and sets "result" in the elements of "blob_reqs" + // to the newly retrieved uncompressed blobs. If there was an error while + // fetching one of blobs, sets its "result" to empty and sets its + // corresponding "status" to a non-ok status. + // + // Note: + // - The main difference between this function and MultiGetBlob is that this + // function is only used for the case where the demanded blobs are stored in + // one blob file. MultiGetBlob will call this function multiple times if the + // demanded blobs are stored in multiple blob files. + // + // - For consistency, whether the blob is found in the cache or on disk, sets + // "*bytes_read" to the total size of on-disk (possibly compressed) blob + // records. + void MultiGetBlobFromOneFile(const ReadOptions& read_options, + uint64_t file_number, uint64_t file_size, + autovector& blob_reqs, + uint64_t* bytes_read); + + inline Status GetBlobFileReader( + uint64_t blob_file_number, + CacheHandleGuard* blob_file_reader) { + return blob_file_cache_->GetBlobFileReader(blob_file_number, + blob_file_reader); + } + + inline Cache* GetBlobCache() const { return blob_cache_.get(); } + + bool TEST_BlobInCache(uint64_t file_number, uint64_t file_size, + uint64_t offset, size_t* charge = nullptr) const; + + private: + Status GetBlobFromCache(const Slice& cache_key, + CacheHandleGuard* cached_blob) const; + + Status PutBlobIntoCache(const Slice& cache_key, + std::unique_ptr* blob, + CacheHandleGuard* cached_blob) const; + + static void PinCachedBlob(CacheHandleGuard* cached_blob, + PinnableSlice* value); + + static void PinOwnedBlob(std::unique_ptr* owned_blob, + PinnableSlice* value); + + Cache::Handle* GetEntryFromCache(const Slice& key) const; + + Status InsertEntryIntoCache(const Slice& key, BlobContents* value, + size_t charge, Cache::Handle** cache_handle, + Cache::Priority priority) const; + + inline CacheKey GetCacheKey(uint64_t file_number, uint64_t /*file_size*/, + uint64_t offset) const { + OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number); + return base_cache_key.WithOffset(offset); + } + + const std::string& db_id_; + const std::string& db_session_id_; + + Statistics* statistics_; + + // A cache to store blob file reader. + BlobFileCache* blob_file_cache_; + + // A cache to store uncompressed blobs. + std::shared_ptr blob_cache_; + + // The control option of how the cache tiers will be used. Currently rocksdb + // support block/blob cache (volatile tier) and secondary cache (this tier + // isn't strictly speaking a non-volatile tier since the compressed cache in + // this tier is in volatile memory). + const CacheTier lowest_used_cache_tier_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_source_test.cc b/src/rocksdb/db/blob/blob_source_test.cc new file mode 100644 index 000000000..a85ed8646 --- /dev/null +++ b/src/rocksdb/db/blob/blob_source_test.cc @@ -0,0 +1,1624 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_source.h" + +#include +#include +#include +#include +#include + +#include "cache/charged_cache.h" +#include "cache/compressed_secondary_cache.h" +#include "db/blob/blob_contents.h" +#include "db/blob/blob_file_cache.h" +#include "db/blob/blob_file_reader.h" +#include "db/blob/blob_log_format.h" +#include "db/blob/blob_log_writer.h" +#include "db/db_test_util.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "options/cf_options.h" +#include "rocksdb/options.h" +#include "util/compression.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +// Creates a test blob file with `num` blobs in it. +void WriteBlobFile(const ImmutableOptions& immutable_options, + uint32_t column_family_id, bool has_ttl, + const ExpirationRange& expiration_range_header, + const ExpirationRange& expiration_range_footer, + uint64_t blob_file_number, const std::vector& keys, + const std::vector& blobs, CompressionType compression, + std::vector& blob_offsets, + std::vector& blob_sizes) { + assert(!immutable_options.cf_paths.empty()); + size_t num = keys.size(); + assert(num == blobs.size()); + assert(num == blob_offsets.size()); + assert(num == blob_sizes.size()); + + const std::string blob_file_path = + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number); + std::unique_ptr file; + ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file, + FileOptions())); + + std::unique_ptr file_writer(new WritableFileWriter( + std::move(file), blob_file_path, FileOptions(), immutable_options.clock)); + + constexpr Statistics* statistics = nullptr; + constexpr bool use_fsync = false; + constexpr bool do_flush = false; + + BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock, + statistics, blob_file_number, use_fsync, + do_flush); + + BlobLogHeader header(column_family_id, compression, has_ttl, + expiration_range_header); + + ASSERT_OK(blob_log_writer.WriteHeader(header)); + + std::vector compressed_blobs(num); + std::vector blobs_to_write(num); + if (kNoCompression == compression) { + for (size_t i = 0; i < num; ++i) { + blobs_to_write[i] = blobs[i]; + blob_sizes[i] = blobs[i].size(); + } + } else { + CompressionOptions opts; + CompressionContext context(compression); + constexpr uint64_t sample_for_compression = 0; + CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), + compression, sample_for_compression); + + constexpr uint32_t compression_format_version = 2; + + for (size_t i = 0; i < num; ++i) { + ASSERT_TRUE(CompressData(blobs[i], info, compression_format_version, + &compressed_blobs[i])); + blobs_to_write[i] = compressed_blobs[i]; + blob_sizes[i] = compressed_blobs[i].size(); + } + } + + for (size_t i = 0; i < num; ++i) { + uint64_t key_offset = 0; + ASSERT_OK(blob_log_writer.AddRecord(keys[i], blobs_to_write[i], &key_offset, + &blob_offsets[i])); + } + + BlobLogFooter footer; + footer.blob_count = num; + footer.expiration_range = expiration_range_footer; + + std::string checksum_method; + std::string checksum_value; + ASSERT_OK( + blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value)); +} + +} // anonymous namespace + +class BlobSourceTest : public DBTestBase { + protected: + public: + explicit BlobSourceTest() + : DBTestBase("blob_source_test", /*env_do_fsync=*/true) { + options_.env = env_; + options_.enable_blob_files = true; + options_.create_if_missing = true; + + LRUCacheOptions co; + co.capacity = 8 << 20; + co.num_shard_bits = 2; + co.metadata_charge_policy = kDontChargeCacheMetadata; + co.high_pri_pool_ratio = 0.2; + co.low_pri_pool_ratio = 0.2; + options_.blob_cache = NewLRUCache(co); + options_.lowest_used_cache_tier = CacheTier::kVolatileTier; + + assert(db_->GetDbIdentity(db_id_).ok()); + assert(db_->GetDbSessionId(db_session_id_).ok()); + } + + Options options_; + std::string db_id_; + std::string db_session_id_; +}; + +TEST_F(BlobSourceTest, GetBlobsFromCache) { + options_.cf_paths.emplace_back( + test::PerThreadDBPath(env_, "BlobSourceTest_GetBlobsFromCache"), 0); + + options_.statistics = CreateDBStatistics(); + Statistics* statistics = options_.statistics.get(); + assert(statistics); + + DestroyAndReopen(options_); + + ImmutableOptions immutable_options(options_); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr size_t num_blobs = 16; + + std::vector key_strs; + std::vector blob_strs; + + for (size_t i = 0; i < num_blobs; ++i) { + key_strs.push_back("key" + std::to_string(i)); + blob_strs.push_back("blob" + std::to_string(i)); + } + + std::vector keys; + std::vector blobs; + + uint64_t file_size = BlobLogHeader::kSize; + for (size_t i = 0; i < num_blobs; ++i) { + keys.push_back({key_strs[i]}); + blobs.push_back({blob_strs[i]}); + file_size += BlobLogRecord::kHeaderSize + keys[i].size() + blobs[i].size(); + } + file_size += BlobLogFooter::kSize; + + std::vector blob_offsets(keys.size()); + std::vector blob_sizes(keys.size()); + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, keys, blobs, kNoCompression, + blob_offsets, blob_sizes); + + constexpr size_t capacity = 1024; + std::shared_ptr backing_cache = + NewLRUCache(capacity); // Blob file cache + + FileOptions file_options; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr blob_file_cache = + std::make_unique( + backing_cache.get(), &immutable_options, &file_options, + column_family_id, blob_file_read_hist, nullptr /*IOTracer*/); + + BlobSource blob_source(&immutable_options, db_id_, db_session_id_, + blob_file_cache.get()); + + ReadOptions read_options; + read_options.verify_checksums = true; + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + + { + // GetBlob + std::vector values(keys.size()); + uint64_t bytes_read = 0; + uint64_t blob_bytes = 0; + uint64_t total_bytes = 0; + + read_options.fill_cache = false; + get_perf_context()->Reset(); + + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + + ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number, + blob_offsets[i], file_size, blob_sizes[i], + kNoCompression, prefetch_buffer, &values[i], + &bytes_read)); + ASSERT_EQ(values[i], blobs[i]); + ASSERT_TRUE(values[i].IsPinned()); + ASSERT_EQ(bytes_read, + BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]); + + ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + total_bytes += bytes_read; + } + + // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache, + // GetBlob, and TEST_BlobInCache. + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0); + ASSERT_EQ((int)get_perf_context()->blob_read_count, num_blobs); + ASSERT_EQ((int)get_perf_context()->blob_read_byte, total_bytes); + ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0); + ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0); + + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 3); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0); + + read_options.fill_cache = true; + blob_bytes = 0; + total_bytes = 0; + get_perf_context()->Reset(); + statistics->Reset().PermitUncheckedError(); + + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + + ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number, + blob_offsets[i], file_size, blob_sizes[i], + kNoCompression, prefetch_buffer, &values[i], + &bytes_read)); + ASSERT_EQ(values[i], blobs[i]); + ASSERT_TRUE(values[i].IsPinned()); + ASSERT_EQ(bytes_read, + BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]); + + blob_bytes += blob_sizes[i]; + total_bytes += bytes_read; + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, i); + ASSERT_EQ((int)get_perf_context()->blob_read_count, i + 1); + ASSERT_EQ((int)get_perf_context()->blob_read_byte, total_bytes); + + ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, i + 1); + ASSERT_EQ((int)get_perf_context()->blob_read_count, i + 1); + ASSERT_EQ((int)get_perf_context()->blob_read_byte, total_bytes); + } + + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_blobs); + ASSERT_EQ((int)get_perf_context()->blob_read_count, num_blobs); + ASSERT_EQ((int)get_perf_context()->blob_read_byte, total_bytes); + + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 2); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_blobs); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), num_blobs); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), blob_bytes); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), + blob_bytes); + + read_options.fill_cache = true; + total_bytes = 0; + blob_bytes = 0; + get_perf_context()->Reset(); + statistics->Reset().PermitUncheckedError(); + + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + + ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number, + blob_offsets[i], file_size, blob_sizes[i], + kNoCompression, prefetch_buffer, &values[i], + &bytes_read)); + ASSERT_EQ(values[i], blobs[i]); + ASSERT_TRUE(values[i].IsPinned()); + ASSERT_EQ(bytes_read, + BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]); + + ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + total_bytes += bytes_read; // on-disk blob record size + blob_bytes += blob_sizes[i]; // cached blob value size + } + + // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache, + // GetBlob, and TEST_BlobInCache. + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_blobs * 3); + ASSERT_EQ((int)get_perf_context()->blob_read_count, 0); // without i/o + ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0); // without i/o + + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_blobs * 3); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), + blob_bytes * 3); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0); + + // Cache-only GetBlob + read_options.read_tier = ReadTier::kBlockCacheTier; + total_bytes = 0; + blob_bytes = 0; + get_perf_context()->Reset(); + statistics->Reset().PermitUncheckedError(); + + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + + ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number, + blob_offsets[i], file_size, blob_sizes[i], + kNoCompression, prefetch_buffer, &values[i], + &bytes_read)); + ASSERT_EQ(values[i], blobs[i]); + ASSERT_TRUE(values[i].IsPinned()); + ASSERT_EQ(bytes_read, + BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]); + + ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + total_bytes += bytes_read; + blob_bytes += blob_sizes[i]; + } + + // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache, + // GetBlob, and TEST_BlobInCache. + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_blobs * 3); + ASSERT_EQ((int)get_perf_context()->blob_read_count, 0); // without i/o + ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0); // without i/o + + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_blobs * 3); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), + blob_bytes * 3); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0); + } + + options_.blob_cache->EraseUnRefEntries(); + + { + // Cache-only GetBlob + std::vector values(keys.size()); + uint64_t bytes_read = 0; + + read_options.read_tier = ReadTier::kBlockCacheTier; + read_options.fill_cache = true; + get_perf_context()->Reset(); + statistics->Reset().PermitUncheckedError(); + + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + + ASSERT_TRUE(blob_source + .GetBlob(read_options, keys[i], blob_file_number, + blob_offsets[i], file_size, blob_sizes[i], + kNoCompression, prefetch_buffer, &values[i], + &bytes_read) + .IsIncomplete()); + ASSERT_TRUE(values[i].empty()); + ASSERT_FALSE(values[i].IsPinned()); + ASSERT_EQ(bytes_read, 0); + + ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + } + + // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache, + // GetBlob, and TEST_BlobInCache. + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0); + ASSERT_EQ((int)get_perf_context()->blob_read_count, 0); + ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0); + + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 3); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0); + } + + { + // GetBlob from non-existing file + std::vector values(keys.size()); + uint64_t bytes_read = 0; + uint64_t file_number = 100; // non-existing file + + read_options.read_tier = ReadTier::kReadAllTier; + read_options.fill_cache = true; + get_perf_context()->Reset(); + statistics->Reset().PermitUncheckedError(); + + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_FALSE(blob_source.TEST_BlobInCache(file_number, file_size, + blob_offsets[i])); + + ASSERT_TRUE(blob_source + .GetBlob(read_options, keys[i], file_number, + blob_offsets[i], file_size, blob_sizes[i], + kNoCompression, prefetch_buffer, &values[i], + &bytes_read) + .IsIOError()); + ASSERT_TRUE(values[i].empty()); + ASSERT_FALSE(values[i].IsPinned()); + ASSERT_EQ(bytes_read, 0); + + ASSERT_FALSE(blob_source.TEST_BlobInCache(file_number, file_size, + blob_offsets[i])); + } + + // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache, + // GetBlob, and TEST_BlobInCache. + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0); + ASSERT_EQ((int)get_perf_context()->blob_read_count, 0); + ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0); + + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 3); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0); + } +} + +TEST_F(BlobSourceTest, GetCompressedBlobs) { + if (!Snappy_Supported()) { + return; + } + + const CompressionType compression = kSnappyCompression; + + options_.cf_paths.emplace_back( + test::PerThreadDBPath(env_, "BlobSourceTest_GetCompressedBlobs"), 0); + + DestroyAndReopen(options_); + + ImmutableOptions immutable_options(options_); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr size_t num_blobs = 256; + + std::vector key_strs; + std::vector blob_strs; + + for (size_t i = 0; i < num_blobs; ++i) { + key_strs.push_back("key" + std::to_string(i)); + blob_strs.push_back("blob" + std::to_string(i)); + } + + std::vector keys; + std::vector blobs; + + for (size_t i = 0; i < num_blobs; ++i) { + keys.push_back({key_strs[i]}); + blobs.push_back({blob_strs[i]}); + } + + std::vector blob_offsets(keys.size()); + std::vector blob_sizes(keys.size()); + + constexpr size_t capacity = 1024; + auto backing_cache = NewLRUCache(capacity); // Blob file cache + + FileOptions file_options; + std::unique_ptr blob_file_cache = + std::make_unique( + backing_cache.get(), &immutable_options, &file_options, + column_family_id, nullptr /*HistogramImpl*/, nullptr /*IOTracer*/); + + BlobSource blob_source(&immutable_options, db_id_, db_session_id_, + blob_file_cache.get()); + + ReadOptions read_options; + read_options.verify_checksums = true; + + uint64_t bytes_read = 0; + std::vector values(keys.size()); + + { + // Snappy Compression + const uint64_t file_number = 1; + + read_options.read_tier = ReadTier::kReadAllTier; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, + expiration_range, expiration_range, file_number, keys, blobs, + compression, blob_offsets, blob_sizes); + + CacheHandleGuard blob_file_reader; + ASSERT_OK(blob_source.GetBlobFileReader(file_number, &blob_file_reader)); + ASSERT_NE(blob_file_reader.GetValue(), nullptr); + + const uint64_t file_size = blob_file_reader.GetValue()->GetFileSize(); + ASSERT_EQ(blob_file_reader.GetValue()->GetCompressionType(), compression); + + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_NE(blobs[i].size() /*uncompressed size*/, + blob_sizes[i] /*compressed size*/); + } + + read_options.fill_cache = true; + read_options.read_tier = ReadTier::kReadAllTier; + get_perf_context()->Reset(); + + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_FALSE(blob_source.TEST_BlobInCache(file_number, file_size, + blob_offsets[i])); + ASSERT_OK(blob_source.GetBlob(read_options, keys[i], file_number, + blob_offsets[i], file_size, blob_sizes[i], + compression, nullptr /*prefetch_buffer*/, + &values[i], &bytes_read)); + ASSERT_EQ(values[i], blobs[i] /*uncompressed blob*/); + ASSERT_NE(values[i].size(), blob_sizes[i] /*compressed size*/); + ASSERT_EQ(bytes_read, + BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]); + + ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size, + blob_offsets[i])); + } + + ASSERT_GE((int)get_perf_context()->blob_decompress_time, 0); + + read_options.read_tier = ReadTier::kBlockCacheTier; + get_perf_context()->Reset(); + + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size, + blob_offsets[i])); + + // Compressed blob size is passed in GetBlob + ASSERT_OK(blob_source.GetBlob(read_options, keys[i], file_number, + blob_offsets[i], file_size, blob_sizes[i], + compression, nullptr /*prefetch_buffer*/, + &values[i], &bytes_read)); + ASSERT_EQ(values[i], blobs[i] /*uncompressed blob*/); + ASSERT_NE(values[i].size(), blob_sizes[i] /*compressed size*/); + ASSERT_EQ(bytes_read, + BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]); + + ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size, + blob_offsets[i])); + } + + ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0); + } +} + +TEST_F(BlobSourceTest, MultiGetBlobsFromMultiFiles) { + options_.cf_paths.emplace_back( + test::PerThreadDBPath(env_, "BlobSourceTest_MultiGetBlobsFromMultiFiles"), + 0); + + options_.statistics = CreateDBStatistics(); + Statistics* statistics = options_.statistics.get(); + assert(statistics); + + DestroyAndReopen(options_); + + ImmutableOptions immutable_options(options_); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_files = 2; + constexpr size_t num_blobs = 32; + + std::vector key_strs; + std::vector blob_strs; + + for (size_t i = 0; i < num_blobs; ++i) { + key_strs.push_back("key" + std::to_string(i)); + blob_strs.push_back("blob" + std::to_string(i)); + } + + std::vector keys; + std::vector blobs; + + uint64_t file_size = BlobLogHeader::kSize; + uint64_t blob_value_bytes = 0; + for (size_t i = 0; i < num_blobs; ++i) { + keys.push_back({key_strs[i]}); + blobs.push_back({blob_strs[i]}); + blob_value_bytes += blobs[i].size(); + file_size += BlobLogRecord::kHeaderSize + keys[i].size() + blobs[i].size(); + } + file_size += BlobLogFooter::kSize; + const uint64_t blob_records_bytes = + file_size - BlobLogHeader::kSize - BlobLogFooter::kSize; + + std::vector blob_offsets(keys.size()); + std::vector blob_sizes(keys.size()); + + { + // Write key/blob pairs to multiple blob files. + for (size_t i = 0; i < blob_files; ++i) { + const uint64_t file_number = i + 1; + WriteBlobFile(immutable_options, column_family_id, has_ttl, + expiration_range, expiration_range, file_number, keys, + blobs, kNoCompression, blob_offsets, blob_sizes); + } + } + + constexpr size_t capacity = 10; + std::shared_ptr backing_cache = + NewLRUCache(capacity); // Blob file cache + + FileOptions file_options; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr blob_file_cache = + std::make_unique( + backing_cache.get(), &immutable_options, &file_options, + column_family_id, blob_file_read_hist, nullptr /*IOTracer*/); + + BlobSource blob_source(&immutable_options, db_id_, db_session_id_, + blob_file_cache.get()); + + ReadOptions read_options; + read_options.verify_checksums = true; + + uint64_t bytes_read = 0; + + { + // MultiGetBlob + read_options.fill_cache = true; + read_options.read_tier = ReadTier::kReadAllTier; + + autovector blob_reqs; + std::array, blob_files> blob_reqs_in_file; + std::array value_buf; + std::array statuses_buf; + + for (size_t i = 0; i < blob_files; ++i) { + const uint64_t file_number = i + 1; + for (size_t j = 0; j < num_blobs; ++j) { + blob_reqs_in_file[i].emplace_back( + keys[j], blob_offsets[j], blob_sizes[j], kNoCompression, + &value_buf[i * num_blobs + j], &statuses_buf[i * num_blobs + j]); + } + blob_reqs.emplace_back(file_number, file_size, blob_reqs_in_file[i]); + } + + get_perf_context()->Reset(); + statistics->Reset().PermitUncheckedError(); + + blob_source.MultiGetBlob(read_options, blob_reqs, &bytes_read); + + for (size_t i = 0; i < blob_files; ++i) { + const uint64_t file_number = i + 1; + for (size_t j = 0; j < num_blobs; ++j) { + ASSERT_OK(statuses_buf[i * num_blobs + j]); + ASSERT_EQ(value_buf[i * num_blobs + j], blobs[j]); + ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size, + blob_offsets[j])); + } + } + + // Retrieved all blobs from 2 blob files twice via MultiGetBlob and + // TEST_BlobInCache. + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, + num_blobs * blob_files); + ASSERT_EQ((int)get_perf_context()->blob_read_count, + num_blobs * blob_files); // blocking i/o + ASSERT_EQ((int)get_perf_context()->blob_read_byte, + blob_records_bytes * blob_files); // blocking i/o + ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0); + ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0); + + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), + num_blobs * blob_files); // MultiGetBlob + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), + num_blobs * blob_files); // TEST_BlobInCache + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), + num_blobs * blob_files); // MultiGetBlob + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), + blob_value_bytes * blob_files); // TEST_BlobInCache + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), + blob_value_bytes * blob_files); // MultiGetBlob + + get_perf_context()->Reset(); + statistics->Reset().PermitUncheckedError(); + + autovector fake_blob_reqs_in_file; + std::array fake_value_buf; + std::array fake_statuses_buf; + + const uint64_t fake_file_number = 100; + for (size_t i = 0; i < num_blobs; ++i) { + fake_blob_reqs_in_file.emplace_back( + keys[i], blob_offsets[i], blob_sizes[i], kNoCompression, + &fake_value_buf[i], &fake_statuses_buf[i]); + } + + // Add a fake multi-get blob request. + blob_reqs.emplace_back(fake_file_number, file_size, fake_blob_reqs_in_file); + + blob_source.MultiGetBlob(read_options, blob_reqs, &bytes_read); + + // Check the real blob read requests. + for (size_t i = 0; i < blob_files; ++i) { + const uint64_t file_number = i + 1; + for (size_t j = 0; j < num_blobs; ++j) { + ASSERT_OK(statuses_buf[i * num_blobs + j]); + ASSERT_EQ(value_buf[i * num_blobs + j], blobs[j]); + ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size, + blob_offsets[j])); + } + } + + // Check the fake blob request. + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_TRUE(fake_statuses_buf[i].IsIOError()); + ASSERT_TRUE(fake_value_buf[i].empty()); + ASSERT_FALSE(blob_source.TEST_BlobInCache(fake_file_number, file_size, + blob_offsets[i])); + } + + // Retrieved all blobs from 3 blob files (including the fake one) twice + // via MultiGetBlob and TEST_BlobInCache. + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, + num_blobs * blob_files * 2); + ASSERT_EQ((int)get_perf_context()->blob_read_count, + 0); // blocking i/o + ASSERT_EQ((int)get_perf_context()->blob_read_byte, + 0); // blocking i/o + ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0); + ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0); + + // Fake blob requests: MultiGetBlob and TEST_BlobInCache + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 2); + // Real blob requests: MultiGetBlob and TEST_BlobInCache + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), + num_blobs * blob_files * 2); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0); + // Real blob requests: MultiGetBlob and TEST_BlobInCache + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), + blob_value_bytes * blob_files * 2); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0); + } +} + +TEST_F(BlobSourceTest, MultiGetBlobsFromCache) { + options_.cf_paths.emplace_back( + test::PerThreadDBPath(env_, "BlobSourceTest_MultiGetBlobsFromCache"), 0); + + options_.statistics = CreateDBStatistics(); + Statistics* statistics = options_.statistics.get(); + assert(statistics); + + DestroyAndReopen(options_); + + ImmutableOptions immutable_options(options_); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr size_t num_blobs = 16; + + std::vector key_strs; + std::vector blob_strs; + + for (size_t i = 0; i < num_blobs; ++i) { + key_strs.push_back("key" + std::to_string(i)); + blob_strs.push_back("blob" + std::to_string(i)); + } + + std::vector keys; + std::vector blobs; + + uint64_t file_size = BlobLogHeader::kSize; + for (size_t i = 0; i < num_blobs; ++i) { + keys.push_back({key_strs[i]}); + blobs.push_back({blob_strs[i]}); + file_size += BlobLogRecord::kHeaderSize + keys[i].size() + blobs[i].size(); + } + file_size += BlobLogFooter::kSize; + + std::vector blob_offsets(keys.size()); + std::vector blob_sizes(keys.size()); + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, keys, blobs, kNoCompression, + blob_offsets, blob_sizes); + + constexpr size_t capacity = 10; + std::shared_ptr backing_cache = + NewLRUCache(capacity); // Blob file cache + + FileOptions file_options; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr blob_file_cache = + std::make_unique( + backing_cache.get(), &immutable_options, &file_options, + column_family_id, blob_file_read_hist, nullptr /*IOTracer*/); + + BlobSource blob_source(&immutable_options, db_id_, db_session_id_, + blob_file_cache.get()); + + ReadOptions read_options; + read_options.verify_checksums = true; + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + + { + // MultiGetBlobFromOneFile + uint64_t bytes_read = 0; + std::array statuses_buf; + std::array value_buf; + autovector blob_reqs; + + for (size_t i = 0; i < num_blobs; i += 2) { // even index + blob_reqs.emplace_back(keys[i], blob_offsets[i], blob_sizes[i], + kNoCompression, &value_buf[i], &statuses_buf[i]); + ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + } + + read_options.fill_cache = true; + read_options.read_tier = ReadTier::kReadAllTier; + get_perf_context()->Reset(); + statistics->Reset().PermitUncheckedError(); + + // Get half of blobs + blob_source.MultiGetBlobFromOneFile(read_options, blob_file_number, + file_size, blob_reqs, &bytes_read); + + uint64_t fs_read_bytes = 0; + uint64_t ca_read_bytes = 0; + for (size_t i = 0; i < num_blobs; ++i) { + if (i % 2 == 0) { + ASSERT_OK(statuses_buf[i]); + ASSERT_EQ(value_buf[i], blobs[i]); + ASSERT_TRUE(value_buf[i].IsPinned()); + fs_read_bytes += + blob_sizes[i] + keys[i].size() + BlobLogRecord::kHeaderSize; + ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + ca_read_bytes += blob_sizes[i]; + } else { + statuses_buf[i].PermitUncheckedError(); + ASSERT_TRUE(value_buf[i].empty()); + ASSERT_FALSE(value_buf[i].IsPinned()); + ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + } + } + + constexpr int num_even_blobs = num_blobs / 2; + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_even_blobs); + ASSERT_EQ((int)get_perf_context()->blob_read_count, + num_even_blobs); // blocking i/o + ASSERT_EQ((int)get_perf_context()->blob_read_byte, + fs_read_bytes); // blocking i/o + ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0); + ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0); + + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_even_blobs); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), num_even_blobs); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), + ca_read_bytes); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), + ca_read_bytes); + + // Get the rest of blobs + for (size_t i = 1; i < num_blobs; i += 2) { // odd index + ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + + ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number, + blob_offsets[i], file_size, blob_sizes[i], + kNoCompression, prefetch_buffer, + &value_buf[i], &bytes_read)); + ASSERT_EQ(value_buf[i], blobs[i]); + ASSERT_TRUE(value_buf[i].IsPinned()); + ASSERT_EQ(bytes_read, + BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]); + + ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + } + + // Cache-only MultiGetBlobFromOneFile + read_options.read_tier = ReadTier::kBlockCacheTier; + get_perf_context()->Reset(); + statistics->Reset().PermitUncheckedError(); + + blob_reqs.clear(); + for (size_t i = 0; i < num_blobs; ++i) { + blob_reqs.emplace_back(keys[i], blob_offsets[i], blob_sizes[i], + kNoCompression, &value_buf[i], &statuses_buf[i]); + } + + blob_source.MultiGetBlobFromOneFile(read_options, blob_file_number, + file_size, blob_reqs, &bytes_read); + + uint64_t blob_bytes = 0; + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_OK(statuses_buf[i]); + ASSERT_EQ(value_buf[i], blobs[i]); + ASSERT_TRUE(value_buf[i].IsPinned()); + ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + blob_bytes += blob_sizes[i]; + } + + // Retrieved the blob cache num_blobs * 2 times via GetBlob and + // TEST_BlobInCache. + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_blobs * 2); + ASSERT_EQ((int)get_perf_context()->blob_read_count, 0); // blocking i/o + ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0); // blocking i/o + ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0); + ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0); + + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_blobs * 2); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), + blob_bytes * 2); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0); + } + + options_.blob_cache->EraseUnRefEntries(); + + { + // Cache-only MultiGetBlobFromOneFile + uint64_t bytes_read = 0; + read_options.read_tier = ReadTier::kBlockCacheTier; + + std::array statuses_buf; + std::array value_buf; + autovector blob_reqs; + + for (size_t i = 0; i < num_blobs; i++) { + blob_reqs.emplace_back(keys[i], blob_offsets[i], blob_sizes[i], + kNoCompression, &value_buf[i], &statuses_buf[i]); + ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + } + + get_perf_context()->Reset(); + statistics->Reset().PermitUncheckedError(); + + blob_source.MultiGetBlobFromOneFile(read_options, blob_file_number, + file_size, blob_reqs, &bytes_read); + + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_TRUE(statuses_buf[i].IsIncomplete()); + ASSERT_TRUE(value_buf[i].empty()); + ASSERT_FALSE(value_buf[i].IsPinned()); + ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + } + + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0); + ASSERT_EQ((int)get_perf_context()->blob_read_count, 0); // blocking i/o + ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0); // blocking i/o + ASSERT_EQ((int)get_perf_context()->blob_checksum_time, 0); + ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0); + + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 2); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0); + } + + { + // MultiGetBlobFromOneFile from non-existing file + uint64_t bytes_read = 0; + uint64_t non_existing_file_number = 100; + read_options.read_tier = ReadTier::kReadAllTier; + + std::array statuses_buf; + std::array value_buf; + autovector blob_reqs; + + for (size_t i = 0; i < num_blobs; i++) { + blob_reqs.emplace_back(keys[i], blob_offsets[i], blob_sizes[i], + kNoCompression, &value_buf[i], &statuses_buf[i]); + ASSERT_FALSE(blob_source.TEST_BlobInCache(non_existing_file_number, + file_size, blob_offsets[i])); + } + + get_perf_context()->Reset(); + statistics->Reset().PermitUncheckedError(); + + blob_source.MultiGetBlobFromOneFile(read_options, non_existing_file_number, + file_size, blob_reqs, &bytes_read); + + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_TRUE(statuses_buf[i].IsIOError()); + ASSERT_TRUE(value_buf[i].empty()); + ASSERT_FALSE(value_buf[i].IsPinned()); + ASSERT_FALSE(blob_source.TEST_BlobInCache(non_existing_file_number, + file_size, blob_offsets[i])); + } + + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0); + ASSERT_EQ((int)get_perf_context()->blob_read_count, 0); // blocking i/o + ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0); // blocking i/o + ASSERT_EQ((int)get_perf_context()->blob_checksum_time, 0); + ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0); + + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 2); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0); + } +} + +class BlobSecondaryCacheTest : public DBTestBase { + protected: + public: + explicit BlobSecondaryCacheTest() + : DBTestBase("blob_secondary_cache_test", /*env_do_fsync=*/true) { + options_.env = env_; + options_.enable_blob_files = true; + options_.create_if_missing = true; + + // Set a small cache capacity to evict entries from the cache, and to test + // that secondary cache is used properly. + lru_cache_opts_.capacity = 1024; + lru_cache_opts_.num_shard_bits = 0; + lru_cache_opts_.strict_capacity_limit = true; + lru_cache_opts_.metadata_charge_policy = kDontChargeCacheMetadata; + lru_cache_opts_.high_pri_pool_ratio = 0.2; + lru_cache_opts_.low_pri_pool_ratio = 0.2; + + secondary_cache_opts_.capacity = 8 << 20; // 8 MB + secondary_cache_opts_.num_shard_bits = 0; + secondary_cache_opts_.metadata_charge_policy = + kDefaultCacheMetadataChargePolicy; + + // Read blobs from the secondary cache if they are not in the primary cache + options_.lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier; + + assert(db_->GetDbIdentity(db_id_).ok()); + assert(db_->GetDbSessionId(db_session_id_).ok()); + } + + Options options_; + + LRUCacheOptions lru_cache_opts_; + CompressedSecondaryCacheOptions secondary_cache_opts_; + + std::string db_id_; + std::string db_session_id_; +}; + +TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) { + if (!Snappy_Supported()) { + return; + } + + secondary_cache_opts_.compression_type = kSnappyCompression; + lru_cache_opts_.secondary_cache = + NewCompressedSecondaryCache(secondary_cache_opts_); + options_.blob_cache = NewLRUCache(lru_cache_opts_); + + options_.cf_paths.emplace_back( + test::PerThreadDBPath( + env_, "BlobSecondaryCacheTest_GetBlobsFromSecondaryCache"), + 0); + + options_.statistics = CreateDBStatistics(); + Statistics* statistics = options_.statistics.get(); + assert(statistics); + + DestroyAndReopen(options_); + + ImmutableOptions immutable_options(options_); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t file_number = 1; + + Random rnd(301); + + std::vector key_strs{"key0", "key1"}; + std::vector blob_strs{rnd.RandomString(512), + rnd.RandomString(768)}; + + std::vector keys{key_strs[0], key_strs[1]}; + std::vector blobs{blob_strs[0], blob_strs[1]}; + + std::vector blob_offsets(keys.size()); + std::vector blob_sizes(keys.size()); + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, file_number, keys, blobs, kNoCompression, + blob_offsets, blob_sizes); + + constexpr size_t capacity = 1024; + std::shared_ptr backing_cache = NewLRUCache(capacity); + + FileOptions file_options; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr blob_file_cache(new BlobFileCache( + backing_cache.get(), &immutable_options, &file_options, column_family_id, + blob_file_read_hist, nullptr /*IOTracer*/)); + + BlobSource blob_source(&immutable_options, db_id_, db_session_id_, + blob_file_cache.get()); + + CacheHandleGuard file_reader; + ASSERT_OK(blob_source.GetBlobFileReader(file_number, &file_reader)); + ASSERT_NE(file_reader.GetValue(), nullptr); + const uint64_t file_size = file_reader.GetValue()->GetFileSize(); + ASSERT_EQ(file_reader.GetValue()->GetCompressionType(), kNoCompression); + + ReadOptions read_options; + read_options.verify_checksums = true; + + auto blob_cache = options_.blob_cache; + auto secondary_cache = lru_cache_opts_.secondary_cache; + + Cache::CreateCallback create_cb = [](const void* buf, size_t size, + void** out_obj, + size_t* charge) -> Status { + CacheAllocationPtr allocation(new char[size]); + + return BlobContents::CreateCallback(std::move(allocation), buf, size, + out_obj, charge); + }; + + { + // GetBlob + std::vector values(keys.size()); + + read_options.fill_cache = true; + get_perf_context()->Reset(); + + // key0 should be filled to the primary cache from the blob file. + ASSERT_OK(blob_source.GetBlob(read_options, keys[0], file_number, + blob_offsets[0], file_size, blob_sizes[0], + kNoCompression, nullptr /* prefetch_buffer */, + &values[0], nullptr /* bytes_read */)); + // Release cache handle + values[0].Reset(); + + // key0 should be evicted and key0's dummy item is inserted into secondary + // cache. key1 should be filled to the primary cache from the blob file. + ASSERT_OK(blob_source.GetBlob(read_options, keys[1], file_number, + blob_offsets[1], file_size, blob_sizes[1], + kNoCompression, nullptr /* prefetch_buffer */, + &values[1], nullptr /* bytes_read */)); + + // Release cache handle + values[1].Reset(); + + // key0 should be filled to the primary cache from the blob file. key1 + // should be evicted and key1's dummy item is inserted into secondary cache. + ASSERT_OK(blob_source.GetBlob(read_options, keys[0], file_number, + blob_offsets[0], file_size, blob_sizes[0], + kNoCompression, nullptr /* prefetch_buffer */, + &values[0], nullptr /* bytes_read */)); + ASSERT_EQ(values[0], blobs[0]); + ASSERT_TRUE( + blob_source.TEST_BlobInCache(file_number, file_size, blob_offsets[0])); + + // Release cache handle + values[0].Reset(); + + // key0 should be evicted and is inserted into secondary cache. + // key1 should be filled to the primary cache from the blob file. + ASSERT_OK(blob_source.GetBlob(read_options, keys[1], file_number, + blob_offsets[1], file_size, blob_sizes[1], + kNoCompression, nullptr /* prefetch_buffer */, + &values[1], nullptr /* bytes_read */)); + ASSERT_EQ(values[1], blobs[1]); + ASSERT_TRUE( + blob_source.TEST_BlobInCache(file_number, file_size, blob_offsets[1])); + + // Release cache handle + values[1].Reset(); + + OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number); + + // blob_cache here only looks at the primary cache since we didn't provide + // the cache item helper for the secondary cache. However, since key0 is + // demoted to the secondary cache, we shouldn't be able to find it in the + // primary cache. + { + CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[0]); + const Slice key0 = cache_key.AsSlice(); + auto handle0 = blob_cache->Lookup(key0, statistics); + ASSERT_EQ(handle0, nullptr); + + // key0's item should be in the secondary cache. + bool is_in_sec_cache = false; + auto sec_handle0 = + secondary_cache->Lookup(key0, create_cb, true, + /*advise_erase=*/true, is_in_sec_cache); + ASSERT_FALSE(is_in_sec_cache); + ASSERT_NE(sec_handle0, nullptr); + ASSERT_TRUE(sec_handle0->IsReady()); + auto value = static_cast(sec_handle0->Value()); + ASSERT_NE(value, nullptr); + ASSERT_EQ(value->data(), blobs[0]); + delete value; + + // key0 doesn't exist in the blob cache although key0's dummy + // item exist in the secondary cache. + ASSERT_FALSE(blob_source.TEST_BlobInCache(file_number, file_size, + blob_offsets[0])); + } + + // key1 should exists in the primary cache. key1's dummy item exists + // in the secondary cache. + { + CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[1]); + const Slice key1 = cache_key.AsSlice(); + auto handle1 = blob_cache->Lookup(key1, statistics); + ASSERT_NE(handle1, nullptr); + blob_cache->Release(handle1); + + bool is_in_sec_cache = false; + auto sec_handle1 = + secondary_cache->Lookup(key1, create_cb, true, + /*advise_erase=*/true, is_in_sec_cache); + ASSERT_FALSE(is_in_sec_cache); + ASSERT_EQ(sec_handle1, nullptr); + + ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size, + blob_offsets[1])); + } + + { + // fetch key0 from the blob file to the primary cache. + // key1 is evicted and inserted into the secondary cache. + ASSERT_OK(blob_source.GetBlob( + read_options, keys[0], file_number, blob_offsets[0], file_size, + blob_sizes[0], kNoCompression, nullptr /* prefetch_buffer */, + &values[0], nullptr /* bytes_read */)); + ASSERT_EQ(values[0], blobs[0]); + + // Release cache handle + values[0].Reset(); + + // key0 should be in the primary cache. + CacheKey cache_key0 = base_cache_key.WithOffset(blob_offsets[0]); + const Slice key0 = cache_key0.AsSlice(); + auto handle0 = blob_cache->Lookup(key0, statistics); + ASSERT_NE(handle0, nullptr); + auto value = static_cast(blob_cache->Value(handle0)); + ASSERT_NE(value, nullptr); + ASSERT_EQ(value->data(), blobs[0]); + blob_cache->Release(handle0); + + // key1 is not in the primary cache and is in the secondary cache. + CacheKey cache_key1 = base_cache_key.WithOffset(blob_offsets[1]); + const Slice key1 = cache_key1.AsSlice(); + auto handle1 = blob_cache->Lookup(key1, statistics); + ASSERT_EQ(handle1, nullptr); + + // erase key0 from the primary cache. + blob_cache->Erase(key0); + handle0 = blob_cache->Lookup(key0, statistics); + ASSERT_EQ(handle0, nullptr); + + // key1 promotion should succeed due to the primary cache being empty. we + // did't call secondary cache's Lookup() here, because it will remove the + // key but it won't be able to promote the key to the primary cache. + // Instead we use the end-to-end blob source API to read key1. + // In function TEST_BlobInCache, key1's dummy item is inserted into the + // primary cache and a standalone handle is checked by GetValue(). + ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size, + blob_offsets[1])); + + // key1's dummy handle is in the primary cache and key1's item is still + // in the secondary cache. So, the primary cache's Lookup() without + // secondary cache support cannot see it. (NOTE: The dummy handle used + // to be a leaky abstraction but not anymore.) + handle1 = blob_cache->Lookup(key1, statistics); + ASSERT_EQ(handle1, nullptr); + + // But after another access, it is promoted to primary cache + ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size, + blob_offsets[1])); + + // And Lookup() can find it (without secondary cache support) + handle1 = blob_cache->Lookup(key1, statistics); + ASSERT_NE(handle1, nullptr); + ASSERT_NE(blob_cache->Value(handle1), nullptr); + blob_cache->Release(handle1); + } + } +} + +class BlobSourceCacheReservationTest : public DBTestBase { + public: + explicit BlobSourceCacheReservationTest() + : DBTestBase("blob_source_cache_reservation_test", + /*env_do_fsync=*/true) { + options_.env = env_; + options_.enable_blob_files = true; + options_.create_if_missing = true; + + LRUCacheOptions co; + co.capacity = kCacheCapacity; + co.num_shard_bits = kNumShardBits; + co.metadata_charge_policy = kDontChargeCacheMetadata; + + co.high_pri_pool_ratio = 0.0; + co.low_pri_pool_ratio = 0.0; + std::shared_ptr blob_cache = NewLRUCache(co); + + co.high_pri_pool_ratio = 0.5; + co.low_pri_pool_ratio = 0.5; + std::shared_ptr block_cache = NewLRUCache(co); + + options_.blob_cache = blob_cache; + options_.lowest_used_cache_tier = CacheTier::kVolatileTier; + + BlockBasedTableOptions block_based_options; + block_based_options.no_block_cache = false; + block_based_options.block_cache = block_cache; + block_based_options.cache_usage_options.options_overrides.insert( + {CacheEntryRole::kBlobCache, + {/* charged = */ CacheEntryRoleOptions::Decision::kEnabled}}); + options_.table_factory.reset( + NewBlockBasedTableFactory(block_based_options)); + + assert(db_->GetDbIdentity(db_id_).ok()); + assert(db_->GetDbSessionId(db_session_id_).ok()); + } + + void GenerateKeysAndBlobs() { + for (size_t i = 0; i < kNumBlobs; ++i) { + key_strs_.push_back("key" + std::to_string(i)); + blob_strs_.push_back("blob" + std::to_string(i)); + } + + blob_file_size_ = BlobLogHeader::kSize; + for (size_t i = 0; i < kNumBlobs; ++i) { + keys_.push_back({key_strs_[i]}); + blobs_.push_back({blob_strs_[i]}); + blob_file_size_ += + BlobLogRecord::kHeaderSize + keys_[i].size() + blobs_[i].size(); + } + blob_file_size_ += BlobLogFooter::kSize; + } + + static constexpr std::size_t kSizeDummyEntry = CacheReservationManagerImpl< + CacheEntryRole::kBlobCache>::GetDummyEntrySize(); + static constexpr std::size_t kCacheCapacity = 1 * kSizeDummyEntry; + static constexpr int kNumShardBits = 0; // 2^0 shard + + static constexpr uint32_t kColumnFamilyId = 1; + static constexpr bool kHasTTL = false; + static constexpr uint64_t kBlobFileNumber = 1; + static constexpr size_t kNumBlobs = 16; + + std::vector keys_; + std::vector blobs_; + std::vector key_strs_; + std::vector blob_strs_; + uint64_t blob_file_size_; + + Options options_; + std::string db_id_; + std::string db_session_id_; +}; + +#ifndef ROCKSDB_LITE +TEST_F(BlobSourceCacheReservationTest, SimpleCacheReservation) { + options_.cf_paths.emplace_back( + test::PerThreadDBPath( + env_, "BlobSourceCacheReservationTest_SimpleCacheReservation"), + 0); + + GenerateKeysAndBlobs(); + + DestroyAndReopen(options_); + + ImmutableOptions immutable_options(options_); + + constexpr ExpirationRange expiration_range; + + std::vector blob_offsets(keys_.size()); + std::vector blob_sizes(keys_.size()); + + WriteBlobFile(immutable_options, kColumnFamilyId, kHasTTL, expiration_range, + expiration_range, kBlobFileNumber, keys_, blobs_, + kNoCompression, blob_offsets, blob_sizes); + + constexpr size_t capacity = 10; + std::shared_ptr backing_cache = NewLRUCache(capacity); + + FileOptions file_options; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr blob_file_cache = + std::make_unique( + backing_cache.get(), &immutable_options, &file_options, + kColumnFamilyId, blob_file_read_hist, nullptr /*IOTracer*/); + + BlobSource blob_source(&immutable_options, db_id_, db_session_id_, + blob_file_cache.get()); + + ConcurrentCacheReservationManager* cache_res_mgr = + static_cast(blob_source.GetBlobCache()) + ->TEST_GetCacheReservationManager(); + ASSERT_NE(cache_res_mgr, nullptr); + + ReadOptions read_options; + read_options.verify_checksums = true; + + { + read_options.fill_cache = false; + + std::vector values(keys_.size()); + + for (size_t i = 0; i < kNumBlobs; ++i) { + ASSERT_OK(blob_source.GetBlob( + read_options, keys_[i], kBlobFileNumber, blob_offsets[i], + blob_file_size_, blob_sizes[i], kNoCompression, + nullptr /* prefetch_buffer */, &values[i], nullptr /* bytes_read */)); + ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), 0); + ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), 0); + } + } + + { + read_options.fill_cache = true; + + std::vector values(keys_.size()); + + // num_blobs is 16, so the total blob cache usage is less than a single + // dummy entry. Therefore, cache reservation manager only reserves one dummy + // entry here. + uint64_t blob_bytes = 0; + for (size_t i = 0; i < kNumBlobs; ++i) { + ASSERT_OK(blob_source.GetBlob( + read_options, keys_[i], kBlobFileNumber, blob_offsets[i], + blob_file_size_, blob_sizes[i], kNoCompression, + nullptr /* prefetch_buffer */, &values[i], nullptr /* bytes_read */)); + + size_t charge = 0; + ASSERT_TRUE(blob_source.TEST_BlobInCache(kBlobFileNumber, blob_file_size_, + blob_offsets[i], &charge)); + + blob_bytes += charge; + ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), kSizeDummyEntry); + ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), blob_bytes); + ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), + options_.blob_cache->GetUsage()); + } + } + + { + OffsetableCacheKey base_cache_key(db_id_, db_session_id_, kBlobFileNumber); + size_t blob_bytes = options_.blob_cache->GetUsage(); + + for (size_t i = 0; i < kNumBlobs; ++i) { + size_t charge = 0; + ASSERT_TRUE(blob_source.TEST_BlobInCache(kBlobFileNumber, blob_file_size_, + blob_offsets[i], &charge)); + + CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[i]); + // We didn't call options_.blob_cache->Erase() here, this is because + // the cache wrapper's Erase() method must be called to update the + // cache usage after erasing the cache entry. + blob_source.GetBlobCache()->Erase(cache_key.AsSlice()); + if (i == kNumBlobs - 1) { + // All the blobs got removed from the cache. cache_res_mgr should not + // reserve any space for them. + ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), 0); + } else { + ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), kSizeDummyEntry); + } + blob_bytes -= charge; + ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), blob_bytes); + ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), + options_.blob_cache->GetUsage()); + } + } +} + +TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservationOnFullCache) { + options_.cf_paths.emplace_back( + test::PerThreadDBPath( + env_, + "BlobSourceCacheReservationTest_IncreaseCacheReservationOnFullCache"), + 0); + + GenerateKeysAndBlobs(); + + DestroyAndReopen(options_); + + ImmutableOptions immutable_options(options_); + constexpr size_t blob_size = kSizeDummyEntry / (kNumBlobs / 2); + for (size_t i = 0; i < kNumBlobs; ++i) { + blob_file_size_ -= blobs_[i].size(); // old blob size + blob_strs_[i].resize(blob_size, '@'); + blobs_[i] = Slice(blob_strs_[i]); + blob_file_size_ += blobs_[i].size(); // new blob size + } + + std::vector blob_offsets(keys_.size()); + std::vector blob_sizes(keys_.size()); + + constexpr ExpirationRange expiration_range; + WriteBlobFile(immutable_options, kColumnFamilyId, kHasTTL, expiration_range, + expiration_range, kBlobFileNumber, keys_, blobs_, + kNoCompression, blob_offsets, blob_sizes); + + constexpr size_t capacity = 10; + std::shared_ptr backing_cache = NewLRUCache(capacity); + + FileOptions file_options; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr blob_file_cache = + std::make_unique( + backing_cache.get(), &immutable_options, &file_options, + kColumnFamilyId, blob_file_read_hist, nullptr /*IOTracer*/); + + BlobSource blob_source(&immutable_options, db_id_, db_session_id_, + blob_file_cache.get()); + + ConcurrentCacheReservationManager* cache_res_mgr = + static_cast(blob_source.GetBlobCache()) + ->TEST_GetCacheReservationManager(); + ASSERT_NE(cache_res_mgr, nullptr); + + ReadOptions read_options; + read_options.verify_checksums = true; + + { + read_options.fill_cache = false; + + std::vector values(keys_.size()); + + for (size_t i = 0; i < kNumBlobs; ++i) { + ASSERT_OK(blob_source.GetBlob( + read_options, keys_[i], kBlobFileNumber, blob_offsets[i], + blob_file_size_, blob_sizes[i], kNoCompression, + nullptr /* prefetch_buffer */, &values[i], nullptr /* bytes_read */)); + ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), 0); + ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), 0); + } + } + + { + read_options.fill_cache = true; + + std::vector values(keys_.size()); + + // Since we resized each blob to be kSizeDummyEntry / (num_blobs / 2), we + // can't fit all the blobs in the cache at the same time, which means we + // should observe cache evictions once we reach the cache's capacity. + // Due to the overhead of the cache and the BlobContents objects, as well as + // jemalloc bin sizes, this happens after inserting seven blobs. + uint64_t blob_bytes = 0; + for (size_t i = 0; i < kNumBlobs; ++i) { + ASSERT_OK(blob_source.GetBlob( + read_options, keys_[i], kBlobFileNumber, blob_offsets[i], + blob_file_size_, blob_sizes[i], kNoCompression, + nullptr /* prefetch_buffer */, &values[i], nullptr /* bytes_read */)); + + // Release cache handle + values[i].Reset(); + + if (i < kNumBlobs / 2 - 1) { + size_t charge = 0; + ASSERT_TRUE(blob_source.TEST_BlobInCache( + kBlobFileNumber, blob_file_size_, blob_offsets[i], &charge)); + + blob_bytes += charge; + } + + ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), kSizeDummyEntry); + ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), blob_bytes); + ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), + options_.blob_cache->GetUsage()); + } + } +} +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/blob/db_blob_basic_test.cc b/src/rocksdb/db/blob/db_blob_basic_test.cc new file mode 100644 index 000000000..e6832a2ae --- /dev/null +++ b/src/rocksdb/db/blob/db_blob_basic_test.cc @@ -0,0 +1,1789 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include +#include + +#include "cache/compressed_secondary_cache.h" +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "test_util/sync_point.h" +#include "utilities/fault_injection_env.h" + +namespace ROCKSDB_NAMESPACE { + +class DBBlobBasicTest : public DBTestBase { + protected: + DBBlobBasicTest() + : DBTestBase("db_blob_basic_test", /* env_do_fsync */ false) {} +}; + +TEST_F(DBBlobBasicTest, GetBlob) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr char key[] = "key"; + constexpr char blob_value[] = "blob_value"; + + ASSERT_OK(Put(key, blob_value)); + + ASSERT_OK(Flush()); + + ASSERT_EQ(Get(key), blob_value); + + // Try again with no I/O allowed. The table and the necessary blocks should + // already be in their respective caches; however, the blob itself can only be + // read from the blob file, so the read should return Incomplete. + ReadOptions read_options; + read_options.read_tier = kBlockCacheTier; + + PinnableSlice result; + ASSERT_TRUE(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result) + .IsIncomplete()); +} + +TEST_F(DBBlobBasicTest, GetBlobFromCache) { + Options options = GetDefaultOptions(); + + LRUCacheOptions co; + co.capacity = 2 << 20; // 2MB + co.num_shard_bits = 2; + co.metadata_charge_policy = kDontChargeCacheMetadata; + auto backing_cache = NewLRUCache(co); + + options.enable_blob_files = true; + options.blob_cache = backing_cache; + + BlockBasedTableOptions block_based_options; + block_based_options.no_block_cache = false; + block_based_options.block_cache = backing_cache; + block_based_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(block_based_options)); + + Reopen(options); + + constexpr char key[] = "key"; + constexpr char blob_value[] = "blob_value"; + + ASSERT_OK(Put(key, blob_value)); + + ASSERT_OK(Flush()); + + ReadOptions read_options; + + read_options.fill_cache = false; + + { + PinnableSlice result; + + read_options.read_tier = kReadAllTier; + ASSERT_OK(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result)); + ASSERT_EQ(result, blob_value); + + result.Reset(); + read_options.read_tier = kBlockCacheTier; + + // Try again with no I/O allowed. Since we didn't re-fill the cache, the + // blob itself can only be read from the blob file, so the read should + // return Incomplete. + ASSERT_TRUE(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result) + .IsIncomplete()); + ASSERT_TRUE(result.empty()); + } + + read_options.fill_cache = true; + + { + PinnableSlice result; + + read_options.read_tier = kReadAllTier; + ASSERT_OK(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result)); + ASSERT_EQ(result, blob_value); + + result.Reset(); + read_options.read_tier = kBlockCacheTier; + + // Try again with no I/O allowed. The table and the necessary blocks/blobs + // should already be in their respective caches. + ASSERT_OK(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result)); + ASSERT_EQ(result, blob_value); + } +} + +TEST_F(DBBlobBasicTest, IterateBlobsFromCache) { + Options options = GetDefaultOptions(); + + LRUCacheOptions co; + co.capacity = 2 << 20; // 2MB + co.num_shard_bits = 2; + co.metadata_charge_policy = kDontChargeCacheMetadata; + auto backing_cache = NewLRUCache(co); + + options.enable_blob_files = true; + options.blob_cache = backing_cache; + + BlockBasedTableOptions block_based_options; + block_based_options.no_block_cache = false; + block_based_options.block_cache = backing_cache; + block_based_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(block_based_options)); + + options.statistics = CreateDBStatistics(); + + Reopen(options); + + int num_blobs = 5; + std::vector keys; + std::vector blobs; + + for (int i = 0; i < num_blobs; ++i) { + keys.push_back("key" + std::to_string(i)); + blobs.push_back("blob" + std::to_string(i)); + ASSERT_OK(Put(keys[i], blobs[i])); + } + ASSERT_OK(Flush()); + + ReadOptions read_options; + + { + read_options.fill_cache = false; + read_options.read_tier = kReadAllTier; + + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + int i = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key().ToString(), keys[i]); + ASSERT_EQ(iter->value().ToString(), blobs[i]); + ++i; + } + ASSERT_EQ(i, num_blobs); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 0); + } + + { + read_options.fill_cache = false; + read_options.read_tier = kBlockCacheTier; + + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + // Try again with no I/O allowed. Since we didn't re-fill the cache, + // the blob itself can only be read from the blob file, so iter->Valid() + // should be false. + iter->SeekToFirst(); + ASSERT_NOK(iter->status()); + ASSERT_FALSE(iter->Valid()); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 0); + } + + { + read_options.fill_cache = true; + read_options.read_tier = kReadAllTier; + + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + // Read blobs from the file and refill the cache. + int i = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key().ToString(), keys[i]); + ASSERT_EQ(iter->value().ToString(), blobs[i]); + ++i; + } + ASSERT_EQ(i, num_blobs); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), + num_blobs); + } + + { + read_options.fill_cache = false; + read_options.read_tier = kBlockCacheTier; + + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + // Try again with no I/O allowed. The table and the necessary blocks/blobs + // should already be in their respective caches. + int i = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key().ToString(), keys[i]); + ASSERT_EQ(iter->value().ToString(), blobs[i]); + ++i; + } + ASSERT_EQ(i, num_blobs); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 0); + } +} + +TEST_F(DBBlobBasicTest, IterateBlobsFromCachePinning) { + constexpr size_t min_blob_size = 6; + + Options options = GetDefaultOptions(); + + LRUCacheOptions cache_options; + cache_options.capacity = 2048; + cache_options.num_shard_bits = 0; + cache_options.metadata_charge_policy = kDontChargeCacheMetadata; + + options.blob_cache = NewLRUCache(cache_options); + options.enable_blob_files = true; + options.min_blob_size = min_blob_size; + + Reopen(options); + + // Put then iterate over three key-values. The second value is below the size + // limit and is thus stored inline; the other two are stored separately as + // blobs. We expect to have something pinned in the cache iff we are + // positioned on a blob. + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "long_value"; + static_assert(sizeof(first_value) - 1 >= min_blob_size, + "first_value too short to be stored as blob"); + + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "short"; + static_assert(sizeof(second_value) - 1 < min_blob_size, + "second_value too long to be inlined"); + + ASSERT_OK(Put(second_key, second_value)); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "other_long_value"; + static_assert(sizeof(third_value) - 1 >= min_blob_size, + "third_value too short to be stored as blob"); + + ASSERT_OK(Put(third_key, third_value)); + + ASSERT_OK(Flush()); + + { + ReadOptions read_options; + read_options.fill_cache = true; + + std::unique_ptr iter(db_->NewIterator(read_options)); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), first_key); + ASSERT_EQ(iter->value(), first_value); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), second_key); + ASSERT_EQ(iter->value(), second_value); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), third_key); + ASSERT_EQ(iter->value(), third_value); + + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + } + + { + ReadOptions read_options; + read_options.fill_cache = false; + read_options.read_tier = kBlockCacheTier; + + std::unique_ptr iter(db_->NewIterator(read_options)); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), first_key); + ASSERT_EQ(iter->value(), first_value); + ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), second_key); + ASSERT_EQ(iter->value(), second_value); + ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), third_key); + ASSERT_EQ(iter->value(), third_value); + ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0); + + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0); + } + + { + ReadOptions read_options; + read_options.fill_cache = false; + read_options.read_tier = kBlockCacheTier; + + std::unique_ptr iter(db_->NewIterator(read_options)); + + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), third_key); + ASSERT_EQ(iter->value(), third_value); + ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), second_key); + ASSERT_EQ(iter->value(), second_value); + ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), first_key); + ASSERT_EQ(iter->value(), first_value); + ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0); + + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0); + } +} + +TEST_F(DBBlobBasicTest, MultiGetBlobs) { + constexpr size_t min_blob_size = 6; + + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = min_blob_size; + + Reopen(options); + + // Put then retrieve three key-values. The first value is below the size limit + // and is thus stored inline; the other two are stored separately as blobs. + constexpr size_t num_keys = 3; + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "short"; + static_assert(sizeof(first_value) - 1 < min_blob_size, + "first_value too long to be inlined"); + + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "long_value"; + static_assert(sizeof(second_value) - 1 >= min_blob_size, + "second_value too short to be stored as blob"); + + ASSERT_OK(Put(second_key, second_value)); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "other_long_value"; + static_assert(sizeof(third_value) - 1 >= min_blob_size, + "third_value too short to be stored as blob"); + + ASSERT_OK(Put(third_key, third_value)); + + ASSERT_OK(Flush()); + + ReadOptions read_options; + + std::array keys{{first_key, second_key, third_key}}; + + { + std::array values; + std::array statuses; + + db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], first_value); + + ASSERT_OK(statuses[1]); + ASSERT_EQ(values[1], second_value); + + ASSERT_OK(statuses[2]); + ASSERT_EQ(values[2], third_value); + } + + // Try again with no I/O allowed. The table and the necessary blocks should + // already be in their respective caches. The first (inlined) value should be + // successfully read; however, the two blob values could only be read from the + // blob file, so for those the read should return Incomplete. + read_options.read_tier = kBlockCacheTier; + + { + std::array values; + std::array statuses; + + db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], first_value); + + ASSERT_TRUE(statuses[1].IsIncomplete()); + + ASSERT_TRUE(statuses[2].IsIncomplete()); + } +} + +TEST_F(DBBlobBasicTest, MultiGetBlobsFromCache) { + Options options = GetDefaultOptions(); + + LRUCacheOptions co; + co.capacity = 2 << 20; // 2MB + co.num_shard_bits = 2; + co.metadata_charge_policy = kDontChargeCacheMetadata; + auto backing_cache = NewLRUCache(co); + + constexpr size_t min_blob_size = 6; + options.min_blob_size = min_blob_size; + options.create_if_missing = true; + options.enable_blob_files = true; + options.blob_cache = backing_cache; + + BlockBasedTableOptions block_based_options; + block_based_options.no_block_cache = false; + block_based_options.block_cache = backing_cache; + block_based_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(block_based_options)); + + DestroyAndReopen(options); + + // Put then retrieve three key-values. The first value is below the size limit + // and is thus stored inline; the other two are stored separately as blobs. + constexpr size_t num_keys = 3; + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "short"; + static_assert(sizeof(first_value) - 1 < min_blob_size, + "first_value too long to be inlined"); + + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "long_value"; + static_assert(sizeof(second_value) - 1 >= min_blob_size, + "second_value too short to be stored as blob"); + + ASSERT_OK(Put(second_key, second_value)); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "other_long_value"; + static_assert(sizeof(third_value) - 1 >= min_blob_size, + "third_value too short to be stored as blob"); + + ASSERT_OK(Put(third_key, third_value)); + + ASSERT_OK(Flush()); + + ReadOptions read_options; + read_options.fill_cache = false; + + std::array keys{{first_key, second_key, third_key}}; + + { + std::array values; + std::array statuses; + + db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], first_value); + + ASSERT_OK(statuses[1]); + ASSERT_EQ(values[1], second_value); + + ASSERT_OK(statuses[2]); + ASSERT_EQ(values[2], third_value); + } + + // Try again with no I/O allowed. The first (inlined) value should be + // successfully read; however, the two blob values could only be read from the + // blob file, so for those the read should return Incomplete. + read_options.read_tier = kBlockCacheTier; + + { + std::array values; + std::array statuses; + + db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], first_value); + + ASSERT_TRUE(statuses[1].IsIncomplete()); + + ASSERT_TRUE(statuses[2].IsIncomplete()); + } + + // Fill the cache when reading blobs from the blob file. + read_options.read_tier = kReadAllTier; + read_options.fill_cache = true; + + { + std::array values; + std::array statuses; + + db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], first_value); + + ASSERT_OK(statuses[1]); + ASSERT_EQ(values[1], second_value); + + ASSERT_OK(statuses[2]); + ASSERT_EQ(values[2], third_value); + } + + // Try again with no I/O allowed. All blobs should be successfully read from + // the cache. + read_options.read_tier = kBlockCacheTier; + + { + std::array values; + std::array statuses; + + db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], first_value); + + ASSERT_OK(statuses[1]); + ASSERT_EQ(values[1], second_value); + + ASSERT_OK(statuses[2]); + ASSERT_EQ(values[2], third_value); + } +} + +#ifndef ROCKSDB_LITE +TEST_F(DBBlobBasicTest, MultiGetWithDirectIO) { + Options options = GetDefaultOptions(); + + // First, create an external SST file ["b"]. + const std::string file_path = dbname_ + "/test.sst"; + { + SstFileWriter sst_file_writer(EnvOptions(), GetDefaultOptions()); + Status s = sst_file_writer.Open(file_path); + ASSERT_OK(s); + ASSERT_OK(sst_file_writer.Put("b", "b_value")); + ASSERT_OK(sst_file_writer.Finish()); + } + + options.enable_blob_files = true; + options.min_blob_size = 1000; + options.use_direct_reads = true; + options.allow_ingest_behind = true; + + // Open DB with fixed-prefix sst-partitioner so that compaction will cut + // new table file when encountering a new key whose 1-byte prefix changes. + constexpr size_t key_len = 1; + options.sst_partitioner_factory = + NewSstPartitionerFixedPrefixFactory(key_len); + + Status s = TryReopen(options); + if (s.IsInvalidArgument()) { + ROCKSDB_GTEST_SKIP("This test requires direct IO support"); + return; + } + ASSERT_OK(s); + + constexpr size_t num_keys = 3; + constexpr size_t blob_size = 3000; + + constexpr char first_key[] = "a"; + const std::string first_blob(blob_size, 'a'); + ASSERT_OK(Put(first_key, first_blob)); + + constexpr char second_key[] = "b"; + const std::string second_blob(2 * blob_size, 'b'); + ASSERT_OK(Put(second_key, second_blob)); + + constexpr char third_key[] = "d"; + const std::string third_blob(blob_size, 'd'); + ASSERT_OK(Put(third_key, third_blob)); + + // first_blob, second_blob and third_blob in the same blob file. + // SST Blob file + // L0 ["a", "b", "d"] |'aaaa', 'bbbb', 'dddd'| + // | | | ^ ^ ^ + // | | | | | | + // | | +---------|-------|--------+ + // | +-----------------|-------+ + // +-------------------------+ + ASSERT_OK(Flush()); + + constexpr char fourth_key[] = "c"; + const std::string fourth_blob(blob_size, 'c'); + ASSERT_OK(Put(fourth_key, fourth_blob)); + // fourth_blob in another blob file. + // SST Blob file SST Blob file + // L0 ["a", "b", "d"] |'aaaa', 'bbbb', 'dddd'| ["c"] |'cccc'| + // | | | ^ ^ ^ | ^ + // | | | | | | | | + // | | +---------|-------|--------+ +-------+ + // | +-----------------|-------+ + // +-------------------------+ + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + + // Due to the above sst partitioner, we get 4 L1 files. The blob files are + // unchanged. + // |'aaaa', 'bbbb', 'dddd'| |'cccc'| + // ^ ^ ^ ^ + // | | | | + // L0 | | | | + // L1 ["a"] ["b"] ["c"] | | ["d"] | + // | | | | | | + // | | +---------|-------|---------------+ + // | +-----------------|-------+ + // +-------------------------+ + ASSERT_EQ(4, NumTableFilesAtLevel(/*level=*/1)); + + { + // Ingest the external SST file into bottommost level. + std::vector ext_files{file_path}; + IngestExternalFileOptions opts; + opts.ingest_behind = true; + ASSERT_OK( + db_->IngestExternalFile(db_->DefaultColumnFamily(), ext_files, opts)); + } + + // Now the database becomes as follows. + // |'aaaa', 'bbbb', 'dddd'| |'cccc'| + // ^ ^ ^ ^ + // | | | | + // L0 | | | | + // L1 ["a"] ["b"] ["c"] | | ["d"] | + // | | | | | | + // | | +---------|-------|---------------+ + // | +-----------------|-------+ + // +-------------------------+ + // + // L6 ["b"] + + { + // Compact ["b"] to bottommost level. + Slice begin = Slice(second_key); + Slice end = Slice(second_key); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, &begin, &end)); + } + + // |'aaaa', 'bbbb', 'dddd'| |'cccc'| + // ^ ^ ^ ^ + // | | | | + // L0 | | | | + // L1 ["a"] ["c"] | | ["d"] | + // | | | | | + // | +---------|-------|---------------+ + // | +-----------------|-------+ + // +-------|-----------------+ + // | + // L6 ["b"] + ASSERT_EQ(3, NumTableFilesAtLevel(/*level=*/1)); + ASSERT_EQ(1, NumTableFilesAtLevel(/*level=*/6)); + + bool called = false; + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "RandomAccessFileReader::MultiRead:AlignedReqs", [&](void* arg) { + auto* aligned_reqs = static_cast*>(arg); + assert(aligned_reqs); + ASSERT_EQ(1, aligned_reqs->size()); + called = true; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::array keys{{first_key, third_key, second_key}}; + + { + std::array values; + std::array statuses; + + // The MultiGet(), when constructing the KeyContexts, will process the keys + // in such order: a, d, b. The reason is that ["a"] and ["d"] are in L1, + // while ["b"] resides in L6. + // Consequently, the original FSReadRequest list prepared by + // Version::MultiGetblob() will be for "a", "d" and "b". It is unsorted as + // follows: + // + // ["a", offset=30, len=3033], + // ["d", offset=9096, len=3033], + // ["b", offset=3063, len=6033] + // + // If we do not sort them before calling MultiRead() in DirectIO, then the + // underlying IO merging logic will yield two requests. + // + // [offset=0, len=4096] (for "a") + // [offset=0, len=12288] (result of merging the request for "d" and "b") + // + // We need to sort them in Version::MultiGetBlob() so that the underlying + // IO merging logic in DirectIO mode works as expected. The correct + // behavior will be one aligned request: + // + // [offset=0, len=12288] + + db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_TRUE(called); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], first_blob); + + ASSERT_OK(statuses[1]); + ASSERT_EQ(values[1], third_blob); + + ASSERT_OK(statuses[2]); + ASSERT_EQ(values[2], second_blob); + } +} +#endif // !ROCKSDB_LITE + +TEST_F(DBBlobBasicTest, MultiGetBlobsFromMultipleFiles) { + Options options = GetDefaultOptions(); + + LRUCacheOptions co; + co.capacity = 2 << 20; // 2MB + co.num_shard_bits = 2; + co.metadata_charge_policy = kDontChargeCacheMetadata; + auto backing_cache = NewLRUCache(co); + + options.min_blob_size = 0; + options.create_if_missing = true; + options.enable_blob_files = true; + options.blob_cache = backing_cache; + + BlockBasedTableOptions block_based_options; + block_based_options.no_block_cache = false; + block_based_options.block_cache = backing_cache; + block_based_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(block_based_options)); + + Reopen(options); + + constexpr size_t kNumBlobFiles = 3; + constexpr size_t kNumBlobsPerFile = 3; + constexpr size_t kNumKeys = kNumBlobsPerFile * kNumBlobFiles; + + std::vector key_strs; + std::vector value_strs; + for (size_t i = 0; i < kNumBlobFiles; ++i) { + for (size_t j = 0; j < kNumBlobsPerFile; ++j) { + std::string key = "key" + std::to_string(i) + "_" + std::to_string(j); + std::string value = + "value_as_blob" + std::to_string(i) + "_" + std::to_string(j); + ASSERT_OK(Put(key, value)); + key_strs.push_back(key); + value_strs.push_back(value); + } + ASSERT_OK(Flush()); + } + assert(key_strs.size() == kNumKeys); + std::array keys; + for (size_t i = 0; i < keys.size(); ++i) { + keys[i] = key_strs[i]; + } + + ReadOptions read_options; + read_options.read_tier = kReadAllTier; + read_options.fill_cache = false; + + { + std::array values; + std::array statuses; + db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0], + &values[0], &statuses[0]); + + for (size_t i = 0; i < kNumKeys; ++i) { + ASSERT_OK(statuses[i]); + ASSERT_EQ(value_strs[i], values[i]); + } + } + + read_options.read_tier = kBlockCacheTier; + + { + std::array values; + std::array statuses; + db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0], + &values[0], &statuses[0]); + + for (size_t i = 0; i < kNumKeys; ++i) { + ASSERT_TRUE(statuses[i].IsIncomplete()); + ASSERT_TRUE(values[i].empty()); + } + } + + read_options.read_tier = kReadAllTier; + read_options.fill_cache = true; + + { + std::array values; + std::array statuses; + db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0], + &values[0], &statuses[0]); + + for (size_t i = 0; i < kNumKeys; ++i) { + ASSERT_OK(statuses[i]); + ASSERT_EQ(value_strs[i], values[i]); + } + } + + read_options.read_tier = kBlockCacheTier; + + { + std::array values; + std::array statuses; + db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0], + &values[0], &statuses[0]); + + for (size_t i = 0; i < kNumKeys; ++i) { + ASSERT_OK(statuses[i]); + ASSERT_EQ(value_strs[i], values[i]); + } + } +} + +TEST_F(DBBlobBasicTest, GetBlob_CorruptIndex) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + ASSERT_OK(Put(key, blob)); + ASSERT_OK(Flush()); + + SyncPoint::GetInstance()->SetCallBack( + "Version::Get::TamperWithBlobIndex", [](void* arg) { + Slice* const blob_index = static_cast(arg); + assert(blob_index); + assert(!blob_index->empty()); + blob_index->remove_prefix(1); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + PinnableSlice result; + ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result) + .IsCorruption()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(DBBlobBasicTest, MultiGetBlob_CorruptIndex) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + + DestroyAndReopen(options); + + constexpr size_t kNumOfKeys = 3; + std::array key_strs; + std::array value_strs; + std::array keys; + for (size_t i = 0; i < kNumOfKeys; ++i) { + key_strs[i] = "foo" + std::to_string(i); + value_strs[i] = "blob_value" + std::to_string(i); + ASSERT_OK(Put(key_strs[i], value_strs[i])); + keys[i] = key_strs[i]; + } + + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + ASSERT_OK(Put(key, blob)); + keys[kNumOfKeys] = key; + + ASSERT_OK(Flush()); + + SyncPoint::GetInstance()->SetCallBack( + "Version::MultiGet::TamperWithBlobIndex", [&key](void* arg) { + KeyContext* const key_context = static_cast(arg); + assert(key_context); + assert(key_context->key); + + if (*(key_context->key) == key) { + Slice* const blob_index = key_context->value; + assert(blob_index); + assert(!blob_index->empty()); + blob_index->remove_prefix(1); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::array values; + std::array statuses; + db_->MultiGet(ReadOptions(), dbfull()->DefaultColumnFamily(), kNumOfKeys + 1, + keys.data(), values.data(), statuses.data(), + /*sorted_input=*/false); + for (size_t i = 0; i < kNumOfKeys + 1; ++i) { + if (i != kNumOfKeys) { + ASSERT_OK(statuses[i]); + ASSERT_EQ("blob_value" + std::to_string(i), values[i]); + } else { + ASSERT_TRUE(statuses[i].IsCorruption()); + } + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(DBBlobBasicTest, MultiGetBlob_ExceedSoftLimit) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr size_t kNumOfKeys = 3; + std::array key_bufs; + std::array value_bufs; + std::array keys; + for (size_t i = 0; i < kNumOfKeys; ++i) { + key_bufs[i] = "foo" + std::to_string(i); + value_bufs[i] = "blob_value" + std::to_string(i); + ASSERT_OK(Put(key_bufs[i], value_bufs[i])); + keys[i] = key_bufs[i]; + } + ASSERT_OK(Flush()); + + std::array values; + std::array statuses; + ReadOptions read_opts; + read_opts.value_size_soft_limit = 1; + db_->MultiGet(read_opts, dbfull()->DefaultColumnFamily(), kNumOfKeys, + keys.data(), values.data(), statuses.data(), + /*sorted_input=*/true); + for (const auto& s : statuses) { + ASSERT_TRUE(s.IsAborted()); + } +} + +TEST_F(DBBlobBasicTest, GetBlob_InlinedTTLIndex) { + constexpr uint64_t min_blob_size = 10; + + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = min_blob_size; + + Reopen(options); + + constexpr char key[] = "key"; + constexpr char blob[] = "short"; + static_assert(sizeof(short) - 1 < min_blob_size, + "Blob too long to be inlined"); + + // Fake an inlined TTL blob index. + std::string blob_index; + + constexpr uint64_t expiration = 1234567890; + + BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob); + + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + ASSERT_OK(Flush()); + + PinnableSlice result; + ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result) + .IsCorruption()); +} + +TEST_F(DBBlobBasicTest, GetBlob_IndexWithInvalidFileNumber) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr char key[] = "key"; + + // Fake a blob index referencing a non-existent blob file. + std::string blob_index; + + constexpr uint64_t blob_file_number = 1000; + constexpr uint64_t offset = 1234; + constexpr uint64_t size = 5678; + + BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size, + kNoCompression); + + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + ASSERT_OK(Flush()); + + PinnableSlice result; + ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result) + .IsCorruption()); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBBlobBasicTest, GenerateIOTracing) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + std::string trace_file = dbname_ + "/io_trace_file"; + + Reopen(options); + { + // Create IO trace file + std::unique_ptr trace_writer; + ASSERT_OK( + NewFileTraceWriter(env_, EnvOptions(), trace_file, &trace_writer)); + ASSERT_OK(db_->StartIOTrace(TraceOptions(), std::move(trace_writer))); + + constexpr char key[] = "key"; + constexpr char blob_value[] = "blob_value"; + + ASSERT_OK(Put(key, blob_value)); + ASSERT_OK(Flush()); + ASSERT_EQ(Get(key), blob_value); + + ASSERT_OK(db_->EndIOTrace()); + ASSERT_OK(env_->FileExists(trace_file)); + } + { + // Parse trace file to check file operations related to blob files are + // recorded. + std::unique_ptr trace_reader; + ASSERT_OK( + NewFileTraceReader(env_, EnvOptions(), trace_file, &trace_reader)); + IOTraceReader reader(std::move(trace_reader)); + + IOTraceHeader header; + ASSERT_OK(reader.ReadHeader(&header)); + ASSERT_EQ(kMajorVersion, static_cast(header.rocksdb_major_version)); + ASSERT_EQ(kMinorVersion, static_cast(header.rocksdb_minor_version)); + + // Read records. + int blob_files_op_count = 0; + Status status; + while (true) { + IOTraceRecord record; + status = reader.ReadIOOp(&record); + if (!status.ok()) { + break; + } + if (record.file_name.find("blob") != std::string::npos) { + blob_files_op_count++; + } + } + // Assuming blob files will have Append, Close and then Read operations. + ASSERT_GT(blob_files_op_count, 2); + } +} +#endif // !ROCKSDB_LITE + +TEST_F(DBBlobBasicTest, BestEffortsRecovery_MissingNewestBlobFile) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + Reopen(options); + + ASSERT_OK(dbfull()->DisableFileDeletions()); + constexpr int kNumTableFiles = 2; + for (int i = 0; i < kNumTableFiles; ++i) { + for (char ch = 'a'; ch != 'c'; ++ch) { + std::string key(1, ch); + ASSERT_OK(Put(key, "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + } + + Close(); + + std::vector files; + ASSERT_OK(env_->GetChildren(dbname_, &files)); + std::string blob_file_path; + uint64_t max_blob_file_num = kInvalidBlobFileNumber; + for (const auto& fname : files) { + uint64_t file_num = 0; + FileType type; + if (ParseFileName(fname, &file_num, /*info_log_name_prefix=*/"", &type) && + type == kBlobFile) { + if (file_num > max_blob_file_num) { + max_blob_file_num = file_num; + blob_file_path = dbname_ + "/" + fname; + } + } + } + ASSERT_OK(env_->DeleteFile(blob_file_path)); + + options.best_efforts_recovery = true; + Reopen(options); + std::string value; + ASSERT_OK(db_->Get(ReadOptions(), "a", &value)); + ASSERT_EQ("value" + std::to_string(kNumTableFiles - 2), value); +} + +TEST_F(DBBlobBasicTest, GetMergeBlobWithPut) { + Options options = GetDefaultOptions(); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + ASSERT_OK(Put("Key1", "v1")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("Key1", "v2")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("Key1", "v3")); + ASSERT_OK(Flush()); + + std::string value; + ASSERT_OK(db_->Get(ReadOptions(), "Key1", &value)); + ASSERT_EQ(Get("Key1"), "v1,v2,v3"); +} + +TEST_F(DBBlobBasicTest, MultiGetMergeBlobWithPut) { + constexpr size_t num_keys = 3; + + Options options = GetDefaultOptions(); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + ASSERT_OK(Put("Key0", "v0_0")); + ASSERT_OK(Put("Key1", "v1_0")); + ASSERT_OK(Put("Key2", "v2_0")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("Key0", "v0_1")); + ASSERT_OK(Merge("Key1", "v1_1")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("Key0", "v0_2")); + ASSERT_OK(Flush()); + + std::array keys{{"Key0", "Key1", "Key2"}}; + std::array values; + std::array statuses; + + db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], "v0_0,v0_1,v0_2"); + + ASSERT_OK(statuses[1]); + ASSERT_EQ(values[1], "v1_0,v1_1"); + + ASSERT_OK(statuses[2]); + ASSERT_EQ(values[2], "v2_0"); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBBlobBasicTest, Properties) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr char key1[] = "key1"; + constexpr size_t key1_size = sizeof(key1) - 1; + + constexpr char key2[] = "key2"; + constexpr size_t key2_size = sizeof(key2) - 1; + + constexpr char key3[] = "key3"; + constexpr size_t key3_size = sizeof(key3) - 1; + + constexpr char blob[] = "00000000000000"; + constexpr size_t blob_size = sizeof(blob) - 1; + + constexpr char longer_blob[] = "00000000000000000000"; + constexpr size_t longer_blob_size = sizeof(longer_blob) - 1; + + ASSERT_OK(Put(key1, blob)); + ASSERT_OK(Put(key2, longer_blob)); + ASSERT_OK(Flush()); + + constexpr size_t first_blob_file_expected_size = + BlobLogHeader::kSize + + BlobLogRecord::CalculateAdjustmentForRecordHeader(key1_size) + blob_size + + BlobLogRecord::CalculateAdjustmentForRecordHeader(key2_size) + + longer_blob_size + BlobLogFooter::kSize; + + ASSERT_OK(Put(key3, blob)); + ASSERT_OK(Flush()); + + constexpr size_t second_blob_file_expected_size = + BlobLogHeader::kSize + + BlobLogRecord::CalculateAdjustmentForRecordHeader(key3_size) + blob_size + + BlobLogFooter::kSize; + + constexpr size_t total_expected_size = + first_blob_file_expected_size + second_blob_file_expected_size; + + // Number of blob files + uint64_t num_blob_files = 0; + ASSERT_TRUE( + db_->GetIntProperty(DB::Properties::kNumBlobFiles, &num_blob_files)); + ASSERT_EQ(num_blob_files, 2); + + // Total size of live blob files + uint64_t live_blob_file_size = 0; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kLiveBlobFileSize, + &live_blob_file_size)); + ASSERT_EQ(live_blob_file_size, total_expected_size); + + // Total amount of garbage in live blob files + { + uint64_t live_blob_file_garbage_size = 0; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kLiveBlobFileGarbageSize, + &live_blob_file_garbage_size)); + ASSERT_EQ(live_blob_file_garbage_size, 0); + } + + // Total size of all blob files across all versions + // Note: this should be the same as above since we only have one + // version at this point. + uint64_t total_blob_file_size = 0; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kTotalBlobFileSize, + &total_blob_file_size)); + ASSERT_EQ(total_blob_file_size, total_expected_size); + + // Delete key2 to create some garbage + ASSERT_OK(Delete(key2)); + ASSERT_OK(Flush()); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + constexpr size_t expected_garbage_size = + BlobLogRecord::CalculateAdjustmentForRecordHeader(key2_size) + + longer_blob_size; + + constexpr double expected_space_amp = + static_cast(total_expected_size) / + (total_expected_size - expected_garbage_size); + + // Blob file stats + std::string blob_stats; + ASSERT_TRUE(db_->GetProperty(DB::Properties::kBlobStats, &blob_stats)); + + std::ostringstream oss; + oss << "Number of blob files: 2\nTotal size of blob files: " + << total_expected_size + << "\nTotal size of garbage in blob files: " << expected_garbage_size + << "\nBlob file space amplification: " << expected_space_amp << '\n'; + + ASSERT_EQ(blob_stats, oss.str()); + + // Total amount of garbage in live blob files + { + uint64_t live_blob_file_garbage_size = 0; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kLiveBlobFileGarbageSize, + &live_blob_file_garbage_size)); + ASSERT_EQ(live_blob_file_garbage_size, expected_garbage_size); + } +} + +TEST_F(DBBlobBasicTest, PropertiesMultiVersion) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr char key1[] = "key1"; + constexpr char key2[] = "key2"; + constexpr char key3[] = "key3"; + + constexpr size_t key_size = sizeof(key1) - 1; + static_assert(sizeof(key2) - 1 == key_size, "unexpected size: key2"); + static_assert(sizeof(key3) - 1 == key_size, "unexpected size: key3"); + + constexpr char blob[] = "0000000000"; + constexpr size_t blob_size = sizeof(blob) - 1; + + ASSERT_OK(Put(key1, blob)); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(key2, blob)); + ASSERT_OK(Flush()); + + // Create an iterator to keep the current version alive + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + ASSERT_OK(iter->status()); + + // Note: the Delete and subsequent compaction results in the first blob file + // not making it to the final version. (It is still part of the previous + // version kept alive by the iterator though.) On the other hand, the Put + // results in a third blob file. + ASSERT_OK(Delete(key1)); + ASSERT_OK(Put(key3, blob)); + ASSERT_OK(Flush()); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + // Total size of all blob files across all versions: between the two versions, + // we should have three blob files of the same size with one blob each. + // The version kept alive by the iterator contains the first and the second + // blob file, while the final version contains the second and the third blob + // file. (The second blob file is thus shared by the two versions but should + // be counted only once.) + uint64_t total_blob_file_size = 0; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kTotalBlobFileSize, + &total_blob_file_size)); + ASSERT_EQ(total_blob_file_size, + 3 * (BlobLogHeader::kSize + + BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) + + blob_size + BlobLogFooter::kSize)); +} +#endif // !ROCKSDB_LITE + +class DBBlobBasicIOErrorTest : public DBBlobBasicTest, + public testing::WithParamInterface { + protected: + DBBlobBasicIOErrorTest() : sync_point_(GetParam()) { + fault_injection_env_.reset(new FaultInjectionTestEnv(env_)); + } + ~DBBlobBasicIOErrorTest() { Close(); } + + std::unique_ptr fault_injection_env_; + std::string sync_point_; +}; + +class DBBlobBasicIOErrorMultiGetTest : public DBBlobBasicIOErrorTest { + public: + DBBlobBasicIOErrorMultiGetTest() : DBBlobBasicIOErrorTest() {} +}; + +INSTANTIATE_TEST_CASE_P(DBBlobBasicTest, DBBlobBasicIOErrorTest, + ::testing::ValuesIn(std::vector{ + "BlobFileReader::OpenFile:NewRandomAccessFile", + "BlobFileReader::GetBlob:ReadFromFile"})); + +INSTANTIATE_TEST_CASE_P(DBBlobBasicTest, DBBlobBasicIOErrorMultiGetTest, + ::testing::ValuesIn(std::vector{ + "BlobFileReader::OpenFile:NewRandomAccessFile", + "BlobFileReader::MultiGetBlob:ReadFromFile"})); + +TEST_P(DBBlobBasicIOErrorTest, GetBlob_IOError) { + Options options; + options.env = fault_injection_env_.get(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr char key[] = "key"; + constexpr char blob_value[] = "blob_value"; + + ASSERT_OK(Put(key, blob_value)); + + ASSERT_OK(Flush()); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) { + fault_injection_env_->SetFilesystemActive(false, + Status::IOError(sync_point_)); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + PinnableSlice result; + ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result) + .IsIOError()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(DBBlobBasicIOErrorMultiGetTest, MultiGetBlobs_IOError) { + Options options = GetDefaultOptions(); + options.env = fault_injection_env_.get(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr size_t num_keys = 2; + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + + ASSERT_OK(Put(second_key, second_value)); + + ASSERT_OK(Flush()); + + std::array keys{{first_key, second_key}}; + std::array values; + std::array statuses; + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) { + fault_injection_env_->SetFilesystemActive(false, + Status::IOError(sync_point_)); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_TRUE(statuses[0].IsIOError()); + ASSERT_TRUE(statuses[1].IsIOError()); +} + +TEST_P(DBBlobBasicIOErrorMultiGetTest, MultipleBlobFiles) { + Options options = GetDefaultOptions(); + options.env = fault_injection_env_.get(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr size_t num_keys = 2; + + constexpr char key1[] = "key1"; + constexpr char value1[] = "blob1"; + + ASSERT_OK(Put(key1, value1)); + ASSERT_OK(Flush()); + + constexpr char key2[] = "key2"; + constexpr char value2[] = "blob2"; + + ASSERT_OK(Put(key2, value2)); + ASSERT_OK(Flush()); + + std::array keys{{key1, key2}}; + std::array values; + std::array statuses; + + bool first_blob_file = true; + SyncPoint::GetInstance()->SetCallBack( + sync_point_, [&first_blob_file, this](void* /* arg */) { + if (first_blob_file) { + first_blob_file = false; + return; + } + fault_injection_env_->SetFilesystemActive(false, + Status::IOError(sync_point_)); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, + keys.data(), values.data(), statuses.data()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + ASSERT_OK(statuses[0]); + ASSERT_EQ(value1, values[0]); + ASSERT_TRUE(statuses[1].IsIOError()); +} + +namespace { + +class ReadBlobCompactionFilter : public CompactionFilter { + public: + ReadBlobCompactionFilter() = default; + const char* Name() const override { + return "rocksdb.compaction.filter.read.blob"; + } + CompactionFilter::Decision FilterV2( + int /*level*/, const Slice& /*key*/, ValueType value_type, + const Slice& existing_value, std::string* new_value, + std::string* /*skip_until*/) const override { + if (value_type != CompactionFilter::ValueType::kValue) { + return CompactionFilter::Decision::kKeep; + } + assert(new_value); + new_value->assign(existing_value.data(), existing_value.size()); + return CompactionFilter::Decision::kChangeValue; + } +}; + +} // anonymous namespace + +TEST_P(DBBlobBasicIOErrorTest, CompactionFilterReadBlob_IOError) { + Options options = GetDefaultOptions(); + options.env = fault_injection_env_.get(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + std::unique_ptr compaction_filter_guard( + new ReadBlobCompactionFilter); + options.compaction_filter = compaction_filter_guard.get(); + + DestroyAndReopen(options); + constexpr char key[] = "foo"; + constexpr char blob_value[] = "foo_blob_value"; + ASSERT_OK(Put(key, blob_value)); + ASSERT_OK(Flush()); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) { + fault_injection_env_->SetFilesystemActive(false, + Status::IOError(sync_point_)); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr) + .IsIOError()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(DBBlobBasicTest, WarmCacheWithBlobsDuringFlush) { + Options options = GetDefaultOptions(); + + LRUCacheOptions co; + co.capacity = 1 << 25; + co.num_shard_bits = 2; + co.metadata_charge_policy = kDontChargeCacheMetadata; + auto backing_cache = NewLRUCache(co); + + options.blob_cache = backing_cache; + + BlockBasedTableOptions block_based_options; + block_based_options.no_block_cache = false; + block_based_options.block_cache = backing_cache; + block_based_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(block_based_options)); + + options.enable_blob_files = true; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 1.0; + options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + DestroyAndReopen(options); + + constexpr size_t kNumBlobs = 10; + constexpr size_t kValueSize = 100; + + std::string value(kValueSize, 'a'); + + for (size_t i = 1; i <= kNumBlobs; i++) { + ASSERT_OK(Put(std::to_string(i), value)); + ASSERT_OK(Put(std::to_string(i + kNumBlobs), value)); // Add some overlap + ASSERT_OK(Flush()); + ASSERT_EQ(i * 2, options.statistics->getTickerCount(BLOB_DB_CACHE_ADD)); + ASSERT_EQ(value, Get(std::to_string(i))); + ASSERT_EQ(value, Get(std::to_string(i + kNumBlobs))); + ASSERT_EQ(0, options.statistics->getTickerCount(BLOB_DB_CACHE_MISS)); + ASSERT_EQ(i * 2, options.statistics->getTickerCount(BLOB_DB_CACHE_HIT)); + } + + // Verify compaction not counted + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + EXPECT_EQ(kNumBlobs * 2, + options.statistics->getTickerCount(BLOB_DB_CACHE_ADD)); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBBlobBasicTest, DynamicallyWarmCacheDuringFlush) { + Options options = GetDefaultOptions(); + + LRUCacheOptions co; + co.capacity = 1 << 25; + co.num_shard_bits = 2; + co.metadata_charge_policy = kDontChargeCacheMetadata; + auto backing_cache = NewLRUCache(co); + + options.blob_cache = backing_cache; + + BlockBasedTableOptions block_based_options; + block_based_options.no_block_cache = false; + block_based_options.block_cache = backing_cache; + block_based_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(block_based_options)); + + options.enable_blob_files = true; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 1.0; + options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + DestroyAndReopen(options); + + constexpr size_t kNumBlobs = 10; + constexpr size_t kValueSize = 100; + + std::string value(kValueSize, 'a'); + + for (size_t i = 1; i <= 5; i++) { + ASSERT_OK(Put(std::to_string(i), value)); + ASSERT_OK(Put(std::to_string(i + kNumBlobs), value)); // Add some overlap + ASSERT_OK(Flush()); + ASSERT_EQ(2, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD)); + + ASSERT_EQ(value, Get(std::to_string(i))); + ASSERT_EQ(value, Get(std::to_string(i + kNumBlobs))); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD)); + ASSERT_EQ(0, + options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS)); + ASSERT_EQ(2, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT)); + } + + ASSERT_OK(dbfull()->SetOptions({{"prepopulate_blob_cache", "kDisable"}})); + + for (size_t i = 6; i <= kNumBlobs; i++) { + ASSERT_OK(Put(std::to_string(i), value)); + ASSERT_OK(Put(std::to_string(i + kNumBlobs), value)); // Add some overlap + ASSERT_OK(Flush()); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD)); + + ASSERT_EQ(value, Get(std::to_string(i))); + ASSERT_EQ(value, Get(std::to_string(i + kNumBlobs))); + ASSERT_EQ(2, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD)); + ASSERT_EQ(2, + options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT)); + } + + // Verify compaction not counted + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + EXPECT_EQ(0, options.statistics->getTickerCount(BLOB_DB_CACHE_ADD)); +} +#endif // !ROCKSDB_LITE + +TEST_F(DBBlobBasicTest, WarmCacheWithBlobsSecondary) { + CompressedSecondaryCacheOptions secondary_cache_opts; + secondary_cache_opts.capacity = 1 << 20; + secondary_cache_opts.num_shard_bits = 0; + secondary_cache_opts.metadata_charge_policy = kDontChargeCacheMetadata; + secondary_cache_opts.compression_type = kNoCompression; + + LRUCacheOptions primary_cache_opts; + primary_cache_opts.capacity = 1024; + primary_cache_opts.num_shard_bits = 0; + primary_cache_opts.metadata_charge_policy = kDontChargeCacheMetadata; + primary_cache_opts.secondary_cache = + NewCompressedSecondaryCache(secondary_cache_opts); + + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.statistics = CreateDBStatistics(); + options.enable_blob_files = true; + options.blob_cache = NewLRUCache(primary_cache_opts); + options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly; + + DestroyAndReopen(options); + + // Note: only one of the two blobs fit in the primary cache at any given time. + constexpr char first_key[] = "foo"; + constexpr size_t first_blob_size = 512; + const std::string first_blob(first_blob_size, 'a'); + + constexpr char second_key[] = "bar"; + constexpr size_t second_blob_size = 768; + const std::string second_blob(second_blob_size, 'b'); + + // First blob is inserted into primary cache during flush. + ASSERT_OK(Put(first_key, first_blob)); + ASSERT_OK(Flush()); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 1); + + // Second blob is inserted into primary cache during flush, + // First blob is evicted but only a dummy handle is inserted into secondary + // cache. + ASSERT_OK(Put(second_key, second_blob)); + ASSERT_OK(Flush()); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 1); + + // First blob is inserted into primary cache. + // Second blob is evicted but only a dummy handle is inserted into secondary + // cache. + ASSERT_EQ(Get(first_key), first_blob); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS), 1); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT), 0); + ASSERT_EQ(options.statistics->getAndResetTickerCount(SECONDARY_CACHE_HITS), + 0); + // Second blob is inserted into primary cache, + // First blob is evicted and is inserted into secondary cache. + ASSERT_EQ(Get(second_key), second_blob); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS), 1); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT), 0); + ASSERT_EQ(options.statistics->getAndResetTickerCount(SECONDARY_CACHE_HITS), + 0); + + // First blob's dummy item is inserted into primary cache b/c of lookup. + // Second blob is still in primary cache. + ASSERT_EQ(Get(first_key), first_blob); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS), 0); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT), 1); + ASSERT_EQ(options.statistics->getAndResetTickerCount(SECONDARY_CACHE_HITS), + 1); + + // First blob's item is inserted into primary cache b/c of lookup. + // Second blob is evicted and inserted into secondary cache. + ASSERT_EQ(Get(first_key), first_blob); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS), 0); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT), 1); + ASSERT_EQ(options.statistics->getAndResetTickerCount(SECONDARY_CACHE_HITS), + 1); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/blob/db_blob_compaction_test.cc b/src/rocksdb/db/blob/db_blob_compaction_test.cc new file mode 100644 index 000000000..f3fe3c03b --- /dev/null +++ b/src/rocksdb/db/blob/db_blob_compaction_test.cc @@ -0,0 +1,913 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +class DBBlobCompactionTest : public DBTestBase { + public: + explicit DBBlobCompactionTest() + : DBTestBase("db_blob_compaction_test", /*env_do_fsync=*/false) {} + +#ifndef ROCKSDB_LITE + const std::vector& GetCompactionStats() { + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + assert(versions->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + const InternalStats* const internal_stats = cfd->internal_stats(); + assert(internal_stats); + + return internal_stats->TEST_GetCompactionStats(); + } +#endif // ROCKSDB_LITE +}; + +namespace { + +class FilterByKeyLength : public CompactionFilter { + public: + explicit FilterByKeyLength(size_t len) : length_threshold_(len) {} + const char* Name() const override { + return "rocksdb.compaction.filter.by.key.length"; + } + CompactionFilter::Decision FilterBlobByKey( + int /*level*/, const Slice& key, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + if (key.size() < length_threshold_) { + return CompactionFilter::Decision::kRemove; + } + return CompactionFilter::Decision::kKeep; + } + + private: + size_t length_threshold_; +}; + +class FilterByValueLength : public CompactionFilter { + public: + explicit FilterByValueLength(size_t len) : length_threshold_(len) {} + const char* Name() const override { + return "rocksdb.compaction.filter.by.value.length"; + } + CompactionFilter::Decision FilterV2( + int /*level*/, const Slice& /*key*/, ValueType /*value_type*/, + const Slice& existing_value, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + if (existing_value.size() < length_threshold_) { + return CompactionFilter::Decision::kRemove; + } + return CompactionFilter::Decision::kKeep; + } + + private: + size_t length_threshold_; +}; + +class BadBlobCompactionFilter : public CompactionFilter { + public: + explicit BadBlobCompactionFilter(std::string prefix, + CompactionFilter::Decision filter_by_key, + CompactionFilter::Decision filter_v2) + : prefix_(std::move(prefix)), + filter_blob_by_key_(filter_by_key), + filter_v2_(filter_v2) {} + const char* Name() const override { return "rocksdb.compaction.filter.bad"; } + CompactionFilter::Decision FilterBlobByKey( + int /*level*/, const Slice& key, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + if (key.size() >= prefix_.size() && + 0 == strncmp(prefix_.data(), key.data(), prefix_.size())) { + return CompactionFilter::Decision::kUndetermined; + } + return filter_blob_by_key_; + } + CompactionFilter::Decision FilterV2( + int /*level*/, const Slice& /*key*/, ValueType /*value_type*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + return filter_v2_; + } + + private: + const std::string prefix_; + const CompactionFilter::Decision filter_blob_by_key_; + const CompactionFilter::Decision filter_v2_; +}; + +class ValueBlindWriteFilter : public CompactionFilter { + public: + explicit ValueBlindWriteFilter(std::string new_val) + : new_value_(std::move(new_val)) {} + const char* Name() const override { + return "rocksdb.compaction.filter.blind.write"; + } + CompactionFilter::Decision FilterBlobByKey( + int level, const Slice& key, std::string* new_value, + std::string* skip_until) const override; + + private: + const std::string new_value_; +}; + +CompactionFilter::Decision ValueBlindWriteFilter::FilterBlobByKey( + int /*level*/, const Slice& /*key*/, std::string* new_value, + std::string* /*skip_until*/) const { + assert(new_value); + new_value->assign(new_value_); + return CompactionFilter::Decision::kChangeValue; +} + +class ValueMutationFilter : public CompactionFilter { + public: + explicit ValueMutationFilter(std::string padding) + : padding_(std::move(padding)) {} + const char* Name() const override { + return "rocksdb.compaction.filter.value.mutation"; + } + CompactionFilter::Decision FilterV2(int level, const Slice& key, + ValueType value_type, + const Slice& existing_value, + std::string* new_value, + std::string* skip_until) const override; + + private: + const std::string padding_; +}; + +CompactionFilter::Decision ValueMutationFilter::FilterV2( + int /*level*/, const Slice& /*key*/, ValueType value_type, + const Slice& existing_value, std::string* new_value, + std::string* /*skip_until*/) const { + assert(CompactionFilter::ValueType::kBlobIndex != value_type); + if (CompactionFilter::ValueType::kValue != value_type) { + return CompactionFilter::Decision::kKeep; + } + assert(new_value); + new_value->assign(existing_value.data(), existing_value.size()); + new_value->append(padding_); + return CompactionFilter::Decision::kChangeValue; +} + +class AlwaysKeepFilter : public CompactionFilter { + public: + explicit AlwaysKeepFilter() = default; + const char* Name() const override { + return "rocksdb.compaction.filter.always.keep"; + } + CompactionFilter::Decision FilterV2( + int /*level*/, const Slice& /*key*/, ValueType /*value_type*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + return CompactionFilter::Decision::kKeep; + } +}; + +class SkipUntilFilter : public CompactionFilter { + public: + explicit SkipUntilFilter(std::string skip_until) + : skip_until_(std::move(skip_until)) {} + + const char* Name() const override { + return "rocksdb.compaction.filter.skip.until"; + } + + CompactionFilter::Decision FilterV2(int /* level */, const Slice& /* key */, + ValueType /* value_type */, + const Slice& /* existing_value */, + std::string* /* new_value */, + std::string* skip_until) const override { + assert(skip_until); + *skip_until = skip_until_; + + return CompactionFilter::Decision::kRemoveAndSkipUntil; + } + + private: + std::string skip_until_; +}; + +} // anonymous namespace + +class DBBlobBadCompactionFilterTest + : public DBBlobCompactionTest, + public testing::WithParamInterface< + std::tuple> { + public: + explicit DBBlobBadCompactionFilterTest() + : compaction_filter_guard_(new BadBlobCompactionFilter( + std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()))) {} + + protected: + std::unique_ptr compaction_filter_guard_; +}; + +INSTANTIATE_TEST_CASE_P( + BadCompactionFilter, DBBlobBadCompactionFilterTest, + testing::Combine( + testing::Values("a"), + testing::Values(CompactionFilter::Decision::kChangeBlobIndex, + CompactionFilter::Decision::kIOError), + testing::Values(CompactionFilter::Decision::kUndetermined, + CompactionFilter::Decision::kChangeBlobIndex, + CompactionFilter::Decision::kIOError))); + +TEST_F(DBBlobCompactionTest, FilterByKeyLength) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + constexpr size_t kKeyLength = 2; + std::unique_ptr compaction_filter_guard( + new FilterByKeyLength(kKeyLength)); + options.compaction_filter = compaction_filter_guard.get(); + + constexpr char short_key[] = "a"; + constexpr char long_key[] = "abc"; + constexpr char blob_value[] = "value"; + + DestroyAndReopen(options); + ASSERT_OK(Put(short_key, blob_value)); + ASSERT_OK(Put(long_key, blob_value)); + ASSERT_OK(Flush()); + CompactRangeOptions cro; + ASSERT_OK(db_->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr)); + std::string value; + ASSERT_TRUE(db_->Get(ReadOptions(), short_key, &value).IsNotFound()); + value.clear(); + ASSERT_OK(db_->Get(ReadOptions(), long_key, &value)); + ASSERT_EQ("value", value); + +#ifndef ROCKSDB_LITE + const auto& compaction_stats = GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + // Filter decides between kKeep and kRemove solely based on key; + // this involves neither reading nor writing blobs + ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); +#endif // ROCKSDB_LITE + + Close(); +} + +TEST_F(DBBlobCompactionTest, FilterByValueLength) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 5; + options.create_if_missing = true; + constexpr size_t kValueLength = 5; + std::unique_ptr compaction_filter_guard( + new FilterByValueLength(kValueLength)); + options.compaction_filter = compaction_filter_guard.get(); + + const std::vector short_value_keys = {"a", "e", "j"}; + constexpr char short_value[] = "val"; + const std::vector long_value_keys = {"b", "f", "k"}; + constexpr char long_value[] = "valuevalue"; + + DestroyAndReopen(options); + for (size_t i = 0; i < short_value_keys.size(); ++i) { + ASSERT_OK(Put(short_value_keys[i], short_value)); + } + for (size_t i = 0; i < short_value_keys.size(); ++i) { + ASSERT_OK(Put(long_value_keys[i], long_value)); + } + ASSERT_OK(Flush()); + CompactRangeOptions cro; + ASSERT_OK(db_->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr)); + std::string value; + for (size_t i = 0; i < short_value_keys.size(); ++i) { + ASSERT_TRUE( + db_->Get(ReadOptions(), short_value_keys[i], &value).IsNotFound()); + value.clear(); + } + for (size_t i = 0; i < long_value_keys.size(); ++i) { + ASSERT_OK(db_->Get(ReadOptions(), long_value_keys[i], &value)); + ASSERT_EQ(long_value, value); + } + +#ifndef ROCKSDB_LITE + const auto& compaction_stats = GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + // Filter decides between kKeep and kRemove based on value; + // this involves reading but not writing blobs + ASSERT_GT(compaction_stats[1].bytes_read_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); +#endif // ROCKSDB_LITE + + Close(); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBBlobCompactionTest, BlobCompactWithStartingLevel) { + Options options = GetDefaultOptions(); + + options.enable_blob_files = true; + options.min_blob_size = 1000; + options.blob_file_starting_level = 5; + options.create_if_missing = true; + + // Open DB with fixed-prefix sst-partitioner so that compaction will cut + // new table file when encountering a new key whose 1-byte prefix changes. + constexpr size_t key_len = 1; + options.sst_partitioner_factory = + NewSstPartitionerFixedPrefixFactory(key_len); + + ASSERT_OK(TryReopen(options)); + + constexpr size_t blob_size = 3000; + + constexpr char first_key[] = "a"; + const std::string first_blob(blob_size, 'a'); + ASSERT_OK(Put(first_key, first_blob)); + + constexpr char second_key[] = "b"; + const std::string second_blob(2 * blob_size, 'b'); + ASSERT_OK(Put(second_key, second_blob)); + + constexpr char third_key[] = "d"; + const std::string third_blob(blob_size, 'd'); + ASSERT_OK(Put(third_key, third_blob)); + + ASSERT_OK(Flush()); + + constexpr char fourth_key[] = "c"; + const std::string fourth_blob(blob_size, 'c'); + ASSERT_OK(Put(fourth_key, fourth_blob)); + + ASSERT_OK(Flush()); + + ASSERT_EQ(0, GetBlobFileNumbers().size()); + ASSERT_EQ(2, NumTableFilesAtLevel(/*level=*/0)); + ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/1)); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + + // No blob file should be created since blob_file_starting_level is 5. + ASSERT_EQ(0, GetBlobFileNumbers().size()); + ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/0)); + ASSERT_EQ(4, NumTableFilesAtLevel(/*level=*/1)); + + { + options.blob_file_starting_level = 1; + DestroyAndReopen(options); + + ASSERT_OK(Put(first_key, first_blob)); + ASSERT_OK(Put(second_key, second_blob)); + ASSERT_OK(Put(third_key, third_blob)); + ASSERT_OK(Flush()); + ASSERT_OK(Put(fourth_key, fourth_blob)); + ASSERT_OK(Flush()); + + ASSERT_EQ(0, GetBlobFileNumbers().size()); + ASSERT_EQ(2, NumTableFilesAtLevel(/*level=*/0)); + ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/1)); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + // The compaction's output level equals to blob_file_starting_level. + ASSERT_EQ(1, GetBlobFileNumbers().size()); + ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/0)); + ASSERT_EQ(4, NumTableFilesAtLevel(/*level=*/1)); + } + + Close(); +} +#endif + +TEST_F(DBBlobCompactionTest, BlindWriteFilter) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + constexpr char new_blob_value[] = "new_blob_value"; + std::unique_ptr compaction_filter_guard( + new ValueBlindWriteFilter(new_blob_value)); + options.compaction_filter = compaction_filter_guard.get(); + DestroyAndReopen(options); + const std::vector keys = {"a", "b", "c"}; + const std::vector values = {"a_value", "b_value", "c_value"}; + assert(keys.size() == values.size()); + for (size_t i = 0; i < keys.size(); ++i) { + ASSERT_OK(Put(keys[i], values[i])); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + for (const auto& key : keys) { + ASSERT_EQ(new_blob_value, Get(key)); + } + +#ifndef ROCKSDB_LITE + const auto& compaction_stats = GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + // Filter unconditionally changes value in FilterBlobByKey; + // this involves writing but not reading blobs + ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); + ASSERT_GT(compaction_stats[1].bytes_written_blob, 0); +#endif // ROCKSDB_LITE + + Close(); +} + +TEST_F(DBBlobCompactionTest, SkipUntilFilter) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + + std::unique_ptr compaction_filter_guard( + new SkipUntilFilter("z")); + options.compaction_filter = compaction_filter_guard.get(); + + Reopen(options); + + const std::vector keys{"a", "b", "c"}; + const std::vector values{"a_value", "b_value", "c_value"}; + assert(keys.size() == values.size()); + + for (size_t i = 0; i < keys.size(); ++i) { + ASSERT_OK(Put(keys[i], values[i])); + } + + ASSERT_OK(Flush()); + + int process_in_flow_called = 0; + + SyncPoint::GetInstance()->SetCallBack( + "BlobCountingIterator::UpdateAndCountBlobIfNeeded:ProcessInFlow", + [&process_in_flow_called](void* /* arg */) { ++process_in_flow_called; }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /* begin */ nullptr, + /* end */ nullptr)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + for (const auto& key : keys) { + ASSERT_EQ(Get(key), "NOT_FOUND"); + } + + // Make sure SkipUntil was performed using iteration rather than Seek + ASSERT_EQ(process_in_flow_called, keys.size()); + + Close(); +} + +TEST_P(DBBlobBadCompactionFilterTest, BadDecisionFromCompactionFilter) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + options.compaction_filter = compaction_filter_guard_.get(); + DestroyAndReopen(options); + ASSERT_OK(Put("b", "value")); + ASSERT_OK(Flush()); + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr) + .IsNotSupported()); + Close(); + + DestroyAndReopen(options); + std::string key(std::get<0>(GetParam())); + ASSERT_OK(Put(key, "value")); + ASSERT_OK(Flush()); + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr) + .IsNotSupported()); + Close(); +} + +TEST_F(DBBlobCompactionTest, CompactionFilter_InlinedTTLIndex) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.enable_blob_files = true; + options.min_blob_size = 0; + std::unique_ptr compaction_filter_guard( + new ValueMutationFilter("")); + options.compaction_filter = compaction_filter_guard.get(); + DestroyAndReopen(options); + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + // Fake an inlined TTL blob index. + std::string blob_index; + constexpr uint64_t expiration = 1234567890; + BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob); + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + ASSERT_OK(Flush()); + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr) + .IsCorruption()); + Close(); +} + +TEST_F(DBBlobCompactionTest, CompactionFilter) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.enable_blob_files = true; + options.min_blob_size = 0; + constexpr char padding[] = "_delta"; + std::unique_ptr compaction_filter_guard( + new ValueMutationFilter(padding)); + options.compaction_filter = compaction_filter_guard.get(); + DestroyAndReopen(options); + const std::vector> kvs = { + {"a", "a_value"}, {"b", "b_value"}, {"c", "c_value"}}; + for (const auto& kv : kvs) { + ASSERT_OK(Put(kv.first, kv.second)); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + for (const auto& kv : kvs) { + ASSERT_EQ(kv.second + std::string(padding), Get(kv.first)); + } + +#ifndef ROCKSDB_LITE + const auto& compaction_stats = GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + // Filter changes the value using the previous value in FilterV2; + // this involves reading and writing blobs + ASSERT_GT(compaction_stats[1].bytes_read_blob, 0); + ASSERT_GT(compaction_stats[1].bytes_written_blob, 0); +#endif // ROCKSDB_LITE + + Close(); +} + +TEST_F(DBBlobCompactionTest, CorruptedBlobIndex) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.enable_blob_files = true; + options.min_blob_size = 0; + std::unique_ptr compaction_filter_guard( + new ValueMutationFilter("")); + options.compaction_filter = compaction_filter_guard.get(); + DestroyAndReopen(options); + + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + ASSERT_OK(Put(key, blob)); + ASSERT_OK(Flush()); + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::InvokeFilterIfNeeded::TamperWithBlobIndex", + [](void* arg) { + Slice* const blob_index = static_cast(arg); + assert(blob_index); + assert(!blob_index->empty()); + blob_index->remove_prefix(1); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr) + .IsCorruption()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + Close(); +} + +TEST_F(DBBlobCompactionTest, CompactionFilterReadBlobAndKeep) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.enable_blob_files = true; + options.min_blob_size = 0; + std::unique_ptr compaction_filter_guard( + new AlwaysKeepFilter()); + options.compaction_filter = compaction_filter_guard.get(); + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "foo_value")); + ASSERT_OK(Flush()); + std::vector blob_files = GetBlobFileNumbers(); + ASSERT_EQ(1, blob_files.size()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + ASSERT_EQ(blob_files, GetBlobFileNumbers()); + +#ifndef ROCKSDB_LITE + const auto& compaction_stats = GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + // Filter decides to keep the existing value in FilterV2; + // this involves reading but not writing blobs + ASSERT_GT(compaction_stats[1].bytes_read_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); +#endif // ROCKSDB_LITE + + Close(); +} + +TEST_F(DBBlobCompactionTest, TrackGarbage) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + + Reopen(options); + + // First table+blob file pair: 4 blobs with different keys + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "third_value"; + constexpr char fourth_key[] = "fourth_key"; + constexpr char fourth_value[] = "fourth_value"; + + ASSERT_OK(Put(first_key, first_value)); + ASSERT_OK(Put(second_key, second_value)); + ASSERT_OK(Put(third_key, third_value)); + ASSERT_OK(Put(fourth_key, fourth_value)); + ASSERT_OK(Flush()); + + // Second table+blob file pair: overwrite 2 existing keys + constexpr char new_first_value[] = "new_first_value"; + constexpr char new_second_value[] = "new_second_value"; + + ASSERT_OK(Put(first_key, new_first_value)); + ASSERT_OK(Put(second_key, new_second_value)); + ASSERT_OK(Flush()); + + // Compact them together. The first blob file should have 2 garbage blobs + // corresponding to the 2 overwritten keys. + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + assert(versions->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + Version* const current = cfd->current(); + assert(current); + + const VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); + + const auto& blob_files = storage_info->GetBlobFiles(); + ASSERT_EQ(blob_files.size(), 2); + + { + const auto& meta = blob_files.front(); + assert(meta); + + constexpr uint64_t first_expected_bytes = + sizeof(first_value) - 1 + + BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(first_key) - + 1); + constexpr uint64_t second_expected_bytes = + sizeof(second_value) - 1 + + BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(second_key) - + 1); + constexpr uint64_t third_expected_bytes = + sizeof(third_value) - 1 + + BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(third_key) - + 1); + constexpr uint64_t fourth_expected_bytes = + sizeof(fourth_value) - 1 + + BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(fourth_key) - + 1); + + ASSERT_EQ(meta->GetTotalBlobCount(), 4); + ASSERT_EQ(meta->GetTotalBlobBytes(), + first_expected_bytes + second_expected_bytes + + third_expected_bytes + fourth_expected_bytes); + ASSERT_EQ(meta->GetGarbageBlobCount(), 2); + ASSERT_EQ(meta->GetGarbageBlobBytes(), + first_expected_bytes + second_expected_bytes); + } + + { + const auto& meta = blob_files.back(); + assert(meta); + + constexpr uint64_t new_first_expected_bytes = + sizeof(new_first_value) - 1 + + BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(first_key) - + 1); + constexpr uint64_t new_second_expected_bytes = + sizeof(new_second_value) - 1 + + BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(second_key) - + 1); + + ASSERT_EQ(meta->GetTotalBlobCount(), 2); + ASSERT_EQ(meta->GetTotalBlobBytes(), + new_first_expected_bytes + new_second_expected_bytes); + ASSERT_EQ(meta->GetGarbageBlobCount(), 0); + ASSERT_EQ(meta->GetGarbageBlobBytes(), 0); + } +} + +TEST_F(DBBlobCompactionTest, MergeBlobWithBase) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.disable_auto_compactions = true; + + Reopen(options); + ASSERT_OK(Put("Key1", "v1_1")); + ASSERT_OK(Put("Key2", "v2_1")); + ASSERT_OK(Flush()); + + ASSERT_OK(Merge("Key1", "v1_2")); + ASSERT_OK(Merge("Key2", "v2_2")); + ASSERT_OK(Flush()); + + ASSERT_OK(Merge("Key1", "v1_3")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + ASSERT_EQ(Get("Key1"), "v1_1,v1_2,v1_3"); + ASSERT_EQ(Get("Key2"), "v2_1,v2_2"); + Close(); +} + +TEST_F(DBBlobCompactionTest, CompactionReadaheadGarbageCollection) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 1.0; + options.blob_compaction_readahead_size = 1 << 10; + options.disable_auto_compactions = true; + + Reopen(options); + + ASSERT_OK(Put("key", "lime")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("key", "pie")); + ASSERT_OK(Put("foo", "baz")); + ASSERT_OK(Flush()); + + size_t num_non_prefetch_reads = 0; + SyncPoint::GetInstance()->SetCallBack( + "BlobFileReader::GetBlob:ReadFromFile", + [&num_non_prefetch_reads](void* /* arg */) { ++num_non_prefetch_reads; }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_EQ(Get("key"), "pie"); + ASSERT_EQ(Get("foo"), "baz"); + ASSERT_EQ(num_non_prefetch_reads, 0); + + Close(); +} + +TEST_F(DBBlobCompactionTest, CompactionReadaheadFilter) { + Options options = GetDefaultOptions(); + + std::unique_ptr compaction_filter_guard( + new ValueMutationFilter("pie")); + + options.compaction_filter = compaction_filter_guard.get(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.blob_compaction_readahead_size = 1 << 10; + options.disable_auto_compactions = true; + + Reopen(options); + + ASSERT_OK(Put("key", "lime")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + size_t num_non_prefetch_reads = 0; + SyncPoint::GetInstance()->SetCallBack( + "BlobFileReader::GetBlob:ReadFromFile", + [&num_non_prefetch_reads](void* /* arg */) { ++num_non_prefetch_reads; }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_EQ(Get("key"), "limepie"); + ASSERT_EQ(Get("foo"), "barpie"); + ASSERT_EQ(num_non_prefetch_reads, 0); + + Close(); +} + +TEST_F(DBBlobCompactionTest, CompactionReadaheadMerge) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.blob_compaction_readahead_size = 1 << 10; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.disable_auto_compactions = true; + + Reopen(options); + + ASSERT_OK(Put("key", "lime")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + ASSERT_OK(Merge("key", "pie")); + ASSERT_OK(Merge("foo", "baz")); + ASSERT_OK(Flush()); + + size_t num_non_prefetch_reads = 0; + SyncPoint::GetInstance()->SetCallBack( + "BlobFileReader::GetBlob:ReadFromFile", + [&num_non_prefetch_reads](void* /* arg */) { ++num_non_prefetch_reads; }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_EQ(Get("key"), "lime,pie"); + ASSERT_EQ(Get("foo"), "bar,baz"); + ASSERT_EQ(num_non_prefetch_reads, 0); + + Close(); +} + +TEST_F(DBBlobCompactionTest, CompactionDoNotFillCache) { + Options options = GetDefaultOptions(); + + options.enable_blob_files = true; + options.min_blob_size = 0; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 1.0; + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + + LRUCacheOptions cache_options; + cache_options.capacity = 1 << 20; + cache_options.metadata_charge_policy = kDontChargeCacheMetadata; + + options.blob_cache = NewLRUCache(cache_options); + + Reopen(options); + + ASSERT_OK(Put("key", "lime")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("key", "pie")); + ASSERT_OK(Put("foo", "baz")); + ASSERT_OK(Flush()); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + ASSERT_EQ(options.statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0); + + Close(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/blob/db_blob_corruption_test.cc b/src/rocksdb/db/blob/db_blob_corruption_test.cc new file mode 100644 index 000000000..7ac7ce3fc --- /dev/null +++ b/src/rocksdb/db/blob/db_blob_corruption_test.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +class DBBlobCorruptionTest : public DBTestBase { + protected: + DBBlobCorruptionTest() + : DBTestBase("db_blob_corruption_test", /* env_do_fsync */ false) {} + + void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { + // Pick file to corrupt + std::vector filenames; + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); + uint64_t number; + FileType type; + std::string fname; + uint64_t picked_number = kInvalidBlobFileNumber; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type) && type == filetype && + number > picked_number) { // Pick latest file + fname = dbname_ + "/" + filenames[i]; + picked_number = number; + } + } + ASSERT_TRUE(!fname.empty()) << filetype; + ASSERT_OK(test::CorruptFile(env_, fname, offset, bytes_to_corrupt)); + } +}; + +#ifndef ROCKSDB_LITE +TEST_F(DBBlobCorruptionTest, VerifyWholeBlobFileChecksum) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + options.file_checksum_gen_factory = + ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory(); + Reopen(options); + + ASSERT_OK(Put(Slice("key_1"), Slice("blob_value_1"))); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Slice("key_2"), Slice("blob_value_2"))); + ASSERT_OK(Flush()); + ASSERT_OK(db_->VerifyFileChecksums(ReadOptions())); + Close(); + + Corrupt(kBlobFile, 0, 2); + + ASSERT_OK(TryReopen(options)); + + int count{0}; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::VerifyFullFileChecksum:mismatch", [&](void* arg) { + const Status* s = static_cast(arg); + ASSERT_NE(s, nullptr); + ++count; + ASSERT_NOK(*s); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsCorruption()); + ASSERT_EQ(1, count); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} +#endif // !ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/blob/db_blob_index_test.cc b/src/rocksdb/db/blob/db_blob_index_test.cc new file mode 100644 index 000000000..64c550894 --- /dev/null +++ b/src/rocksdb/db/blob/db_blob_index_test.cc @@ -0,0 +1,602 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include + +#include "db/arena_wrapped_db_iter.h" +#include "db/blob/blob_index.h" +#include "db/column_family.h" +#include "db/db_iter.h" +#include "db/db_test_util.h" +#include "db/dbformat.h" +#include "db/write_batch_internal.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +// kTypeBlobIndex is a value type used by BlobDB only. The base rocksdb +// should accept the value type on write, and report not supported value +// for reads, unless caller request for it explicitly. The base rocksdb +// doesn't understand format of actual blob index (the value). +class DBBlobIndexTest : public DBTestBase { + public: + enum Tier { + kMemtable = 0, + kImmutableMemtables = 1, + kL0SstFile = 2, + kLnSstFile = 3, + }; + const std::vector kAllTiers = {Tier::kMemtable, + Tier::kImmutableMemtables, + Tier::kL0SstFile, Tier::kLnSstFile}; + + DBBlobIndexTest() : DBTestBase("db_blob_index_test", /*env_do_fsync=*/true) {} + + ColumnFamilyHandle* cfh() { return dbfull()->DefaultColumnFamily(); } + + ColumnFamilyData* cfd() { + return static_cast_with_check(cfh())->cfd(); + } + + Status PutBlobIndex(WriteBatch* batch, const Slice& key, + const Slice& blob_index) { + return WriteBatchInternal::PutBlobIndex(batch, cfd()->GetID(), key, + blob_index); + } + + Status Write(WriteBatch* batch) { + return dbfull()->Write(WriteOptions(), batch); + } + + std::string GetImpl(const Slice& key, bool* is_blob_index = nullptr, + const Snapshot* snapshot = nullptr) { + ReadOptions read_options; + read_options.snapshot = snapshot; + PinnableSlice value; + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = cfh(); + get_impl_options.value = &value; + get_impl_options.is_blob_index = is_blob_index; + auto s = dbfull()->GetImpl(read_options, key, get_impl_options); + if (s.IsNotFound()) { + return "NOT_FOUND"; + } + if (s.IsCorruption()) { + return "CORRUPTION"; + } + if (s.IsNotSupported()) { + return "NOT_SUPPORTED"; + } + if (!s.ok()) { + return s.ToString(); + } + return value.ToString(); + } + + std::string GetBlobIndex(const Slice& key, + const Snapshot* snapshot = nullptr) { + bool is_blob_index = false; + std::string value = GetImpl(key, &is_blob_index, snapshot); + if (!is_blob_index) { + return "NOT_BLOB"; + } + return value; + } + + ArenaWrappedDBIter* GetBlobIterator() { + return dbfull()->NewIteratorImpl( + ReadOptions(), cfd(), dbfull()->GetLatestSequenceNumber(), + nullptr /*read_callback*/, true /*expose_blob_index*/); + } + + Options GetTestOptions() { + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + options.num_levels = 2; + options.disable_auto_compactions = true; + // Disable auto flushes. + options.max_write_buffer_number = 10; + options.min_write_buffer_number_to_merge = 10; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + return options; + } + + void MoveDataTo(Tier tier) { + switch (tier) { + case Tier::kMemtable: + break; + case Tier::kImmutableMemtables: + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + break; + case Tier::kL0SstFile: + ASSERT_OK(Flush()); + break; + case Tier::kLnSstFile: + ASSERT_OK(Flush()); + ASSERT_OK(Put("a", "dummy")); + ASSERT_OK(Put("z", "dummy")); + ASSERT_OK(Flush()); + ASSERT_OK( + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,1", FilesPerLevel()); +#endif // !ROCKSDB_LITE + break; + } + } +}; + +// Note: the following test case pertains to the StackableDB-based BlobDB +// implementation. We should be able to write kTypeBlobIndex to memtables and +// SST files. +TEST_F(DBBlobIndexTest, Write) { + for (auto tier : kAllTiers) { + DestroyAndReopen(GetTestOptions()); + + std::vector> key_values; + + constexpr size_t num_key_values = 5; + + key_values.reserve(num_key_values); + + for (size_t i = 1; i <= num_key_values; ++i) { + std::string key = "key" + std::to_string(i); + + std::string blob_index; + BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 9876543210, + "blob" + std::to_string(i)); + + key_values.emplace_back(std::move(key), std::move(blob_index)); + } + + for (const auto& key_value : key_values) { + WriteBatch batch; + ASSERT_OK(PutBlobIndex(&batch, key_value.first, key_value.second)); + ASSERT_OK(Write(&batch)); + } + + MoveDataTo(tier); + + for (const auto& key_value : key_values) { + ASSERT_EQ(GetBlobIndex(key_value.first), key_value.second); + } + } +} + +// Note: the following test case pertains to the StackableDB-based BlobDB +// implementation. Get should be able to return blob index if is_blob_index is +// provided, otherwise it should return Status::NotSupported (when reading from +// memtable) or Status::Corruption (when reading from SST). Reading from SST +// returns Corruption because we can't differentiate between the application +// accidentally opening the base DB of a stacked BlobDB and actual corruption +// when using the integrated BlobDB. +TEST_F(DBBlobIndexTest, Get) { + std::string blob_index; + BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 9876543210, "blob"); + + for (auto tier : kAllTiers) { + DestroyAndReopen(GetTestOptions()); + + WriteBatch batch; + ASSERT_OK(batch.Put("key", "value")); + ASSERT_OK(PutBlobIndex(&batch, "blob_key", blob_index)); + ASSERT_OK(Write(&batch)); + + MoveDataTo(tier); + + // Verify normal value + bool is_blob_index = false; + PinnableSlice value; + ASSERT_EQ("value", Get("key")); + ASSERT_EQ("value", GetImpl("key")); + ASSERT_EQ("value", GetImpl("key", &is_blob_index)); + ASSERT_FALSE(is_blob_index); + + // Verify blob index + if (tier <= kImmutableMemtables) { + ASSERT_TRUE(Get("blob_key", &value).IsNotSupported()); + ASSERT_EQ("NOT_SUPPORTED", GetImpl("blob_key")); + } else { + ASSERT_TRUE(Get("blob_key", &value).IsCorruption()); + ASSERT_EQ("CORRUPTION", GetImpl("blob_key")); + } + ASSERT_EQ(blob_index, GetImpl("blob_key", &is_blob_index)); + ASSERT_TRUE(is_blob_index); + } +} + +// Note: the following test case pertains to the StackableDB-based BlobDB +// implementation. Get should NOT return Status::NotSupported/Status::Corruption +// if blob index is updated with a normal value. See the test case above for +// more details. +TEST_F(DBBlobIndexTest, Updated) { + std::string blob_index; + BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 9876543210, "blob"); + + for (auto tier : kAllTiers) { + DestroyAndReopen(GetTestOptions()); + WriteBatch batch; + for (int i = 0; i < 10; i++) { + ASSERT_OK(PutBlobIndex(&batch, "key" + std::to_string(i), blob_index)); + } + ASSERT_OK(Write(&batch)); + // Avoid blob values from being purged. + const Snapshot* snapshot = dbfull()->GetSnapshot(); + ASSERT_OK(Put("key1", "new_value")); + ASSERT_OK(Merge("key2", "a")); + ASSERT_OK(Merge("key2", "b")); + ASSERT_OK(Merge("key2", "c")); + ASSERT_OK(Delete("key3")); + ASSERT_OK(SingleDelete("key4")); + ASSERT_OK(Delete("key5")); + ASSERT_OK(Merge("key5", "a")); + ASSERT_OK(Merge("key5", "b")); + ASSERT_OK(Merge("key5", "c")); + ASSERT_OK(dbfull()->DeleteRange(WriteOptions(), cfh(), "key6", "key9")); + MoveDataTo(tier); + for (int i = 0; i < 10; i++) { + ASSERT_EQ(blob_index, GetBlobIndex("key" + std::to_string(i), snapshot)); + } + ASSERT_EQ("new_value", Get("key1")); + if (tier <= kImmutableMemtables) { + ASSERT_EQ("NOT_SUPPORTED", GetImpl("key2")); + } else { + ASSERT_EQ("CORRUPTION", GetImpl("key2")); + } + ASSERT_EQ("NOT_FOUND", Get("key3")); + ASSERT_EQ("NOT_FOUND", Get("key4")); + ASSERT_EQ("a,b,c", GetImpl("key5")); + for (int i = 6; i < 9; i++) { + ASSERT_EQ("NOT_FOUND", Get("key" + std::to_string(i))); + } + ASSERT_EQ(blob_index, GetBlobIndex("key9")); + dbfull()->ReleaseSnapshot(snapshot); + } +} + +// Note: the following test case pertains to the StackableDB-based BlobDB +// implementation. When a blob iterator is used, it should set the +// expose_blob_index flag for the underlying DBIter, and retrieve/return the +// corresponding blob value. If a regular DBIter is created (i.e. +// expose_blob_index is not set), it should return Status::Corruption. +TEST_F(DBBlobIndexTest, Iterate) { + const std::vector> data = { + /*00*/ {kTypeValue}, + /*01*/ {kTypeBlobIndex}, + /*02*/ {kTypeValue}, + /*03*/ {kTypeBlobIndex, kTypeValue}, + /*04*/ {kTypeValue}, + /*05*/ {kTypeValue, kTypeBlobIndex}, + /*06*/ {kTypeValue}, + /*07*/ {kTypeDeletion, kTypeBlobIndex}, + /*08*/ {kTypeValue}, + /*09*/ {kTypeSingleDeletion, kTypeBlobIndex}, + /*10*/ {kTypeValue}, + /*11*/ {kTypeMerge, kTypeMerge, kTypeMerge, kTypeBlobIndex}, + /*12*/ {kTypeValue}, + /*13*/ + {kTypeMerge, kTypeMerge, kTypeMerge, kTypeDeletion, kTypeBlobIndex}, + /*14*/ {kTypeValue}, + /*15*/ {kTypeBlobIndex}, + /*16*/ {kTypeValue}, + }; + + auto get_key = [](int index) { + char buf[20]; + snprintf(buf, sizeof(buf), "%02d", index); + return "key" + std::string(buf); + }; + + auto get_value = [&](int index, int version) { + return get_key(index) + "_value" + std::to_string(version); + }; + + auto check_iterator = [&](Iterator* iterator, Status::Code expected_status, + const Slice& expected_value) { + ASSERT_EQ(expected_status, iterator->status().code()); + if (expected_status == Status::kOk) { + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ(expected_value, iterator->value()); + } else { + ASSERT_FALSE(iterator->Valid()); + } + }; + + auto create_normal_iterator = [&]() -> Iterator* { + return dbfull()->NewIterator(ReadOptions()); + }; + + auto create_blob_iterator = [&]() -> Iterator* { return GetBlobIterator(); }; + + auto check_is_blob = [&](bool is_blob) { + return [is_blob](Iterator* iterator) { + ASSERT_EQ(is_blob, + reinterpret_cast(iterator)->IsBlob()); + }; + }; + + auto verify = [&](int index, Status::Code expected_status, + const Slice& forward_value, const Slice& backward_value, + std::function create_iterator, + std::function extra_check = nullptr) { + // Seek + auto* iterator = create_iterator(); + ASSERT_OK(iterator->status()); + ASSERT_OK(iterator->Refresh()); + iterator->Seek(get_key(index)); + check_iterator(iterator, expected_status, forward_value); + if (extra_check) { + extra_check(iterator); + } + delete iterator; + + // Next + iterator = create_iterator(); + ASSERT_OK(iterator->Refresh()); + iterator->Seek(get_key(index - 1)); + ASSERT_TRUE(iterator->Valid()); + ASSERT_OK(iterator->status()); + iterator->Next(); + check_iterator(iterator, expected_status, forward_value); + if (extra_check) { + extra_check(iterator); + } + delete iterator; + + // SeekForPrev + iterator = create_iterator(); + ASSERT_OK(iterator->status()); + ASSERT_OK(iterator->Refresh()); + iterator->SeekForPrev(get_key(index)); + check_iterator(iterator, expected_status, backward_value); + if (extra_check) { + extra_check(iterator); + } + delete iterator; + + // Prev + iterator = create_iterator(); + iterator->Seek(get_key(index + 1)); + ASSERT_TRUE(iterator->Valid()); + ASSERT_OK(iterator->status()); + iterator->Prev(); + check_iterator(iterator, expected_status, backward_value); + if (extra_check) { + extra_check(iterator); + } + delete iterator; + }; + + for (auto tier : {Tier::kMemtable} /*kAllTiers*/) { + // Avoid values from being purged. + std::vector snapshots; + DestroyAndReopen(GetTestOptions()); + + // fill data + for (int i = 0; i < static_cast(data.size()); i++) { + for (int j = static_cast(data[i].size()) - 1; j >= 0; j--) { + std::string key = get_key(i); + std::string value = get_value(i, j); + WriteBatch batch; + switch (data[i][j]) { + case kTypeValue: + ASSERT_OK(Put(key, value)); + break; + case kTypeDeletion: + ASSERT_OK(Delete(key)); + break; + case kTypeSingleDeletion: + ASSERT_OK(SingleDelete(key)); + break; + case kTypeMerge: + ASSERT_OK(Merge(key, value)); + break; + case kTypeBlobIndex: + ASSERT_OK(PutBlobIndex(&batch, key, value)); + ASSERT_OK(Write(&batch)); + break; + default: + FAIL(); + }; + } + snapshots.push_back(dbfull()->GetSnapshot()); + } + ASSERT_OK( + dbfull()->DeleteRange(WriteOptions(), cfh(), get_key(15), get_key(16))); + snapshots.push_back(dbfull()->GetSnapshot()); + MoveDataTo(tier); + + // Normal iterator + verify(1, Status::kCorruption, "", "", create_normal_iterator); + verify(3, Status::kCorruption, "", "", create_normal_iterator); + verify(5, Status::kOk, get_value(5, 0), get_value(5, 0), + create_normal_iterator); + verify(7, Status::kOk, get_value(8, 0), get_value(6, 0), + create_normal_iterator); + verify(9, Status::kOk, get_value(10, 0), get_value(8, 0), + create_normal_iterator); + verify(11, Status::kCorruption, "", "", create_normal_iterator); + verify(13, Status::kOk, + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + create_normal_iterator); + verify(15, Status::kOk, get_value(16, 0), get_value(14, 0), + create_normal_iterator); + + // Iterator with blob support + verify(1, Status::kOk, get_value(1, 0), get_value(1, 0), + create_blob_iterator, check_is_blob(true)); + verify(3, Status::kOk, get_value(3, 0), get_value(3, 0), + create_blob_iterator, check_is_blob(true)); + verify(5, Status::kOk, get_value(5, 0), get_value(5, 0), + create_blob_iterator, check_is_blob(false)); + verify(7, Status::kOk, get_value(8, 0), get_value(6, 0), + create_blob_iterator, check_is_blob(false)); + verify(9, Status::kOk, get_value(10, 0), get_value(8, 0), + create_blob_iterator, check_is_blob(false)); + if (tier <= kImmutableMemtables) { + verify(11, Status::kNotSupported, "", "", create_blob_iterator); + } else { + verify(11, Status::kCorruption, "", "", create_blob_iterator); + } + verify(13, Status::kOk, + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + create_blob_iterator, check_is_blob(false)); + verify(15, Status::kOk, get_value(16, 0), get_value(14, 0), + create_blob_iterator, check_is_blob(false)); + +#ifndef ROCKSDB_LITE + // Iterator with blob support and using seek. + ASSERT_OK(dbfull()->SetOptions( + cfh(), {{"max_sequential_skip_in_iterations", "0"}})); + verify(1, Status::kOk, get_value(1, 0), get_value(1, 0), + create_blob_iterator, check_is_blob(true)); + verify(3, Status::kOk, get_value(3, 0), get_value(3, 0), + create_blob_iterator, check_is_blob(true)); + verify(5, Status::kOk, get_value(5, 0), get_value(5, 0), + create_blob_iterator, check_is_blob(false)); + verify(7, Status::kOk, get_value(8, 0), get_value(6, 0), + create_blob_iterator, check_is_blob(false)); + verify(9, Status::kOk, get_value(10, 0), get_value(8, 0), + create_blob_iterator, check_is_blob(false)); + if (tier <= kImmutableMemtables) { + verify(11, Status::kNotSupported, "", "", create_blob_iterator); + } else { + verify(11, Status::kCorruption, "", "", create_blob_iterator); + } + verify(13, Status::kOk, + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + create_blob_iterator, check_is_blob(false)); + verify(15, Status::kOk, get_value(16, 0), get_value(14, 0), + create_blob_iterator, check_is_blob(false)); +#endif // !ROCKSDB_LITE + + for (auto* snapshot : snapshots) { + dbfull()->ReleaseSnapshot(snapshot); + } + } +} + +TEST_F(DBBlobIndexTest, IntegratedBlobIterate) { + const std::vector> data = { + /*00*/ {"Put"}, + /*01*/ {"Put", "Merge", "Merge", "Merge"}, + /*02*/ {"Put"}}; + + auto get_key = [](size_t index) { return ("key" + std::to_string(index)); }; + + auto get_value = [&](size_t index, size_t version) { + return get_key(index) + "_value" + std::to_string(version); + }; + + auto check_iterator = [&](Iterator* iterator, Status expected_status, + const Slice& expected_value) { + ASSERT_EQ(expected_status, iterator->status()); + if (expected_status.ok()) { + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ(expected_value, iterator->value()); + } else { + ASSERT_FALSE(iterator->Valid()); + } + }; + + auto verify = [&](size_t index, Status expected_status, + const Slice& expected_value) { + // Seek + { + Iterator* iterator = db_->NewIterator(ReadOptions()); + std::unique_ptr iterator_guard(iterator); + ASSERT_OK(iterator->status()); + ASSERT_OK(iterator->Refresh()); + iterator->Seek(get_key(index)); + check_iterator(iterator, expected_status, expected_value); + } + // Next + { + Iterator* iterator = db_->NewIterator(ReadOptions()); + std::unique_ptr iterator_guard(iterator); + ASSERT_OK(iterator->Refresh()); + iterator->Seek(get_key(index - 1)); + ASSERT_TRUE(iterator->Valid()); + ASSERT_OK(iterator->status()); + iterator->Next(); + check_iterator(iterator, expected_status, expected_value); + } + // SeekForPrev + { + Iterator* iterator = db_->NewIterator(ReadOptions()); + std::unique_ptr iterator_guard(iterator); + ASSERT_OK(iterator->status()); + ASSERT_OK(iterator->Refresh()); + iterator->SeekForPrev(get_key(index)); + check_iterator(iterator, expected_status, expected_value); + } + // Prev + { + Iterator* iterator = db_->NewIterator(ReadOptions()); + std::unique_ptr iterator_guard(iterator); + iterator->Seek(get_key(index + 1)); + ASSERT_TRUE(iterator->Valid()); + ASSERT_OK(iterator->status()); + iterator->Prev(); + check_iterator(iterator, expected_status, expected_value); + } + }; + + Options options = GetTestOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + DestroyAndReopen(options); + + // fill data + for (size_t i = 0; i < data.size(); i++) { + for (size_t j = 0; j < data[i].size(); j++) { + std::string key = get_key(i); + std::string value = get_value(i, j); + if (data[i][j] == "Put") { + ASSERT_OK(Put(key, value)); + ASSERT_OK(Flush()); + } else if (data[i][j] == "Merge") { + ASSERT_OK(Merge(key, value)); + ASSERT_OK(Flush()); + } + } + } + + std::string expected_value = get_value(1, 0) + "," + get_value(1, 1) + "," + + get_value(1, 2) + "," + get_value(1, 3); + Status expected_status; + verify(1, expected_status, expected_value); + +#ifndef ROCKSDB_LITE + // Test DBIter::FindValueForCurrentKeyUsingSeek flow. + ASSERT_OK(dbfull()->SetOptions(cfh(), + {{"max_sequential_skip_in_iterations", "0"}})); + verify(1, expected_status, expected_value); +#endif // !ROCKSDB_LITE +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/blob/prefetch_buffer_collection.cc b/src/rocksdb/db/blob/prefetch_buffer_collection.cc new file mode 100644 index 000000000..079576f51 --- /dev/null +++ b/src/rocksdb/db/blob/prefetch_buffer_collection.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/prefetch_buffer_collection.h" + +namespace ROCKSDB_NAMESPACE { + +FilePrefetchBuffer* PrefetchBufferCollection::GetOrCreatePrefetchBuffer( + uint64_t file_number) { + auto& prefetch_buffer = prefetch_buffers_[file_number]; + if (!prefetch_buffer) { + prefetch_buffer.reset( + new FilePrefetchBuffer(readahead_size_, readahead_size_)); + } + + return prefetch_buffer.get(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/prefetch_buffer_collection.h b/src/rocksdb/db/blob/prefetch_buffer_collection.h new file mode 100644 index 000000000..b973eddc0 --- /dev/null +++ b/src/rocksdb/db/blob/prefetch_buffer_collection.h @@ -0,0 +1,38 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "file/file_prefetch_buffer.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// A class that owns a collection of FilePrefetchBuffers using the file number +// as key. Used for implementing compaction readahead for blob files. Designed +// to be accessed by a single thread only: every (sub)compaction needs its own +// buffers since they are guaranteed to read different blobs from different +// positions even when reading the same file. +class PrefetchBufferCollection { + public: + explicit PrefetchBufferCollection(uint64_t readahead_size) + : readahead_size_(readahead_size) { + assert(readahead_size_ > 0); + } + + FilePrefetchBuffer* GetOrCreatePrefetchBuffer(uint64_t file_number); + + private: + uint64_t readahead_size_; + std::unordered_map> + prefetch_buffers_; // maps file number to prefetch buffer +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/builder.cc b/src/rocksdb/db/builder.cc new file mode 100644 index 000000000..9283ffd64 --- /dev/null +++ b/src/rocksdb/db/builder.cc @@ -0,0 +1,434 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/builder.h" + +#include +#include +#include + +#include "db/blob/blob_file_builder.h" +#include "db/compaction/compaction_iterator.h" +#include "db/event_helpers.h" +#include "db/internal_stats.h" +#include "db/merge_helper.h" +#include "db/output_validator.h" +#include "db/range_del_aggregator.h" +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "file/writable_file_writer.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/thread_status_util.h" +#include "options/options_helper.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "table/block_based/block_based_table_builder.h" +#include "table/format.h" +#include "table/internal_iterator.h" +#include "table/unique_id_impl.h" +#include "test_util/sync_point.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +class TableFactory; + +TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions, + WritableFileWriter* file) { + assert((tboptions.column_family_id == + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) == + tboptions.column_family_name.empty()); + return tboptions.ioptions.table_factory->NewTableBuilder(tboptions, file); +} + +Status BuildTable( + const std::string& dbname, VersionSet* versions, + const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions, + const FileOptions& file_options, TableCache* table_cache, + InternalIterator* iter, + std::vector> + range_del_iters, + FileMetaData* meta, std::vector* blob_file_additions, + std::vector snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SequenceNumber job_snapshot, SnapshotChecker* snapshot_checker, + bool paranoid_file_checks, InternalStats* internal_stats, + IOStatus* io_status, const std::shared_ptr& io_tracer, + BlobFileCreationReason blob_creation_reason, + const SeqnoToTimeMapping& seqno_to_time_mapping, EventLogger* event_logger, + int job_id, const Env::IOPriority io_priority, + TableProperties* table_properties, Env::WriteLifeTimeHint write_hint, + const std::string* full_history_ts_low, + BlobFileCompletionCallback* blob_callback, uint64_t* num_input_entries, + uint64_t* memtable_payload_bytes, uint64_t* memtable_garbage_bytes) { + assert((tboptions.column_family_id == + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) == + tboptions.column_family_name.empty()); + auto& mutable_cf_options = tboptions.moptions; + auto& ioptions = tboptions.ioptions; + // Reports the IOStats for flush for every following bytes. + const size_t kReportFlushIOStatsEvery = 1048576; + OutputValidator output_validator( + tboptions.internal_comparator, + /*enable_order_check=*/ + mutable_cf_options.check_flush_compaction_key_order, + /*enable_hash=*/paranoid_file_checks); + Status s; + meta->fd.file_size = 0; + iter->SeekToFirst(); + std::unique_ptr range_del_agg( + new CompactionRangeDelAggregator(&tboptions.internal_comparator, + snapshots, full_history_ts_low)); + uint64_t num_unfragmented_tombstones = 0; + uint64_t total_tombstone_payload_bytes = 0; + for (auto& range_del_iter : range_del_iters) { + num_unfragmented_tombstones += + range_del_iter->num_unfragmented_tombstones(); + total_tombstone_payload_bytes += + range_del_iter->total_tombstone_payload_bytes(); + range_del_agg->AddTombstones(std::move(range_del_iter)); + } + + std::string fname = TableFileName(ioptions.cf_paths, meta->fd.GetNumber(), + meta->fd.GetPathId()); + std::vector blob_file_paths; + std::string file_checksum = kUnknownFileChecksum; + std::string file_checksum_func_name = kUnknownFileChecksumFuncName; +#ifndef ROCKSDB_LITE + EventHelpers::NotifyTableFileCreationStarted(ioptions.listeners, dbname, + tboptions.column_family_name, + fname, job_id, tboptions.reason); +#endif // !ROCKSDB_LITE + Env* env = db_options.env; + assert(env); + FileSystem* fs = db_options.fs.get(); + assert(fs); + + TableProperties tp; + bool table_file_created = false; + if (iter->Valid() || !range_del_agg->IsEmpty()) { + std::unique_ptr compaction_filter; + if (ioptions.compaction_filter_factory != nullptr && + ioptions.compaction_filter_factory->ShouldFilterTableFileCreation( + tboptions.reason)) { + CompactionFilter::Context context; + context.is_full_compaction = false; + context.is_manual_compaction = false; + context.column_family_id = tboptions.column_family_id; + context.reason = tboptions.reason; + compaction_filter = + ioptions.compaction_filter_factory->CreateCompactionFilter(context); + if (compaction_filter != nullptr && + !compaction_filter->IgnoreSnapshots()) { + s.PermitUncheckedError(); + return Status::NotSupported( + "CompactionFilter::IgnoreSnapshots() = false is not supported " + "anymore."); + } + } + + TableBuilder* builder; + std::unique_ptr file_writer; + { + std::unique_ptr file; +#ifndef NDEBUG + bool use_direct_writes = file_options.use_direct_writes; + TEST_SYNC_POINT_CALLBACK("BuildTable:create_file", &use_direct_writes); +#endif // !NDEBUG + IOStatus io_s = NewWritableFile(fs, fname, &file, file_options); + assert(s.ok()); + s = io_s; + if (io_status->ok()) { + *io_status = io_s; + } + if (!s.ok()) { + EventHelpers::LogAndNotifyTableFileCreationFinished( + event_logger, ioptions.listeners, dbname, + tboptions.column_family_name, fname, job_id, meta->fd, + kInvalidBlobFileNumber, tp, tboptions.reason, s, file_checksum, + file_checksum_func_name); + return s; + } + + table_file_created = true; + FileTypeSet tmp_set = ioptions.checksum_handoff_file_types; + file->SetIOPriority(io_priority); + file->SetWriteLifeTimeHint(write_hint); + file_writer.reset(new WritableFileWriter( + std::move(file), fname, file_options, ioptions.clock, io_tracer, + ioptions.stats, ioptions.listeners, + ioptions.file_checksum_gen_factory.get(), + tmp_set.Contains(FileType::kTableFile), false)); + + builder = NewTableBuilder(tboptions, file_writer.get()); + } + + MergeHelper merge( + env, tboptions.internal_comparator.user_comparator(), + ioptions.merge_operator.get(), compaction_filter.get(), ioptions.logger, + true /* internal key corruption is not ok */, + snapshots.empty() ? 0 : snapshots.back(), snapshot_checker); + + std::unique_ptr blob_file_builder( + (mutable_cf_options.enable_blob_files && + tboptions.level_at_creation >= + mutable_cf_options.blob_file_starting_level && + blob_file_additions) + ? new BlobFileBuilder( + versions, fs, &ioptions, &mutable_cf_options, &file_options, + tboptions.db_id, tboptions.db_session_id, job_id, + tboptions.column_family_id, tboptions.column_family_name, + io_priority, write_hint, io_tracer, blob_callback, + blob_creation_reason, &blob_file_paths, blob_file_additions) + : nullptr); + + const std::atomic kManualCompactionCanceledFalse{false}; + CompactionIterator c_iter( + iter, tboptions.internal_comparator.user_comparator(), &merge, + kMaxSequenceNumber, &snapshots, earliest_write_conflict_snapshot, + job_snapshot, snapshot_checker, env, + ShouldReportDetailedTime(env, ioptions.stats), + true /* internal key corruption is not ok */, range_del_agg.get(), + blob_file_builder.get(), ioptions.allow_data_in_errors, + ioptions.enforce_single_del_contracts, + /*manual_compaction_canceled=*/kManualCompactionCanceledFalse, + /*compaction=*/nullptr, compaction_filter.get(), + /*shutting_down=*/nullptr, db_options.info_log, full_history_ts_low); + + c_iter.SeekToFirst(); + for (; c_iter.Valid(); c_iter.Next()) { + const Slice& key = c_iter.key(); + const Slice& value = c_iter.value(); + const ParsedInternalKey& ikey = c_iter.ikey(); + // Generate a rolling 64-bit hash of the key and values + // Note : + // Here "key" integrates 'sequence_number'+'kType'+'user key'. + s = output_validator.Add(key, value); + if (!s.ok()) { + break; + } + builder->Add(key, value); + + s = meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type); + if (!s.ok()) { + break; + } + + // TODO(noetzli): Update stats after flush, too. + if (io_priority == Env::IO_HIGH && + IOSTATS(bytes_written) >= kReportFlushIOStatsEvery) { + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written)); + } + } + if (!s.ok()) { + c_iter.status().PermitUncheckedError(); + } else if (!c_iter.status().ok()) { + s = c_iter.status(); + } + + if (s.ok()) { + auto range_del_it = range_del_agg->NewIterator(); + for (range_del_it->SeekToFirst(); range_del_it->Valid(); + range_del_it->Next()) { + auto tombstone = range_del_it->Tombstone(); + auto kv = tombstone.Serialize(); + builder->Add(kv.first.Encode(), kv.second); + meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(), + tombstone.seq_, + tboptions.internal_comparator); + } + } + + TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable"); + const bool empty = builder->IsEmpty(); + if (num_input_entries != nullptr) { + *num_input_entries = + c_iter.num_input_entry_scanned() + num_unfragmented_tombstones; + } + if (!s.ok() || empty) { + builder->Abandon(); + } else { + std::string seqno_time_mapping_str; + seqno_to_time_mapping.Encode( + seqno_time_mapping_str, meta->fd.smallest_seqno, + meta->fd.largest_seqno, meta->file_creation_time); + builder->SetSeqnoTimeTableProperties( + seqno_time_mapping_str, + ioptions.compaction_style == CompactionStyle::kCompactionStyleFIFO + ? meta->file_creation_time + : meta->oldest_ancester_time); + s = builder->Finish(); + } + if (io_status->ok()) { + *io_status = builder->io_status(); + } + + if (s.ok() && !empty) { + uint64_t file_size = builder->FileSize(); + meta->fd.file_size = file_size; + meta->marked_for_compaction = builder->NeedCompact(); + assert(meta->fd.GetFileSize() > 0); + tp = builder + ->GetTableProperties(); // refresh now that builder is finished + if (memtable_payload_bytes != nullptr && + memtable_garbage_bytes != nullptr) { + const CompactionIterationStats& ci_stats = c_iter.iter_stats(); + uint64_t total_payload_bytes = ci_stats.total_input_raw_key_bytes + + ci_stats.total_input_raw_value_bytes + + total_tombstone_payload_bytes; + uint64_t total_payload_bytes_written = + (tp.raw_key_size + tp.raw_value_size); + // Prevent underflow, which may still happen at this point + // since we only support inserts, deletes, and deleteRanges. + if (total_payload_bytes_written <= total_payload_bytes) { + *memtable_payload_bytes = total_payload_bytes; + *memtable_garbage_bytes = + total_payload_bytes - total_payload_bytes_written; + } else { + *memtable_payload_bytes = 0; + *memtable_garbage_bytes = 0; + } + } + if (table_properties) { + *table_properties = tp; + } + } + delete builder; + + // Finish and check for file errors + TEST_SYNC_POINT("BuildTable:BeforeSyncTable"); + if (s.ok() && !empty) { + StopWatch sw(ioptions.clock, ioptions.stats, TABLE_SYNC_MICROS); + *io_status = file_writer->Sync(ioptions.use_fsync); + } + TEST_SYNC_POINT("BuildTable:BeforeCloseTableFile"); + if (s.ok() && io_status->ok() && !empty) { + *io_status = file_writer->Close(); + } + if (s.ok() && io_status->ok() && !empty) { + // Add the checksum information to file metadata. + meta->file_checksum = file_writer->GetFileChecksum(); + meta->file_checksum_func_name = file_writer->GetFileChecksumFuncName(); + file_checksum = meta->file_checksum; + file_checksum_func_name = meta->file_checksum_func_name; + // Set unique_id only if db_id and db_session_id exist + if (!tboptions.db_id.empty() && !tboptions.db_session_id.empty()) { + if (!GetSstInternalUniqueId(tboptions.db_id, tboptions.db_session_id, + meta->fd.GetNumber(), &(meta->unique_id)) + .ok()) { + // if failed to get unique id, just set it Null + meta->unique_id = kNullUniqueId64x2; + } + } + } + + if (s.ok()) { + s = *io_status; + } + + if (blob_file_builder) { + if (s.ok()) { + s = blob_file_builder->Finish(); + } else { + blob_file_builder->Abandon(s); + } + blob_file_builder.reset(); + } + + // TODO Also check the IO status when create the Iterator. + + TEST_SYNC_POINT("BuildTable:BeforeOutputValidation"); + if (s.ok() && !empty) { + // Verify that the table is usable + // We set for_compaction to false and don't OptimizeForCompactionTableRead + // here because this is a special case after we finish the table building. + // No matter whether use_direct_io_for_flush_and_compaction is true, + // the goal is to cache it here for further user reads. + ReadOptions read_options; + std::unique_ptr it(table_cache->NewIterator( + read_options, file_options, tboptions.internal_comparator, *meta, + nullptr /* range_del_agg */, mutable_cf_options.prefix_extractor, + nullptr, + (internal_stats == nullptr) ? nullptr + : internal_stats->GetFileReadHist(0), + TableReaderCaller::kFlush, /*arena=*/nullptr, + /*skip_filter=*/false, tboptions.level_at_creation, + MaxFileSizeForL0MetaPin(mutable_cf_options), + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key*/ nullptr, + /*allow_unprepared_value*/ false)); + s = it->status(); + if (s.ok() && paranoid_file_checks) { + OutputValidator file_validator(tboptions.internal_comparator, + /*enable_order_check=*/true, + /*enable_hash=*/true); + for (it->SeekToFirst(); it->Valid(); it->Next()) { + // Generate a rolling 64-bit hash of the key and values + file_validator.Add(it->key(), it->value()).PermitUncheckedError(); + } + s = it->status(); + if (s.ok() && !output_validator.CompareValidator(file_validator)) { + s = Status::Corruption("Paranoid checksums do not match"); + } + } + } + } + + // Check for input iterator errors + if (!iter->status().ok()) { + s = iter->status(); + } + + if (!s.ok() || meta->fd.GetFileSize() == 0) { + TEST_SYNC_POINT("BuildTable:BeforeDeleteFile"); + + constexpr IODebugContext* dbg = nullptr; + + if (table_file_created) { + Status ignored = fs->DeleteFile(fname, IOOptions(), dbg); + ignored.PermitUncheckedError(); + } + + assert(blob_file_additions || blob_file_paths.empty()); + + if (blob_file_additions) { + for (const std::string& blob_file_path : blob_file_paths) { + Status ignored = DeleteDBFile(&db_options, blob_file_path, dbname, + /*force_bg=*/false, /*force_fg=*/false); + ignored.PermitUncheckedError(); + TEST_SYNC_POINT("BuildTable::AfterDeleteFile"); + } + } + } + + Status status_for_listener = s; + if (meta->fd.GetFileSize() == 0) { + fname = "(nil)"; + if (s.ok()) { + status_for_listener = Status::Aborted("Empty SST file not kept"); + } + } + // Output to event logger and fire events. + EventHelpers::LogAndNotifyTableFileCreationFinished( + event_logger, ioptions.listeners, dbname, tboptions.column_family_name, + fname, job_id, meta->fd, meta->oldest_blob_file_number, tp, + tboptions.reason, status_for_listener, file_checksum, + file_checksum_func_name); + + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/builder.h b/src/rocksdb/db/builder.h new file mode 100644 index 000000000..a028fd2ba --- /dev/null +++ b/src/rocksdb/db/builder.h @@ -0,0 +1,77 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include +#include +#include + +#include "db/range_tombstone_fragmenter.h" +#include "db/seqno_to_time_mapping.h" +#include "db/table_properties_collector.h" +#include "logging/event_logger.h" +#include "options/cf_options.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/listener.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/types.h" +#include "table/scoped_arena_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +struct FileMetaData; + +class VersionSet; +class BlobFileAddition; +class SnapshotChecker; +class TableCache; +class TableBuilder; +class WritableFileWriter; +class InternalStats; +class BlobFileCompletionCallback; + +// Convenience function for NewTableBuilder on the embedded table_factory. +TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions, + WritableFileWriter* file); + +// Build a Table file from the contents of *iter. The generated file +// will be named according to number specified in meta. On success, the rest of +// *meta will be filled with metadata about the generated table. +// If no data is present in *iter, meta->file_size will be set to +// zero, and no Table file will be produced. +// +// @param column_family_name Name of the column family that is also identified +// by column_family_id, or empty string if unknown. +extern Status BuildTable( + const std::string& dbname, VersionSet* versions, + const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions, + const FileOptions& file_options, TableCache* table_cache, + InternalIterator* iter, + std::vector> + range_del_iters, + FileMetaData* meta, std::vector* blob_file_additions, + std::vector snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SequenceNumber job_snapshot, SnapshotChecker* snapshot_checker, + bool paranoid_file_checks, InternalStats* internal_stats, + IOStatus* io_status, const std::shared_ptr& io_tracer, + BlobFileCreationReason blob_creation_reason, + const SeqnoToTimeMapping& seqno_to_time_mapping, + EventLogger* event_logger = nullptr, int job_id = 0, + const Env::IOPriority io_priority = Env::IO_HIGH, + TableProperties* table_properties = nullptr, + Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET, + const std::string* full_history_ts_low = nullptr, + BlobFileCompletionCallback* blob_callback = nullptr, + uint64_t* num_input_entries = nullptr, + uint64_t* memtable_payload_bytes = nullptr, + uint64_t* memtable_garbage_bytes = nullptr); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/c.cc b/src/rocksdb/db/c.cc new file mode 100644 index 000000000..a7e4360c6 --- /dev/null +++ b/src/rocksdb/db/c.cc @@ -0,0 +1,6390 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE + +#include "rocksdb/c.h" + +#include +#include +#include +#include + +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/comparator.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/experimental.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/iterator.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/options.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/rate_limiter.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "rocksdb/universal_compaction.h" +#include "rocksdb/utilities/backup_engine.h" +#include "rocksdb/utilities/checkpoint.h" +#include "rocksdb/utilities/db_ttl.h" +#include "rocksdb/utilities/memory_util.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/options_util.h" +#include "rocksdb/utilities/table_properties_collectors.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "rocksdb/write_batch.h" +#include "utilities/merge_operators.h" + +using ROCKSDB_NAMESPACE::BackupEngine; +using ROCKSDB_NAMESPACE::BackupEngineOptions; +using ROCKSDB_NAMESPACE::BackupID; +using ROCKSDB_NAMESPACE::BackupInfo; +using ROCKSDB_NAMESPACE::BatchResult; +using ROCKSDB_NAMESPACE::BlockBasedTableOptions; +using ROCKSDB_NAMESPACE::BottommostLevelCompaction; +using ROCKSDB_NAMESPACE::BytewiseComparator; +using ROCKSDB_NAMESPACE::Cache; +using ROCKSDB_NAMESPACE::Checkpoint; +using ROCKSDB_NAMESPACE::ColumnFamilyDescriptor; +using ROCKSDB_NAMESPACE::ColumnFamilyHandle; +using ROCKSDB_NAMESPACE::ColumnFamilyMetaData; +using ROCKSDB_NAMESPACE::ColumnFamilyOptions; +using ROCKSDB_NAMESPACE::CompactionFilter; +using ROCKSDB_NAMESPACE::CompactionFilterFactory; +using ROCKSDB_NAMESPACE::CompactionOptionsFIFO; +using ROCKSDB_NAMESPACE::CompactRangeOptions; +using ROCKSDB_NAMESPACE::Comparator; +using ROCKSDB_NAMESPACE::CompressionType; +using ROCKSDB_NAMESPACE::CuckooTableOptions; +using ROCKSDB_NAMESPACE::DB; +using ROCKSDB_NAMESPACE::DBOptions; +using ROCKSDB_NAMESPACE::DbPath; +using ROCKSDB_NAMESPACE::Env; +using ROCKSDB_NAMESPACE::EnvOptions; +using ROCKSDB_NAMESPACE::FileLock; +using ROCKSDB_NAMESPACE::FilterPolicy; +using ROCKSDB_NAMESPACE::FlushOptions; +using ROCKSDB_NAMESPACE::InfoLogLevel; +using ROCKSDB_NAMESPACE::IngestExternalFileOptions; +using ROCKSDB_NAMESPACE::Iterator; +using ROCKSDB_NAMESPACE::LevelMetaData; +using ROCKSDB_NAMESPACE::LiveFileMetaData; +using ROCKSDB_NAMESPACE::Logger; +using ROCKSDB_NAMESPACE::LRUCacheOptions; +using ROCKSDB_NAMESPACE::MemoryAllocator; +using ROCKSDB_NAMESPACE::MemoryUtil; +using ROCKSDB_NAMESPACE::MergeOperator; +using ROCKSDB_NAMESPACE::NewBloomFilterPolicy; +using ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory; +using ROCKSDB_NAMESPACE::NewGenericRateLimiter; +using ROCKSDB_NAMESPACE::NewLRUCache; +using ROCKSDB_NAMESPACE::NewRibbonFilterPolicy; +using ROCKSDB_NAMESPACE::OptimisticTransactionDB; +using ROCKSDB_NAMESPACE::OptimisticTransactionOptions; +using ROCKSDB_NAMESPACE::Options; +using ROCKSDB_NAMESPACE::PerfContext; +using ROCKSDB_NAMESPACE::PerfLevel; +using ROCKSDB_NAMESPACE::PinnableSlice; +using ROCKSDB_NAMESPACE::PrepopulateBlobCache; +using ROCKSDB_NAMESPACE::RandomAccessFile; +using ROCKSDB_NAMESPACE::Range; +using ROCKSDB_NAMESPACE::RateLimiter; +using ROCKSDB_NAMESPACE::ReadOptions; +using ROCKSDB_NAMESPACE::RestoreOptions; +using ROCKSDB_NAMESPACE::SequentialFile; +using ROCKSDB_NAMESPACE::Slice; +using ROCKSDB_NAMESPACE::SliceParts; +using ROCKSDB_NAMESPACE::SliceTransform; +using ROCKSDB_NAMESPACE::Snapshot; +using ROCKSDB_NAMESPACE::SstFileMetaData; +using ROCKSDB_NAMESPACE::SstFileWriter; +using ROCKSDB_NAMESPACE::Status; +using ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory; +using ROCKSDB_NAMESPACE::Transaction; +using ROCKSDB_NAMESPACE::TransactionDB; +using ROCKSDB_NAMESPACE::TransactionDBOptions; +using ROCKSDB_NAMESPACE::TransactionLogIterator; +using ROCKSDB_NAMESPACE::TransactionOptions; +using ROCKSDB_NAMESPACE::WALRecoveryMode; +using ROCKSDB_NAMESPACE::WritableFile; +using ROCKSDB_NAMESPACE::WriteBatch; +using ROCKSDB_NAMESPACE::WriteBatchWithIndex; +using ROCKSDB_NAMESPACE::WriteOptions; + +using std::unordered_set; +using std::vector; + +extern "C" { + +struct rocksdb_t { + DB* rep; +}; +struct rocksdb_backup_engine_t { + BackupEngine* rep; +}; +struct rocksdb_backup_engine_info_t { + std::vector rep; +}; +struct rocksdb_restore_options_t { + RestoreOptions rep; +}; +struct rocksdb_iterator_t { + Iterator* rep; +}; +struct rocksdb_writebatch_t { + WriteBatch rep; +}; +struct rocksdb_writebatch_wi_t { + WriteBatchWithIndex* rep; +}; +struct rocksdb_snapshot_t { + const Snapshot* rep; +}; +struct rocksdb_flushoptions_t { + FlushOptions rep; +}; +struct rocksdb_fifo_compaction_options_t { + CompactionOptionsFIFO rep; +}; +struct rocksdb_readoptions_t { + ReadOptions rep; + // stack variables to set pointers to in ReadOptions + Slice upper_bound; + Slice lower_bound; + Slice timestamp; + Slice iter_start_ts; +}; +struct rocksdb_writeoptions_t { + WriteOptions rep; +}; +struct rocksdb_options_t { + Options rep; +}; +struct rocksdb_compactoptions_t { + CompactRangeOptions rep; + Slice full_history_ts_low; +}; +struct rocksdb_block_based_table_options_t { + BlockBasedTableOptions rep; +}; +struct rocksdb_cuckoo_table_options_t { + CuckooTableOptions rep; +}; +struct rocksdb_seqfile_t { + SequentialFile* rep; +}; +struct rocksdb_randomfile_t { + RandomAccessFile* rep; +}; +struct rocksdb_writablefile_t { + WritableFile* rep; +}; +struct rocksdb_wal_iterator_t { + TransactionLogIterator* rep; +}; +struct rocksdb_wal_readoptions_t { + TransactionLogIterator::ReadOptions rep; +}; +struct rocksdb_filelock_t { + FileLock* rep; +}; +struct rocksdb_logger_t { + std::shared_ptr rep; +}; +struct rocksdb_lru_cache_options_t { + LRUCacheOptions rep; +}; +struct rocksdb_memory_allocator_t { + std::shared_ptr rep; +}; +struct rocksdb_cache_t { + std::shared_ptr rep; +}; +struct rocksdb_livefiles_t { + std::vector rep; +}; +struct rocksdb_column_family_handle_t { + ColumnFamilyHandle* rep; +}; +struct rocksdb_column_family_metadata_t { + ColumnFamilyMetaData rep; +}; +struct rocksdb_level_metadata_t { + const LevelMetaData* rep; +}; +struct rocksdb_sst_file_metadata_t { + const SstFileMetaData* rep; +}; +struct rocksdb_envoptions_t { + EnvOptions rep; +}; +struct rocksdb_ingestexternalfileoptions_t { + IngestExternalFileOptions rep; +}; +struct rocksdb_sstfilewriter_t { + SstFileWriter* rep; +}; +struct rocksdb_ratelimiter_t { + std::shared_ptr rep; +}; +struct rocksdb_perfcontext_t { + PerfContext* rep; +}; +struct rocksdb_pinnableslice_t { + PinnableSlice rep; +}; +struct rocksdb_transactiondb_options_t { + TransactionDBOptions rep; +}; +struct rocksdb_transactiondb_t { + TransactionDB* rep; +}; +struct rocksdb_transaction_options_t { + TransactionOptions rep; +}; +struct rocksdb_transaction_t { + Transaction* rep; +}; +struct rocksdb_backup_engine_options_t { + BackupEngineOptions rep; +}; +struct rocksdb_checkpoint_t { + Checkpoint* rep; +}; +struct rocksdb_optimistictransactiondb_t { + OptimisticTransactionDB* rep; +}; +struct rocksdb_optimistictransaction_options_t { + OptimisticTransactionOptions rep; +}; + +struct rocksdb_compactionfiltercontext_t { + CompactionFilter::Context rep; +}; + +struct rocksdb_compactionfilter_t : public CompactionFilter { + void* state_; + void (*destructor_)(void*); + unsigned char (*filter_)(void*, int level, const char* key, size_t key_length, + const char* existing_value, size_t value_length, + char** new_value, size_t* new_value_length, + unsigned char* value_changed); + const char* (*name_)(void*); + unsigned char ignore_snapshots_; + + ~rocksdb_compactionfilter_t() override { (*destructor_)(state_); } + + bool Filter(int level, const Slice& key, const Slice& existing_value, + std::string* new_value, bool* value_changed) const override { + char* c_new_value = nullptr; + size_t new_value_length = 0; + unsigned char c_value_changed = 0; + unsigned char result = + (*filter_)(state_, level, key.data(), key.size(), existing_value.data(), + existing_value.size(), &c_new_value, &new_value_length, + &c_value_changed); + if (c_value_changed) { + new_value->assign(c_new_value, new_value_length); + *value_changed = true; + } + return result; + } + + const char* Name() const override { return (*name_)(state_); } + + bool IgnoreSnapshots() const override { return ignore_snapshots_; } +}; + +struct rocksdb_compactionfilterfactory_t : public CompactionFilterFactory { + void* state_; + void (*destructor_)(void*); + rocksdb_compactionfilter_t* (*create_compaction_filter_)( + void*, rocksdb_compactionfiltercontext_t* context); + const char* (*name_)(void*); + + ~rocksdb_compactionfilterfactory_t() override { (*destructor_)(state_); } + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + rocksdb_compactionfiltercontext_t ccontext; + ccontext.rep = context; + CompactionFilter* cf = (*create_compaction_filter_)(state_, &ccontext); + return std::unique_ptr(cf); + } + + const char* Name() const override { return (*name_)(state_); } +}; + +struct rocksdb_comparator_t : public Comparator { + void* state_; + void (*destructor_)(void*); + int (*compare_)(void*, const char* a, size_t alen, const char* b, + size_t blen); + const char* (*name_)(void*); + int (*compare_ts_)(void*, const char* a_ts, size_t a_tslen, const char* b_ts, + size_t b_tslen); + int (*compare_without_ts_)(void*, const char* a, size_t alen, + unsigned char a_has_ts, const char* b, size_t blen, + unsigned char b_has_ts); + + rocksdb_comparator_t() : Comparator() {} + + rocksdb_comparator_t(size_t ts_size) : Comparator(ts_size) {} + + ~rocksdb_comparator_t() override { (*destructor_)(state_); } + + int Compare(const Slice& a, const Slice& b) const override { + return (*compare_)(state_, a.data(), a.size(), b.data(), b.size()); + } + + int CompareTimestamp(const Slice& a_ts, const Slice& b_ts) const override { + if (compare_ts_ == nullptr) { + return 0; + } + return (*compare_ts_)(state_, a_ts.data(), a_ts.size(), b_ts.data(), + b_ts.size()); + } + + int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b, + bool b_has_ts) const override { + if (compare_without_ts_ == nullptr) { + return Compare(a, b); + } + return (*compare_without_ts_)(state_, a.data(), a.size(), a_has_ts, + b.data(), b.size(), b_has_ts); + } + + const char* Name() const override { return (*name_)(state_); } + + // No-ops since the C binding does not support key shortening methods. + void FindShortestSeparator(std::string*, const Slice&) const override {} + void FindShortSuccessor(std::string* /*key*/) const override {} +}; + +struct rocksdb_filterpolicy_t : public FilterPolicy { + void* state_; + void (*destructor_)(void*); + const char* (*name_)(void*); + + ~rocksdb_filterpolicy_t() override { (*destructor_)(state_); } + + const char* Name() const override { return (*name_)(state_); } +}; + +struct rocksdb_mergeoperator_t : public MergeOperator { + void* state_; + void (*destructor_)(void*); + const char* (*name_)(void*); + char* (*full_merge_)(void*, const char* key, size_t key_length, + const char* existing_value, size_t existing_value_length, + const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length); + char* (*partial_merge_)(void*, const char* key, size_t key_length, + const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length); + void (*delete_value_)(void*, const char* value, size_t value_length); + + ~rocksdb_mergeoperator_t() override { (*destructor_)(state_); } + + const char* Name() const override { return (*name_)(state_); } + + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override { + size_t n = merge_in.operand_list.size(); + std::vector operand_pointers(n); + std::vector operand_sizes(n); + for (size_t i = 0; i < n; i++) { + Slice operand(merge_in.operand_list[i]); + operand_pointers[i] = operand.data(); + operand_sizes[i] = operand.size(); + } + + const char* existing_value_data = nullptr; + size_t existing_value_len = 0; + if (merge_in.existing_value != nullptr) { + existing_value_data = merge_in.existing_value->data(); + existing_value_len = merge_in.existing_value->size(); + } + + unsigned char success; + size_t new_value_len; + char* tmp_new_value = (*full_merge_)( + state_, merge_in.key.data(), merge_in.key.size(), existing_value_data, + existing_value_len, &operand_pointers[0], &operand_sizes[0], + static_cast(n), &success, &new_value_len); + merge_out->new_value.assign(tmp_new_value, new_value_len); + + if (delete_value_ != nullptr) { + (*delete_value_)(state_, tmp_new_value, new_value_len); + } else { + free(tmp_new_value); + } + + return success; + } + + bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, + Logger* /*logger*/) const override { + size_t operand_count = operand_list.size(); + std::vector operand_pointers(operand_count); + std::vector operand_sizes(operand_count); + for (size_t i = 0; i < operand_count; ++i) { + Slice operand(operand_list[i]); + operand_pointers[i] = operand.data(); + operand_sizes[i] = operand.size(); + } + + unsigned char success; + size_t new_value_len; + char* tmp_new_value = (*partial_merge_)( + state_, key.data(), key.size(), &operand_pointers[0], &operand_sizes[0], + static_cast(operand_count), &success, &new_value_len); + new_value->assign(tmp_new_value, new_value_len); + + if (delete_value_ != nullptr) { + (*delete_value_)(state_, tmp_new_value, new_value_len); + } else { + free(tmp_new_value); + } + + return success; + } +}; + +struct rocksdb_dbpath_t { + DbPath rep; +}; + +struct rocksdb_env_t { + Env* rep; + bool is_default; +}; + +struct rocksdb_slicetransform_t : public SliceTransform { + void* state_; + void (*destructor_)(void*); + const char* (*name_)(void*); + char* (*transform_)(void*, const char* key, size_t length, + size_t* dst_length); + unsigned char (*in_domain_)(void*, const char* key, size_t length); + unsigned char (*in_range_)(void*, const char* key, size_t length); + + ~rocksdb_slicetransform_t() override { (*destructor_)(state_); } + + const char* Name() const override { return (*name_)(state_); } + + Slice Transform(const Slice& src) const override { + size_t len; + char* dst = (*transform_)(state_, src.data(), src.size(), &len); + return Slice(dst, len); + } + + bool InDomain(const Slice& src) const override { + return (*in_domain_)(state_, src.data(), src.size()); + } + + bool InRange(const Slice& src) const override { + return (*in_range_)(state_, src.data(), src.size()); + } +}; + +struct rocksdb_universal_compaction_options_t { + ROCKSDB_NAMESPACE::CompactionOptionsUniversal* rep; +}; + +static bool SaveError(char** errptr, const Status& s) { + assert(errptr != nullptr); + if (s.ok()) { + return false; + } else if (*errptr == nullptr) { + *errptr = strdup(s.ToString().c_str()); + } else { + // TODO(sanjay): Merge with existing error? + // This is a bug if *errptr is not created by malloc() + free(*errptr); + *errptr = strdup(s.ToString().c_str()); + } + return true; +} + +static char* CopyString(const std::string& str) { + char* result = reinterpret_cast(malloc(sizeof(char) * str.size())); + memcpy(result, str.data(), sizeof(char) * str.size()); + return result; +} + +rocksdb_t* rocksdb_open(const rocksdb_options_t* options, const char* name, + char** errptr) { + DB* db; + if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) { + return nullptr; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +rocksdb_t* rocksdb_open_with_ttl(const rocksdb_options_t* options, + const char* name, int ttl, char** errptr) { + ROCKSDB_NAMESPACE::DBWithTTL* db; + if (SaveError(errptr, ROCKSDB_NAMESPACE::DBWithTTL::Open( + options->rep, std::string(name), &db, ttl))) { + return nullptr; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +rocksdb_t* rocksdb_open_for_read_only(const rocksdb_options_t* options, + const char* name, + unsigned char error_if_wal_file_exists, + char** errptr) { + DB* db; + if (SaveError(errptr, DB::OpenForReadOnly(options->rep, std::string(name), + &db, error_if_wal_file_exists))) { + return nullptr; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +rocksdb_t* rocksdb_open_as_secondary(const rocksdb_options_t* options, + const char* name, + const char* secondary_path, + char** errptr) { + DB* db; + if (SaveError(errptr, + DB::OpenAsSecondary(options->rep, std::string(name), + std::string(secondary_path), &db))) { + return nullptr; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +rocksdb_backup_engine_t* rocksdb_backup_engine_open( + const rocksdb_options_t* options, const char* path, char** errptr) { + BackupEngine* be; + if (SaveError(errptr, BackupEngine::Open( + options->rep.env, + BackupEngineOptions(path, nullptr, true, + options->rep.info_log.get()), + &be))) { + return nullptr; + } + rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t; + result->rep = be; + return result; +} + +rocksdb_backup_engine_t* rocksdb_backup_engine_open_opts( + const rocksdb_backup_engine_options_t* options, rocksdb_env_t* env, + char** errptr) { + BackupEngine* be; + if (SaveError(errptr, BackupEngine::Open(options->rep, env->rep, &be))) { + return nullptr; + } + rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t; + result->rep = be; + return result; +} + +void rocksdb_backup_engine_create_new_backup(rocksdb_backup_engine_t* be, + rocksdb_t* db, char** errptr) { + SaveError(errptr, be->rep->CreateNewBackup(db->rep)); +} + +void rocksdb_backup_engine_create_new_backup_flush( + rocksdb_backup_engine_t* be, rocksdb_t* db, + unsigned char flush_before_backup, char** errptr) { + SaveError(errptr, be->rep->CreateNewBackup(db->rep, flush_before_backup)); +} + +void rocksdb_backup_engine_purge_old_backups(rocksdb_backup_engine_t* be, + uint32_t num_backups_to_keep, + char** errptr) { + SaveError(errptr, be->rep->PurgeOldBackups(num_backups_to_keep)); +} + +rocksdb_restore_options_t* rocksdb_restore_options_create() { + return new rocksdb_restore_options_t; +} + +void rocksdb_restore_options_destroy(rocksdb_restore_options_t* opt) { + delete opt; +} + +void rocksdb_restore_options_set_keep_log_files(rocksdb_restore_options_t* opt, + int v) { + opt->rep.keep_log_files = v; +} + +void rocksdb_backup_engine_verify_backup(rocksdb_backup_engine_t* be, + uint32_t backup_id, char** errptr) { + SaveError(errptr, be->rep->VerifyBackup(static_cast(backup_id))); +} + +void rocksdb_backup_engine_restore_db_from_latest_backup( + rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir, + const rocksdb_restore_options_t* restore_options, char** errptr) { + SaveError(errptr, be->rep->RestoreDBFromLatestBackup(std::string(db_dir), + std::string(wal_dir), + restore_options->rep)); +} + +void rocksdb_backup_engine_restore_db_from_backup( + rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir, + const rocksdb_restore_options_t* restore_options, const uint32_t backup_id, + char** errptr) { + SaveError(errptr, be->rep->RestoreDBFromBackup(backup_id, std::string(db_dir), + std::string(wal_dir), + restore_options->rep)); +} + +const rocksdb_backup_engine_info_t* rocksdb_backup_engine_get_backup_info( + rocksdb_backup_engine_t* be) { + rocksdb_backup_engine_info_t* result = new rocksdb_backup_engine_info_t; + be->rep->GetBackupInfo(&result->rep); + return result; +} + +int rocksdb_backup_engine_info_count(const rocksdb_backup_engine_info_t* info) { + return static_cast(info->rep.size()); +} + +int64_t rocksdb_backup_engine_info_timestamp( + const rocksdb_backup_engine_info_t* info, int index) { + return info->rep[index].timestamp; +} + +uint32_t rocksdb_backup_engine_info_backup_id( + const rocksdb_backup_engine_info_t* info, int index) { + return info->rep[index].backup_id; +} + +uint64_t rocksdb_backup_engine_info_size( + const rocksdb_backup_engine_info_t* info, int index) { + return info->rep[index].size; +} + +uint32_t rocksdb_backup_engine_info_number_files( + const rocksdb_backup_engine_info_t* info, int index) { + return info->rep[index].number_files; +} + +void rocksdb_backup_engine_info_destroy( + const rocksdb_backup_engine_info_t* info) { + delete info; +} + +void rocksdb_backup_engine_close(rocksdb_backup_engine_t* be) { + delete be->rep; + delete be; +} + +rocksdb_backup_engine_options_t* rocksdb_backup_engine_options_create( + const char* backup_dir) { + return new rocksdb_backup_engine_options_t{ + BackupEngineOptions(std::string(backup_dir))}; +} + +void rocksdb_backup_engine_options_set_backup_dir( + rocksdb_backup_engine_options_t* options, const char* backup_dir) { + options->rep.backup_dir = std::string(backup_dir); +} + +void rocksdb_backup_engine_options_set_env( + rocksdb_backup_engine_options_t* options, rocksdb_env_t* env) { + options->rep.backup_env = (env ? env->rep : nullptr); +} + +void rocksdb_backup_engine_options_set_share_table_files( + rocksdb_backup_engine_options_t* options, unsigned char val) { + options->rep.share_table_files = val; +} + +unsigned char rocksdb_backup_engine_options_get_share_table_files( + rocksdb_backup_engine_options_t* options) { + return options->rep.share_table_files; +} + +void rocksdb_backup_engine_options_set_sync( + rocksdb_backup_engine_options_t* options, unsigned char val) { + options->rep.sync = val; +} + +unsigned char rocksdb_backup_engine_options_get_sync( + rocksdb_backup_engine_options_t* options) { + return options->rep.sync; +} + +void rocksdb_backup_engine_options_set_destroy_old_data( + rocksdb_backup_engine_options_t* options, unsigned char val) { + options->rep.destroy_old_data = val; +} + +unsigned char rocksdb_backup_engine_options_get_destroy_old_data( + rocksdb_backup_engine_options_t* options) { + return options->rep.destroy_old_data; +} + +void rocksdb_backup_engine_options_set_backup_log_files( + rocksdb_backup_engine_options_t* options, unsigned char val) { + options->rep.backup_log_files = val; +} + +unsigned char rocksdb_backup_engine_options_get_backup_log_files( + rocksdb_backup_engine_options_t* options) { + return options->rep.backup_log_files; +} + +void rocksdb_backup_engine_options_set_backup_rate_limit( + rocksdb_backup_engine_options_t* options, uint64_t limit) { + options->rep.backup_rate_limit = limit; +} + +uint64_t rocksdb_backup_engine_options_get_backup_rate_limit( + rocksdb_backup_engine_options_t* options) { + return options->rep.backup_rate_limit; +} + +void rocksdb_backup_engine_options_set_restore_rate_limit( + rocksdb_backup_engine_options_t* options, uint64_t limit) { + options->rep.restore_rate_limit = limit; +} + +uint64_t rocksdb_backup_engine_options_get_restore_rate_limit( + rocksdb_backup_engine_options_t* options) { + return options->rep.restore_rate_limit; +} + +void rocksdb_backup_engine_options_set_max_background_operations( + rocksdb_backup_engine_options_t* options, int val) { + options->rep.max_background_operations = val; +} + +int rocksdb_backup_engine_options_get_max_background_operations( + rocksdb_backup_engine_options_t* options) { + return options->rep.max_background_operations; +} + +void rocksdb_backup_engine_options_set_callback_trigger_interval_size( + rocksdb_backup_engine_options_t* options, uint64_t size) { + options->rep.callback_trigger_interval_size = size; +} + +uint64_t rocksdb_backup_engine_options_get_callback_trigger_interval_size( + rocksdb_backup_engine_options_t* options) { + return options->rep.callback_trigger_interval_size; +} + +void rocksdb_backup_engine_options_set_max_valid_backups_to_open( + rocksdb_backup_engine_options_t* options, int val) { + options->rep.max_valid_backups_to_open = val; +} + +int rocksdb_backup_engine_options_get_max_valid_backups_to_open( + rocksdb_backup_engine_options_t* options) { + return options->rep.max_valid_backups_to_open; +} + +void rocksdb_backup_engine_options_set_share_files_with_checksum_naming( + rocksdb_backup_engine_options_t* options, int val) { + options->rep.share_files_with_checksum_naming = + static_cast(val); +} + +int rocksdb_backup_engine_options_get_share_files_with_checksum_naming( + rocksdb_backup_engine_options_t* options) { + return static_cast(options->rep.share_files_with_checksum_naming); +} + +void rocksdb_backup_engine_options_destroy( + rocksdb_backup_engine_options_t* options) { + delete options; +} + +rocksdb_checkpoint_t* rocksdb_checkpoint_object_create(rocksdb_t* db, + char** errptr) { + Checkpoint* checkpoint; + if (SaveError(errptr, Checkpoint::Create(db->rep, &checkpoint))) { + return nullptr; + } + rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t; + result->rep = checkpoint; + return result; +} + +void rocksdb_checkpoint_create(rocksdb_checkpoint_t* checkpoint, + const char* checkpoint_dir, + uint64_t log_size_for_flush, char** errptr) { + SaveError(errptr, checkpoint->rep->CreateCheckpoint( + std::string(checkpoint_dir), log_size_for_flush)); +} + +void rocksdb_checkpoint_object_destroy(rocksdb_checkpoint_t* checkpoint) { + delete checkpoint->rep; + delete checkpoint; +} + +void rocksdb_close(rocksdb_t* db) { + delete db->rep; + delete db; +} + +void rocksdb_options_set_uint64add_merge_operator(rocksdb_options_t* opt) { + opt->rep.merge_operator = + ROCKSDB_NAMESPACE::MergeOperators::CreateUInt64AddOperator(); +} + +rocksdb_t* rocksdb_open_and_trim_history( + const rocksdb_options_t* db_options, const char* name, + int num_column_families, const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char* trim_ts, + size_t trim_tslen, char** errptr) { + std::vector column_families; + for (int i = 0; i < num_column_families; i++) { + column_families.push_back(ColumnFamilyDescriptor( + std::string(column_family_names[i]), + ColumnFamilyOptions(column_family_options[i]->rep))); + } + + std::string trim_ts_(trim_ts, trim_tslen); + + DB* db; + std::vector handles; + if (SaveError(errptr, DB::OpenAndTrimHistory( + DBOptions(db_options->rep), std::string(name), + column_families, &handles, &db, trim_ts_))) { + return nullptr; + } + + for (size_t i = 0; i < handles.size(); i++) { + rocksdb_column_family_handle_t* c_handle = + new rocksdb_column_family_handle_t; + c_handle->rep = handles[i]; + column_family_handles[i] = c_handle; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +rocksdb_t* rocksdb_open_column_families( + const rocksdb_options_t* db_options, const char* name, + int num_column_families, const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr) { + std::vector column_families; + for (int i = 0; i < num_column_families; i++) { + column_families.push_back(ColumnFamilyDescriptor( + std::string(column_family_names[i]), + ColumnFamilyOptions(column_family_options[i]->rep))); + } + + DB* db; + std::vector handles; + if (SaveError(errptr, DB::Open(DBOptions(db_options->rep), std::string(name), + column_families, &handles, &db))) { + return nullptr; + } + + for (size_t i = 0; i < handles.size(); i++) { + rocksdb_column_family_handle_t* c_handle = + new rocksdb_column_family_handle_t; + c_handle->rep = handles[i]; + column_family_handles[i] = c_handle; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +rocksdb_t* rocksdb_open_column_families_with_ttl( + const rocksdb_options_t* db_options, const char* name, + int num_column_families, const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, const int* ttls, + char** errptr) { + std::vector ttls_vec; + std::vector column_families; + for (int i = 0; i < num_column_families; i++) { + ttls_vec.push_back(ttls[i]); + + column_families.push_back(ColumnFamilyDescriptor( + std::string(column_family_names[i]), + ColumnFamilyOptions(column_family_options[i]->rep))); + } + + ROCKSDB_NAMESPACE::DBWithTTL* db; + std::vector handles; + if (SaveError(errptr, ROCKSDB_NAMESPACE::DBWithTTL::Open( + DBOptions(db_options->rep), std::string(name), + column_families, &handles, &db, ttls_vec))) { + return nullptr; + } + + for (size_t i = 0; i < handles.size(); i++) { + rocksdb_column_family_handle_t* c_handle = + new rocksdb_column_family_handle_t; + c_handle->rep = handles[i]; + column_family_handles[i] = c_handle; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +rocksdb_t* rocksdb_open_for_read_only_column_families( + const rocksdb_options_t* db_options, const char* name, + int num_column_families, const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, + unsigned char error_if_wal_file_exists, char** errptr) { + std::vector column_families; + for (int i = 0; i < num_column_families; i++) { + column_families.push_back(ColumnFamilyDescriptor( + std::string(column_family_names[i]), + ColumnFamilyOptions(column_family_options[i]->rep))); + } + + DB* db; + std::vector handles; + if (SaveError(errptr, + DB::OpenForReadOnly(DBOptions(db_options->rep), + std::string(name), column_families, + &handles, &db, error_if_wal_file_exists))) { + return nullptr; + } + + for (size_t i = 0; i < handles.size(); i++) { + rocksdb_column_family_handle_t* c_handle = + new rocksdb_column_family_handle_t; + c_handle->rep = handles[i]; + column_family_handles[i] = c_handle; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +rocksdb_t* rocksdb_open_as_secondary_column_families( + const rocksdb_options_t* db_options, const char* name, + const char* secondary_path, int num_column_families, + const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr) { + std::vector column_families; + for (int i = 0; i != num_column_families; ++i) { + column_families.emplace_back( + std::string(column_family_names[i]), + ColumnFamilyOptions(column_family_options[i]->rep)); + } + DB* db; + std::vector handles; + if (SaveError(errptr, DB::OpenAsSecondary(DBOptions(db_options->rep), + std::string(name), + std::string(secondary_path), + column_families, &handles, &db))) { + return nullptr; + } + for (size_t i = 0; i != handles.size(); ++i) { + rocksdb_column_family_handle_t* c_handle = + new rocksdb_column_family_handle_t; + c_handle->rep = handles[i]; + column_family_handles[i] = c_handle; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +char** rocksdb_list_column_families(const rocksdb_options_t* options, + const char* name, size_t* lencfs, + char** errptr) { + std::vector fams; + SaveError(errptr, DB::ListColumnFamilies(DBOptions(options->rep), + std::string(name), &fams)); + + *lencfs = fams.size(); + char** column_families = + static_cast(malloc(sizeof(char*) * fams.size())); + for (size_t i = 0; i < fams.size(); i++) { + column_families[i] = strdup(fams[i].c_str()); + } + return column_families; +} + +void rocksdb_list_column_families_destroy(char** list, size_t len) { + for (size_t i = 0; i < len; ++i) { + free(list[i]); + } + free(list); +} + +rocksdb_column_family_handle_t* rocksdb_create_column_family( + rocksdb_t* db, const rocksdb_options_t* column_family_options, + const char* column_family_name, char** errptr) { + rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t; + SaveError(errptr, db->rep->CreateColumnFamily( + ColumnFamilyOptions(column_family_options->rep), + std::string(column_family_name), &(handle->rep))); + return handle; +} + +rocksdb_column_family_handle_t* rocksdb_create_column_family_with_ttl( + rocksdb_t* db, const rocksdb_options_t* column_family_options, + const char* column_family_name, int ttl, char** errptr) { + ROCKSDB_NAMESPACE::DBWithTTL* db_with_ttl = + static_cast(db->rep); + rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t; + SaveError(errptr, db_with_ttl->CreateColumnFamilyWithTtl( + ColumnFamilyOptions(column_family_options->rep), + std::string(column_family_name), &(handle->rep), ttl)); + return handle; +} + +void rocksdb_drop_column_family(rocksdb_t* db, + rocksdb_column_family_handle_t* handle, + char** errptr) { + SaveError(errptr, db->rep->DropColumnFamily(handle->rep)); +} + +uint32_t rocksdb_column_family_handle_get_id( + rocksdb_column_family_handle_t* handle) { + return handle->rep->GetID(); +} + +char* rocksdb_column_family_handle_get_name( + rocksdb_column_family_handle_t* handle, size_t* name_len) { + auto name = handle->rep->GetName(); + *name_len = name.size(); + return CopyString(name); +} + +void rocksdb_column_family_handle_destroy( + rocksdb_column_family_handle_t* handle) { + delete handle->rep; + delete handle; +} + +void rocksdb_put(rocksdb_t* db, const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, + db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen))); +} + +void rocksdb_put_cf(rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, const char* val, + size_t vallen, char** errptr) { + SaveError(errptr, db->rep->Put(options->rep, column_family->rep, + Slice(key, keylen), Slice(val, vallen))); +} + +void rocksdb_put_with_ts(rocksdb_t* db, const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, const char* ts, + size_t tslen, const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, db->rep->Put(options->rep, Slice(key, keylen), + Slice(ts, tslen), Slice(val, vallen))); +} + +void rocksdb_put_cf_with_ts(rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, const char* ts, + size_t tslen, const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, + db->rep->Put(options->rep, column_family->rep, Slice(key, keylen), + Slice(ts, tslen), Slice(val, vallen))); +} + +void rocksdb_delete(rocksdb_t* db, const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, char** errptr) { + SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen))); +} + +void rocksdb_delete_cf(rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, char** errptr) { + SaveError(errptr, db->rep->Delete(options->rep, column_family->rep, + Slice(key, keylen))); +} + +void rocksdb_delete_with_ts(rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, const char* ts, + size_t tslen, char** errptr) { + SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen), + Slice(ts, tslen))); +} + +void rocksdb_delete_cf_with_ts(rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, const char* ts, + size_t tslen, char** errptr) { + SaveError(errptr, db->rep->Delete(options->rep, column_family->rep, + Slice(key, keylen), Slice(ts, tslen))); +} + +void rocksdb_singledelete(rocksdb_t* db, const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, char** errptr) { + SaveError(errptr, db->rep->SingleDelete(options->rep, Slice(key, keylen))); +} + +void rocksdb_singledelete_cf(rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, char** errptr) { + SaveError(errptr, db->rep->SingleDelete(options->rep, column_family->rep, + Slice(key, keylen))); +} + +void rocksdb_singledelete_with_ts(rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, + const char* ts, size_t tslen, char** errptr) { + SaveError(errptr, db->rep->SingleDelete(options->rep, Slice(key, keylen), + Slice(ts, tslen))); +} + +void rocksdb_singledelete_cf_with_ts( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, const char* ts, size_t tslen, char** errptr) { + SaveError(errptr, + db->rep->SingleDelete(options->rep, column_family->rep, + Slice(key, keylen), Slice(ts, tslen))); +} + +void rocksdb_increase_full_history_ts_low( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* ts_low, size_t ts_lowlen, char** errptr) { + std::string ts(ts_low, ts_lowlen); + SaveError(errptr, db->rep->IncreaseFullHistoryTsLow(column_family->rep, ts)); +} + +char* rocksdb_get_full_history_ts_low( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + size_t* ts_len, char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = db->rep->GetFullHistoryTsLow(column_family->rep, &tmp); + if (s.ok()) { + *ts_len = tmp.size(); + result = CopyString(tmp); + } else { + *ts_len = 0; + SaveError(errptr, s); + } + return result; +} + +void rocksdb_delete_range_cf(rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, + const char* end_key, size_t end_key_len, + char** errptr) { + SaveError(errptr, db->rep->DeleteRange(options->rep, column_family->rep, + Slice(start_key, start_key_len), + Slice(end_key, end_key_len))); +} + +void rocksdb_merge(rocksdb_t* db, const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, const char* val, + size_t vallen, char** errptr) { + SaveError(errptr, db->rep->Merge(options->rep, Slice(key, keylen), + Slice(val, vallen))); +} + +void rocksdb_merge_cf(rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, const char* val, + size_t vallen, char** errptr) { + SaveError(errptr, db->rep->Merge(options->rep, column_family->rep, + Slice(key, keylen), Slice(val, vallen))); +} + +void rocksdb_write(rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t* batch, char** errptr) { + SaveError(errptr, db->rep->Write(options->rep, &batch->rep)); +} + +char* rocksdb_get(rocksdb_t* db, const rocksdb_readoptions_t* options, + const char* key, size_t keylen, size_t* vallen, + char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +char* rocksdb_get_cf(rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, size_t* vallen, + char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = + db->rep->Get(options->rep, column_family->rep, Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +char* rocksdb_get_with_ts(rocksdb_t* db, const rocksdb_readoptions_t* options, + const char* key, size_t keylen, size_t* vallen, + char** ts, size_t* tslen, char** errptr) { + char* result = nullptr; + std::string tmp_val; + std::string tmp_ts; + Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp_val, &tmp_ts); + if (s.ok()) { + *vallen = tmp_val.size(); + result = CopyString(tmp_val); + *tslen = tmp_ts.size(); + *ts = CopyString(tmp_ts); + } else { + *vallen = 0; + *tslen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +char* rocksdb_get_cf_with_ts(rocksdb_t* db, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, size_t* vallen, + char** ts, size_t* tslen, char** errptr) { + char* result = nullptr; + std::string tmp; + std::string tmp_ts; + Status s = db->rep->Get(options->rep, column_family->rep, Slice(key, keylen), + &tmp, &tmp_ts); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + *tslen = tmp_ts.size(); + *ts = CopyString(tmp_ts); + } else { + *vallen = 0; + *tslen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +void rocksdb_multi_get(rocksdb_t* db, const rocksdb_readoptions_t* options, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs) { + std::vector keys(num_keys); + for (size_t i = 0; i < num_keys; i++) { + keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector values(num_keys); + std::vector statuses = db->rep->MultiGet(options->rep, keys, &values); + for (size_t i = 0; i < num_keys; i++) { + if (statuses[i].ok()) { + values_list[i] = CopyString(values[i]); + values_list_sizes[i] = values[i].size(); + errs[i] = nullptr; + } else { + values_list[i] = nullptr; + values_list_sizes[i] = 0; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } +} + +void rocksdb_multi_get_with_ts(rocksdb_t* db, + const rocksdb_readoptions_t* options, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, + char** values_list, size_t* values_list_sizes, + char** timestamp_list, + size_t* timestamp_list_sizes, char** errs) { + std::vector keys(num_keys); + for (size_t i = 0; i < num_keys; i++) { + keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector values(num_keys); + std::vector timestamps(num_keys); + std::vector statuses = + db->rep->MultiGet(options->rep, keys, &values, ×tamps); + for (size_t i = 0; i < num_keys; i++) { + if (statuses[i].ok()) { + values_list[i] = CopyString(values[i]); + values_list_sizes[i] = values[i].size(); + timestamp_list[i] = CopyString(timestamps[i]); + timestamp_list_sizes[i] = timestamps[i].size(); + errs[i] = nullptr; + } else { + values_list[i] = nullptr; + values_list_sizes[i] = 0; + timestamp_list[i] = nullptr; + timestamp_list_sizes[i] = 0; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } +} + +void rocksdb_multi_get_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + const rocksdb_column_family_handle_t* const* column_families, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs) { + std::vector keys(num_keys); + std::vector cfs(num_keys); + for (size_t i = 0; i < num_keys; i++) { + keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + cfs[i] = column_families[i]->rep; + } + std::vector values(num_keys); + std::vector statuses = + db->rep->MultiGet(options->rep, cfs, keys, &values); + for (size_t i = 0; i < num_keys; i++) { + if (statuses[i].ok()) { + values_list[i] = CopyString(values[i]); + values_list_sizes[i] = values[i].size(); + errs[i] = nullptr; + } else { + values_list[i] = nullptr; + values_list_sizes[i] = 0; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } +} + +void rocksdb_multi_get_cf_with_ts( + rocksdb_t* db, const rocksdb_readoptions_t* options, + const rocksdb_column_family_handle_t* const* column_families, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** timestamps_list, + size_t* timestamps_list_sizes, char** errs) { + std::vector keys(num_keys); + std::vector cfs(num_keys); + for (size_t i = 0; i < num_keys; i++) { + keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + cfs[i] = column_families[i]->rep; + } + std::vector values(num_keys); + std::vector timestamps(num_keys); + std::vector statuses = + db->rep->MultiGet(options->rep, cfs, keys, &values, ×tamps); + for (size_t i = 0; i < num_keys; i++) { + if (statuses[i].ok()) { + values_list[i] = CopyString(values[i]); + values_list_sizes[i] = values[i].size(); + timestamps_list[i] = CopyString(timestamps[i]); + timestamps_list_sizes[i] = timestamps[i].size(); + errs[i] = nullptr; + } else { + values_list[i] = nullptr; + values_list_sizes[i] = 0; + timestamps_list[i] = nullptr; + timestamps_list_sizes[i] = 0; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } +} + +void rocksdb_batched_multi_get_cf(rocksdb_t* db, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, + rocksdb_pinnableslice_t** values, char** errs, + const bool sorted_input) { + Slice* key_slices = new Slice[num_keys]; + PinnableSlice* value_slices = new PinnableSlice[num_keys]; + Status* statuses = new Status[num_keys]; + for (size_t i = 0; i < num_keys; ++i) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + + db->rep->MultiGet(options->rep, column_family->rep, num_keys, key_slices, + value_slices, statuses, sorted_input); + + for (size_t i = 0; i < num_keys; ++i) { + if (statuses[i].ok()) { + values[i] = new (rocksdb_pinnableslice_t); + values[i]->rep = std::move(value_slices[i]); + errs[i] = nullptr; + } else { + values[i] = nullptr; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } + + delete[] key_slices; + delete[] value_slices; + delete[] statuses; +} + +unsigned char rocksdb_key_may_exist(rocksdb_t* db, + const rocksdb_readoptions_t* options, + const char* key, size_t key_len, + char** value, size_t* val_len, + const char* timestamp, size_t timestamp_len, + unsigned char* value_found) { + std::string tmp; + std::string time; + if (timestamp) { + time.assign(timestamp, timestamp_len); + } + bool found = false; + const bool result = db->rep->KeyMayExist(options->rep, Slice(key, key_len), + &tmp, timestamp ? &time : nullptr, + value_found ? &found : nullptr); + if (value_found) { + *value_found = found; + if (found) { + *val_len = tmp.size(); + *value = CopyString(tmp); + } + } + return result; +} + +unsigned char rocksdb_key_may_exist_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t key_len, char** value, size_t* val_len, const char* timestamp, + size_t timestamp_len, unsigned char* value_found) { + std::string tmp; + std::string time; + if (timestamp) { + time.assign(timestamp, timestamp_len); + } + bool found = false; + const bool result = db->rep->KeyMayExist( + options->rep, column_family->rep, Slice(key, key_len), &tmp, + timestamp ? &time : nullptr, value_found ? &found : nullptr); + if (value_found) { + *value_found = found; + if (found) { + *val_len = tmp.size(); + *value = CopyString(tmp); + } + } + return result; +} + +rocksdb_iterator_t* rocksdb_create_iterator( + rocksdb_t* db, const rocksdb_readoptions_t* options) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = db->rep->NewIterator(options->rep); + return result; +} + +rocksdb_wal_iterator_t* rocksdb_get_updates_since( + rocksdb_t* db, uint64_t seq_number, + const rocksdb_wal_readoptions_t* options, char** errptr) { + std::unique_ptr iter; + TransactionLogIterator::ReadOptions ro; + if (options != nullptr) { + ro = options->rep; + } + if (SaveError(errptr, db->rep->GetUpdatesSince(seq_number, &iter, ro))) { + return nullptr; + } + rocksdb_wal_iterator_t* result = new rocksdb_wal_iterator_t; + result->rep = iter.release(); + return result; +} + +void rocksdb_wal_iter_next(rocksdb_wal_iterator_t* iter) { iter->rep->Next(); } + +unsigned char rocksdb_wal_iter_valid(const rocksdb_wal_iterator_t* iter) { + return iter->rep->Valid(); +} + +void rocksdb_wal_iter_status(const rocksdb_wal_iterator_t* iter, + char** errptr) { + SaveError(errptr, iter->rep->status()); +} + +void rocksdb_wal_iter_destroy(const rocksdb_wal_iterator_t* iter) { + delete iter->rep; + delete iter; +} + +rocksdb_writebatch_t* rocksdb_wal_iter_get_batch( + const rocksdb_wal_iterator_t* iter, uint64_t* seq) { + rocksdb_writebatch_t* result = rocksdb_writebatch_create(); + BatchResult wal_batch = iter->rep->GetBatch(); + result->rep = std::move(*wal_batch.writeBatchPtr); + if (seq != nullptr) { + *seq = wal_batch.sequence; + } + return result; +} + +uint64_t rocksdb_get_latest_sequence_number(rocksdb_t* db) { + return db->rep->GetLatestSequenceNumber(); +} + +rocksdb_iterator_t* rocksdb_create_iterator_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = db->rep->NewIterator(options->rep, column_family->rep); + return result; +} + +void rocksdb_create_iterators(rocksdb_t* db, rocksdb_readoptions_t* opts, + rocksdb_column_family_handle_t** column_families, + rocksdb_iterator_t** iterators, size_t size, + char** errptr) { + std::vector column_families_vec; + for (size_t i = 0; i < size; i++) { + column_families_vec.push_back(column_families[i]->rep); + } + + std::vector res; + Status status = db->rep->NewIterators(opts->rep, column_families_vec, &res); + assert(res.size() == size); + if (SaveError(errptr, status)) { + return; + } + + for (size_t i = 0; i < size; i++) { + iterators[i] = new rocksdb_iterator_t; + iterators[i]->rep = res[i]; + } +} + +const rocksdb_snapshot_t* rocksdb_create_snapshot(rocksdb_t* db) { + rocksdb_snapshot_t* result = new rocksdb_snapshot_t; + result->rep = db->rep->GetSnapshot(); + return result; +} + +void rocksdb_release_snapshot(rocksdb_t* db, + const rocksdb_snapshot_t* snapshot) { + db->rep->ReleaseSnapshot(snapshot->rep); + delete snapshot; +} + +char* rocksdb_property_value(rocksdb_t* db, const char* propname) { + std::string tmp; + if (db->rep->GetProperty(Slice(propname), &tmp)) { + // We use strdup() since we expect human readable output. + return strdup(tmp.c_str()); + } else { + return nullptr; + } +} + +int rocksdb_property_int(rocksdb_t* db, const char* propname, + uint64_t* out_val) { + if (db->rep->GetIntProperty(Slice(propname), out_val)) { + return 0; + } else { + return -1; + } +} + +int rocksdb_property_int_cf(rocksdb_t* db, + rocksdb_column_family_handle_t* column_family, + const char* propname, uint64_t* out_val) { + if (db->rep->GetIntProperty(column_family->rep, Slice(propname), out_val)) { + return 0; + } else { + return -1; + } +} + +char* rocksdb_property_value_cf(rocksdb_t* db, + rocksdb_column_family_handle_t* column_family, + const char* propname) { + std::string tmp; + if (db->rep->GetProperty(column_family->rep, Slice(propname), &tmp)) { + // We use strdup() since we expect human readable output. + return strdup(tmp.c_str()); + } else { + return nullptr; + } +} + +void rocksdb_approximate_sizes(rocksdb_t* db, int num_ranges, + const char* const* range_start_key, + const size_t* range_start_key_len, + const char* const* range_limit_key, + const size_t* range_limit_key_len, + uint64_t* sizes, char** errptr) { + Range* ranges = new Range[num_ranges]; + for (int i = 0; i < num_ranges; i++) { + ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]); + ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]); + } + Status s = db->rep->GetApproximateSizes(ranges, num_ranges, sizes); + if (!s.ok()) { + SaveError(errptr, s); + } + delete[] ranges; +} + +void rocksdb_approximate_sizes_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + int num_ranges, const char* const* range_start_key, + const size_t* range_start_key_len, const char* const* range_limit_key, + const size_t* range_limit_key_len, uint64_t* sizes, char** errptr) { + Range* ranges = new Range[num_ranges]; + for (int i = 0; i < num_ranges; i++) { + ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]); + ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]); + } + Status s = db->rep->GetApproximateSizes(column_family->rep, ranges, + num_ranges, sizes); + if (!s.ok()) { + SaveError(errptr, s); + } + delete[] ranges; +} + +void rocksdb_delete_file(rocksdb_t* db, const char* name) { + db->rep->DeleteFile(name); +} + +const rocksdb_livefiles_t* rocksdb_livefiles(rocksdb_t* db) { + rocksdb_livefiles_t* result = new rocksdb_livefiles_t; + db->rep->GetLiveFilesMetaData(&result->rep); + return result; +} + +void rocksdb_compact_range(rocksdb_t* db, const char* start_key, + size_t start_key_len, const char* limit_key, + size_t limit_key_len) { + Slice a, b; + db->rep->CompactRange( + CompactRangeOptions(), + // Pass nullptr Slice if corresponding "const char*" is nullptr + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)); +} + +void rocksdb_compact_range_cf(rocksdb_t* db, + rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len) { + Slice a, b; + db->rep->CompactRange( + CompactRangeOptions(), column_family->rep, + // Pass nullptr Slice if corresponding "const char*" is nullptr + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)); +} + +void rocksdb_suggest_compact_range(rocksdb_t* db, const char* start_key, + size_t start_key_len, const char* limit_key, + size_t limit_key_len, char** errptr) { + Slice a, b; + Status s = ROCKSDB_NAMESPACE::experimental::SuggestCompactRange( + db->rep, + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)); + SaveError(errptr, s); +} + +void rocksdb_suggest_compact_range_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* limit_key, + size_t limit_key_len, char** errptr) { + Slice a, b; + Status s = db->rep->SuggestCompactRange( + column_family->rep, + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)); + SaveError(errptr, s); +} + +void rocksdb_compact_range_opt(rocksdb_t* db, rocksdb_compactoptions_t* opt, + const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len) { + Slice a, b; + db->rep->CompactRange( + opt->rep, + // Pass nullptr Slice if corresponding "const char*" is nullptr + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)); +} + +void rocksdb_compact_range_cf_opt(rocksdb_t* db, + rocksdb_column_family_handle_t* column_family, + rocksdb_compactoptions_t* opt, + const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len) { + Slice a, b; + db->rep->CompactRange( + opt->rep, column_family->rep, + // Pass nullptr Slice if corresponding "const char*" is nullptr + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)); +} + +void rocksdb_flush(rocksdb_t* db, const rocksdb_flushoptions_t* options, + char** errptr) { + SaveError(errptr, db->rep->Flush(options->rep)); +} + +void rocksdb_flush_cf(rocksdb_t* db, const rocksdb_flushoptions_t* options, + rocksdb_column_family_handle_t* column_family, + char** errptr) { + SaveError(errptr, db->rep->Flush(options->rep, column_family->rep)); +} + +void rocksdb_flush_wal(rocksdb_t* db, unsigned char sync, char** errptr) { + SaveError(errptr, db->rep->FlushWAL(sync)); +} + +void rocksdb_disable_file_deletions(rocksdb_t* db, char** errptr) { + SaveError(errptr, db->rep->DisableFileDeletions()); +} + +void rocksdb_enable_file_deletions(rocksdb_t* db, unsigned char force, + char** errptr) { + SaveError(errptr, db->rep->EnableFileDeletions(force)); +} + +void rocksdb_destroy_db(const rocksdb_options_t* options, const char* name, + char** errptr) { + SaveError(errptr, DestroyDB(name, options->rep)); +} + +void rocksdb_repair_db(const rocksdb_options_t* options, const char* name, + char** errptr) { + SaveError(errptr, RepairDB(name, options->rep)); +} + +void rocksdb_iter_destroy(rocksdb_iterator_t* iter) { + delete iter->rep; + delete iter; +} + +unsigned char rocksdb_iter_valid(const rocksdb_iterator_t* iter) { + return iter->rep->Valid(); +} + +void rocksdb_iter_seek_to_first(rocksdb_iterator_t* iter) { + iter->rep->SeekToFirst(); +} + +void rocksdb_iter_seek_to_last(rocksdb_iterator_t* iter) { + iter->rep->SeekToLast(); +} + +void rocksdb_iter_seek(rocksdb_iterator_t* iter, const char* k, size_t klen) { + iter->rep->Seek(Slice(k, klen)); +} + +void rocksdb_iter_seek_for_prev(rocksdb_iterator_t* iter, const char* k, + size_t klen) { + iter->rep->SeekForPrev(Slice(k, klen)); +} + +void rocksdb_iter_next(rocksdb_iterator_t* iter) { iter->rep->Next(); } + +void rocksdb_iter_prev(rocksdb_iterator_t* iter) { iter->rep->Prev(); } + +const char* rocksdb_iter_key(const rocksdb_iterator_t* iter, size_t* klen) { + Slice s = iter->rep->key(); + *klen = s.size(); + return s.data(); +} + +const char* rocksdb_iter_value(const rocksdb_iterator_t* iter, size_t* vlen) { + Slice s = iter->rep->value(); + *vlen = s.size(); + return s.data(); +} + +const char* rocksdb_iter_timestamp(const rocksdb_iterator_t* iter, + size_t* tslen) { + Slice s = iter->rep->timestamp(); + *tslen = s.size(); + return s.data(); +} + +void rocksdb_iter_get_error(const rocksdb_iterator_t* iter, char** errptr) { + SaveError(errptr, iter->rep->status()); +} + +rocksdb_writebatch_t* rocksdb_writebatch_create() { + return new rocksdb_writebatch_t; +} + +rocksdb_writebatch_t* rocksdb_writebatch_create_from(const char* rep, + size_t size) { + rocksdb_writebatch_t* b = new rocksdb_writebatch_t; + b->rep = WriteBatch(std::string(rep, size)); + return b; +} + +void rocksdb_writebatch_destroy(rocksdb_writebatch_t* b) { delete b; } + +void rocksdb_writebatch_clear(rocksdb_writebatch_t* b) { b->rep.Clear(); } + +int rocksdb_writebatch_count(rocksdb_writebatch_t* b) { return b->rep.Count(); } + +void rocksdb_writebatch_put(rocksdb_writebatch_t* b, const char* key, + size_t klen, const char* val, size_t vlen) { + b->rep.Put(Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_put_cf(rocksdb_writebatch_t* b, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, + size_t vlen) { + b->rep.Put(column_family->rep, Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_put_cf_with_ts( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* ts, size_t tslen, const char* val, + size_t vlen) { + b->rep.Put(column_family->rep, Slice(key, klen), Slice(ts, tslen), + Slice(val, vlen)); +} + +void rocksdb_writebatch_putv(rocksdb_writebatch_t* b, int num_keys, + const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, + const size_t* values_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector value_slices(num_values); + for (int i = 0; i < num_values; i++) { + value_slices[i] = Slice(values_list[i], values_list_sizes[i]); + } + b->rep.Put(SliceParts(key_slices.data(), num_keys), + SliceParts(value_slices.data(), num_values)); +} + +void rocksdb_writebatch_putv_cf(rocksdb_writebatch_t* b, + rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, + const size_t* values_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector value_slices(num_values); + for (int i = 0; i < num_values; i++) { + value_slices[i] = Slice(values_list[i], values_list_sizes[i]); + } + b->rep.Put(column_family->rep, SliceParts(key_slices.data(), num_keys), + SliceParts(value_slices.data(), num_values)); +} + +void rocksdb_writebatch_merge(rocksdb_writebatch_t* b, const char* key, + size_t klen, const char* val, size_t vlen) { + b->rep.Merge(Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_merge_cf(rocksdb_writebatch_t* b, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, + size_t vlen) { + b->rep.Merge(column_family->rep, Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_mergev(rocksdb_writebatch_t* b, int num_keys, + const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, + const size_t* values_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector value_slices(num_values); + for (int i = 0; i < num_values; i++) { + value_slices[i] = Slice(values_list[i], values_list_sizes[i]); + } + b->rep.Merge(SliceParts(key_slices.data(), num_keys), + SliceParts(value_slices.data(), num_values)); +} + +void rocksdb_writebatch_mergev_cf(rocksdb_writebatch_t* b, + rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, + const size_t* values_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector value_slices(num_values); + for (int i = 0; i < num_values; i++) { + value_slices[i] = Slice(values_list[i], values_list_sizes[i]); + } + b->rep.Merge(column_family->rep, SliceParts(key_slices.data(), num_keys), + SliceParts(value_slices.data(), num_values)); +} + +void rocksdb_writebatch_delete(rocksdb_writebatch_t* b, const char* key, + size_t klen) { + b->rep.Delete(Slice(key, klen)); +} + +void rocksdb_writebatch_singledelete(rocksdb_writebatch_t* b, const char* key, + size_t klen) { + b->rep.SingleDelete(Slice(key, klen)); +} + +void rocksdb_writebatch_delete_cf(rocksdb_writebatch_t* b, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen) { + b->rep.Delete(column_family->rep, Slice(key, klen)); +} + +void rocksdb_writebatch_delete_cf_with_ts( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* ts, size_t tslen) { + b->rep.Delete(column_family->rep, Slice(key, klen), Slice(ts, tslen)); +} + +void rocksdb_writebatch_singledelete_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen) { + b->rep.SingleDelete(column_family->rep, Slice(key, klen)); +} + +void rocksdb_writebatch_singledelete_cf_with_ts( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* ts, size_t tslen) { + b->rep.SingleDelete(column_family->rep, Slice(key, klen), Slice(ts, tslen)); +} + +void rocksdb_writebatch_deletev(rocksdb_writebatch_t* b, int num_keys, + const char* const* keys_list, + const size_t* keys_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + b->rep.Delete(SliceParts(key_slices.data(), num_keys)); +} + +void rocksdb_writebatch_deletev_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + b->rep.Delete(column_family->rep, SliceParts(key_slices.data(), num_keys)); +} + +void rocksdb_writebatch_delete_range(rocksdb_writebatch_t* b, + const char* start_key, + size_t start_key_len, const char* end_key, + size_t end_key_len) { + b->rep.DeleteRange(Slice(start_key, start_key_len), + Slice(end_key, end_key_len)); +} + +void rocksdb_writebatch_delete_range_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* end_key, + size_t end_key_len) { + b->rep.DeleteRange(column_family->rep, Slice(start_key, start_key_len), + Slice(end_key, end_key_len)); +} + +void rocksdb_writebatch_delete_rangev(rocksdb_writebatch_t* b, int num_keys, + const char* const* start_keys_list, + const size_t* start_keys_list_sizes, + const char* const* end_keys_list, + const size_t* end_keys_list_sizes) { + std::vector start_key_slices(num_keys); + std::vector end_key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]); + end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]); + } + b->rep.DeleteRange(SliceParts(start_key_slices.data(), num_keys), + SliceParts(end_key_slices.data(), num_keys)); +} + +void rocksdb_writebatch_delete_rangev_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* start_keys_list, + const size_t* start_keys_list_sizes, const char* const* end_keys_list, + const size_t* end_keys_list_sizes) { + std::vector start_key_slices(num_keys); + std::vector end_key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]); + end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]); + } + b->rep.DeleteRange(column_family->rep, + SliceParts(start_key_slices.data(), num_keys), + SliceParts(end_key_slices.data(), num_keys)); +} + +void rocksdb_writebatch_put_log_data(rocksdb_writebatch_t* b, const char* blob, + size_t len) { + b->rep.PutLogData(Slice(blob, len)); +} + +class H : public WriteBatch::Handler { + public: + void* state_; + void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen); + void (*deleted_)(void*, const char* k, size_t klen); + void Put(const Slice& key, const Slice& value) override { + (*put_)(state_, key.data(), key.size(), value.data(), value.size()); + } + void Delete(const Slice& key) override { + (*deleted_)(state_, key.data(), key.size()); + } +}; + +void rocksdb_writebatch_iterate(rocksdb_writebatch_t* b, void* state, + void (*put)(void*, const char* k, size_t klen, + const char* v, size_t vlen), + void (*deleted)(void*, const char* k, + size_t klen)) { + H handler; + handler.state_ = state; + handler.put_ = put; + handler.deleted_ = deleted; + b->rep.Iterate(&handler); +} + +const char* rocksdb_writebatch_data(rocksdb_writebatch_t* b, size_t* size) { + *size = b->rep.GetDataSize(); + return b->rep.Data().c_str(); +} + +void rocksdb_writebatch_set_save_point(rocksdb_writebatch_t* b) { + b->rep.SetSavePoint(); +} + +void rocksdb_writebatch_rollback_to_save_point(rocksdb_writebatch_t* b, + char** errptr) { + SaveError(errptr, b->rep.RollbackToSavePoint()); +} + +void rocksdb_writebatch_pop_save_point(rocksdb_writebatch_t* b, char** errptr) { + SaveError(errptr, b->rep.PopSavePoint()); +} + +rocksdb_writebatch_wi_t* rocksdb_writebatch_wi_create( + size_t reserved_bytes, unsigned char overwrite_key) { + rocksdb_writebatch_wi_t* b = new rocksdb_writebatch_wi_t; + b->rep = new WriteBatchWithIndex(BytewiseComparator(), reserved_bytes, + overwrite_key); + return b; +} + +void rocksdb_writebatch_wi_destroy(rocksdb_writebatch_wi_t* b) { + if (b->rep) { + delete b->rep; + } + delete b; +} + +void rocksdb_writebatch_wi_clear(rocksdb_writebatch_wi_t* b) { + b->rep->Clear(); +} + +int rocksdb_writebatch_wi_count(rocksdb_writebatch_wi_t* b) { + return b->rep->GetWriteBatch()->Count(); +} + +void rocksdb_writebatch_wi_put(rocksdb_writebatch_wi_t* b, const char* key, + size_t klen, const char* val, size_t vlen) { + b->rep->Put(Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_wi_put_cf(rocksdb_writebatch_wi_t* b, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, + size_t vlen) { + b->rep->Put(column_family->rep, Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_wi_putv(rocksdb_writebatch_wi_t* b, int num_keys, + const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, + const size_t* values_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector value_slices(num_values); + for (int i = 0; i < num_values; i++) { + value_slices[i] = Slice(values_list[i], values_list_sizes[i]); + } + b->rep->Put(SliceParts(key_slices.data(), num_keys), + SliceParts(value_slices.data(), num_values)); +} + +void rocksdb_writebatch_wi_putv_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes, + int num_values, const char* const* values_list, + const size_t* values_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector value_slices(num_values); + for (int i = 0; i < num_values; i++) { + value_slices[i] = Slice(values_list[i], values_list_sizes[i]); + } + b->rep->Put(column_family->rep, SliceParts(key_slices.data(), num_keys), + SliceParts(value_slices.data(), num_values)); +} + +void rocksdb_writebatch_wi_merge(rocksdb_writebatch_wi_t* b, const char* key, + size_t klen, const char* val, size_t vlen) { + b->rep->Merge(Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_wi_merge_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, size_t vlen) { + b->rep->Merge(column_family->rep, Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_wi_mergev(rocksdb_writebatch_wi_t* b, int num_keys, + const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, + const size_t* values_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector value_slices(num_values); + for (int i = 0; i < num_values; i++) { + value_slices[i] = Slice(values_list[i], values_list_sizes[i]); + } + b->rep->Merge(SliceParts(key_slices.data(), num_keys), + SliceParts(value_slices.data(), num_values)); +} + +void rocksdb_writebatch_wi_mergev_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes, + int num_values, const char* const* values_list, + const size_t* values_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector value_slices(num_values); + for (int i = 0; i < num_values; i++) { + value_slices[i] = Slice(values_list[i], values_list_sizes[i]); + } + b->rep->Merge(column_family->rep, SliceParts(key_slices.data(), num_keys), + SliceParts(value_slices.data(), num_values)); +} + +void rocksdb_writebatch_wi_delete(rocksdb_writebatch_wi_t* b, const char* key, + size_t klen) { + b->rep->Delete(Slice(key, klen)); +} + +void rocksdb_writebatch_wi_singledelete(rocksdb_writebatch_wi_t* b, + const char* key, size_t klen) { + b->rep->SingleDelete(Slice(key, klen)); +} + +void rocksdb_writebatch_wi_delete_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen) { + b->rep->Delete(column_family->rep, Slice(key, klen)); +} + +void rocksdb_writebatch_wi_singledelete_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen) { + b->rep->SingleDelete(column_family->rep, Slice(key, klen)); +} + +void rocksdb_writebatch_wi_deletev(rocksdb_writebatch_wi_t* b, int num_keys, + const char* const* keys_list, + const size_t* keys_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + b->rep->Delete(SliceParts(key_slices.data(), num_keys)); +} + +void rocksdb_writebatch_wi_deletev_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + b->rep->Delete(column_family->rep, SliceParts(key_slices.data(), num_keys)); +} + +void rocksdb_writebatch_wi_delete_range(rocksdb_writebatch_wi_t* b, + const char* start_key, + size_t start_key_len, + const char* end_key, + size_t end_key_len) { + b->rep->DeleteRange(Slice(start_key, start_key_len), + Slice(end_key, end_key_len)); +} + +void rocksdb_writebatch_wi_delete_range_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* end_key, + size_t end_key_len) { + b->rep->DeleteRange(column_family->rep, Slice(start_key, start_key_len), + Slice(end_key, end_key_len)); +} + +void rocksdb_writebatch_wi_delete_rangev(rocksdb_writebatch_wi_t* b, + int num_keys, + const char* const* start_keys_list, + const size_t* start_keys_list_sizes, + const char* const* end_keys_list, + const size_t* end_keys_list_sizes) { + std::vector start_key_slices(num_keys); + std::vector end_key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]); + end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]); + } + b->rep->DeleteRange(SliceParts(start_key_slices.data(), num_keys), + SliceParts(end_key_slices.data(), num_keys)); +} + +void rocksdb_writebatch_wi_delete_rangev_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* start_keys_list, + const size_t* start_keys_list_sizes, const char* const* end_keys_list, + const size_t* end_keys_list_sizes) { + std::vector start_key_slices(num_keys); + std::vector end_key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]); + end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]); + } + b->rep->DeleteRange(column_family->rep, + SliceParts(start_key_slices.data(), num_keys), + SliceParts(end_key_slices.data(), num_keys)); +} + +void rocksdb_writebatch_wi_put_log_data(rocksdb_writebatch_wi_t* b, + const char* blob, size_t len) { + b->rep->PutLogData(Slice(blob, len)); +} + +void rocksdb_writebatch_wi_iterate( + rocksdb_writebatch_wi_t* b, void* state, + void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), + void (*deleted)(void*, const char* k, size_t klen)) { + H handler; + handler.state_ = state; + handler.put_ = put; + handler.deleted_ = deleted; + b->rep->GetWriteBatch()->Iterate(&handler); +} + +const char* rocksdb_writebatch_wi_data(rocksdb_writebatch_wi_t* b, + size_t* size) { + WriteBatch* wb = b->rep->GetWriteBatch(); + *size = wb->GetDataSize(); + return wb->Data().c_str(); +} + +void rocksdb_writebatch_wi_set_save_point(rocksdb_writebatch_wi_t* b) { + b->rep->SetSavePoint(); +} + +void rocksdb_writebatch_wi_rollback_to_save_point(rocksdb_writebatch_wi_t* b, + char** errptr) { + SaveError(errptr, b->rep->RollbackToSavePoint()); +} + +rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base( + rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = wbwi->rep->NewIteratorWithBase(base_iterator->rep); + delete base_iterator; + return result; +} + +rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_cf( + rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator, + rocksdb_column_family_handle_t* column_family) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = + wbwi->rep->NewIteratorWithBase(column_family->rep, base_iterator->rep); + delete base_iterator; + return result; +} + +char* rocksdb_writebatch_wi_get_from_batch(rocksdb_writebatch_wi_t* wbwi, + const rocksdb_options_t* options, + const char* key, size_t keylen, + size_t* vallen, char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = wbwi->rep->GetFromBatch(options->rep, Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +char* rocksdb_writebatch_wi_get_from_batch_cf( + rocksdb_writebatch_wi_t* wbwi, const rocksdb_options_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, size_t* vallen, char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = wbwi->rep->GetFromBatch(column_family->rep, options->rep, + Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +char* rocksdb_writebatch_wi_get_from_batch_and_db( + rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db, + const rocksdb_readoptions_t* options, const char* key, size_t keylen, + size_t* vallen, char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = wbwi->rep->GetFromBatchAndDB(db->rep, options->rep, + Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +char* rocksdb_writebatch_wi_get_from_batch_and_db_cf( + rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, size_t* vallen, char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = wbwi->rep->GetFromBatchAndDB( + db->rep, options->rep, column_family->rep, Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +void rocksdb_write_writebatch_wi(rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_writebatch_wi_t* wbwi, char** errptr) { + WriteBatch* wb = wbwi->rep->GetWriteBatch(); + SaveError(errptr, db->rep->Write(options->rep, wb)); +} + +void rocksdb_load_latest_options( + const char* db_path, rocksdb_env_t* env, bool ignore_unknown_options, + rocksdb_cache_t* cache, rocksdb_options_t** db_options, + size_t* num_column_families, char*** list_column_family_names, + rocksdb_options_t*** list_column_family_options, char** errptr) { + DBOptions db_opt; + std::vector cf_descs; + Status s = LoadLatestOptions(std::string(db_path), env->rep, &db_opt, + &cf_descs, ignore_unknown_options, &cache->rep); + if (s.ok()) { + char** cf_names = (char**)malloc(cf_descs.size() * sizeof(char*)); + rocksdb_options_t** cf_options = (rocksdb_options_t**)malloc( + cf_descs.size() * sizeof(rocksdb_options_t*)); + for (size_t i = 0; i < cf_descs.size(); ++i) { + cf_names[i] = strdup(cf_descs[i].name.c_str()); + cf_options[i] = new rocksdb_options_t{ + Options(DBOptions(), std::move(cf_descs[i].options))}; + } + *num_column_families = cf_descs.size(); + *db_options = new rocksdb_options_t{ + Options(std::move(db_opt), ColumnFamilyOptions())}; + *list_column_family_names = cf_names; + *list_column_family_options = cf_options; + } else { + *num_column_families = 0; + *db_options = nullptr; + *list_column_family_names = nullptr; + *list_column_family_options = nullptr; + SaveError(errptr, s); + } +} + +void rocksdb_load_latest_options_destroy( + rocksdb_options_t* db_options, char** list_column_family_names, + rocksdb_options_t** list_column_family_options, size_t len) { + rocksdb_options_destroy(db_options); + if (list_column_family_names) { + for (size_t i = 0; i < len; ++i) { + free(list_column_family_names[i]); + } + free(list_column_family_names); + } + if (list_column_family_options) { + for (size_t i = 0; i < len; ++i) { + rocksdb_options_destroy(list_column_family_options[i]); + } + free(list_column_family_options); + } +} + +rocksdb_block_based_table_options_t* rocksdb_block_based_options_create() { + return new rocksdb_block_based_table_options_t; +} + +void rocksdb_block_based_options_destroy( + rocksdb_block_based_table_options_t* options) { + delete options; +} + +void rocksdb_block_based_options_set_checksum( + rocksdb_block_based_table_options_t* opt, char v) { + opt->rep.checksum = static_cast(v); +} + +void rocksdb_block_based_options_set_block_size( + rocksdb_block_based_table_options_t* options, size_t block_size) { + options->rep.block_size = block_size; +} + +void rocksdb_block_based_options_set_block_size_deviation( + rocksdb_block_based_table_options_t* options, int block_size_deviation) { + options->rep.block_size_deviation = block_size_deviation; +} + +void rocksdb_block_based_options_set_block_restart_interval( + rocksdb_block_based_table_options_t* options, int block_restart_interval) { + options->rep.block_restart_interval = block_restart_interval; +} + +void rocksdb_block_based_options_set_index_block_restart_interval( + rocksdb_block_based_table_options_t* options, + int index_block_restart_interval) { + options->rep.index_block_restart_interval = index_block_restart_interval; +} + +void rocksdb_block_based_options_set_metadata_block_size( + rocksdb_block_based_table_options_t* options, + uint64_t metadata_block_size) { + options->rep.metadata_block_size = metadata_block_size; +} + +void rocksdb_block_based_options_set_partition_filters( + rocksdb_block_based_table_options_t* options, + unsigned char partition_filters) { + options->rep.partition_filters = partition_filters; +} + +void rocksdb_block_based_options_set_use_delta_encoding( + rocksdb_block_based_table_options_t* options, + unsigned char use_delta_encoding) { + options->rep.use_delta_encoding = use_delta_encoding; +} + +void rocksdb_block_based_options_set_filter_policy( + rocksdb_block_based_table_options_t* options, + rocksdb_filterpolicy_t* filter_policy) { + options->rep.filter_policy.reset(filter_policy); +} + +void rocksdb_block_based_options_set_no_block_cache( + rocksdb_block_based_table_options_t* options, + unsigned char no_block_cache) { + options->rep.no_block_cache = no_block_cache; +} + +void rocksdb_block_based_options_set_block_cache( + rocksdb_block_based_table_options_t* options, + rocksdb_cache_t* block_cache) { + if (block_cache) { + options->rep.block_cache = block_cache->rep; + } +} + +void rocksdb_block_based_options_set_block_cache_compressed( + rocksdb_block_based_table_options_t* options, + rocksdb_cache_t* block_cache_compressed) { + if (block_cache_compressed) { + options->rep.block_cache_compressed = block_cache_compressed->rep; + } +} + +void rocksdb_block_based_options_set_whole_key_filtering( + rocksdb_block_based_table_options_t* options, unsigned char v) { + options->rep.whole_key_filtering = v; +} + +void rocksdb_block_based_options_set_format_version( + rocksdb_block_based_table_options_t* options, int v) { + options->rep.format_version = v; +} + +void rocksdb_block_based_options_set_index_type( + rocksdb_block_based_table_options_t* options, int v) { + options->rep.index_type = static_cast(v); +} + +void rocksdb_block_based_options_set_data_block_index_type( + rocksdb_block_based_table_options_t* options, int v) { + options->rep.data_block_index_type = + static_cast(v); +} + +void rocksdb_block_based_options_set_data_block_hash_ratio( + rocksdb_block_based_table_options_t* options, double v) { + options->rep.data_block_hash_table_util_ratio = v; +} + +void rocksdb_block_based_options_set_cache_index_and_filter_blocks( + rocksdb_block_based_table_options_t* options, unsigned char v) { + options->rep.cache_index_and_filter_blocks = v; +} + +void rocksdb_block_based_options_set_cache_index_and_filter_blocks_with_high_priority( + rocksdb_block_based_table_options_t* options, unsigned char v) { + options->rep.cache_index_and_filter_blocks_with_high_priority = v; +} + +void rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache( + rocksdb_block_based_table_options_t* options, unsigned char v) { + options->rep.pin_l0_filter_and_index_blocks_in_cache = v; +} + +void rocksdb_block_based_options_set_pin_top_level_index_and_filter( + rocksdb_block_based_table_options_t* options, unsigned char v) { + options->rep.pin_top_level_index_and_filter = v; +} + +void rocksdb_options_set_block_based_table_factory( + rocksdb_options_t* opt, + rocksdb_block_based_table_options_t* table_options) { + if (table_options) { + opt->rep.table_factory.reset( + ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(table_options->rep)); + } +} + +rocksdb_cuckoo_table_options_t* rocksdb_cuckoo_options_create() { + return new rocksdb_cuckoo_table_options_t; +} + +void rocksdb_cuckoo_options_destroy(rocksdb_cuckoo_table_options_t* options) { + delete options; +} + +void rocksdb_cuckoo_options_set_hash_ratio( + rocksdb_cuckoo_table_options_t* options, double v) { + options->rep.hash_table_ratio = v; +} + +void rocksdb_cuckoo_options_set_max_search_depth( + rocksdb_cuckoo_table_options_t* options, uint32_t v) { + options->rep.max_search_depth = v; +} + +void rocksdb_cuckoo_options_set_cuckoo_block_size( + rocksdb_cuckoo_table_options_t* options, uint32_t v) { + options->rep.cuckoo_block_size = v; +} + +void rocksdb_cuckoo_options_set_identity_as_first_hash( + rocksdb_cuckoo_table_options_t* options, unsigned char v) { + options->rep.identity_as_first_hash = v; +} + +void rocksdb_cuckoo_options_set_use_module_hash( + rocksdb_cuckoo_table_options_t* options, unsigned char v) { + options->rep.use_module_hash = v; +} + +void rocksdb_options_set_cuckoo_table_factory( + rocksdb_options_t* opt, rocksdb_cuckoo_table_options_t* table_options) { + if (table_options) { + opt->rep.table_factory.reset( + ROCKSDB_NAMESPACE::NewCuckooTableFactory(table_options->rep)); + } +} + +void rocksdb_set_options(rocksdb_t* db, int count, const char* const keys[], + const char* const values[], char** errptr) { + std::unordered_map options_map; + for (int i = 0; i < count; i++) options_map[keys[i]] = values[i]; + SaveError(errptr, db->rep->SetOptions(options_map)); +} + +void rocksdb_set_options_cf(rocksdb_t* db, + rocksdb_column_family_handle_t* handle, int count, + const char* const keys[], + const char* const values[], char** errptr) { + std::unordered_map options_map; + for (int i = 0; i < count; i++) options_map[keys[i]] = values[i]; + SaveError(errptr, db->rep->SetOptions(handle->rep, options_map)); +} + +rocksdb_options_t* rocksdb_options_create() { return new rocksdb_options_t; } + +void rocksdb_options_destroy(rocksdb_options_t* options) { delete options; } + +rocksdb_options_t* rocksdb_options_create_copy(rocksdb_options_t* options) { + return new rocksdb_options_t(*options); +} + +void rocksdb_options_increase_parallelism(rocksdb_options_t* opt, + int total_threads) { + opt->rep.IncreaseParallelism(total_threads); +} + +void rocksdb_options_optimize_for_point_lookup(rocksdb_options_t* opt, + uint64_t block_cache_size_mb) { + opt->rep.OptimizeForPointLookup(block_cache_size_mb); +} + +void rocksdb_options_optimize_level_style_compaction( + rocksdb_options_t* opt, uint64_t memtable_memory_budget) { + opt->rep.OptimizeLevelStyleCompaction(memtable_memory_budget); +} + +void rocksdb_options_optimize_universal_style_compaction( + rocksdb_options_t* opt, uint64_t memtable_memory_budget) { + opt->rep.OptimizeUniversalStyleCompaction(memtable_memory_budget); +} + +void rocksdb_options_set_allow_ingest_behind(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.allow_ingest_behind = v; +} + +unsigned char rocksdb_options_get_allow_ingest_behind(rocksdb_options_t* opt) { + return opt->rep.allow_ingest_behind; +} + +void rocksdb_options_set_compaction_filter(rocksdb_options_t* opt, + rocksdb_compactionfilter_t* filter) { + opt->rep.compaction_filter = filter; +} + +void rocksdb_options_set_compaction_filter_factory( + rocksdb_options_t* opt, rocksdb_compactionfilterfactory_t* factory) { + opt->rep.compaction_filter_factory = + std::shared_ptr(factory); +} + +void rocksdb_options_compaction_readahead_size(rocksdb_options_t* opt, + size_t s) { + opt->rep.compaction_readahead_size = s; +} + +size_t rocksdb_options_get_compaction_readahead_size(rocksdb_options_t* opt) { + return opt->rep.compaction_readahead_size; +} + +void rocksdb_options_set_comparator(rocksdb_options_t* opt, + rocksdb_comparator_t* cmp) { + opt->rep.comparator = cmp; +} + +void rocksdb_options_set_merge_operator( + rocksdb_options_t* opt, rocksdb_mergeoperator_t* merge_operator) { + opt->rep.merge_operator = std::shared_ptr(merge_operator); +} + +void rocksdb_options_set_create_if_missing(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.create_if_missing = v; +} + +unsigned char rocksdb_options_get_create_if_missing(rocksdb_options_t* opt) { + return opt->rep.create_if_missing; +} + +void rocksdb_options_set_create_missing_column_families(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.create_missing_column_families = v; +} + +unsigned char rocksdb_options_get_create_missing_column_families( + rocksdb_options_t* opt) { + return opt->rep.create_missing_column_families; +} + +void rocksdb_options_set_error_if_exists(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.error_if_exists = v; +} + +unsigned char rocksdb_options_get_error_if_exists(rocksdb_options_t* opt) { + return opt->rep.error_if_exists; +} + +void rocksdb_options_set_paranoid_checks(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.paranoid_checks = v; +} + +unsigned char rocksdb_options_get_paranoid_checks(rocksdb_options_t* opt) { + return opt->rep.paranoid_checks; +} + +void rocksdb_options_set_db_paths(rocksdb_options_t* opt, + const rocksdb_dbpath_t** dbpath_values, + size_t num_paths) { + std::vector db_paths(num_paths); + for (size_t i = 0; i < num_paths; ++i) { + db_paths[i] = dbpath_values[i]->rep; + } + opt->rep.db_paths = db_paths; +} + +void rocksdb_options_set_env(rocksdb_options_t* opt, rocksdb_env_t* env) { + opt->rep.env = (env ? env->rep : nullptr); +} + +void rocksdb_options_set_info_log(rocksdb_options_t* opt, rocksdb_logger_t* l) { + if (l) { + opt->rep.info_log = l->rep; + } +} + +void rocksdb_options_set_info_log_level(rocksdb_options_t* opt, int v) { + opt->rep.info_log_level = static_cast(v); +} + +int rocksdb_options_get_info_log_level(rocksdb_options_t* opt) { + return static_cast(opt->rep.info_log_level); +} + +void rocksdb_options_set_db_write_buffer_size(rocksdb_options_t* opt, + size_t s) { + opt->rep.db_write_buffer_size = s; +} + +size_t rocksdb_options_get_db_write_buffer_size(rocksdb_options_t* opt) { + return opt->rep.db_write_buffer_size; +} + +void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) { + opt->rep.write_buffer_size = s; +} + +size_t rocksdb_options_get_write_buffer_size(rocksdb_options_t* opt) { + return opt->rep.write_buffer_size; +} + +void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) { + opt->rep.max_open_files = n; +} + +int rocksdb_options_get_max_open_files(rocksdb_options_t* opt) { + return opt->rep.max_open_files; +} + +void rocksdb_options_set_max_file_opening_threads(rocksdb_options_t* opt, + int n) { + opt->rep.max_file_opening_threads = n; +} + +int rocksdb_options_get_max_file_opening_threads(rocksdb_options_t* opt) { + return opt->rep.max_file_opening_threads; +} + +void rocksdb_options_set_max_total_wal_size(rocksdb_options_t* opt, + uint64_t n) { + opt->rep.max_total_wal_size = n; +} + +uint64_t rocksdb_options_get_max_total_wal_size(rocksdb_options_t* opt) { + return opt->rep.max_total_wal_size; +} + +void rocksdb_options_set_target_file_size_base(rocksdb_options_t* opt, + uint64_t n) { + opt->rep.target_file_size_base = n; +} + +uint64_t rocksdb_options_get_target_file_size_base(rocksdb_options_t* opt) { + return opt->rep.target_file_size_base; +} + +void rocksdb_options_set_target_file_size_multiplier(rocksdb_options_t* opt, + int n) { + opt->rep.target_file_size_multiplier = n; +} + +int rocksdb_options_get_target_file_size_multiplier(rocksdb_options_t* opt) { + return opt->rep.target_file_size_multiplier; +} + +void rocksdb_options_set_max_bytes_for_level_base(rocksdb_options_t* opt, + uint64_t n) { + opt->rep.max_bytes_for_level_base = n; +} + +uint64_t rocksdb_options_get_max_bytes_for_level_base(rocksdb_options_t* opt) { + return opt->rep.max_bytes_for_level_base; +} + +void rocksdb_options_set_level_compaction_dynamic_level_bytes( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.level_compaction_dynamic_level_bytes = v; +} + +unsigned char rocksdb_options_get_level_compaction_dynamic_level_bytes( + rocksdb_options_t* opt) { + return opt->rep.level_compaction_dynamic_level_bytes; +} + +void rocksdb_options_set_max_bytes_for_level_multiplier(rocksdb_options_t* opt, + double n) { + opt->rep.max_bytes_for_level_multiplier = n; +} + +double rocksdb_options_get_max_bytes_for_level_multiplier( + rocksdb_options_t* opt) { + return opt->rep.max_bytes_for_level_multiplier; +} + +void rocksdb_options_set_max_compaction_bytes(rocksdb_options_t* opt, + uint64_t n) { + opt->rep.max_compaction_bytes = n; +} + +uint64_t rocksdb_options_get_max_compaction_bytes(rocksdb_options_t* opt) { + return opt->rep.max_compaction_bytes; +} + +void rocksdb_options_set_max_bytes_for_level_multiplier_additional( + rocksdb_options_t* opt, int* level_values, size_t num_levels) { + opt->rep.max_bytes_for_level_multiplier_additional.resize(num_levels); + for (size_t i = 0; i < num_levels; ++i) { + opt->rep.max_bytes_for_level_multiplier_additional[i] = level_values[i]; + } +} + +void rocksdb_options_enable_statistics(rocksdb_options_t* opt) { + opt->rep.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); +} + +void rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt, + unsigned char val) { + opt->rep.skip_stats_update_on_db_open = val; +} + +unsigned char rocksdb_options_get_skip_stats_update_on_db_open( + rocksdb_options_t* opt) { + return opt->rep.skip_stats_update_on_db_open; +} + +void rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open( + rocksdb_options_t* opt, unsigned char val) { + opt->rep.skip_checking_sst_file_sizes_on_db_open = val; +} + +unsigned char rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open( + rocksdb_options_t* opt) { + return opt->rep.skip_checking_sst_file_sizes_on_db_open; +} + +/* Blob Options Settings */ +void rocksdb_options_set_enable_blob_files(rocksdb_options_t* opt, + unsigned char val) { + opt->rep.enable_blob_files = val; +} +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_files( + rocksdb_options_t* opt) { + return opt->rep.enable_blob_files; +} + +void rocksdb_options_set_min_blob_size(rocksdb_options_t* opt, uint64_t val) { + opt->rep.min_blob_size = val; +} + +uint64_t rocksdb_options_get_min_blob_size(rocksdb_options_t* opt) { + return opt->rep.min_blob_size; +} + +void rocksdb_options_set_blob_file_size(rocksdb_options_t* opt, uint64_t val) { + opt->rep.blob_file_size = val; +} + +uint64_t rocksdb_options_get_blob_file_size(rocksdb_options_t* opt) { + return opt->rep.blob_file_size; +} + +void rocksdb_options_set_blob_compression_type(rocksdb_options_t* opt, + int val) { + opt->rep.blob_compression_type = static_cast(val); +} + +int rocksdb_options_get_blob_compression_type(rocksdb_options_t* opt) { + return opt->rep.blob_compression_type; +} + +void rocksdb_options_set_enable_blob_gc(rocksdb_options_t* opt, + unsigned char val) { + opt->rep.enable_blob_garbage_collection = val; +} + +unsigned char rocksdb_options_get_enable_blob_gc(rocksdb_options_t* opt) { + return opt->rep.enable_blob_garbage_collection; +} + +void rocksdb_options_set_blob_gc_age_cutoff(rocksdb_options_t* opt, + double val) { + opt->rep.blob_garbage_collection_age_cutoff = val; +} + +double rocksdb_options_get_blob_gc_age_cutoff(rocksdb_options_t* opt) { + return opt->rep.blob_garbage_collection_age_cutoff; +} + +void rocksdb_options_set_blob_gc_force_threshold(rocksdb_options_t* opt, + double val) { + opt->rep.blob_garbage_collection_force_threshold = val; +} + +double rocksdb_options_get_blob_gc_force_threshold(rocksdb_options_t* opt) { + return opt->rep.blob_garbage_collection_force_threshold; +} + +void rocksdb_options_set_blob_compaction_readahead_size(rocksdb_options_t* opt, + uint64_t val) { + opt->rep.blob_compaction_readahead_size = val; +} + +uint64_t rocksdb_options_get_blob_compaction_readahead_size( + rocksdb_options_t* opt) { + return opt->rep.blob_compaction_readahead_size; +} + +void rocksdb_options_set_blob_file_starting_level(rocksdb_options_t* opt, + int val) { + opt->rep.blob_file_starting_level = val; +} + +int rocksdb_options_get_blob_file_starting_level(rocksdb_options_t* opt) { + return opt->rep.blob_file_starting_level; +} + +void rocksdb_options_set_blob_cache(rocksdb_options_t* opt, + rocksdb_cache_t* blob_cache) { + opt->rep.blob_cache = blob_cache->rep; +} + +void rocksdb_options_set_prepopulate_blob_cache(rocksdb_options_t* opt, int t) { + opt->rep.prepopulate_blob_cache = static_cast(t); +} + +int rocksdb_options_get_prepopulate_blob_cache(rocksdb_options_t* opt) { + return static_cast(opt->rep.prepopulate_blob_cache); +} + +void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) { + opt->rep.num_levels = n; +} + +int rocksdb_options_get_num_levels(rocksdb_options_t* opt) { + return opt->rep.num_levels; +} + +void rocksdb_options_set_level0_file_num_compaction_trigger( + rocksdb_options_t* opt, int n) { + opt->rep.level0_file_num_compaction_trigger = n; +} + +int rocksdb_options_get_level0_file_num_compaction_trigger( + rocksdb_options_t* opt) { + return opt->rep.level0_file_num_compaction_trigger; +} + +void rocksdb_options_set_level0_slowdown_writes_trigger(rocksdb_options_t* opt, + int n) { + opt->rep.level0_slowdown_writes_trigger = n; +} + +int rocksdb_options_get_level0_slowdown_writes_trigger(rocksdb_options_t* opt) { + return opt->rep.level0_slowdown_writes_trigger; +} + +void rocksdb_options_set_level0_stop_writes_trigger(rocksdb_options_t* opt, + int n) { + opt->rep.level0_stop_writes_trigger = n; +} + +int rocksdb_options_get_level0_stop_writes_trigger(rocksdb_options_t* opt) { + return opt->rep.level0_stop_writes_trigger; +} + +void rocksdb_options_set_wal_recovery_mode(rocksdb_options_t* opt, int mode) { + opt->rep.wal_recovery_mode = static_cast(mode); +} + +int rocksdb_options_get_wal_recovery_mode(rocksdb_options_t* opt) { + return static_cast(opt->rep.wal_recovery_mode); +} + +void rocksdb_options_set_compression(rocksdb_options_t* opt, int t) { + opt->rep.compression = static_cast(t); +} + +int rocksdb_options_get_compression(rocksdb_options_t* opt) { + return opt->rep.compression; +} + +void rocksdb_options_set_bottommost_compression(rocksdb_options_t* opt, int t) { + opt->rep.bottommost_compression = static_cast(t); +} + +int rocksdb_options_get_bottommost_compression(rocksdb_options_t* opt) { + return opt->rep.bottommost_compression; +} + +void rocksdb_options_set_compression_per_level(rocksdb_options_t* opt, + const int* level_values, + size_t num_levels) { + opt->rep.compression_per_level.resize(num_levels); + for (size_t i = 0; i < num_levels; ++i) { + opt->rep.compression_per_level[i] = + static_cast(level_values[i]); + } +} + +void rocksdb_options_set_bottommost_compression_options(rocksdb_options_t* opt, + int w_bits, int level, + int strategy, + int max_dict_bytes, + unsigned char enabled) { + opt->rep.bottommost_compression_opts.window_bits = w_bits; + opt->rep.bottommost_compression_opts.level = level; + opt->rep.bottommost_compression_opts.strategy = strategy; + opt->rep.bottommost_compression_opts.max_dict_bytes = max_dict_bytes; + opt->rep.bottommost_compression_opts.enabled = enabled; +} + +void rocksdb_options_set_bottommost_compression_options_zstd_max_train_bytes( + rocksdb_options_t* opt, int zstd_max_train_bytes, unsigned char enabled) { + opt->rep.bottommost_compression_opts.zstd_max_train_bytes = + zstd_max_train_bytes; + opt->rep.bottommost_compression_opts.enabled = enabled; +} + +void rocksdb_options_set_bottommost_compression_options_use_zstd_dict_trainer( + rocksdb_options_t* opt, unsigned char use_zstd_dict_trainer, + unsigned char enabled) { + opt->rep.bottommost_compression_opts.use_zstd_dict_trainer = + use_zstd_dict_trainer; + opt->rep.bottommost_compression_opts.enabled = enabled; +} + +unsigned char +rocksdb_options_get_bottommost_compression_options_use_zstd_dict_trainer( + rocksdb_options_t* opt) { + return opt->rep.bottommost_compression_opts.use_zstd_dict_trainer; +} + +void rocksdb_options_set_bottommost_compression_options_max_dict_buffer_bytes( + rocksdb_options_t* opt, uint64_t max_dict_buffer_bytes, + unsigned char enabled) { + opt->rep.bottommost_compression_opts.max_dict_buffer_bytes = + max_dict_buffer_bytes; + opt->rep.bottommost_compression_opts.enabled = enabled; +} + +void rocksdb_options_set_compression_options(rocksdb_options_t* opt, int w_bits, + int level, int strategy, + int max_dict_bytes) { + opt->rep.compression_opts.window_bits = w_bits; + opt->rep.compression_opts.level = level; + opt->rep.compression_opts.strategy = strategy; + opt->rep.compression_opts.max_dict_bytes = max_dict_bytes; +} + +void rocksdb_options_set_compression_options_zstd_max_train_bytes( + rocksdb_options_t* opt, int zstd_max_train_bytes) { + opt->rep.compression_opts.zstd_max_train_bytes = zstd_max_train_bytes; +} + +int rocksdb_options_get_compression_options_zstd_max_train_bytes( + rocksdb_options_t* opt) { + return opt->rep.compression_opts.zstd_max_train_bytes; +} + +void rocksdb_options_set_compression_options_use_zstd_dict_trainer( + rocksdb_options_t* opt, unsigned char use_zstd_dict_trainer) { + opt->rep.compression_opts.use_zstd_dict_trainer = use_zstd_dict_trainer; +} + +unsigned char rocksdb_options_get_compression_options_use_zstd_dict_trainer( + rocksdb_options_t* opt) { + return opt->rep.compression_opts.use_zstd_dict_trainer; +} + +void rocksdb_options_set_compression_options_parallel_threads( + rocksdb_options_t* opt, int value) { + opt->rep.compression_opts.parallel_threads = value; +} + +int rocksdb_options_get_compression_options_parallel_threads( + rocksdb_options_t* opt) { + return opt->rep.compression_opts.parallel_threads; +} + +void rocksdb_options_set_compression_options_max_dict_buffer_bytes( + rocksdb_options_t* opt, uint64_t max_dict_buffer_bytes) { + opt->rep.compression_opts.max_dict_buffer_bytes = max_dict_buffer_bytes; +} + +uint64_t rocksdb_options_get_compression_options_max_dict_buffer_bytes( + rocksdb_options_t* opt) { + return opt->rep.compression_opts.max_dict_buffer_bytes; +} + +void rocksdb_options_set_prefix_extractor( + rocksdb_options_t* opt, rocksdb_slicetransform_t* prefix_extractor) { + opt->rep.prefix_extractor.reset(prefix_extractor); +} + +void rocksdb_options_set_use_fsync(rocksdb_options_t* opt, int use_fsync) { + opt->rep.use_fsync = use_fsync; +} + +int rocksdb_options_get_use_fsync(rocksdb_options_t* opt) { + return opt->rep.use_fsync; +} + +void rocksdb_options_set_db_log_dir(rocksdb_options_t* opt, + const char* db_log_dir) { + opt->rep.db_log_dir = db_log_dir; +} + +void rocksdb_options_set_wal_dir(rocksdb_options_t* opt, const char* v) { + opt->rep.wal_dir = v; +} + +void rocksdb_options_set_WAL_ttl_seconds(rocksdb_options_t* opt, uint64_t ttl) { + opt->rep.WAL_ttl_seconds = ttl; +} + +uint64_t rocksdb_options_get_WAL_ttl_seconds(rocksdb_options_t* opt) { + return opt->rep.WAL_ttl_seconds; +} + +void rocksdb_options_set_WAL_size_limit_MB(rocksdb_options_t* opt, + uint64_t limit) { + opt->rep.WAL_size_limit_MB = limit; +} + +uint64_t rocksdb_options_get_WAL_size_limit_MB(rocksdb_options_t* opt) { + return opt->rep.WAL_size_limit_MB; +} + +void rocksdb_options_set_manifest_preallocation_size(rocksdb_options_t* opt, + size_t v) { + opt->rep.manifest_preallocation_size = v; +} + +size_t rocksdb_options_get_manifest_preallocation_size(rocksdb_options_t* opt) { + return opt->rep.manifest_preallocation_size; +} + +void rocksdb_options_set_use_direct_reads(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.use_direct_reads = v; +} + +unsigned char rocksdb_options_get_use_direct_reads(rocksdb_options_t* opt) { + return opt->rep.use_direct_reads; +} + +void rocksdb_options_set_use_direct_io_for_flush_and_compaction( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.use_direct_io_for_flush_and_compaction = v; +} + +unsigned char rocksdb_options_get_use_direct_io_for_flush_and_compaction( + rocksdb_options_t* opt) { + return opt->rep.use_direct_io_for_flush_and_compaction; +} + +void rocksdb_options_set_allow_mmap_reads(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.allow_mmap_reads = v; +} + +unsigned char rocksdb_options_get_allow_mmap_reads(rocksdb_options_t* opt) { + return opt->rep.allow_mmap_reads; +} + +void rocksdb_options_set_allow_mmap_writes(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.allow_mmap_writes = v; +} + +unsigned char rocksdb_options_get_allow_mmap_writes(rocksdb_options_t* opt) { + return opt->rep.allow_mmap_writes; +} + +void rocksdb_options_set_is_fd_close_on_exec(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.is_fd_close_on_exec = v; +} + +unsigned char rocksdb_options_get_is_fd_close_on_exec(rocksdb_options_t* opt) { + return opt->rep.is_fd_close_on_exec; +} + +void rocksdb_options_set_stats_dump_period_sec(rocksdb_options_t* opt, + unsigned int v) { + opt->rep.stats_dump_period_sec = v; +} + +unsigned int rocksdb_options_get_stats_dump_period_sec(rocksdb_options_t* opt) { + return opt->rep.stats_dump_period_sec; +} + +void rocksdb_options_set_stats_persist_period_sec(rocksdb_options_t* opt, + unsigned int v) { + opt->rep.stats_persist_period_sec = v; +} + +unsigned int rocksdb_options_get_stats_persist_period_sec( + rocksdb_options_t* opt) { + return opt->rep.stats_persist_period_sec; +} + +void rocksdb_options_set_advise_random_on_open(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.advise_random_on_open = v; +} + +unsigned char rocksdb_options_get_advise_random_on_open( + rocksdb_options_t* opt) { + return opt->rep.advise_random_on_open; +} + +void rocksdb_options_set_access_hint_on_compaction_start(rocksdb_options_t* opt, + int v) { + switch (v) { + case 0: + opt->rep.access_hint_on_compaction_start = + ROCKSDB_NAMESPACE::Options::NONE; + break; + case 1: + opt->rep.access_hint_on_compaction_start = + ROCKSDB_NAMESPACE::Options::NORMAL; + break; + case 2: + opt->rep.access_hint_on_compaction_start = + ROCKSDB_NAMESPACE::Options::SEQUENTIAL; + break; + case 3: + opt->rep.access_hint_on_compaction_start = + ROCKSDB_NAMESPACE::Options::WILLNEED; + break; + default: + assert(0); + } +} + +int rocksdb_options_get_access_hint_on_compaction_start( + rocksdb_options_t* opt) { + return opt->rep.access_hint_on_compaction_start; +} + +void rocksdb_options_set_use_adaptive_mutex(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.use_adaptive_mutex = v; +} + +unsigned char rocksdb_options_get_use_adaptive_mutex(rocksdb_options_t* opt) { + return opt->rep.use_adaptive_mutex; +} + +void rocksdb_options_set_wal_bytes_per_sync(rocksdb_options_t* opt, + uint64_t v) { + opt->rep.wal_bytes_per_sync = v; +} + +uint64_t rocksdb_options_get_wal_bytes_per_sync(rocksdb_options_t* opt) { + return opt->rep.wal_bytes_per_sync; +} + +void rocksdb_options_set_bytes_per_sync(rocksdb_options_t* opt, uint64_t v) { + opt->rep.bytes_per_sync = v; +} + +uint64_t rocksdb_options_get_bytes_per_sync(rocksdb_options_t* opt) { + return opt->rep.bytes_per_sync; +} + +void rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t* opt, + uint64_t v) { + opt->rep.writable_file_max_buffer_size = static_cast(v); +} + +uint64_t rocksdb_options_get_writable_file_max_buffer_size( + rocksdb_options_t* opt) { + return opt->rep.writable_file_max_buffer_size; +} + +void rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.allow_concurrent_memtable_write = v; +} + +unsigned char rocksdb_options_get_allow_concurrent_memtable_write( + rocksdb_options_t* opt) { + return opt->rep.allow_concurrent_memtable_write; +} + +void rocksdb_options_set_enable_write_thread_adaptive_yield( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.enable_write_thread_adaptive_yield = v; +} + +unsigned char rocksdb_options_get_enable_write_thread_adaptive_yield( + rocksdb_options_t* opt) { + return opt->rep.enable_write_thread_adaptive_yield; +} + +void rocksdb_options_set_max_sequential_skip_in_iterations( + rocksdb_options_t* opt, uint64_t v) { + opt->rep.max_sequential_skip_in_iterations = v; +} + +uint64_t rocksdb_options_get_max_sequential_skip_in_iterations( + rocksdb_options_t* opt) { + return opt->rep.max_sequential_skip_in_iterations; +} + +void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t* opt, + int n) { + opt->rep.max_write_buffer_number = n; +} + +int rocksdb_options_get_max_write_buffer_number(rocksdb_options_t* opt) { + return opt->rep.max_write_buffer_number; +} + +void rocksdb_options_set_min_write_buffer_number_to_merge( + rocksdb_options_t* opt, int n) { + opt->rep.min_write_buffer_number_to_merge = n; +} + +int rocksdb_options_get_min_write_buffer_number_to_merge( + rocksdb_options_t* opt) { + return opt->rep.min_write_buffer_number_to_merge; +} + +void rocksdb_options_set_max_write_buffer_number_to_maintain( + rocksdb_options_t* opt, int n) { + opt->rep.max_write_buffer_number_to_maintain = n; +} + +int rocksdb_options_get_max_write_buffer_number_to_maintain( + rocksdb_options_t* opt) { + return opt->rep.max_write_buffer_number_to_maintain; +} + +void rocksdb_options_set_max_write_buffer_size_to_maintain( + rocksdb_options_t* opt, int64_t n) { + opt->rep.max_write_buffer_size_to_maintain = n; +} + +int64_t rocksdb_options_get_max_write_buffer_size_to_maintain( + rocksdb_options_t* opt) { + return opt->rep.max_write_buffer_size_to_maintain; +} + +void rocksdb_options_set_enable_pipelined_write(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.enable_pipelined_write = v; +} + +unsigned char rocksdb_options_get_enable_pipelined_write( + rocksdb_options_t* opt) { + return opt->rep.enable_pipelined_write; +} + +void rocksdb_options_set_unordered_write(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.unordered_write = v; +} + +unsigned char rocksdb_options_get_unordered_write(rocksdb_options_t* opt) { + return opt->rep.unordered_write; +} + +void rocksdb_options_set_max_subcompactions(rocksdb_options_t* opt, + uint32_t n) { + opt->rep.max_subcompactions = n; +} + +uint32_t rocksdb_options_get_max_subcompactions(rocksdb_options_t* opt) { + return opt->rep.max_subcompactions; +} + +void rocksdb_options_set_max_background_jobs(rocksdb_options_t* opt, int n) { + opt->rep.max_background_jobs = n; +} + +int rocksdb_options_get_max_background_jobs(rocksdb_options_t* opt) { + return opt->rep.max_background_jobs; +} + +void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt, + int n) { + opt->rep.max_background_compactions = n; +} + +int rocksdb_options_get_max_background_compactions(rocksdb_options_t* opt) { + return opt->rep.max_background_compactions; +} + +void rocksdb_options_set_max_background_flushes(rocksdb_options_t* opt, int n) { + opt->rep.max_background_flushes = n; +} + +int rocksdb_options_get_max_background_flushes(rocksdb_options_t* opt) { + return opt->rep.max_background_flushes; +} + +void rocksdb_options_set_experimental_mempurge_threshold(rocksdb_options_t* opt, + double v) { + opt->rep.experimental_mempurge_threshold = v; +} + +double rocksdb_options_get_experimental_mempurge_threshold( + rocksdb_options_t* opt) { + return opt->rep.experimental_mempurge_threshold; +} + +void rocksdb_options_set_max_log_file_size(rocksdb_options_t* opt, size_t v) { + opt->rep.max_log_file_size = v; +} + +size_t rocksdb_options_get_max_log_file_size(rocksdb_options_t* opt) { + return opt->rep.max_log_file_size; +} + +void rocksdb_options_set_log_file_time_to_roll(rocksdb_options_t* opt, + size_t v) { + opt->rep.log_file_time_to_roll = v; +} + +size_t rocksdb_options_get_log_file_time_to_roll(rocksdb_options_t* opt) { + return opt->rep.log_file_time_to_roll; +} + +void rocksdb_options_set_keep_log_file_num(rocksdb_options_t* opt, size_t v) { + opt->rep.keep_log_file_num = v; +} + +size_t rocksdb_options_get_keep_log_file_num(rocksdb_options_t* opt) { + return opt->rep.keep_log_file_num; +} + +void rocksdb_options_set_recycle_log_file_num(rocksdb_options_t* opt, + size_t v) { + opt->rep.recycle_log_file_num = v; +} + +size_t rocksdb_options_get_recycle_log_file_num(rocksdb_options_t* opt) { + return opt->rep.recycle_log_file_num; +} + +void rocksdb_options_set_soft_pending_compaction_bytes_limit( + rocksdb_options_t* opt, size_t v) { + opt->rep.soft_pending_compaction_bytes_limit = v; +} + +size_t rocksdb_options_get_soft_pending_compaction_bytes_limit( + rocksdb_options_t* opt) { + return opt->rep.soft_pending_compaction_bytes_limit; +} + +void rocksdb_options_set_hard_pending_compaction_bytes_limit( + rocksdb_options_t* opt, size_t v) { + opt->rep.hard_pending_compaction_bytes_limit = v; +} + +size_t rocksdb_options_get_hard_pending_compaction_bytes_limit( + rocksdb_options_t* opt) { + return opt->rep.hard_pending_compaction_bytes_limit; +} + +void rocksdb_options_set_max_manifest_file_size(rocksdb_options_t* opt, + size_t v) { + opt->rep.max_manifest_file_size = v; +} + +size_t rocksdb_options_get_max_manifest_file_size(rocksdb_options_t* opt) { + return opt->rep.max_manifest_file_size; +} + +void rocksdb_options_set_table_cache_numshardbits(rocksdb_options_t* opt, + int v) { + opt->rep.table_cache_numshardbits = v; +} + +int rocksdb_options_get_table_cache_numshardbits(rocksdb_options_t* opt) { + return opt->rep.table_cache_numshardbits; +} + +void rocksdb_options_set_arena_block_size(rocksdb_options_t* opt, size_t v) { + opt->rep.arena_block_size = v; +} + +size_t rocksdb_options_get_arena_block_size(rocksdb_options_t* opt) { + return opt->rep.arena_block_size; +} + +void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt, + int disable) { + opt->rep.disable_auto_compactions = disable; +} + +unsigned char rocksdb_options_get_disable_auto_compactions( + rocksdb_options_t* opt) { + return opt->rep.disable_auto_compactions; +} + +void rocksdb_options_set_optimize_filters_for_hits(rocksdb_options_t* opt, + int v) { + opt->rep.optimize_filters_for_hits = v; +} + +unsigned char rocksdb_options_get_optimize_filters_for_hits( + rocksdb_options_t* opt) { + return opt->rep.optimize_filters_for_hits; +} + +void rocksdb_options_set_delete_obsolete_files_period_micros( + rocksdb_options_t* opt, uint64_t v) { + opt->rep.delete_obsolete_files_period_micros = v; +} + +uint64_t rocksdb_options_get_delete_obsolete_files_period_micros( + rocksdb_options_t* opt) { + return opt->rep.delete_obsolete_files_period_micros; +} + +void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t* opt) { + opt->rep.PrepareForBulkLoad(); +} + +void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t* opt) { + opt->rep.memtable_factory.reset(new ROCKSDB_NAMESPACE::VectorRepFactory); +} + +void rocksdb_options_set_memtable_prefix_bloom_size_ratio( + rocksdb_options_t* opt, double v) { + opt->rep.memtable_prefix_bloom_size_ratio = v; +} + +double rocksdb_options_get_memtable_prefix_bloom_size_ratio( + rocksdb_options_t* opt) { + return opt->rep.memtable_prefix_bloom_size_ratio; +} + +void rocksdb_options_set_memtable_huge_page_size(rocksdb_options_t* opt, + size_t v) { + opt->rep.memtable_huge_page_size = v; +} + +size_t rocksdb_options_get_memtable_huge_page_size(rocksdb_options_t* opt) { + return opt->rep.memtable_huge_page_size; +} + +void rocksdb_options_set_hash_skip_list_rep(rocksdb_options_t* opt, + size_t bucket_count, + int32_t skiplist_height, + int32_t skiplist_branching_factor) { + ROCKSDB_NAMESPACE::MemTableRepFactory* factory = + ROCKSDB_NAMESPACE::NewHashSkipListRepFactory( + bucket_count, skiplist_height, skiplist_branching_factor); + opt->rep.memtable_factory.reset(factory); +} + +void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t* opt, + size_t bucket_count) { + opt->rep.memtable_factory.reset( + ROCKSDB_NAMESPACE::NewHashLinkListRepFactory(bucket_count)); +} + +void rocksdb_options_set_plain_table_factory(rocksdb_options_t* opt, + uint32_t user_key_len, + int bloom_bits_per_key, + double hash_table_ratio, + size_t index_sparseness) { + ROCKSDB_NAMESPACE::PlainTableOptions options; + options.user_key_len = user_key_len; + options.bloom_bits_per_key = bloom_bits_per_key; + options.hash_table_ratio = hash_table_ratio; + options.index_sparseness = index_sparseness; + + ROCKSDB_NAMESPACE::TableFactory* factory = + ROCKSDB_NAMESPACE::NewPlainTableFactory(options); + opt->rep.table_factory.reset(factory); +} + +void rocksdb_options_set_max_successive_merges(rocksdb_options_t* opt, + size_t v) { + opt->rep.max_successive_merges = v; +} + +size_t rocksdb_options_get_max_successive_merges(rocksdb_options_t* opt) { + return opt->rep.max_successive_merges; +} + +void rocksdb_options_set_bloom_locality(rocksdb_options_t* opt, uint32_t v) { + opt->rep.bloom_locality = v; +} + +uint32_t rocksdb_options_get_bloom_locality(rocksdb_options_t* opt) { + return opt->rep.bloom_locality; +} + +void rocksdb_options_set_inplace_update_support(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.inplace_update_support = v; +} + +unsigned char rocksdb_options_get_inplace_update_support( + rocksdb_options_t* opt) { + return opt->rep.inplace_update_support; +} + +void rocksdb_options_set_inplace_update_num_locks(rocksdb_options_t* opt, + size_t v) { + opt->rep.inplace_update_num_locks = v; +} + +size_t rocksdb_options_get_inplace_update_num_locks(rocksdb_options_t* opt) { + return opt->rep.inplace_update_num_locks; +} + +void rocksdb_options_set_report_bg_io_stats(rocksdb_options_t* opt, int v) { + opt->rep.report_bg_io_stats = v; +} + +unsigned char rocksdb_options_get_report_bg_io_stats(rocksdb_options_t* opt) { + return opt->rep.report_bg_io_stats; +} + +void rocksdb_options_set_compaction_style(rocksdb_options_t* opt, int style) { + opt->rep.compaction_style = + static_cast(style); +} + +int rocksdb_options_get_compaction_style(rocksdb_options_t* opt) { + return opt->rep.compaction_style; +} + +void rocksdb_options_set_universal_compaction_options( + rocksdb_options_t* opt, rocksdb_universal_compaction_options_t* uco) { + opt->rep.compaction_options_universal = *(uco->rep); +} + +void rocksdb_options_set_fifo_compaction_options( + rocksdb_options_t* opt, rocksdb_fifo_compaction_options_t* fifo) { + opt->rep.compaction_options_fifo = fifo->rep; +} + +char* rocksdb_options_statistics_get_string(rocksdb_options_t* opt) { + ROCKSDB_NAMESPACE::Statistics* statistics = opt->rep.statistics.get(); + if (statistics) { + return strdup(statistics->ToString().c_str()); + } + return nullptr; +} + +void rocksdb_options_set_ratelimiter(rocksdb_options_t* opt, + rocksdb_ratelimiter_t* limiter) { + if (limiter) { + opt->rep.rate_limiter = limiter->rep; + } +} + +void rocksdb_options_set_atomic_flush(rocksdb_options_t* opt, + unsigned char atomic_flush) { + opt->rep.atomic_flush = atomic_flush; +} + +unsigned char rocksdb_options_get_atomic_flush(rocksdb_options_t* opt) { + return opt->rep.atomic_flush; +} + +void rocksdb_options_set_manual_wal_flush(rocksdb_options_t* opt, + unsigned char manual_wal_flush) { + opt->rep.manual_wal_flush = manual_wal_flush; +} + +unsigned char rocksdb_options_get_manual_wal_flush(rocksdb_options_t* opt) { + return opt->rep.manual_wal_flush; +} + +void rocksdb_options_set_wal_compression(rocksdb_options_t* opt, int val) { + opt->rep.wal_compression = static_cast(val); +} + +int rocksdb_options_get_wal_compression(rocksdb_options_t* opt) { + return opt->rep.wal_compression; +} + +rocksdb_ratelimiter_t* rocksdb_ratelimiter_create(int64_t rate_bytes_per_sec, + int64_t refill_period_us, + int32_t fairness) { + rocksdb_ratelimiter_t* rate_limiter = new rocksdb_ratelimiter_t; + rate_limiter->rep.reset( + NewGenericRateLimiter(rate_bytes_per_sec, refill_period_us, fairness)); + return rate_limiter; +} + +void rocksdb_ratelimiter_destroy(rocksdb_ratelimiter_t* limiter) { + delete limiter; +} + +void rocksdb_options_set_row_cache(rocksdb_options_t* opt, + rocksdb_cache_t* cache) { + if (cache) { + opt->rep.row_cache = cache->rep; + } +} + +void rocksdb_options_add_compact_on_deletion_collector_factory( + rocksdb_options_t* opt, size_t window_size, size_t num_dels_trigger) { + std::shared_ptr + compact_on_del = + NewCompactOnDeletionCollectorFactory(window_size, num_dels_trigger); + opt->rep.table_properties_collector_factories.emplace_back(compact_on_del); +} + +void rocksdb_set_perf_level(int v) { + PerfLevel level = static_cast(v); + SetPerfLevel(level); +} + +rocksdb_perfcontext_t* rocksdb_perfcontext_create() { + rocksdb_perfcontext_t* context = new rocksdb_perfcontext_t; + context->rep = ROCKSDB_NAMESPACE::get_perf_context(); + return context; +} + +void rocksdb_perfcontext_reset(rocksdb_perfcontext_t* context) { + context->rep->Reset(); +} + +char* rocksdb_perfcontext_report(rocksdb_perfcontext_t* context, + unsigned char exclude_zero_counters) { + return strdup(context->rep->ToString(exclude_zero_counters).c_str()); +} + +uint64_t rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context, + int metric) { + PerfContext* rep = context->rep; + switch (metric) { + case rocksdb_user_key_comparison_count: + return rep->user_key_comparison_count; + case rocksdb_block_cache_hit_count: + return rep->block_cache_hit_count; + case rocksdb_block_read_count: + return rep->block_read_count; + case rocksdb_block_read_byte: + return rep->block_read_byte; + case rocksdb_block_read_time: + return rep->block_read_time; + case rocksdb_block_checksum_time: + return rep->block_checksum_time; + case rocksdb_block_decompress_time: + return rep->block_decompress_time; + case rocksdb_get_read_bytes: + return rep->get_read_bytes; + case rocksdb_multiget_read_bytes: + return rep->multiget_read_bytes; + case rocksdb_iter_read_bytes: + return rep->iter_read_bytes; + case rocksdb_internal_key_skipped_count: + return rep->internal_key_skipped_count; + case rocksdb_internal_delete_skipped_count: + return rep->internal_delete_skipped_count; + case rocksdb_internal_recent_skipped_count: + return rep->internal_recent_skipped_count; + case rocksdb_internal_merge_count: + return rep->internal_merge_count; + case rocksdb_get_snapshot_time: + return rep->get_snapshot_time; + case rocksdb_get_from_memtable_time: + return rep->get_from_memtable_time; + case rocksdb_get_from_memtable_count: + return rep->get_from_memtable_count; + case rocksdb_get_post_process_time: + return rep->get_post_process_time; + case rocksdb_get_from_output_files_time: + return rep->get_from_output_files_time; + case rocksdb_seek_on_memtable_time: + return rep->seek_on_memtable_time; + case rocksdb_seek_on_memtable_count: + return rep->seek_on_memtable_count; + case rocksdb_next_on_memtable_count: + return rep->next_on_memtable_count; + case rocksdb_prev_on_memtable_count: + return rep->prev_on_memtable_count; + case rocksdb_seek_child_seek_time: + return rep->seek_child_seek_time; + case rocksdb_seek_child_seek_count: + return rep->seek_child_seek_count; + case rocksdb_seek_min_heap_time: + return rep->seek_min_heap_time; + case rocksdb_seek_max_heap_time: + return rep->seek_max_heap_time; + case rocksdb_seek_internal_seek_time: + return rep->seek_internal_seek_time; + case rocksdb_find_next_user_entry_time: + return rep->find_next_user_entry_time; + case rocksdb_write_wal_time: + return rep->write_wal_time; + case rocksdb_write_memtable_time: + return rep->write_memtable_time; + case rocksdb_write_delay_time: + return rep->write_delay_time; + case rocksdb_write_pre_and_post_process_time: + return rep->write_pre_and_post_process_time; + case rocksdb_db_mutex_lock_nanos: + return rep->db_mutex_lock_nanos; + case rocksdb_db_condition_wait_nanos: + return rep->db_condition_wait_nanos; + case rocksdb_merge_operator_time_nanos: + return rep->merge_operator_time_nanos; + case rocksdb_read_index_block_nanos: + return rep->read_index_block_nanos; + case rocksdb_read_filter_block_nanos: + return rep->read_filter_block_nanos; + case rocksdb_new_table_block_iter_nanos: + return rep->new_table_block_iter_nanos; + case rocksdb_new_table_iterator_nanos: + return rep->new_table_iterator_nanos; + case rocksdb_block_seek_nanos: + return rep->block_seek_nanos; + case rocksdb_find_table_nanos: + return rep->find_table_nanos; + case rocksdb_bloom_memtable_hit_count: + return rep->bloom_memtable_hit_count; + case rocksdb_bloom_memtable_miss_count: + return rep->bloom_memtable_miss_count; + case rocksdb_bloom_sst_hit_count: + return rep->bloom_sst_hit_count; + case rocksdb_bloom_sst_miss_count: + return rep->bloom_sst_miss_count; + case rocksdb_key_lock_wait_time: + return rep->key_lock_wait_time; + case rocksdb_key_lock_wait_count: + return rep->key_lock_wait_count; + case rocksdb_env_new_sequential_file_nanos: + return rep->env_new_sequential_file_nanos; + case rocksdb_env_new_random_access_file_nanos: + return rep->env_new_random_access_file_nanos; + case rocksdb_env_new_writable_file_nanos: + return rep->env_new_writable_file_nanos; + case rocksdb_env_reuse_writable_file_nanos: + return rep->env_reuse_writable_file_nanos; + case rocksdb_env_new_random_rw_file_nanos: + return rep->env_new_random_rw_file_nanos; + case rocksdb_env_new_directory_nanos: + return rep->env_new_directory_nanos; + case rocksdb_env_file_exists_nanos: + return rep->env_file_exists_nanos; + case rocksdb_env_get_children_nanos: + return rep->env_get_children_nanos; + case rocksdb_env_get_children_file_attributes_nanos: + return rep->env_get_children_file_attributes_nanos; + case rocksdb_env_delete_file_nanos: + return rep->env_delete_file_nanos; + case rocksdb_env_create_dir_nanos: + return rep->env_create_dir_nanos; + case rocksdb_env_create_dir_if_missing_nanos: + return rep->env_create_dir_if_missing_nanos; + case rocksdb_env_delete_dir_nanos: + return rep->env_delete_dir_nanos; + case rocksdb_env_get_file_size_nanos: + return rep->env_get_file_size_nanos; + case rocksdb_env_get_file_modification_time_nanos: + return rep->env_get_file_modification_time_nanos; + case rocksdb_env_rename_file_nanos: + return rep->env_rename_file_nanos; + case rocksdb_env_link_file_nanos: + return rep->env_link_file_nanos; + case rocksdb_env_lock_file_nanos: + return rep->env_lock_file_nanos; + case rocksdb_env_unlock_file_nanos: + return rep->env_unlock_file_nanos; + case rocksdb_env_new_logger_nanos: + return rep->env_new_logger_nanos; + case rocksdb_number_async_seek: + return rep->number_async_seek; + case rocksdb_blob_cache_hit_count: + return rep->blob_cache_hit_count; + case rocksdb_blob_read_count: + return rep->blob_read_count; + case rocksdb_blob_read_byte: + return rep->blob_read_byte; + case rocksdb_blob_read_time: + return rep->blob_read_time; + case rocksdb_blob_checksum_time: + return rep->blob_checksum_time; + case rocksdb_blob_decompress_time: + return rep->blob_decompress_time; + case rocksdb_internal_range_del_reseek_count: + return rep->internal_range_del_reseek_count; + default: + break; + } + return 0; +} + +void rocksdb_perfcontext_destroy(rocksdb_perfcontext_t* context) { + delete context; +} + +/* +TODO: +DB::OpenForReadOnly +DB::KeyMayExist +DB::GetOptions +DB::GetSortedWalFiles +DB::GetLatestSequenceNumber +DB::GetUpdatesSince +DB::GetDbIdentity +DB::RunManualCompaction +custom cache +table_properties_collectors +*/ + +rocksdb_compactionfilter_t* rocksdb_compactionfilter_create( + void* state, void (*destructor)(void*), + unsigned char (*filter)(void*, int level, const char* key, + size_t key_length, const char* existing_value, + size_t value_length, char** new_value, + size_t* new_value_length, + unsigned char* value_changed), + const char* (*name)(void*)) { + rocksdb_compactionfilter_t* result = new rocksdb_compactionfilter_t; + result->state_ = state; + result->destructor_ = destructor; + result->filter_ = filter; + result->ignore_snapshots_ = true; + result->name_ = name; + return result; +} + +void rocksdb_compactionfilter_set_ignore_snapshots( + rocksdb_compactionfilter_t* filter, unsigned char whether_ignore) { + filter->ignore_snapshots_ = whether_ignore; +} + +void rocksdb_compactionfilter_destroy(rocksdb_compactionfilter_t* filter) { + delete filter; +} + +unsigned char rocksdb_compactionfiltercontext_is_full_compaction( + rocksdb_compactionfiltercontext_t* context) { + return context->rep.is_full_compaction; +} + +unsigned char rocksdb_compactionfiltercontext_is_manual_compaction( + rocksdb_compactionfiltercontext_t* context) { + return context->rep.is_manual_compaction; +} + +rocksdb_compactionfilterfactory_t* rocksdb_compactionfilterfactory_create( + void* state, void (*destructor)(void*), + rocksdb_compactionfilter_t* (*create_compaction_filter)( + void*, rocksdb_compactionfiltercontext_t* context), + const char* (*name)(void*)) { + rocksdb_compactionfilterfactory_t* result = + new rocksdb_compactionfilterfactory_t; + result->state_ = state; + result->destructor_ = destructor; + result->create_compaction_filter_ = create_compaction_filter; + result->name_ = name; + return result; +} + +void rocksdb_compactionfilterfactory_destroy( + rocksdb_compactionfilterfactory_t* factory) { + delete factory; +} + +rocksdb_comparator_t* rocksdb_comparator_create( + void* state, void (*destructor)(void*), + int (*compare)(void*, const char* a, size_t alen, const char* b, + size_t blen), + const char* (*name)(void*)) { + rocksdb_comparator_t* result = new rocksdb_comparator_t; + result->state_ = state; + result->destructor_ = destructor; + result->compare_ = compare; + result->name_ = name; + result->compare_ts_ = nullptr; + result->compare_without_ts_ = nullptr; + return result; +} + +void rocksdb_comparator_destroy(rocksdb_comparator_t* cmp) { delete cmp; } + +rocksdb_comparator_t* rocksdb_comparator_with_ts_create( + void* state, void (*destructor)(void*), + int (*compare)(void*, const char* a, size_t alen, const char* b, + size_t blen), + int (*compare_ts)(void*, const char* a_ts, size_t a_tslen, const char* b_ts, + size_t b_tslen), + int (*compare_without_ts)(void*, const char* a, size_t alen, + unsigned char a_has_ts, const char* b, + size_t blen, unsigned char b_has_ts), + const char* (*name)(void*), size_t timestamp_size) { + rocksdb_comparator_t* result = new rocksdb_comparator_t(timestamp_size); + result->state_ = state; + result->destructor_ = destructor; + result->compare_ = compare; + result->compare_ts_ = compare_ts; + result->compare_without_ts_ = compare_without_ts; + result->name_ = name; + return result; +} + +void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t* filter) { + delete filter; +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_format( + double bits_per_key, bool original_format) { + // Make a rocksdb_filterpolicy_t, but override all of its methods so + // they delegate to a NewBloomFilterPolicy() instead of user + // supplied C functions. + struct Wrapper : public rocksdb_filterpolicy_t { + const FilterPolicy* rep_; + ~Wrapper() override { delete rep_; } + const char* Name() const override { return rep_->Name(); } + const char* CompatibilityName() const override { + return rep_->CompatibilityName(); + } + // No need to override GetFilterBitsBuilder if this one is overridden + ROCKSDB_NAMESPACE::FilterBitsBuilder* GetBuilderWithContext( + const ROCKSDB_NAMESPACE::FilterBuildingContext& context) + const override { + return rep_->GetBuilderWithContext(context); + } + ROCKSDB_NAMESPACE::FilterBitsReader* GetFilterBitsReader( + const Slice& contents) const override { + return rep_->GetFilterBitsReader(contents); + } + static void DoNothing(void*) {} + }; + Wrapper* wrapper = new Wrapper; + wrapper->rep_ = NewBloomFilterPolicy(bits_per_key, original_format); + wrapper->state_ = nullptr; + wrapper->destructor_ = &Wrapper::DoNothing; + return wrapper; +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_full( + double bits_per_key) { + return rocksdb_filterpolicy_create_bloom_format(bits_per_key, false); +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(double bits_per_key) { + return rocksdb_filterpolicy_create_bloom_format(bits_per_key, true); +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon_format( + double bloom_equivalent_bits_per_key, int bloom_before_level) { + // Make a rocksdb_filterpolicy_t, but override all of its methods so + // they delegate to a NewRibbonFilterPolicy() instead of user + // supplied C functions. + struct Wrapper : public rocksdb_filterpolicy_t { + const FilterPolicy* rep_; + ~Wrapper() override { delete rep_; } + const char* Name() const override { return rep_->Name(); } + const char* CompatibilityName() const override { + return rep_->CompatibilityName(); + } + ROCKSDB_NAMESPACE::FilterBitsBuilder* GetBuilderWithContext( + const ROCKSDB_NAMESPACE::FilterBuildingContext& context) + const override { + return rep_->GetBuilderWithContext(context); + } + ROCKSDB_NAMESPACE::FilterBitsReader* GetFilterBitsReader( + const Slice& contents) const override { + return rep_->GetFilterBitsReader(contents); + } + static void DoNothing(void*) {} + }; + Wrapper* wrapper = new Wrapper; + wrapper->rep_ = + NewRibbonFilterPolicy(bloom_equivalent_bits_per_key, bloom_before_level); + wrapper->state_ = nullptr; + wrapper->destructor_ = &Wrapper::DoNothing; + return wrapper; +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon( + double bloom_equivalent_bits_per_key) { + return rocksdb_filterpolicy_create_ribbon_format( + bloom_equivalent_bits_per_key, /*bloom_before_level = disabled*/ -1); +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon_hybrid( + double bloom_equivalent_bits_per_key, int bloom_before_level) { + return rocksdb_filterpolicy_create_ribbon_format( + bloom_equivalent_bits_per_key, bloom_before_level); +} + +rocksdb_mergeoperator_t* rocksdb_mergeoperator_create( + void* state, void (*destructor)(void*), + char* (*full_merge)(void*, const char* key, size_t key_length, + const char* existing_value, + size_t existing_value_length, + const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length), + char* (*partial_merge)(void*, const char* key, size_t key_length, + const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length), + void (*delete_value)(void*, const char* value, size_t value_length), + const char* (*name)(void*)) { + rocksdb_mergeoperator_t* result = new rocksdb_mergeoperator_t; + result->state_ = state; + result->destructor_ = destructor; + result->full_merge_ = full_merge; + result->partial_merge_ = partial_merge; + result->delete_value_ = delete_value; + result->name_ = name; + return result; +} + +void rocksdb_mergeoperator_destroy(rocksdb_mergeoperator_t* merge_operator) { + delete merge_operator; +} + +rocksdb_readoptions_t* rocksdb_readoptions_create() { + return new rocksdb_readoptions_t; +} + +void rocksdb_readoptions_destroy(rocksdb_readoptions_t* opt) { delete opt; } + +void rocksdb_readoptions_set_verify_checksums(rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.verify_checksums = v; +} + +unsigned char rocksdb_readoptions_get_verify_checksums( + rocksdb_readoptions_t* opt) { + return opt->rep.verify_checksums; +} + +void rocksdb_readoptions_set_fill_cache(rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.fill_cache = v; +} + +unsigned char rocksdb_readoptions_get_fill_cache(rocksdb_readoptions_t* opt) { + return opt->rep.fill_cache; +} + +void rocksdb_readoptions_set_snapshot(rocksdb_readoptions_t* opt, + const rocksdb_snapshot_t* snap) { + opt->rep.snapshot = (snap ? snap->rep : nullptr); +} + +void rocksdb_readoptions_set_iterate_upper_bound(rocksdb_readoptions_t* opt, + const char* key, + size_t keylen) { + if (key == nullptr) { + opt->upper_bound = Slice(); + opt->rep.iterate_upper_bound = nullptr; + + } else { + opt->upper_bound = Slice(key, keylen); + opt->rep.iterate_upper_bound = &opt->upper_bound; + } +} + +void rocksdb_readoptions_set_iterate_lower_bound(rocksdb_readoptions_t* opt, + const char* key, + size_t keylen) { + if (key == nullptr) { + opt->lower_bound = Slice(); + opt->rep.iterate_lower_bound = nullptr; + } else { + opt->lower_bound = Slice(key, keylen); + opt->rep.iterate_lower_bound = &opt->lower_bound; + } +} + +void rocksdb_readoptions_set_read_tier(rocksdb_readoptions_t* opt, int v) { + opt->rep.read_tier = static_cast(v); +} + +int rocksdb_readoptions_get_read_tier(rocksdb_readoptions_t* opt) { + return static_cast(opt->rep.read_tier); +} + +void rocksdb_readoptions_set_tailing(rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.tailing = v; +} + +unsigned char rocksdb_readoptions_get_tailing(rocksdb_readoptions_t* opt) { + return opt->rep.tailing; +} + +void rocksdb_readoptions_set_managed(rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.managed = v; +} + +void rocksdb_readoptions_set_readahead_size(rocksdb_readoptions_t* opt, + size_t v) { + opt->rep.readahead_size = v; +} + +size_t rocksdb_readoptions_get_readahead_size(rocksdb_readoptions_t* opt) { + return opt->rep.readahead_size; +} + +void rocksdb_readoptions_set_prefix_same_as_start(rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.prefix_same_as_start = v; +} + +unsigned char rocksdb_readoptions_get_prefix_same_as_start( + rocksdb_readoptions_t* opt) { + return opt->rep.prefix_same_as_start; +} + +void rocksdb_readoptions_set_pin_data(rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.pin_data = v; +} + +unsigned char rocksdb_readoptions_get_pin_data(rocksdb_readoptions_t* opt) { + return opt->rep.pin_data; +} + +void rocksdb_readoptions_set_total_order_seek(rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.total_order_seek = v; +} + +unsigned char rocksdb_readoptions_get_total_order_seek( + rocksdb_readoptions_t* opt) { + return opt->rep.total_order_seek; +} + +void rocksdb_readoptions_set_max_skippable_internal_keys( + rocksdb_readoptions_t* opt, uint64_t v) { + opt->rep.max_skippable_internal_keys = v; +} + +uint64_t rocksdb_readoptions_get_max_skippable_internal_keys( + rocksdb_readoptions_t* opt) { + return opt->rep.max_skippable_internal_keys; +} + +void rocksdb_readoptions_set_background_purge_on_iterator_cleanup( + rocksdb_readoptions_t* opt, unsigned char v) { + opt->rep.background_purge_on_iterator_cleanup = v; +} + +unsigned char rocksdb_readoptions_get_background_purge_on_iterator_cleanup( + rocksdb_readoptions_t* opt) { + return opt->rep.background_purge_on_iterator_cleanup; +} + +void rocksdb_readoptions_set_ignore_range_deletions(rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.ignore_range_deletions = v; +} + +unsigned char rocksdb_readoptions_get_ignore_range_deletions( + rocksdb_readoptions_t* opt) { + return opt->rep.ignore_range_deletions; +} + +void rocksdb_readoptions_set_deadline(rocksdb_readoptions_t* opt, + uint64_t microseconds) { + opt->rep.deadline = std::chrono::microseconds(microseconds); +} + +uint64_t rocksdb_readoptions_get_deadline(rocksdb_readoptions_t* opt) { + return opt->rep.deadline.count(); +} + +void rocksdb_readoptions_set_io_timeout(rocksdb_readoptions_t* opt, + uint64_t microseconds) { + opt->rep.io_timeout = std::chrono::microseconds(microseconds); +} + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t* opt) { + return opt->rep.io_timeout.count(); +} + +void rocksdb_readoptions_set_timestamp(rocksdb_readoptions_t* opt, + const char* ts, size_t tslen) { + if (ts == nullptr) { + opt->timestamp = Slice(); + opt->rep.timestamp = nullptr; + } else { + opt->timestamp = Slice(ts, tslen); + opt->rep.timestamp = &opt->timestamp; + } +} + +void rocksdb_readoptions_set_iter_start_ts(rocksdb_readoptions_t* opt, + const char* ts, size_t tslen) { + if (ts == nullptr) { + opt->iter_start_ts = Slice(); + opt->rep.iter_start_ts = nullptr; + } else { + opt->iter_start_ts = Slice(ts, tslen); + opt->rep.iter_start_ts = &opt->iter_start_ts; + } +} + +rocksdb_writeoptions_t* rocksdb_writeoptions_create() { + return new rocksdb_writeoptions_t; +} + +void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t* opt) { delete opt; } + +void rocksdb_writeoptions_set_sync(rocksdb_writeoptions_t* opt, + unsigned char v) { + opt->rep.sync = v; +} + +unsigned char rocksdb_writeoptions_get_sync(rocksdb_writeoptions_t* opt) { + return opt->rep.sync; +} + +void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, + int disable) { + opt->rep.disableWAL = disable; +} + +unsigned char rocksdb_writeoptions_get_disable_WAL( + rocksdb_writeoptions_t* opt) { + return opt->rep.disableWAL; +} + +void rocksdb_writeoptions_set_ignore_missing_column_families( + rocksdb_writeoptions_t* opt, unsigned char v) { + opt->rep.ignore_missing_column_families = v; +} + +unsigned char rocksdb_writeoptions_get_ignore_missing_column_families( + rocksdb_writeoptions_t* opt) { + return opt->rep.ignore_missing_column_families; +} + +void rocksdb_writeoptions_set_no_slowdown(rocksdb_writeoptions_t* opt, + unsigned char v) { + opt->rep.no_slowdown = v; +} + +unsigned char rocksdb_writeoptions_get_no_slowdown( + rocksdb_writeoptions_t* opt) { + return opt->rep.no_slowdown; +} + +void rocksdb_writeoptions_set_low_pri(rocksdb_writeoptions_t* opt, + unsigned char v) { + opt->rep.low_pri = v; +} + +unsigned char rocksdb_writeoptions_get_low_pri(rocksdb_writeoptions_t* opt) { + return opt->rep.low_pri; +} + +void rocksdb_writeoptions_set_memtable_insert_hint_per_batch( + rocksdb_writeoptions_t* opt, unsigned char v) { + opt->rep.memtable_insert_hint_per_batch = v; +} + +unsigned char rocksdb_writeoptions_get_memtable_insert_hint_per_batch( + rocksdb_writeoptions_t* opt) { + return opt->rep.memtable_insert_hint_per_batch; +} + +rocksdb_compactoptions_t* rocksdb_compactoptions_create() { + return new rocksdb_compactoptions_t; +} + +void rocksdb_compactoptions_destroy(rocksdb_compactoptions_t* opt) { + delete opt; +} + +void rocksdb_compactoptions_set_bottommost_level_compaction( + rocksdb_compactoptions_t* opt, unsigned char v) { + opt->rep.bottommost_level_compaction = + static_cast(v); +} + +unsigned char rocksdb_compactoptions_get_bottommost_level_compaction( + rocksdb_compactoptions_t* opt) { + return static_cast(opt->rep.bottommost_level_compaction); +} + +void rocksdb_compactoptions_set_exclusive_manual_compaction( + rocksdb_compactoptions_t* opt, unsigned char v) { + opt->rep.exclusive_manual_compaction = v; +} + +unsigned char rocksdb_compactoptions_get_exclusive_manual_compaction( + rocksdb_compactoptions_t* opt) { + return opt->rep.exclusive_manual_compaction; +} + +void rocksdb_compactoptions_set_change_level(rocksdb_compactoptions_t* opt, + unsigned char v) { + opt->rep.change_level = v; +} + +unsigned char rocksdb_compactoptions_get_change_level( + rocksdb_compactoptions_t* opt) { + return opt->rep.change_level; +} + +void rocksdb_compactoptions_set_target_level(rocksdb_compactoptions_t* opt, + int n) { + opt->rep.target_level = n; +} + +int rocksdb_compactoptions_get_target_level(rocksdb_compactoptions_t* opt) { + return opt->rep.target_level; +} + +void rocksdb_compactoptions_set_full_history_ts_low( + rocksdb_compactoptions_t* opt, char* ts, size_t tslen) { + if (ts == nullptr) { + opt->full_history_ts_low = Slice(); + opt->rep.full_history_ts_low = nullptr; + } else { + opt->full_history_ts_low = Slice(ts, tslen); + opt->rep.full_history_ts_low = &opt->full_history_ts_low; + } +} + +rocksdb_flushoptions_t* rocksdb_flushoptions_create() { + return new rocksdb_flushoptions_t; +} + +void rocksdb_flushoptions_destroy(rocksdb_flushoptions_t* opt) { delete opt; } + +void rocksdb_flushoptions_set_wait(rocksdb_flushoptions_t* opt, + unsigned char v) { + opt->rep.wait = v; +} + +unsigned char rocksdb_flushoptions_get_wait(rocksdb_flushoptions_t* opt) { + return opt->rep.wait; +} + +rocksdb_memory_allocator_t* rocksdb_jemalloc_nodump_allocator_create( + char** errptr) { + rocksdb_memory_allocator_t* allocator = new rocksdb_memory_allocator_t; + ROCKSDB_NAMESPACE::JemallocAllocatorOptions options; + SaveError(errptr, ROCKSDB_NAMESPACE::NewJemallocNodumpAllocator( + options, &allocator->rep)); + return allocator; +} + +void rocksdb_memory_allocator_destroy(rocksdb_memory_allocator_t* allocator) { + delete allocator; +} + +rocksdb_lru_cache_options_t* rocksdb_lru_cache_options_create() { + return new rocksdb_lru_cache_options_t; +} + +void rocksdb_lru_cache_options_destroy(rocksdb_lru_cache_options_t* opt) { + delete opt; +} + +void rocksdb_lru_cache_options_set_capacity(rocksdb_lru_cache_options_t* opt, + size_t capacity) { + opt->rep.capacity = capacity; +} + +void rocksdb_lru_cache_options_set_num_shard_bits( + rocksdb_lru_cache_options_t* opt, int num_shard_bits) { + opt->rep.num_shard_bits = num_shard_bits; +} + +void rocksdb_lru_cache_options_set_memory_allocator( + rocksdb_lru_cache_options_t* opt, rocksdb_memory_allocator_t* allocator) { + opt->rep.memory_allocator = allocator->rep; +} + +rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity) { + rocksdb_cache_t* c = new rocksdb_cache_t; + c->rep = NewLRUCache(capacity); + return c; +} + +rocksdb_cache_t* rocksdb_cache_create_lru_with_strict_capacity_limit( + size_t capacity) { + rocksdb_cache_t* c = new rocksdb_cache_t; + c->rep = NewLRUCache(capacity); + c->rep->SetStrictCapacityLimit(true); + return c; +} + +rocksdb_cache_t* rocksdb_cache_create_lru_opts( + rocksdb_lru_cache_options_t* opt) { + rocksdb_cache_t* c = new rocksdb_cache_t; + c->rep = NewLRUCache(opt->rep); + return c; +} + +void rocksdb_cache_destroy(rocksdb_cache_t* cache) { delete cache; } + +void rocksdb_cache_disown_data(rocksdb_cache_t* cache) { + cache->rep->DisownData(); +} + +void rocksdb_cache_set_capacity(rocksdb_cache_t* cache, size_t capacity) { + cache->rep->SetCapacity(capacity); +} + +size_t rocksdb_cache_get_capacity(rocksdb_cache_t* cache) { + return cache->rep->GetCapacity(); +} + +size_t rocksdb_cache_get_usage(rocksdb_cache_t* cache) { + return cache->rep->GetUsage(); +} + +size_t rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache) { + return cache->rep->GetPinnedUsage(); +} + +rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path, + uint64_t target_size) { + rocksdb_dbpath_t* result = new rocksdb_dbpath_t; + result->rep.path = std::string(path); + result->rep.target_size = target_size; + return result; +} + +void rocksdb_dbpath_destroy(rocksdb_dbpath_t* dbpath) { delete dbpath; } + +rocksdb_env_t* rocksdb_create_default_env() { + rocksdb_env_t* result = new rocksdb_env_t; + result->rep = Env::Default(); + result->is_default = true; + return result; +} + +rocksdb_env_t* rocksdb_create_mem_env() { + rocksdb_env_t* result = new rocksdb_env_t; + result->rep = ROCKSDB_NAMESPACE::NewMemEnv(Env::Default()); + result->is_default = false; + return result; +} + +void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n) { + env->rep->SetBackgroundThreads(n); +} + +int rocksdb_env_get_background_threads(rocksdb_env_t* env) { + return env->rep->GetBackgroundThreads(); +} + +void rocksdb_env_set_bottom_priority_background_threads(rocksdb_env_t* env, + int n) { + env->rep->SetBackgroundThreads(n, Env::BOTTOM); +} + +int rocksdb_env_get_bottom_priority_background_threads(rocksdb_env_t* env) { + return env->rep->GetBackgroundThreads(Env::BOTTOM); +} + +void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, + int n) { + env->rep->SetBackgroundThreads(n, Env::HIGH); +} + +int rocksdb_env_get_high_priority_background_threads(rocksdb_env_t* env) { + return env->rep->GetBackgroundThreads(Env::HIGH); +} + +void rocksdb_env_set_low_priority_background_threads(rocksdb_env_t* env, + int n) { + env->rep->SetBackgroundThreads(n, Env::LOW); +} + +int rocksdb_env_get_low_priority_background_threads(rocksdb_env_t* env) { + return env->rep->GetBackgroundThreads(Env::LOW); +} + +void rocksdb_env_join_all_threads(rocksdb_env_t* env) { + env->rep->WaitForJoin(); +} + +void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env) { + env->rep->LowerThreadPoolIOPriority(); +} + +void rocksdb_env_lower_high_priority_thread_pool_io_priority( + rocksdb_env_t* env) { + env->rep->LowerThreadPoolIOPriority(Env::HIGH); +} + +void rocksdb_env_lower_thread_pool_cpu_priority(rocksdb_env_t* env) { + env->rep->LowerThreadPoolCPUPriority(); +} + +void rocksdb_env_lower_high_priority_thread_pool_cpu_priority( + rocksdb_env_t* env) { + env->rep->LowerThreadPoolCPUPriority(Env::HIGH); +} + +void rocksdb_env_destroy(rocksdb_env_t* env) { + if (!env->is_default) delete env->rep; + delete env; +} + +rocksdb_envoptions_t* rocksdb_envoptions_create() { + rocksdb_envoptions_t* opt = new rocksdb_envoptions_t; + return opt; +} + +void rocksdb_envoptions_destroy(rocksdb_envoptions_t* opt) { delete opt; } + +rocksdb_sstfilewriter_t* rocksdb_sstfilewriter_create( + const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options) { + rocksdb_sstfilewriter_t* writer = new rocksdb_sstfilewriter_t; + writer->rep = new SstFileWriter(env->rep, io_options->rep); + return writer; +} + +void rocksdb_create_dir_if_missing(rocksdb_env_t* env, const char* path, + char** errptr) { + SaveError(errptr, env->rep->CreateDirIfMissing(std::string(path))); +} + +rocksdb_sstfilewriter_t* rocksdb_sstfilewriter_create_with_comparator( + const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options, + const rocksdb_comparator_t* /*comparator*/) { + rocksdb_sstfilewriter_t* writer = new rocksdb_sstfilewriter_t; + writer->rep = new SstFileWriter(env->rep, io_options->rep); + return writer; +} + +void rocksdb_sstfilewriter_open(rocksdb_sstfilewriter_t* writer, + const char* name, char** errptr) { + SaveError(errptr, writer->rep->Open(std::string(name))); +} + +void rocksdb_sstfilewriter_add(rocksdb_sstfilewriter_t* writer, const char* key, + size_t keylen, const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, writer->rep->Put(Slice(key, keylen), Slice(val, vallen))); +} + +void rocksdb_sstfilewriter_put(rocksdb_sstfilewriter_t* writer, const char* key, + size_t keylen, const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, writer->rep->Put(Slice(key, keylen), Slice(val, vallen))); +} + +void rocksdb_sstfilewriter_put_with_ts(rocksdb_sstfilewriter_t* writer, + const char* key, size_t keylen, + const char* ts, size_t tslen, + const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, writer->rep->Put(Slice(key, keylen), Slice(ts, tslen), + Slice(val, vallen))); +} + +void rocksdb_sstfilewriter_merge(rocksdb_sstfilewriter_t* writer, + const char* key, size_t keylen, + const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, writer->rep->Merge(Slice(key, keylen), Slice(val, vallen))); +} + +void rocksdb_sstfilewriter_delete(rocksdb_sstfilewriter_t* writer, + const char* key, size_t keylen, + char** errptr) { + SaveError(errptr, writer->rep->Delete(Slice(key, keylen))); +} + +void rocksdb_sstfilewriter_delete_with_ts(rocksdb_sstfilewriter_t* writer, + const char* key, size_t keylen, + const char* ts, size_t tslen, + char** errptr) { + SaveError(errptr, writer->rep->Delete(Slice(key, keylen), Slice(ts, tslen))); +} + +void rocksdb_sstfilewriter_delete_range(rocksdb_sstfilewriter_t* writer, + const char* begin_key, + size_t begin_keylen, + const char* end_key, size_t end_keylen, + char** errptr) { + SaveError(errptr, writer->rep->DeleteRange(Slice(begin_key, begin_keylen), + Slice(end_key, end_keylen))); +} + +void rocksdb_sstfilewriter_finish(rocksdb_sstfilewriter_t* writer, + char** errptr) { + SaveError(errptr, writer->rep->Finish(nullptr)); +} + +void rocksdb_sstfilewriter_file_size(rocksdb_sstfilewriter_t* writer, + uint64_t* file_size) { + *file_size = writer->rep->FileSize(); +} + +void rocksdb_sstfilewriter_destroy(rocksdb_sstfilewriter_t* writer) { + delete writer->rep; + delete writer; +} + +rocksdb_ingestexternalfileoptions_t* +rocksdb_ingestexternalfileoptions_create() { + rocksdb_ingestexternalfileoptions_t* opt = + new rocksdb_ingestexternalfileoptions_t; + return opt; +} + +void rocksdb_ingestexternalfileoptions_set_move_files( + rocksdb_ingestexternalfileoptions_t* opt, unsigned char move_files) { + opt->rep.move_files = move_files; +} + +void rocksdb_ingestexternalfileoptions_set_snapshot_consistency( + rocksdb_ingestexternalfileoptions_t* opt, + unsigned char snapshot_consistency) { + opt->rep.snapshot_consistency = snapshot_consistency; +} + +void rocksdb_ingestexternalfileoptions_set_allow_global_seqno( + rocksdb_ingestexternalfileoptions_t* opt, + unsigned char allow_global_seqno) { + opt->rep.allow_global_seqno = allow_global_seqno; +} + +void rocksdb_ingestexternalfileoptions_set_allow_blocking_flush( + rocksdb_ingestexternalfileoptions_t* opt, + unsigned char allow_blocking_flush) { + opt->rep.allow_blocking_flush = allow_blocking_flush; +} + +void rocksdb_ingestexternalfileoptions_set_ingest_behind( + rocksdb_ingestexternalfileoptions_t* opt, unsigned char ingest_behind) { + opt->rep.ingest_behind = ingest_behind; +} + +void rocksdb_ingestexternalfileoptions_destroy( + rocksdb_ingestexternalfileoptions_t* opt) { + delete opt; +} + +void rocksdb_ingest_external_file( + rocksdb_t* db, const char* const* file_list, const size_t list_len, + const rocksdb_ingestexternalfileoptions_t* opt, char** errptr) { + std::vector files(list_len); + for (size_t i = 0; i < list_len; ++i) { + files[i] = std::string(file_list[i]); + } + SaveError(errptr, db->rep->IngestExternalFile(files, opt->rep)); +} + +void rocksdb_ingest_external_file_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* handle, + const char* const* file_list, const size_t list_len, + const rocksdb_ingestexternalfileoptions_t* opt, char** errptr) { + std::vector files(list_len); + for (size_t i = 0; i < list_len; ++i) { + files[i] = std::string(file_list[i]); + } + SaveError(errptr, db->rep->IngestExternalFile(handle->rep, files, opt->rep)); +} + +void rocksdb_try_catch_up_with_primary(rocksdb_t* db, char** errptr) { + SaveError(errptr, db->rep->TryCatchUpWithPrimary()); +} + +rocksdb_slicetransform_t* rocksdb_slicetransform_create( + void* state, void (*destructor)(void*), + char* (*transform)(void*, const char* key, size_t length, + size_t* dst_length), + unsigned char (*in_domain)(void*, const char* key, size_t length), + unsigned char (*in_range)(void*, const char* key, size_t length), + const char* (*name)(void*)) { + rocksdb_slicetransform_t* result = new rocksdb_slicetransform_t; + result->state_ = state; + result->destructor_ = destructor; + result->transform_ = transform; + result->in_domain_ = in_domain; + result->in_range_ = in_range; + result->name_ = name; + return result; +} + +void rocksdb_slicetransform_destroy(rocksdb_slicetransform_t* st) { delete st; } + +struct SliceTransformWrapper : public rocksdb_slicetransform_t { + const SliceTransform* rep_; + ~SliceTransformWrapper() override { delete rep_; } + const char* Name() const override { return rep_->Name(); } + std::string GetId() const override { return rep_->GetId(); } + Slice Transform(const Slice& src) const override { + return rep_->Transform(src); + } + bool InDomain(const Slice& src) const override { return rep_->InDomain(src); } + bool InRange(const Slice& src) const override { return rep_->InRange(src); } + static void DoNothing(void*) {} +}; + +rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix( + size_t prefixLen) { + SliceTransformWrapper* wrapper = new SliceTransformWrapper; + wrapper->rep_ = ROCKSDB_NAMESPACE::NewFixedPrefixTransform(prefixLen); + wrapper->state_ = nullptr; + wrapper->destructor_ = &SliceTransformWrapper::DoNothing; + return wrapper; +} + +rocksdb_slicetransform_t* rocksdb_slicetransform_create_noop() { + SliceTransformWrapper* wrapper = new SliceTransformWrapper; + wrapper->rep_ = ROCKSDB_NAMESPACE::NewNoopTransform(); + wrapper->state_ = nullptr; + wrapper->destructor_ = &SliceTransformWrapper::DoNothing; + return wrapper; +} + +rocksdb_universal_compaction_options_t* +rocksdb_universal_compaction_options_create() { + rocksdb_universal_compaction_options_t* result = + new rocksdb_universal_compaction_options_t; + result->rep = new ROCKSDB_NAMESPACE::CompactionOptionsUniversal; + return result; +} + +void rocksdb_universal_compaction_options_set_size_ratio( + rocksdb_universal_compaction_options_t* uco, int ratio) { + uco->rep->size_ratio = ratio; +} + +int rocksdb_universal_compaction_options_get_size_ratio( + rocksdb_universal_compaction_options_t* uco) { + return uco->rep->size_ratio; +} + +void rocksdb_universal_compaction_options_set_min_merge_width( + rocksdb_universal_compaction_options_t* uco, int w) { + uco->rep->min_merge_width = w; +} + +int rocksdb_universal_compaction_options_get_min_merge_width( + rocksdb_universal_compaction_options_t* uco) { + return uco->rep->min_merge_width; +} + +void rocksdb_universal_compaction_options_set_max_merge_width( + rocksdb_universal_compaction_options_t* uco, int w) { + uco->rep->max_merge_width = w; +} + +int rocksdb_universal_compaction_options_get_max_merge_width( + rocksdb_universal_compaction_options_t* uco) { + return uco->rep->max_merge_width; +} + +void rocksdb_universal_compaction_options_set_max_size_amplification_percent( + rocksdb_universal_compaction_options_t* uco, int p) { + uco->rep->max_size_amplification_percent = p; +} + +int rocksdb_universal_compaction_options_get_max_size_amplification_percent( + rocksdb_universal_compaction_options_t* uco) { + return uco->rep->max_size_amplification_percent; +} + +void rocksdb_universal_compaction_options_set_compression_size_percent( + rocksdb_universal_compaction_options_t* uco, int p) { + uco->rep->compression_size_percent = p; +} + +int rocksdb_universal_compaction_options_get_compression_size_percent( + rocksdb_universal_compaction_options_t* uco) { + return uco->rep->compression_size_percent; +} + +void rocksdb_universal_compaction_options_set_stop_style( + rocksdb_universal_compaction_options_t* uco, int style) { + uco->rep->stop_style = + static_cast(style); +} + +int rocksdb_universal_compaction_options_get_stop_style( + rocksdb_universal_compaction_options_t* uco) { + return static_cast(uco->rep->stop_style); +} + +void rocksdb_universal_compaction_options_destroy( + rocksdb_universal_compaction_options_t* uco) { + delete uco->rep; + delete uco; +} + +rocksdb_fifo_compaction_options_t* rocksdb_fifo_compaction_options_create() { + rocksdb_fifo_compaction_options_t* result = + new rocksdb_fifo_compaction_options_t; + result->rep = CompactionOptionsFIFO(); + return result; +} + +void rocksdb_fifo_compaction_options_set_max_table_files_size( + rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size) { + fifo_opts->rep.max_table_files_size = size; +} + +uint64_t rocksdb_fifo_compaction_options_get_max_table_files_size( + rocksdb_fifo_compaction_options_t* fifo_opts) { + return fifo_opts->rep.max_table_files_size; +} + +void rocksdb_fifo_compaction_options_destroy( + rocksdb_fifo_compaction_options_t* fifo_opts) { + delete fifo_opts; +} + +void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, + int level) { + if (level >= 0) { + assert(level <= opt->rep.num_levels); + opt->rep.compression_per_level.resize(opt->rep.num_levels); + for (int i = 0; i < level; i++) { + opt->rep.compression_per_level[i] = ROCKSDB_NAMESPACE::kNoCompression; + } + for (int i = level; i < opt->rep.num_levels; i++) { + opt->rep.compression_per_level[i] = opt->rep.compression; + } + } +} + +int rocksdb_livefiles_count(const rocksdb_livefiles_t* lf) { + return static_cast(lf->rep.size()); +} + +const char* rocksdb_livefiles_column_family_name(const rocksdb_livefiles_t* lf, + int index) { + return lf->rep[index].column_family_name.c_str(); +} + +const char* rocksdb_livefiles_name(const rocksdb_livefiles_t* lf, int index) { + return lf->rep[index].name.c_str(); +} + +int rocksdb_livefiles_level(const rocksdb_livefiles_t* lf, int index) { + return lf->rep[index].level; +} + +size_t rocksdb_livefiles_size(const rocksdb_livefiles_t* lf, int index) { + return lf->rep[index].size; +} + +const char* rocksdb_livefiles_smallestkey(const rocksdb_livefiles_t* lf, + int index, size_t* size) { + *size = lf->rep[index].smallestkey.size(); + return lf->rep[index].smallestkey.data(); +} + +const char* rocksdb_livefiles_largestkey(const rocksdb_livefiles_t* lf, + int index, size_t* size) { + *size = lf->rep[index].largestkey.size(); + return lf->rep[index].largestkey.data(); +} + +uint64_t rocksdb_livefiles_entries(const rocksdb_livefiles_t* lf, int index) { + return lf->rep[index].num_entries; +} + +uint64_t rocksdb_livefiles_deletions(const rocksdb_livefiles_t* lf, int index) { + return lf->rep[index].num_deletions; +} + +extern void rocksdb_livefiles_destroy(const rocksdb_livefiles_t* lf) { + delete lf; +} + +void rocksdb_get_options_from_string(const rocksdb_options_t* base_options, + const char* opts_str, + rocksdb_options_t* new_options, + char** errptr) { + SaveError(errptr, + GetOptionsFromString(base_options->rep, std::string(opts_str), + &new_options->rep)); +} + +void rocksdb_delete_file_in_range(rocksdb_t* db, const char* start_key, + size_t start_key_len, const char* limit_key, + size_t limit_key_len, char** errptr) { + Slice a, b; + SaveError( + errptr, + DeleteFilesInRange( + db->rep, db->rep->DefaultColumnFamily(), + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr))); +} + +void rocksdb_delete_file_in_range_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* limit_key, + size_t limit_key_len, char** errptr) { + Slice a, b; + SaveError( + errptr, + DeleteFilesInRange( + db->rep, column_family->rep, + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr))); +} + +/* MetaData */ + +rocksdb_column_family_metadata_t* rocksdb_get_column_family_metadata( + rocksdb_t* db) { + rocksdb_column_family_metadata_t* meta = new rocksdb_column_family_metadata_t; + db->rep->GetColumnFamilyMetaData(&meta->rep); + return meta; +} + +rocksdb_column_family_metadata_t* rocksdb_get_column_family_metadata_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family) { + rocksdb_column_family_metadata_t* meta = new rocksdb_column_family_metadata_t; + db->rep->GetColumnFamilyMetaData(column_family->rep, &meta->rep); + return meta; +} + +void rocksdb_column_family_metadata_destroy( + rocksdb_column_family_metadata_t* cf_meta) { + delete cf_meta; +} + +uint64_t rocksdb_column_family_metadata_get_size( + rocksdb_column_family_metadata_t* cf_meta) { + return cf_meta->rep.size; +} + +size_t rocksdb_column_family_metadata_get_file_count( + rocksdb_column_family_metadata_t* cf_meta) { + return cf_meta->rep.file_count; +} + +char* rocksdb_column_family_metadata_get_name( + rocksdb_column_family_metadata_t* cf_meta) { + return strdup(cf_meta->rep.name.c_str()); +} + +size_t rocksdb_column_family_metadata_get_level_count( + rocksdb_column_family_metadata_t* cf_meta) { + return cf_meta->rep.levels.size(); +} + +rocksdb_level_metadata_t* rocksdb_column_family_metadata_get_level_metadata( + rocksdb_column_family_metadata_t* cf_meta, size_t i) { + if (i >= cf_meta->rep.levels.size()) { + return NULL; + } + rocksdb_level_metadata_t* level_meta = + (rocksdb_level_metadata_t*)malloc(sizeof(rocksdb_level_metadata_t)); + level_meta->rep = &cf_meta->rep.levels[i]; + + return level_meta; +} + +void rocksdb_level_metadata_destroy(rocksdb_level_metadata_t* level_meta) { + // Only free the base pointer as its parent rocksdb_column_family_metadata_t + // has the ownership of its rep. + free(level_meta); +} + +int rocksdb_level_metadata_get_level(rocksdb_level_metadata_t* level_meta) { + return level_meta->rep->level; +} + +uint64_t rocksdb_level_metadata_get_size(rocksdb_level_metadata_t* level_meta) { + return level_meta->rep->size; +} + +size_t rocksdb_level_metadata_get_file_count( + rocksdb_level_metadata_t* level_meta) { + return level_meta->rep->files.size(); +} + +rocksdb_sst_file_metadata_t* rocksdb_level_metadata_get_sst_file_metadata( + rocksdb_level_metadata_t* level_meta, size_t i) { + if (i >= level_meta->rep->files.size()) { + return nullptr; + } + rocksdb_sst_file_metadata_t* file_meta = + (rocksdb_sst_file_metadata_t*)malloc(sizeof(rocksdb_sst_file_metadata_t)); + file_meta->rep = &level_meta->rep->files[i]; + return file_meta; +} + +void rocksdb_sst_file_metadata_destroy(rocksdb_sst_file_metadata_t* file_meta) { + // Only free the base pointer as its parent rocksdb_level_metadata_t + // has the ownership of its rep. + free(file_meta); +} + +char* rocksdb_sst_file_metadata_get_relative_filename( + rocksdb_sst_file_metadata_t* file_meta) { + return strdup(file_meta->rep->relative_filename.c_str()); +} + +uint64_t rocksdb_sst_file_metadata_get_size( + rocksdb_sst_file_metadata_t* file_meta) { + return file_meta->rep->size; +} + +char* rocksdb_sst_file_metadata_get_smallestkey( + rocksdb_sst_file_metadata_t* file_meta, size_t* key_len) { + *key_len = file_meta->rep->smallestkey.size(); + return CopyString(file_meta->rep->smallestkey); +} + +char* rocksdb_sst_file_metadata_get_largestkey( + rocksdb_sst_file_metadata_t* file_meta, size_t* key_len) { + *key_len = file_meta->rep->largestkey.size(); + return CopyString(file_meta->rep->largestkey); +} + +/* Transactions */ + +rocksdb_transactiondb_options_t* rocksdb_transactiondb_options_create() { + return new rocksdb_transactiondb_options_t; +} + +void rocksdb_transactiondb_options_destroy( + rocksdb_transactiondb_options_t* opt) { + delete opt; +} + +void rocksdb_transactiondb_options_set_max_num_locks( + rocksdb_transactiondb_options_t* opt, int64_t max_num_locks) { + opt->rep.max_num_locks = max_num_locks; +} + +void rocksdb_transactiondb_options_set_num_stripes( + rocksdb_transactiondb_options_t* opt, size_t num_stripes) { + opt->rep.num_stripes = num_stripes; +} + +void rocksdb_transactiondb_options_set_transaction_lock_timeout( + rocksdb_transactiondb_options_t* opt, int64_t txn_lock_timeout) { + opt->rep.transaction_lock_timeout = txn_lock_timeout; +} + +void rocksdb_transactiondb_options_set_default_lock_timeout( + rocksdb_transactiondb_options_t* opt, int64_t default_lock_timeout) { + opt->rep.default_lock_timeout = default_lock_timeout; +} + +rocksdb_transaction_options_t* rocksdb_transaction_options_create() { + return new rocksdb_transaction_options_t; +} + +void rocksdb_transaction_options_destroy(rocksdb_transaction_options_t* opt) { + delete opt; +} + +void rocksdb_transaction_options_set_set_snapshot( + rocksdb_transaction_options_t* opt, unsigned char v) { + opt->rep.set_snapshot = v; +} + +void rocksdb_transaction_options_set_deadlock_detect( + rocksdb_transaction_options_t* opt, unsigned char v) { + opt->rep.deadlock_detect = v; +} + +void rocksdb_transaction_options_set_lock_timeout( + rocksdb_transaction_options_t* opt, int64_t lock_timeout) { + opt->rep.lock_timeout = lock_timeout; +} + +void rocksdb_transaction_options_set_expiration( + rocksdb_transaction_options_t* opt, int64_t expiration) { + opt->rep.expiration = expiration; +} + +void rocksdb_transaction_options_set_deadlock_detect_depth( + rocksdb_transaction_options_t* opt, int64_t depth) { + opt->rep.deadlock_detect_depth = depth; +} + +void rocksdb_transaction_options_set_max_write_batch_size( + rocksdb_transaction_options_t* opt, size_t size) { + opt->rep.max_write_batch_size = size; +} + +void rocksdb_transaction_options_set_skip_prepare( + rocksdb_transaction_options_t* opt, unsigned char v) { + opt->rep.skip_prepare = v; +} + +rocksdb_optimistictransaction_options_t* +rocksdb_optimistictransaction_options_create() { + return new rocksdb_optimistictransaction_options_t; +} + +void rocksdb_optimistictransaction_options_destroy( + rocksdb_optimistictransaction_options_t* opt) { + delete opt; +} + +void rocksdb_optimistictransaction_options_set_set_snapshot( + rocksdb_optimistictransaction_options_t* opt, unsigned char v) { + opt->rep.set_snapshot = v; +} + +char* rocksdb_optimistictransactiondb_property_value( + rocksdb_optimistictransactiondb_t* db, const char* propname) { + std::string tmp; + if (db->rep->GetProperty(Slice(propname), &tmp)) { + // We use strdup() since we expect human readable output. + return strdup(tmp.c_str()); + } else { + return nullptr; + } +} + +int rocksdb_optimistictransactiondb_property_int( + rocksdb_optimistictransactiondb_t* db, const char* propname, + uint64_t* out_val) { + if (db->rep->GetIntProperty(Slice(propname), out_val)) { + return 0; + } else { + return -1; + } +} + +rocksdb_column_family_handle_t* rocksdb_transactiondb_create_column_family( + rocksdb_transactiondb_t* txn_db, + const rocksdb_options_t* column_family_options, + const char* column_family_name, char** errptr) { + rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t; + SaveError(errptr, txn_db->rep->CreateColumnFamily( + ColumnFamilyOptions(column_family_options->rep), + std::string(column_family_name), &(handle->rep))); + return handle; +} + +rocksdb_transactiondb_t* rocksdb_transactiondb_open( + const rocksdb_options_t* options, + const rocksdb_transactiondb_options_t* txn_db_options, const char* name, + char** errptr) { + TransactionDB* txn_db; + if (SaveError(errptr, TransactionDB::Open(options->rep, txn_db_options->rep, + std::string(name), &txn_db))) { + return nullptr; + } + rocksdb_transactiondb_t* result = new rocksdb_transactiondb_t; + result->rep = txn_db; + return result; +} + +rocksdb_transactiondb_t* rocksdb_transactiondb_open_column_families( + const rocksdb_options_t* options, + const rocksdb_transactiondb_options_t* txn_db_options, const char* name, + int num_column_families, const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr) { + std::vector column_families; + for (int i = 0; i < num_column_families; i++) { + column_families.push_back(ColumnFamilyDescriptor( + std::string(column_family_names[i]), + ColumnFamilyOptions(column_family_options[i]->rep))); + } + + TransactionDB* txn_db; + std::vector handles; + if (SaveError(errptr, TransactionDB::Open(options->rep, txn_db_options->rep, + std::string(name), column_families, + &handles, &txn_db))) { + return nullptr; + } + + for (size_t i = 0; i < handles.size(); i++) { + rocksdb_column_family_handle_t* c_handle = + new rocksdb_column_family_handle_t; + c_handle->rep = handles[i]; + column_family_handles[i] = c_handle; + } + rocksdb_transactiondb_t* result = new rocksdb_transactiondb_t; + result->rep = txn_db; + return result; +} + +const rocksdb_snapshot_t* rocksdb_transactiondb_create_snapshot( + rocksdb_transactiondb_t* txn_db) { + rocksdb_snapshot_t* result = new rocksdb_snapshot_t; + result->rep = txn_db->rep->GetSnapshot(); + return result; +} + +void rocksdb_transactiondb_release_snapshot( + rocksdb_transactiondb_t* txn_db, const rocksdb_snapshot_t* snapshot) { + txn_db->rep->ReleaseSnapshot(snapshot->rep); + delete snapshot; +} + +char* rocksdb_transactiondb_property_value(rocksdb_transactiondb_t* db, + const char* propname) { + std::string tmp; + if (db->rep->GetProperty(Slice(propname), &tmp)) { + // We use strdup() since we expect human readable output. + return strdup(tmp.c_str()); + } else { + return nullptr; + } +} + +int rocksdb_transactiondb_property_int(rocksdb_transactiondb_t* db, + const char* propname, + uint64_t* out_val) { + if (db->rep->GetIntProperty(Slice(propname), out_val)) { + return 0; + } else { + return -1; + } +} + +rocksdb_transaction_t* rocksdb_transaction_begin( + rocksdb_transactiondb_t* txn_db, + const rocksdb_writeoptions_t* write_options, + const rocksdb_transaction_options_t* txn_options, + rocksdb_transaction_t* old_txn) { + if (old_txn == nullptr) { + rocksdb_transaction_t* result = new rocksdb_transaction_t; + result->rep = txn_db->rep->BeginTransaction(write_options->rep, + txn_options->rep, nullptr); + return result; + } + old_txn->rep = txn_db->rep->BeginTransaction(write_options->rep, + txn_options->rep, old_txn->rep); + return old_txn; +} + +rocksdb_transaction_t** rocksdb_transactiondb_get_prepared_transactions( + rocksdb_transactiondb_t* txn_db, size_t* cnt) { + std::vector txns; + txn_db->rep->GetAllPreparedTransactions(&txns); + *cnt = txns.size(); + if (txns.empty()) { + return nullptr; + } else { + rocksdb_transaction_t** buf = (rocksdb_transaction_t**)malloc( + txns.size() * sizeof(rocksdb_transaction_t*)); + for (size_t i = 0; i < txns.size(); i++) { + buf[i] = new rocksdb_transaction_t; + buf[i]->rep = txns[i]; + } + return buf; + } +} + +void rocksdb_transaction_set_name(rocksdb_transaction_t* txn, const char* name, + size_t name_len, char** errptr) { + std::string str = std::string(name, name_len); + SaveError(errptr, txn->rep->SetName(str)); +} + +char* rocksdb_transaction_get_name(rocksdb_transaction_t* txn, + size_t* name_len) { + auto name = txn->rep->GetName(); + *name_len = name.size(); + return CopyString(name); +} + +void rocksdb_transaction_prepare(rocksdb_transaction_t* txn, char** errptr) { + SaveError(errptr, txn->rep->Prepare()); +} + +rocksdb_writebatch_wi_t* rocksdb_transaction_get_writebatch_wi( + rocksdb_transaction_t* txn) { + rocksdb_writebatch_wi_t* wi = + (rocksdb_writebatch_wi_t*)malloc(sizeof(rocksdb_writebatch_wi_t)); + wi->rep = txn->rep->GetWriteBatch(); + + return wi; +} + +void rocksdb_transaction_rebuild_from_writebatch( + rocksdb_transaction_t* txn, rocksdb_writebatch_t* writebatch, + char** errptr) { + SaveError(errptr, txn->rep->RebuildFromWriteBatch(&writebatch->rep)); +} + +void rocksdb_transaction_rebuild_from_writebatch_wi(rocksdb_transaction_t* txn, + rocksdb_writebatch_wi_t* wi, + char** errptr) { + SaveError(errptr, txn->rep->RebuildFromWriteBatch(wi->rep->GetWriteBatch())); +} + +void rocksdb_transaction_commit(rocksdb_transaction_t* txn, char** errptr) { + SaveError(errptr, txn->rep->Commit()); +} + +void rocksdb_transaction_rollback(rocksdb_transaction_t* txn, char** errptr) { + SaveError(errptr, txn->rep->Rollback()); +} + +void rocksdb_transaction_set_savepoint(rocksdb_transaction_t* txn) { + txn->rep->SetSavePoint(); +} + +void rocksdb_transaction_rollback_to_savepoint(rocksdb_transaction_t* txn, + char** errptr) { + SaveError(errptr, txn->rep->RollbackToSavePoint()); +} + +void rocksdb_transaction_destroy(rocksdb_transaction_t* txn) { + delete txn->rep; + delete txn; +} + +const rocksdb_snapshot_t* rocksdb_transaction_get_snapshot( + rocksdb_transaction_t* txn) { + // This will be freed later on using free, so use malloc here to avoid a + // mismatch + rocksdb_snapshot_t* result = + (rocksdb_snapshot_t*)malloc(sizeof(rocksdb_snapshot_t)); + result->rep = txn->rep->GetSnapshot(); + return result; +} + +// Read a key inside a transaction +char* rocksdb_transaction_get(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, + const char* key, size_t klen, size_t* vlen, + char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = txn->rep->Get(options->rep, Slice(key, klen), &tmp); + if (s.ok()) { + *vlen = tmp.size(); + result = CopyString(tmp); + } else { + *vlen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +rocksdb_pinnableslice_t* rocksdb_transaction_get_pinned( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + const char* key, size_t klen, char** errptr) { + rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t); + Status s = txn->rep->Get(options->rep, Slice(key, klen), &v->rep); + if (!s.ok()) { + delete (v); + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + return nullptr; + } + return v; +} + +char* rocksdb_transaction_get_cf(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, size_t* vlen, + char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = + txn->rep->Get(options->rep, column_family->rep, Slice(key, klen), &tmp); + if (s.ok()) { + *vlen = tmp.size(); + result = CopyString(tmp); + } else { + *vlen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +rocksdb_pinnableslice_t* rocksdb_transaction_get_pinned_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, + char** errptr) { + rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t); + Status s = txn->rep->Get(options->rep, column_family->rep, Slice(key, klen), + &v->rep); + if (!s.ok()) { + delete (v); + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + return nullptr; + } + return v; +} + +// Read a key inside a transaction +char* rocksdb_transaction_get_for_update(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, + const char* key, size_t klen, + size_t* vlen, unsigned char exclusive, + char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = + txn->rep->GetForUpdate(options->rep, Slice(key, klen), &tmp, exclusive); + if (s.ok()) { + *vlen = tmp.size(); + result = CopyString(tmp); + } else { + *vlen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +rocksdb_pinnableslice_t* rocksdb_transaction_get_pinned_for_update( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + const char* key, size_t klen, unsigned char exclusive, char** errptr) { + rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t); + Status s = txn->rep->GetForUpdate(options->rep, Slice(key, klen), + v->rep.GetSelf(), exclusive); + v->rep.PinSelf(); + if (!s.ok()) { + delete (v); + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + return nullptr; + } + return v; +} + +char* rocksdb_transaction_get_for_update_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, + size_t* vlen, unsigned char exclusive, char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = txn->rep->GetForUpdate(options->rep, column_family->rep, + Slice(key, klen), &tmp, exclusive); + if (s.ok()) { + *vlen = tmp.size(); + result = CopyString(tmp); + } else { + *vlen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +rocksdb_pinnableslice_t* rocksdb_transaction_get_pinned_for_update_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, + unsigned char exclusive, char** errptr) { + rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t); + Status s = txn->rep->GetForUpdate(options->rep, column_family->rep, + Slice(key, klen), &v->rep, exclusive); + if (!s.ok()) { + delete (v); + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + return nullptr; + } + return v; +} + +void rocksdb_transaction_multi_get(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, + size_t num_keys, + const char* const* keys_list, + const size_t* keys_list_sizes, + char** values_list, + size_t* values_list_sizes, char** errs) { + std::vector keys(num_keys); + for (size_t i = 0; i < num_keys; i++) { + keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector values(num_keys); + std::vector statuses = + txn->rep->MultiGet(options->rep, keys, &values); + for (size_t i = 0; i < num_keys; i++) { + if (statuses[i].ok()) { + values_list[i] = CopyString(values[i]); + values_list_sizes[i] = values[i].size(); + errs[i] = nullptr; + } else { + values_list[i] = nullptr; + values_list_sizes[i] = 0; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } +} + +void rocksdb_transaction_multi_get_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + const rocksdb_column_family_handle_t* const* column_families, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs) { + std::vector keys(num_keys); + std::vector cfs(num_keys); + for (size_t i = 0; i < num_keys; i++) { + keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + cfs[i] = column_families[i]->rep; + } + std::vector values(num_keys); + std::vector statuses = + txn->rep->MultiGet(options->rep, cfs, keys, &values); + for (size_t i = 0; i < num_keys; i++) { + if (statuses[i].ok()) { + values_list[i] = CopyString(values[i]); + values_list_sizes[i] = values[i].size(); + errs[i] = nullptr; + } else { + values_list[i] = nullptr; + values_list_sizes[i] = 0; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } +} + +// Read a key outside a transaction +char* rocksdb_transactiondb_get(rocksdb_transactiondb_t* txn_db, + const rocksdb_readoptions_t* options, + const char* key, size_t klen, size_t* vlen, + char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = txn_db->rep->Get(options->rep, Slice(key, klen), &tmp); + if (s.ok()) { + *vlen = tmp.size(); + result = CopyString(tmp); + } else { + *vlen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +rocksdb_pinnableslice_t* rocksdb_transactiondb_get_pinned( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + const char* key, size_t klen, char** errptr) { + rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t); + Status s = txn_db->rep->Get(options->rep, txn_db->rep->DefaultColumnFamily(), + Slice(key, klen), &v->rep); + if (!s.ok()) { + delete (v); + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + return nullptr; + } + return v; +} + +char* rocksdb_transactiondb_get_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, size_t* vallen, char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = txn_db->rep->Get(options->rep, column_family->rep, + Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +rocksdb_pinnableslice_t* rocksdb_transactiondb_get_pinned_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr) { + rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t); + Status s = txn_db->rep->Get(options->rep, column_family->rep, + Slice(key, keylen), &v->rep); + if (!s.ok()) { + delete (v); + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + return nullptr; + } + return v; +} + +void rocksdb_transactiondb_multi_get(rocksdb_transactiondb_t* txn_db, + const rocksdb_readoptions_t* options, + size_t num_keys, + const char* const* keys_list, + const size_t* keys_list_sizes, + char** values_list, + size_t* values_list_sizes, char** errs) { + std::vector keys(num_keys); + for (size_t i = 0; i < num_keys; i++) { + keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector values(num_keys); + std::vector statuses = + txn_db->rep->MultiGet(options->rep, keys, &values); + for (size_t i = 0; i < num_keys; i++) { + if (statuses[i].ok()) { + values_list[i] = CopyString(values[i]); + values_list_sizes[i] = values[i].size(); + errs[i] = nullptr; + } else { + values_list[i] = nullptr; + values_list_sizes[i] = 0; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } +} + +void rocksdb_transactiondb_multi_get_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + const rocksdb_column_family_handle_t* const* column_families, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs) { + std::vector keys(num_keys); + std::vector cfs(num_keys); + for (size_t i = 0; i < num_keys; i++) { + keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + cfs[i] = column_families[i]->rep; + } + std::vector values(num_keys); + std::vector statuses = + txn_db->rep->MultiGet(options->rep, cfs, keys, &values); + for (size_t i = 0; i < num_keys; i++) { + if (statuses[i].ok()) { + values_list[i] = CopyString(values[i]); + values_list_sizes[i] = values[i].size(); + errs[i] = nullptr; + } else { + values_list[i] = nullptr; + values_list_sizes[i] = 0; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } +} + +// Put a key inside a transaction +void rocksdb_transaction_put(rocksdb_transaction_t* txn, const char* key, + size_t klen, const char* val, size_t vlen, + char** errptr) { + SaveError(errptr, txn->rep->Put(Slice(key, klen), Slice(val, vlen))); +} + +void rocksdb_transaction_put_cf(rocksdb_transaction_t* txn, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, + size_t vlen, char** errptr) { + SaveError(errptr, txn->rep->Put(column_family->rep, Slice(key, klen), + Slice(val, vlen))); +} + +void rocksdb_transaction_set_commit_timestamp(rocksdb_transaction_t* txn, + uint64_t commit_timestamp) { + txn->rep->SetCommitTimestamp(commit_timestamp); +} + +void rocksdb_transaction_set_read_timestamp_for_validation( + rocksdb_transaction_t* txn, uint64_t read_timestamp) { + txn->rep->SetReadTimestampForValidation(read_timestamp); +} + +// Put a key outside a transaction +void rocksdb_transactiondb_put(rocksdb_transactiondb_t* txn_db, + const rocksdb_writeoptions_t* options, + const char* key, size_t klen, const char* val, + size_t vlen, char** errptr) { + SaveError(errptr, + txn_db->rep->Put(options->rep, Slice(key, klen), Slice(val, vlen))); +} + +void rocksdb_transactiondb_put_cf(rocksdb_transactiondb_t* txn_db, + const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, + const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, txn_db->rep->Put(options->rep, column_family->rep, + Slice(key, keylen), Slice(val, vallen))); +} + +// Write batch into transaction db +void rocksdb_transactiondb_write(rocksdb_transactiondb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t* batch, char** errptr) { + SaveError(errptr, db->rep->Write(options->rep, &batch->rep)); +} + +// Merge a key inside a transaction +void rocksdb_transaction_merge(rocksdb_transaction_t* txn, const char* key, + size_t klen, const char* val, size_t vlen, + char** errptr) { + SaveError(errptr, txn->rep->Merge(Slice(key, klen), Slice(val, vlen))); +} + +void rocksdb_transaction_merge_cf(rocksdb_transaction_t* txn, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, + size_t vlen, char** errptr) { + SaveError(errptr, txn->rep->Merge(column_family->rep, Slice(key, klen), + Slice(val, vlen))); +} + +// Merge a key outside a transaction +void rocksdb_transactiondb_merge(rocksdb_transactiondb_t* txn_db, + const rocksdb_writeoptions_t* options, + const char* key, size_t klen, const char* val, + size_t vlen, char** errptr) { + SaveError(errptr, txn_db->rep->Merge(options->rep, Slice(key, klen), + Slice(val, vlen))); +} + +void rocksdb_transactiondb_merge_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, + const char* val, size_t vlen, char** errptr) { + SaveError(errptr, txn_db->rep->Merge(options->rep, column_family->rep, + Slice(key, klen), Slice(val, vlen))); +} + +// Delete a key inside a transaction +void rocksdb_transaction_delete(rocksdb_transaction_t* txn, const char* key, + size_t klen, char** errptr) { + SaveError(errptr, txn->rep->Delete(Slice(key, klen))); +} + +void rocksdb_transaction_delete_cf( + rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, char** errptr) { + SaveError(errptr, txn->rep->Delete(column_family->rep, Slice(key, klen))); +} + +// Delete a key outside a transaction +void rocksdb_transactiondb_delete(rocksdb_transactiondb_t* txn_db, + const rocksdb_writeoptions_t* options, + const char* key, size_t klen, char** errptr) { + SaveError(errptr, txn_db->rep->Delete(options->rep, Slice(key, klen))); +} + +void rocksdb_transactiondb_delete_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr) { + SaveError(errptr, txn_db->rep->Delete(options->rep, column_family->rep, + Slice(key, keylen))); +} + +// Create an iterator inside a transaction +rocksdb_iterator_t* rocksdb_transaction_create_iterator( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = txn->rep->GetIterator(options->rep); + return result; +} + +// Create an iterator inside a transaction with column family +rocksdb_iterator_t* rocksdb_transaction_create_iterator_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = txn->rep->GetIterator(options->rep, column_family->rep); + return result; +} + +// Create an iterator outside a transaction +rocksdb_iterator_t* rocksdb_transactiondb_create_iterator( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = txn_db->rep->NewIterator(options->rep); + return result; +} + +rocksdb_iterator_t* rocksdb_transactiondb_create_iterator_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = txn_db->rep->NewIterator(options->rep, column_family->rep); + return result; +} + +void rocksdb_transactiondb_close(rocksdb_transactiondb_t* txn_db) { + delete txn_db->rep; + delete txn_db; +} + +void rocksdb_transactiondb_flush_wal(rocksdb_transactiondb_t* txn_db, + unsigned char sync, char** errptr) { + SaveError(errptr, txn_db->rep->FlushWAL(sync)); +} + +void rocksdb_transactiondb_flush(rocksdb_transactiondb_t* txn_db, + const rocksdb_flushoptions_t* options, + char** errptr) { + SaveError(errptr, txn_db->rep->Flush(options->rep)); +} + +void rocksdb_transactiondb_flush_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options, + rocksdb_column_family_handle_t* column_family, char** errptr) { + SaveError(errptr, txn_db->rep->Flush(options->rep, column_family->rep)); +} + +rocksdb_checkpoint_t* rocksdb_transactiondb_checkpoint_object_create( + rocksdb_transactiondb_t* txn_db, char** errptr) { + Checkpoint* checkpoint; + if (SaveError(errptr, Checkpoint::Create(txn_db->rep, &checkpoint))) { + return nullptr; + } + rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t; + result->rep = checkpoint; + return result; +} + +rocksdb_optimistictransactiondb_t* rocksdb_optimistictransactiondb_open( + const rocksdb_options_t* options, const char* name, char** errptr) { + OptimisticTransactionDB* otxn_db; + if (SaveError(errptr, OptimisticTransactionDB::Open( + options->rep, std::string(name), &otxn_db))) { + return nullptr; + } + rocksdb_optimistictransactiondb_t* result = + new rocksdb_optimistictransactiondb_t; + result->rep = otxn_db; + return result; +} + +rocksdb_optimistictransactiondb_t* +rocksdb_optimistictransactiondb_open_column_families( + const rocksdb_options_t* db_options, const char* name, + int num_column_families, const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr) { + std::vector column_families; + for (int i = 0; i < num_column_families; i++) { + column_families.push_back(ColumnFamilyDescriptor( + std::string(column_family_names[i]), + ColumnFamilyOptions(column_family_options[i]->rep))); + } + + OptimisticTransactionDB* otxn_db; + std::vector handles; + if (SaveError(errptr, OptimisticTransactionDB::Open( + DBOptions(db_options->rep), std::string(name), + column_families, &handles, &otxn_db))) { + return nullptr; + } + + for (size_t i = 0; i < handles.size(); i++) { + rocksdb_column_family_handle_t* c_handle = + new rocksdb_column_family_handle_t; + c_handle->rep = handles[i]; + column_family_handles[i] = c_handle; + } + rocksdb_optimistictransactiondb_t* result = + new rocksdb_optimistictransactiondb_t; + result->rep = otxn_db; + return result; +} + +rocksdb_t* rocksdb_optimistictransactiondb_get_base_db( + rocksdb_optimistictransactiondb_t* otxn_db) { + DB* base_db = otxn_db->rep->GetBaseDB(); + + if (base_db != nullptr) { + rocksdb_t* result = new rocksdb_t; + result->rep = base_db; + return result; + } + + return nullptr; +} + +void rocksdb_optimistictransactiondb_close_base_db(rocksdb_t* base_db) { + delete base_db; +} + +rocksdb_transaction_t* rocksdb_optimistictransaction_begin( + rocksdb_optimistictransactiondb_t* otxn_db, + const rocksdb_writeoptions_t* write_options, + const rocksdb_optimistictransaction_options_t* otxn_options, + rocksdb_transaction_t* old_txn) { + if (old_txn == nullptr) { + rocksdb_transaction_t* result = new rocksdb_transaction_t; + result->rep = otxn_db->rep->BeginTransaction(write_options->rep, + otxn_options->rep, nullptr); + return result; + } + old_txn->rep = otxn_db->rep->BeginTransaction( + write_options->rep, otxn_options->rep, old_txn->rep); + return old_txn; +} + +// Write batch into OptimisticTransactionDB +void rocksdb_optimistictransactiondb_write( + rocksdb_optimistictransactiondb_t* otxn_db, + const rocksdb_writeoptions_t* options, rocksdb_writebatch_t* batch, + char** errptr) { + SaveError(errptr, otxn_db->rep->Write(options->rep, &batch->rep)); +} + +void rocksdb_optimistictransactiondb_close( + rocksdb_optimistictransactiondb_t* otxn_db) { + delete otxn_db->rep; + delete otxn_db; +} + +rocksdb_checkpoint_t* rocksdb_optimistictransactiondb_checkpoint_object_create( + rocksdb_optimistictransactiondb_t* otxn_db, char** errptr) { + Checkpoint* checkpoint; + if (SaveError(errptr, Checkpoint::Create(otxn_db->rep, &checkpoint))) { + return nullptr; + } + rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t; + result->rep = checkpoint; + return result; +} + +void rocksdb_free(void* ptr) { free(ptr); } + +rocksdb_pinnableslice_t* rocksdb_get_pinned( + rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key, + size_t keylen, char** errptr) { + rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t); + Status s = db->rep->Get(options->rep, db->rep->DefaultColumnFamily(), + Slice(key, keylen), &v->rep); + if (!s.ok()) { + delete (v); + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + return nullptr; + } + return v; +} + +rocksdb_pinnableslice_t* rocksdb_get_pinned_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr) { + rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t); + Status s = db->rep->Get(options->rep, column_family->rep, Slice(key, keylen), + &v->rep); + if (!s.ok()) { + delete v; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + return nullptr; + } + return v; +} + +void rocksdb_pinnableslice_destroy(rocksdb_pinnableslice_t* v) { delete v; } + +const char* rocksdb_pinnableslice_value(const rocksdb_pinnableslice_t* v, + size_t* vlen) { + if (!v) { + *vlen = 0; + return nullptr; + } + + *vlen = v->rep.size(); + return v->rep.data(); +} + +// container to keep databases and caches in order to use +// ROCKSDB_NAMESPACE::MemoryUtil +struct rocksdb_memory_consumers_t { + std::vector dbs; + std::unordered_set caches; +}; + +// initializes new container of memory consumers +rocksdb_memory_consumers_t* rocksdb_memory_consumers_create() { + return new rocksdb_memory_consumers_t; +} + +// adds datatabase to the container of memory consumers +void rocksdb_memory_consumers_add_db(rocksdb_memory_consumers_t* consumers, + rocksdb_t* db) { + consumers->dbs.push_back(db); +} + +// adds cache to the container of memory consumers +void rocksdb_memory_consumers_add_cache(rocksdb_memory_consumers_t* consumers, + rocksdb_cache_t* cache) { + consumers->caches.insert(cache); +} + +// deletes container with memory consumers +void rocksdb_memory_consumers_destroy(rocksdb_memory_consumers_t* consumers) { + delete consumers; +} + +// contains memory usage statistics provided by ROCKSDB_NAMESPACE::MemoryUtil +struct rocksdb_memory_usage_t { + uint64_t mem_table_total; + uint64_t mem_table_unflushed; + uint64_t mem_table_readers_total; + uint64_t cache_total; +}; + +// estimates amount of memory occupied by consumers (dbs and caches) +rocksdb_memory_usage_t* rocksdb_approximate_memory_usage_create( + rocksdb_memory_consumers_t* consumers, char** errptr) { + vector dbs; + for (auto db : consumers->dbs) { + dbs.push_back(db->rep); + } + + unordered_set cache_set; + for (auto cache : consumers->caches) { + cache_set.insert(const_cast(cache->rep.get())); + } + + std::map usage_by_type; + + auto status = MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set, + &usage_by_type); + if (SaveError(errptr, status)) { + return nullptr; + } + + auto result = new rocksdb_memory_usage_t; + result->mem_table_total = usage_by_type[MemoryUtil::kMemTableTotal]; + result->mem_table_unflushed = usage_by_type[MemoryUtil::kMemTableUnFlushed]; + result->mem_table_readers_total = + usage_by_type[MemoryUtil::kTableReadersTotal]; + result->cache_total = usage_by_type[MemoryUtil::kCacheTotal]; + return result; +} + +uint64_t rocksdb_approximate_memory_usage_get_mem_table_total( + rocksdb_memory_usage_t* memory_usage) { + return memory_usage->mem_table_total; +} + +uint64_t rocksdb_approximate_memory_usage_get_mem_table_unflushed( + rocksdb_memory_usage_t* memory_usage) { + return memory_usage->mem_table_unflushed; +} + +uint64_t rocksdb_approximate_memory_usage_get_mem_table_readers_total( + rocksdb_memory_usage_t* memory_usage) { + return memory_usage->mem_table_readers_total; +} + +uint64_t rocksdb_approximate_memory_usage_get_cache_total( + rocksdb_memory_usage_t* memory_usage) { + return memory_usage->cache_total; +} + +void rocksdb_options_set_dump_malloc_stats(rocksdb_options_t* opt, + unsigned char val) { + opt->rep.dump_malloc_stats = val; +} + +void rocksdb_options_set_memtable_whole_key_filtering(rocksdb_options_t* opt, + unsigned char val) { + opt->rep.memtable_whole_key_filtering = val; +} + +void rocksdb_options_set_avoid_unnecessary_blocking_io(rocksdb_options_t* opt, + unsigned char val) { + opt->rep.avoid_unnecessary_blocking_io = val; +} + +unsigned char rocksdb_options_get_avoid_unnecessary_blocking_io( + rocksdb_options_t* opt) { + return opt->rep.avoid_unnecessary_blocking_io; +} + +// deletes container with memory usage estimates +void rocksdb_approximate_memory_usage_destroy(rocksdb_memory_usage_t* usage) { + delete usage; +} + +void rocksdb_cancel_all_background_work(rocksdb_t* db, unsigned char wait) { + CancelAllBackgroundWork(db->rep, wait); +} + +void rocksdb_disable_manual_compaction(rocksdb_t* db) { + db->rep->DisableManualCompaction(); +} + +void rocksdb_enable_manual_compaction(rocksdb_t* db) { + db->rep->EnableManualCompaction(); +} + +} // end extern "C" + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/c_test.c b/src/rocksdb/db/c_test.c new file mode 100644 index 000000000..249ab9023 --- /dev/null +++ b/src/rocksdb/db/c_test.c @@ -0,0 +1,3476 @@ +/* Copyright (c) 2011 The LevelDB Authors. All rights reserved. + Use of this source code is governed by a BSD-style license that can be + found in the LICENSE file. See the AUTHORS file for names of contributors. */ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +#include + +#ifndef ROCKSDB_LITE // Lite does not support C API + +#include +#include +#include +#include +#include + +#include "rocksdb/c.h" +#ifndef OS_WIN +#include +#endif +#include + +// Can not use port/port.h macros as this is a c file +#ifdef OS_WIN +#include + +// Ok for uniqueness +int geteuid() { + int result = 0; + + result = ((int)GetCurrentProcessId() << 16); + result |= (int)GetCurrentThreadId(); + + return result; +} + +#endif + +const char* phase = ""; +static char dbname[200]; +static char sstfilename[200]; +static char dbbackupname[200]; +static char dbcheckpointname[200]; +static char dbpathname[200]; +static char secondary_path[200]; + +static void StartPhase(const char* name) { + fprintf(stderr, "=== Test %s\n", name); + phase = name; +} +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4996) // getenv security warning +#endif +static const char* GetTempDir(void) { + const char* ret = getenv("TEST_TMPDIR"); + if (ret == NULL || ret[0] == '\0') +#ifdef OS_WIN + ret = getenv("TEMP"); +#else + ret = "/tmp"; +#endif + return ret; +} +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +#define CheckNoError(err) \ + if ((err) != NULL) { \ + fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \ + abort(); \ + } + +#define CheckCondition(cond) \ + if (!(cond)) { \ + fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \ + abort(); \ + } + +static void CheckEqual(const char* expected, const char* v, size_t n) { + if (expected == NULL && v == NULL) { + // ok + } else if (expected != NULL && v != NULL && n == strlen(expected) && + memcmp(expected, v, n) == 0) { + // ok + return; + } else { + fprintf(stderr, "%s: expected '%s', got '%s'\n", phase, + (expected ? expected : "(null)"), (v ? v : "(null)")); + abort(); + } +} + +static void Free(char** ptr) { + if (*ptr) { + free(*ptr); + *ptr = NULL; + } +} + +static void CheckValue(char* err, const char* expected, char** actual, + size_t actual_length) { + CheckNoError(err); + CheckEqual(expected, *actual, actual_length); + Free(actual); +} + +static void CheckGet(rocksdb_t* db, const rocksdb_readoptions_t* options, + const char* key, const char* expected) { + char* err = NULL; + size_t val_len; + char* val; + val = rocksdb_get(db, options, key, strlen(key), &val_len, &err); + CheckNoError(err); + CheckEqual(expected, val, val_len); + Free(&val); +} + +static void CheckGetCF(rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* handle, const char* key, + const char* expected) { + char* err = NULL; + size_t val_len; + char* val; + val = rocksdb_get_cf(db, options, handle, key, strlen(key), &val_len, &err); + CheckNoError(err); + CheckEqual(expected, val, val_len); + Free(&val); +} + +static void CheckPinGet(rocksdb_t* db, const rocksdb_readoptions_t* options, + const char* key, const char* expected) { + char* err = NULL; + size_t val_len; + const char* val; + rocksdb_pinnableslice_t* p; + p = rocksdb_get_pinned(db, options, key, strlen(key), &err); + CheckNoError(err); + val = rocksdb_pinnableslice_value(p, &val_len); + CheckEqual(expected, val, val_len); + rocksdb_pinnableslice_destroy(p); +} + +static void CheckPinGetCF(rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* handle, + const char* key, const char* expected) { + char* err = NULL; + size_t val_len; + const char* val; + rocksdb_pinnableslice_t* p; + p = rocksdb_get_pinned_cf(db, options, handle, key, strlen(key), &err); + CheckNoError(err); + val = rocksdb_pinnableslice_value(p, &val_len); + CheckEqual(expected, val, val_len); + rocksdb_pinnableslice_destroy(p); +} + +static void CheckMultiGetValues(size_t num_keys, char** values, + size_t* values_sizes, char** errs, + const char** expected) { + for (size_t i = 0; i < num_keys; i++) { + CheckNoError(errs[i]); + CheckEqual(expected[i], values[i], values_sizes[i]); + Free(&values[i]); + } +} + +static void CheckIter(rocksdb_iterator_t* iter, const char* key, + const char* val) { + size_t len; + const char* str; + str = rocksdb_iter_key(iter, &len); + CheckEqual(key, str, len); + str = rocksdb_iter_value(iter, &len); + CheckEqual(val, str, len); +} + +// Callback from rocksdb_writebatch_iterate() +static void CheckPut(void* ptr, const char* k, size_t klen, const char* v, + size_t vlen) { + int* state = (int*)ptr; + CheckCondition(*state < 2); + switch (*state) { + case 0: + CheckEqual("bar", k, klen); + CheckEqual("b", v, vlen); + break; + case 1: + CheckEqual("box", k, klen); + CheckEqual("c", v, vlen); + break; + } + (*state)++; +} + +// Callback from rocksdb_writebatch_iterate() +static void CheckDel(void* ptr, const char* k, size_t klen) { + int* state = (int*)ptr; + CheckCondition(*state == 2); + CheckEqual("bar", k, klen); + (*state)++; +} + +static void CmpDestroy(void* arg) { (void)arg; } + +static int CmpCompare(void* arg, const char* a, size_t alen, const char* b, + size_t blen) { + (void)arg; + size_t n = (alen < blen) ? alen : blen; + int r = memcmp(a, b, n); + if (r == 0) { + if (alen < blen) + r = -1; + else if (alen > blen) + r = +1; + } + return r; +} + +static const char* CmpName(void* arg) { + (void)arg; + return "foo"; +} + +// Custom compaction filter +static void CFilterDestroy(void* arg) { (void)arg; } +static const char* CFilterName(void* arg) { + (void)arg; + return "foo"; +} +static unsigned char CFilterFilter(void* arg, int level, const char* key, + size_t key_length, + const char* existing_value, + size_t value_length, char** new_value, + size_t* new_value_length, + unsigned char* value_changed) { + (void)arg; + (void)level; + (void)existing_value; + (void)value_length; + if (key_length == 3) { + if (memcmp(key, "bar", key_length) == 0) { + return 1; + } else if (memcmp(key, "baz", key_length) == 0) { + *value_changed = 1; + *new_value = "newbazvalue"; + *new_value_length = 11; + return 0; + } + } + return 0; +} + +static void CFilterFactoryDestroy(void* arg) { (void)arg; } +static const char* CFilterFactoryName(void* arg) { + (void)arg; + return "foo"; +} +static rocksdb_compactionfilter_t* CFilterCreate( + void* arg, rocksdb_compactionfiltercontext_t* context) { + (void)arg; + (void)context; + return rocksdb_compactionfilter_create(NULL, CFilterDestroy, CFilterFilter, + CFilterName); +} + +void CheckMetaData(rocksdb_column_family_metadata_t* cf_meta, + const char* expected_cf_name) { + char* cf_name = rocksdb_column_family_metadata_get_name(cf_meta); + assert(strcmp(cf_name, expected_cf_name) == 0); + rocksdb_free(cf_name); + + size_t cf_size = rocksdb_column_family_metadata_get_size(cf_meta); + assert(cf_size > 0); + size_t cf_file_count = rocksdb_column_family_metadata_get_size(cf_meta); + assert(cf_file_count > 0); + + uint64_t total_level_size = 0; + size_t total_file_count = 0; + size_t level_count = rocksdb_column_family_metadata_get_level_count(cf_meta); + assert(level_count > 0); + for (size_t l = 0; l < level_count; ++l) { + rocksdb_level_metadata_t* level_meta = + rocksdb_column_family_metadata_get_level_metadata(cf_meta, l); + assert(level_meta); + assert(rocksdb_level_metadata_get_level(level_meta) >= (int)l); + uint64_t level_size = rocksdb_level_metadata_get_size(level_meta); + uint64_t file_size_in_level = 0; + + size_t file_count = rocksdb_level_metadata_get_file_count(level_meta); + total_file_count += file_count; + for (size_t f = 0; f < file_count; ++f) { + rocksdb_sst_file_metadata_t* file_meta = + rocksdb_level_metadata_get_sst_file_metadata(level_meta, f); + assert(file_meta); + + uint64_t file_size = rocksdb_sst_file_metadata_get_size(file_meta); + assert(file_size > 0); + file_size_in_level += file_size; + + char* file_name = + rocksdb_sst_file_metadata_get_relative_filename(file_meta); + assert(file_name); + assert(strlen(file_name) > 0); + rocksdb_free(file_name); + + size_t smallest_key_len; + char* smallest_key = rocksdb_sst_file_metadata_get_smallestkey( + file_meta, &smallest_key_len); + assert(smallest_key); + assert(smallest_key_len > 0); + size_t largest_key_len; + char* largest_key = + rocksdb_sst_file_metadata_get_largestkey(file_meta, &largest_key_len); + assert(largest_key); + assert(largest_key_len > 0); + rocksdb_free(smallest_key); + rocksdb_free(largest_key); + + rocksdb_sst_file_metadata_destroy(file_meta); + } + assert(level_size == file_size_in_level); + total_level_size += level_size; + rocksdb_level_metadata_destroy(level_meta); + } + assert(total_file_count > 0); + assert(cf_size == total_level_size); +} + +void GetAndCheckMetaData(rocksdb_t* db) { + rocksdb_column_family_metadata_t* cf_meta = + rocksdb_get_column_family_metadata(db); + + CheckMetaData(cf_meta, "default"); + + rocksdb_column_family_metadata_destroy(cf_meta); +} + +void GetAndCheckMetaDataCf(rocksdb_t* db, + rocksdb_column_family_handle_t* handle, + const char* cf_name) { + // Compact to make sure we have at least one sst file to obtain datadata. + rocksdb_compact_range_cf(db, handle, NULL, 0, NULL, 0); + + rocksdb_column_family_metadata_t* cf_meta = + rocksdb_get_column_family_metadata_cf(db, handle); + + CheckMetaData(cf_meta, cf_name); + + rocksdb_column_family_metadata_destroy(cf_meta); +} + +static rocksdb_t* CheckCompaction(rocksdb_t* db, rocksdb_options_t* options, + rocksdb_readoptions_t* roptions, + rocksdb_writeoptions_t* woptions) { + char* err = NULL; + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", "foovalue"); + rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err); + CheckNoError(err); + CheckGet(db, roptions, "bar", "barvalue"); + rocksdb_put(db, woptions, "baz", 3, "bazvalue", 8, &err); + CheckNoError(err); + CheckGet(db, roptions, "baz", "bazvalue"); + + // Disable compaction + rocksdb_disable_manual_compaction(db); + rocksdb_compact_range(db, NULL, 0, NULL, 0); + // should not filter anything when disabled + CheckGet(db, roptions, "foo", "foovalue"); + CheckGet(db, roptions, "bar", "barvalue"); + CheckGet(db, roptions, "baz", "bazvalue"); + // Reenable compaction + rocksdb_enable_manual_compaction(db); + + // Force compaction + rocksdb_compact_range(db, NULL, 0, NULL, 0); + // should have filtered bar, but not foo + CheckGet(db, roptions, "foo", "foovalue"); + CheckGet(db, roptions, "bar", NULL); + CheckGet(db, roptions, "baz", "newbazvalue"); + + rocksdb_suggest_compact_range(db, "bar", 3, "foo", 3, &err); + GetAndCheckMetaData(db); + CheckNoError(err); + + return db; +} + +// Custom merge operator +static void MergeOperatorDestroy(void* arg) { (void)arg; } +static const char* MergeOperatorName(void* arg) { + (void)arg; + return "TestMergeOperator"; +} +static char* MergeOperatorFullMerge( + void* arg, const char* key, size_t key_length, const char* existing_value, + size_t existing_value_length, const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length) { + (void)arg; + (void)key; + (void)key_length; + (void)existing_value; + (void)existing_value_length; + (void)operands_list; + (void)operands_list_length; + (void)num_operands; + *new_value_length = 4; + *success = 1; + char* result = malloc(4); + memcpy(result, "fake", 4); + return result; +} +static char* MergeOperatorPartialMerge(void* arg, const char* key, + size_t key_length, + const char* const* operands_list, + const size_t* operands_list_length, + int num_operands, unsigned char* success, + size_t* new_value_length) { + (void)arg; + (void)key; + (void)key_length; + (void)operands_list; + (void)operands_list_length; + (void)num_operands; + *new_value_length = 4; + *success = 1; + char* result = malloc(4); + memcpy(result, "fake", 4); + return result; +} + +static void CheckTxnGet(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, const char* key, + const char* expected) { + char* err = NULL; + size_t val_len; + char* val; + val = rocksdb_transaction_get(txn, options, key, strlen(key), &val_len, &err); + CheckNoError(err); + CheckEqual(expected, val, val_len); + Free(&val); +} + +static void CheckTxnGetCF(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, const char* expected) { + char* err = NULL; + size_t val_len; + char* val; + val = rocksdb_transaction_get_cf(txn, options, column_family, key, + strlen(key), &val_len, &err); + CheckNoError(err); + CheckEqual(expected, val, val_len); + Free(&val); +} + +static void CheckTxnPinGet(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, + const char* key, const char* expected) { + rocksdb_pinnableslice_t* p = NULL; + const char* val = NULL; + char* err = NULL; + size_t val_len; + p = rocksdb_transaction_get_pinned(txn, options, key, strlen(key), &err); + CheckNoError(err); + val = rocksdb_pinnableslice_value(p, &val_len); + CheckEqual(expected, val, val_len); + rocksdb_pinnableslice_destroy(p); +} + +static void CheckTxnPinGetCF(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, const char* expected) { + rocksdb_pinnableslice_t* p = NULL; + const char* val = NULL; + char* err = NULL; + size_t val_len; + p = rocksdb_transaction_get_pinned_cf(txn, options, column_family, key, + strlen(key), &err); + CheckNoError(err); + val = rocksdb_pinnableslice_value(p, &val_len); + CheckEqual(expected, val, val_len); + rocksdb_pinnableslice_destroy(p); +} + +static void CheckTxnDBGet(rocksdb_transactiondb_t* txn_db, + const rocksdb_readoptions_t* options, const char* key, + const char* expected) { + char* err = NULL; + size_t val_len; + char* val; + val = rocksdb_transactiondb_get(txn_db, options, key, strlen(key), &val_len, + &err); + CheckNoError(err); + CheckEqual(expected, val, val_len); + Free(&val); +} + +static void CheckTxnDBGetCF(rocksdb_transactiondb_t* txn_db, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, const char* expected) { + char* err = NULL; + size_t val_len; + char* val; + val = rocksdb_transactiondb_get_cf(txn_db, options, column_family, key, + strlen(key), &val_len, &err); + CheckNoError(err); + CheckEqual(expected, val, val_len); + Free(&val); +} + +static void CheckTxnDBPinGet(rocksdb_transactiondb_t* txn_db, + const rocksdb_readoptions_t* options, + const char* key, const char* expected) { + rocksdb_pinnableslice_t* p = NULL; + const char* val = NULL; + char* err = NULL; + size_t val_len; + p = rocksdb_transactiondb_get_pinned(txn_db, options, key, strlen(key), &err); + CheckNoError(err); + val = rocksdb_pinnableslice_value(p, &val_len); + CheckEqual(expected, val, val_len); + rocksdb_pinnableslice_destroy(p); +} + +static void CheckTxnDBPinGetCF(rocksdb_transactiondb_t* txn_db, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, const char* expected) { + rocksdb_pinnableslice_t* p = NULL; + const char* val = NULL; + char* err = NULL; + size_t val_len; + p = rocksdb_transactiondb_get_pinned_cf(txn_db, options, column_family, key, + strlen(key), &err); + CheckNoError(err); + val = rocksdb_pinnableslice_value(p, &val_len); + CheckEqual(expected, val, val_len); + rocksdb_pinnableslice_destroy(p); +} + +static void LoadAndCheckLatestOptions(const char* db_name, rocksdb_env_t* env, + bool ignore_unknown_options, + rocksdb_cache_t* cache, + rocksdb_comparator_t* cmp, + const size_t expected_num_column_families, + const char** expected_cf_names, + const char* expected_open_err) { + rocksdb_options_t* db_options; + size_t num_column_families; + char** list_column_family_names; + rocksdb_options_t** list_column_family_options; + char* err = 0; + + // load the latest rocksdb option + rocksdb_load_latest_options(db_name, env, ignore_unknown_options, cache, + &db_options, &num_column_families, + &list_column_family_names, + &list_column_family_options, &err); + assert(num_column_families == expected_num_column_families); + CheckNoError(err); + + // verify the loaded options by opening the db. + rocksdb_options_set_error_if_exists(db_options, 0); + + char** list_const_cf_names = + (char**)malloc(num_column_families * sizeof(char*)); + rocksdb_options_t** list_const_cf_options = (rocksdb_options_t**)malloc( + num_column_families * sizeof(rocksdb_options_t*)); + for (size_t i = 0; i < num_column_families; ++i) { + assert(strcmp(list_column_family_names[i], expected_cf_names[i]) == 0); + list_const_cf_names[i] = list_column_family_names[i]; + if (cmp) { + rocksdb_options_set_comparator(list_column_family_options[i], cmp); + } + list_const_cf_options[i] = list_column_family_options[i]; + } + rocksdb_column_family_handle_t** handles = + (rocksdb_column_family_handle_t**)malloc( + num_column_families * sizeof(rocksdb_column_family_handle_t*)); + + rocksdb_t* db = rocksdb_open_column_families( + db_options, db_name, (int)num_column_families, + (const char* const*)list_const_cf_names, + (const rocksdb_options_t* const*)list_const_cf_options, handles, &err); + if (expected_open_err == NULL) { + CheckNoError(err); + for (size_t i = 0; i < num_column_families; ++i) { + rocksdb_column_family_handle_destroy(handles[i]); + } + free(handles); + rocksdb_close(db); + } else { + assert(err != NULL); + assert(strcmp(err, expected_open_err) == 0); + free(handles); + free(err); + } + + free(list_const_cf_names); + free(list_const_cf_options); + rocksdb_load_latest_options_destroy(db_options, list_column_family_names, + list_column_family_options, + num_column_families); +} + +int main(int argc, char** argv) { + (void)argc; + (void)argv; + rocksdb_t* db; + rocksdb_comparator_t* cmp; + rocksdb_cache_t* cache; + rocksdb_dbpath_t* dbpath; + rocksdb_env_t* env; + rocksdb_options_t* options; + rocksdb_compactoptions_t* coptions; + rocksdb_block_based_table_options_t* table_options; + rocksdb_readoptions_t* roptions; + rocksdb_writeoptions_t* woptions; + rocksdb_ratelimiter_t* rate_limiter; + rocksdb_transactiondb_t* txn_db; + rocksdb_transactiondb_options_t* txn_db_options; + rocksdb_transaction_t* txn; + rocksdb_transaction_options_t* txn_options; + rocksdb_optimistictransactiondb_t* otxn_db; + rocksdb_optimistictransaction_options_t* otxn_options; + char* err = NULL; + int run = -1; + + snprintf(dbname, sizeof(dbname), "%s/rocksdb_c_test-%d", GetTempDir(), + ((int)geteuid())); + + snprintf(dbbackupname, sizeof(dbbackupname), "%s/rocksdb_c_test-%d-backup", + GetTempDir(), ((int)geteuid())); + + snprintf(dbcheckpointname, sizeof(dbcheckpointname), + "%s/rocksdb_c_test-%d-checkpoint", GetTempDir(), ((int)geteuid())); + + snprintf(sstfilename, sizeof(sstfilename), "%s/rocksdb_c_test-%d-sst", + GetTempDir(), ((int)geteuid())); + + snprintf(dbpathname, sizeof(dbpathname), "%s/rocksdb_c_test-%d-dbpath", + GetTempDir(), ((int)geteuid())); + + StartPhase("create_objects"); + cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName); + dbpath = rocksdb_dbpath_create(dbpathname, 1024 * 1024); + env = rocksdb_create_default_env(); + + rocksdb_create_dir_if_missing(env, GetTempDir(), &err); + CheckNoError(err); + + cache = rocksdb_cache_create_lru(100000); + + options = rocksdb_options_create(); + rocksdb_options_set_comparator(options, cmp); + rocksdb_options_set_error_if_exists(options, 1); + rocksdb_options_set_env(options, env); + rocksdb_options_set_info_log(options, NULL); + rocksdb_options_set_write_buffer_size(options, 100000); + rocksdb_options_set_paranoid_checks(options, 1); + rocksdb_options_set_max_open_files(options, 10); + + table_options = rocksdb_block_based_options_create(); + rocksdb_block_based_options_set_block_cache(table_options, cache); + rocksdb_block_based_options_set_data_block_index_type(table_options, 1); + rocksdb_block_based_options_set_data_block_hash_ratio(table_options, 0.75); + rocksdb_options_set_block_based_table_factory(options, table_options); + + rocksdb_options_set_compression(options, rocksdb_no_compression); + rocksdb_options_set_compression_options(options, -14, -1, 0, 0); + int compression_levels[] = {rocksdb_no_compression, rocksdb_no_compression, + rocksdb_no_compression, rocksdb_no_compression}; + rocksdb_options_set_compression_per_level(options, compression_levels, 4); + rate_limiter = rocksdb_ratelimiter_create(1000 * 1024 * 1024, 100 * 1000, 10); + rocksdb_options_set_ratelimiter(options, rate_limiter); + rocksdb_ratelimiter_destroy(rate_limiter); + + roptions = rocksdb_readoptions_create(); + rocksdb_readoptions_set_verify_checksums(roptions, 1); + rocksdb_readoptions_set_fill_cache(roptions, 1); + + woptions = rocksdb_writeoptions_create(); + rocksdb_writeoptions_set_sync(woptions, 1); + + coptions = rocksdb_compactoptions_create(); + rocksdb_compactoptions_set_exclusive_manual_compaction(coptions, 1); + + rocksdb_options_add_compact_on_deletion_collector_factory(options, 10000, + 10001); + + StartPhase("destroy"); + rocksdb_destroy_db(options, dbname, &err); + Free(&err); + + StartPhase("open_error"); + rocksdb_open(options, dbname, &err); + CheckCondition(err != NULL); + Free(&err); + + StartPhase("open"); + rocksdb_options_set_create_if_missing(options, 1); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", NULL); + + StartPhase("put"); + rocksdb_put(db, woptions, "foo", 3, "hello", 5, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", "hello"); + + StartPhase("backup_and_restore"); + { + rocksdb_destroy_db(options, dbbackupname, &err); + CheckNoError(err); + + rocksdb_backup_engine_t* be = + rocksdb_backup_engine_open(options, dbbackupname, &err); + CheckNoError(err); + + rocksdb_backup_engine_create_new_backup(be, db, &err); + CheckNoError(err); + + // need a change to trigger a new backup + rocksdb_delete(db, woptions, "does-not-exist", 14, &err); + CheckNoError(err); + + rocksdb_backup_engine_create_new_backup(be, db, &err); + CheckNoError(err); + + const rocksdb_backup_engine_info_t* bei = + rocksdb_backup_engine_get_backup_info(be); + CheckCondition(rocksdb_backup_engine_info_count(bei) > 1); + rocksdb_backup_engine_info_destroy(bei); + + rocksdb_backup_engine_purge_old_backups(be, 1, &err); + CheckNoError(err); + + bei = rocksdb_backup_engine_get_backup_info(be); + CheckCondition(rocksdb_backup_engine_info_count(bei) == 1); + rocksdb_backup_engine_info_destroy(bei); + + rocksdb_delete(db, woptions, "foo", 3, &err); + CheckNoError(err); + + rocksdb_close(db); + + rocksdb_destroy_db(options, dbname, &err); + CheckNoError(err); + + rocksdb_restore_options_t* restore_options = + rocksdb_restore_options_create(); + rocksdb_restore_options_set_keep_log_files(restore_options, 0); + rocksdb_backup_engine_restore_db_from_latest_backup(be, dbname, dbname, + restore_options, &err); + CheckNoError(err); + rocksdb_restore_options_destroy(restore_options); + + rocksdb_options_set_error_if_exists(options, 0); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + rocksdb_options_set_error_if_exists(options, 1); + + CheckGet(db, roptions, "foo", "hello"); + + rocksdb_backup_engine_close(be); + } + + StartPhase("checkpoint"); + { + rocksdb_destroy_db(options, dbcheckpointname, &err); + CheckNoError(err); + + rocksdb_checkpoint_t* checkpoint = + rocksdb_checkpoint_object_create(db, &err); + CheckNoError(err); + + rocksdb_checkpoint_create(checkpoint, dbcheckpointname, 0, &err); + CheckNoError(err); + + // start a new database from the checkpoint + rocksdb_close(db); + rocksdb_options_set_error_if_exists(options, 0); + db = rocksdb_open(options, dbcheckpointname, &err); + CheckNoError(err); + + CheckGet(db, roptions, "foo", "hello"); + + rocksdb_checkpoint_object_destroy(checkpoint); + + rocksdb_close(db); + rocksdb_destroy_db(options, dbcheckpointname, &err); + CheckNoError(err); + + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + rocksdb_options_set_error_if_exists(options, 1); + } + + StartPhase("compactall"); + rocksdb_compact_range(db, NULL, 0, NULL, 0); + CheckGet(db, roptions, "foo", "hello"); + + StartPhase("compactrange"); + rocksdb_compact_range(db, "a", 1, "z", 1); + CheckGet(db, roptions, "foo", "hello"); + + StartPhase("compactallopt"); + rocksdb_compact_range_opt(db, coptions, NULL, 0, NULL, 0); + CheckGet(db, roptions, "foo", "hello"); + + StartPhase("compactrangeopt"); + rocksdb_compact_range_opt(db, coptions, "a", 1, "z", 1); + CheckGet(db, roptions, "foo", "hello"); + + // Simple check cache usage + StartPhase("cache_usage"); + { + rocksdb_readoptions_set_pin_data(roptions, 1); + rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); + rocksdb_iter_seek(iter, "foo", 3); + + size_t usage = rocksdb_cache_get_usage(cache); + CheckCondition(usage > 0); + + size_t pin_usage = rocksdb_cache_get_pinned_usage(cache); + CheckCondition(pin_usage > 0); + + rocksdb_iter_next(iter); + rocksdb_iter_destroy(iter); + rocksdb_readoptions_set_pin_data(roptions, 0); + } + + StartPhase("addfile"); + { + rocksdb_envoptions_t* env_opt = rocksdb_envoptions_create(); + rocksdb_options_t* io_options = rocksdb_options_create(); + rocksdb_sstfilewriter_t* writer = + rocksdb_sstfilewriter_create(env_opt, io_options); + + remove(sstfilename); + rocksdb_sstfilewriter_open(writer, sstfilename, &err); + CheckNoError(err); + rocksdb_sstfilewriter_put(writer, "sstk1", 5, "v1", 2, &err); + CheckNoError(err); + rocksdb_sstfilewriter_put(writer, "sstk2", 5, "v2", 2, &err); + CheckNoError(err); + rocksdb_sstfilewriter_put(writer, "sstk3", 5, "v3", 2, &err); + CheckNoError(err); + rocksdb_sstfilewriter_finish(writer, &err); + CheckNoError(err); + + rocksdb_ingestexternalfileoptions_t* ing_opt = + rocksdb_ingestexternalfileoptions_create(); + const char* file_list[1] = {sstfilename}; + rocksdb_ingest_external_file(db, file_list, 1, ing_opt, &err); + CheckNoError(err); + CheckGet(db, roptions, "sstk1", "v1"); + CheckGet(db, roptions, "sstk2", "v2"); + CheckGet(db, roptions, "sstk3", "v3"); + + remove(sstfilename); + rocksdb_sstfilewriter_open(writer, sstfilename, &err); + CheckNoError(err); + rocksdb_sstfilewriter_put(writer, "sstk2", 5, "v4", 2, &err); + CheckNoError(err); + rocksdb_sstfilewriter_put(writer, "sstk22", 6, "v5", 2, &err); + CheckNoError(err); + rocksdb_sstfilewriter_put(writer, "sstk3", 5, "v6", 2, &err); + CheckNoError(err); + rocksdb_sstfilewriter_finish(writer, &err); + CheckNoError(err); + + rocksdb_ingest_external_file(db, file_list, 1, ing_opt, &err); + CheckNoError(err); + CheckGet(db, roptions, "sstk1", "v1"); + CheckGet(db, roptions, "sstk2", "v4"); + CheckGet(db, roptions, "sstk22", "v5"); + CheckGet(db, roptions, "sstk3", "v6"); + + rocksdb_sstfilewriter_open(writer, sstfilename, &err); + CheckNoError(err); + rocksdb_sstfilewriter_put(writer, "abc1", 4, "v7", 2, &err); + CheckNoError(err); + rocksdb_sstfilewriter_put(writer, "abc2", 4, "v8", 2, &err); + CheckNoError(err); + rocksdb_sstfilewriter_put(writer, "abc3", 4, "v9", 2, &err); + CheckNoError(err); + rocksdb_sstfilewriter_put(writer, "abc4", 4, "v10", 3, &err); + CheckNoError(err); + rocksdb_sstfilewriter_delete_range(writer, "abc1", 4, "abc4", 4, &err); + CheckNoError(err); + rocksdb_sstfilewriter_finish(writer, &err); + CheckNoError(err); + + rocksdb_ingestexternalfileoptions_destroy(ing_opt); + rocksdb_sstfilewriter_destroy(writer); + rocksdb_options_destroy(io_options); + rocksdb_envoptions_destroy(env_opt); + + // Delete all keys we just ingested + rocksdb_delete(db, woptions, "sstk1", 5, &err); + CheckNoError(err); + rocksdb_delete(db, woptions, "sstk2", 5, &err); + CheckNoError(err); + rocksdb_delete(db, woptions, "sstk22", 6, &err); + CheckNoError(err); + rocksdb_delete(db, woptions, "sstk3", 5, &err); + CheckNoError(err); + } + + StartPhase("writebatch"); + { + rocksdb_writebatch_t* wb = rocksdb_writebatch_create(); + rocksdb_writebatch_put(wb, "foo", 3, "a", 1); + rocksdb_writebatch_clear(wb); + rocksdb_writebatch_put(wb, "bar", 3, "b", 1); + rocksdb_writebatch_put(wb, "box", 3, "c", 1); + rocksdb_writebatch_delete(wb, "bar", 3); + rocksdb_write(db, woptions, wb, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", "hello"); + CheckGet(db, roptions, "bar", NULL); + CheckGet(db, roptions, "box", "c"); + int pos = 0; + rocksdb_writebatch_iterate(wb, &pos, CheckPut, CheckDel); + CheckCondition(pos == 3); + rocksdb_writebatch_clear(wb); + rocksdb_writebatch_put(wb, "bar", 3, "b", 1); + rocksdb_writebatch_put(wb, "bay", 3, "d", 1); + rocksdb_writebatch_delete_range(wb, "bar", 3, "bay", 3); + rocksdb_write(db, woptions, wb, &err); + CheckNoError(err); + CheckGet(db, roptions, "bar", NULL); + CheckGet(db, roptions, "bay", "d"); + rocksdb_writebatch_clear(wb); + const char* start_list[1] = {"bay"}; + const size_t start_sizes[1] = {3}; + const char* end_list[1] = {"baz"}; + const size_t end_sizes[1] = {3}; + rocksdb_writebatch_delete_rangev(wb, 1, start_list, start_sizes, end_list, + end_sizes); + rocksdb_write(db, woptions, wb, &err); + CheckNoError(err); + CheckGet(db, roptions, "bay", NULL); + rocksdb_writebatch_destroy(wb); + } + + StartPhase("writebatch_vectors"); + { + rocksdb_writebatch_t* wb = rocksdb_writebatch_create(); + const char* k_list[2] = {"z", "ap"}; + const size_t k_sizes[2] = {1, 2}; + const char* v_list[3] = {"x", "y", "z"}; + const size_t v_sizes[3] = {1, 1, 1}; + rocksdb_writebatch_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes); + rocksdb_write(db, woptions, wb, &err); + CheckNoError(err); + CheckGet(db, roptions, "zap", "xyz"); + rocksdb_writebatch_delete(wb, "zap", 3); + rocksdb_write(db, woptions, wb, &err); + CheckNoError(err); + CheckGet(db, roptions, "zap", NULL); + rocksdb_writebatch_destroy(wb); + } + + StartPhase("writebatch_savepoint"); + { + rocksdb_writebatch_t* wb = rocksdb_writebatch_create(); + rocksdb_writebatch_set_save_point(wb); + rocksdb_writebatch_set_save_point(wb); + const char* k_list[2] = {"z", "ap"}; + const size_t k_sizes[2] = {1, 2}; + const char* v_list[3] = {"x", "y", "z"}; + const size_t v_sizes[3] = {1, 1, 1}; + rocksdb_writebatch_pop_save_point(wb, &err); + CheckNoError(err); + rocksdb_writebatch_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes); + rocksdb_writebatch_rollback_to_save_point(wb, &err); + CheckNoError(err); + rocksdb_write(db, woptions, wb, &err); + CheckNoError(err); + CheckGet(db, roptions, "zap", NULL); + rocksdb_writebatch_destroy(wb); + } + + StartPhase("writebatch_rep"); + { + rocksdb_writebatch_t* wb1 = rocksdb_writebatch_create(); + rocksdb_writebatch_put(wb1, "baz", 3, "d", 1); + rocksdb_writebatch_put(wb1, "quux", 4, "e", 1); + rocksdb_writebatch_delete(wb1, "quux", 4); + size_t repsize1 = 0; + const char* rep = rocksdb_writebatch_data(wb1, &repsize1); + rocksdb_writebatch_t* wb2 = rocksdb_writebatch_create_from(rep, repsize1); + CheckCondition(rocksdb_writebatch_count(wb1) == + rocksdb_writebatch_count(wb2)); + size_t repsize2 = 0; + CheckCondition( + memcmp(rep, rocksdb_writebatch_data(wb2, &repsize2), repsize1) == 0); + rocksdb_writebatch_destroy(wb1); + rocksdb_writebatch_destroy(wb2); + } + + StartPhase("writebatch_wi"); + { + rocksdb_writebatch_wi_t* wbi = rocksdb_writebatch_wi_create(0, 1); + rocksdb_writebatch_wi_put(wbi, "foo", 3, "a", 1); + rocksdb_writebatch_wi_clear(wbi); + rocksdb_writebatch_wi_put(wbi, "bar", 3, "b", 1); + rocksdb_writebatch_wi_put(wbi, "box", 3, "c", 1); + rocksdb_writebatch_wi_delete(wbi, "bar", 3); + int count = rocksdb_writebatch_wi_count(wbi); + CheckCondition(count == 3); + size_t size; + char* value; + value = rocksdb_writebatch_wi_get_from_batch(wbi, options, "box", 3, &size, + &err); + CheckValue(err, "c", &value, size); + value = rocksdb_writebatch_wi_get_from_batch(wbi, options, "bar", 3, &size, + &err); + CheckValue(err, NULL, &value, size); + value = rocksdb_writebatch_wi_get_from_batch_and_db(wbi, db, roptions, + "foo", 3, &size, &err); + CheckValue(err, "hello", &value, size); + value = rocksdb_writebatch_wi_get_from_batch_and_db(wbi, db, roptions, + "box", 3, &size, &err); + CheckValue(err, "c", &value, size); + rocksdb_write_writebatch_wi(db, woptions, wbi, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", "hello"); + CheckGet(db, roptions, "bar", NULL); + CheckGet(db, roptions, "box", "c"); + int pos = 0; + rocksdb_writebatch_wi_iterate(wbi, &pos, CheckPut, CheckDel); + CheckCondition(pos == 3); + rocksdb_writebatch_wi_clear(wbi); + rocksdb_writebatch_wi_destroy(wbi); + } + + StartPhase("writebatch_wi_vectors"); + { + rocksdb_writebatch_wi_t* wb = rocksdb_writebatch_wi_create(0, 1); + const char* k_list[2] = {"z", "ap"}; + const size_t k_sizes[2] = {1, 2}; + const char* v_list[3] = {"x", "y", "z"}; + const size_t v_sizes[3] = {1, 1, 1}; + rocksdb_writebatch_wi_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes); + rocksdb_write_writebatch_wi(db, woptions, wb, &err); + CheckNoError(err); + CheckGet(db, roptions, "zap", "xyz"); + rocksdb_writebatch_wi_delete(wb, "zap", 3); + rocksdb_write_writebatch_wi(db, woptions, wb, &err); + CheckNoError(err); + CheckGet(db, roptions, "zap", NULL); + rocksdb_writebatch_wi_destroy(wb); + } + + StartPhase("writebatch_wi_savepoint"); + { + rocksdb_writebatch_wi_t* wb = rocksdb_writebatch_wi_create(0, 1); + rocksdb_writebatch_wi_set_save_point(wb); + const char* k_list[2] = {"z", "ap"}; + const size_t k_sizes[2] = {1, 2}; + const char* v_list[3] = {"x", "y", "z"}; + const size_t v_sizes[3] = {1, 1, 1}; + rocksdb_writebatch_wi_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes); + rocksdb_writebatch_wi_rollback_to_save_point(wb, &err); + CheckNoError(err); + rocksdb_write_writebatch_wi(db, woptions, wb, &err); + CheckNoError(err); + CheckGet(db, roptions, "zap", NULL); + rocksdb_writebatch_wi_destroy(wb); + } + + StartPhase("iter"); + { + rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_first(iter); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "box", "c"); + rocksdb_iter_next(iter); + CheckIter(iter, "foo", "hello"); + rocksdb_iter_prev(iter); + CheckIter(iter, "box", "c"); + rocksdb_iter_prev(iter); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_last(iter); + CheckIter(iter, "foo", "hello"); + rocksdb_iter_seek(iter, "b", 1); + CheckIter(iter, "box", "c"); + rocksdb_iter_seek_for_prev(iter, "g", 1); + CheckIter(iter, "foo", "hello"); + rocksdb_iter_seek_for_prev(iter, "box", 3); + CheckIter(iter, "box", "c"); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + rocksdb_iter_destroy(iter); + } + + StartPhase("wbwi_iter"); + { + rocksdb_iterator_t* base_iter = rocksdb_create_iterator(db, roptions); + rocksdb_writebatch_wi_t* wbi = rocksdb_writebatch_wi_create(0, 1); + rocksdb_writebatch_wi_put(wbi, "bar", 3, "b", 1); + rocksdb_writebatch_wi_delete(wbi, "foo", 3); + rocksdb_iterator_t* iter = + rocksdb_writebatch_wi_create_iterator_with_base(wbi, base_iter); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_first(iter); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "bar", "b"); + rocksdb_iter_next(iter); + CheckIter(iter, "box", "c"); + rocksdb_iter_prev(iter); + CheckIter(iter, "bar", "b"); + rocksdb_iter_prev(iter); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_last(iter); + CheckIter(iter, "box", "c"); + rocksdb_iter_seek(iter, "b", 1); + CheckIter(iter, "bar", "b"); + rocksdb_iter_seek_for_prev(iter, "c", 1); + CheckIter(iter, "box", "c"); + rocksdb_iter_seek_for_prev(iter, "box", 3); + CheckIter(iter, "box", "c"); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + rocksdb_iter_destroy(iter); + rocksdb_writebatch_wi_destroy(wbi); + } + + StartPhase("multiget"); + { + const char* keys[3] = {"box", "foo", "notfound"}; + const size_t keys_sizes[3] = {3, 3, 8}; + char* vals[3]; + size_t vals_sizes[3]; + char* errs[3]; + const char* expected[3] = {"c", "hello", NULL}; + rocksdb_multi_get(db, roptions, 3, keys, keys_sizes, vals, vals_sizes, + errs); + CheckMultiGetValues(3, vals, vals_sizes, errs, expected); + } + + StartPhase("pin_get"); + { + CheckPinGet(db, roptions, "box", "c"); + CheckPinGet(db, roptions, "foo", "hello"); + CheckPinGet(db, roptions, "notfound", NULL); + } + + StartPhase("approximate_sizes"); + { + int i; + int n = 20000; + char keybuf[100]; + char valbuf[100]; + uint64_t sizes[2]; + const char* start[2] = {"a", "k00000000000000010000"}; + size_t start_len[2] = {1, 21}; + const char* limit[2] = {"k00000000000000010000", "z"}; + size_t limit_len[2] = {21, 1}; + rocksdb_writeoptions_set_sync(woptions, 0); + for (i = 0; i < n; i++) { + snprintf(keybuf, sizeof(keybuf), "k%020d", i); + snprintf(valbuf, sizeof(valbuf), "v%020d", i); + rocksdb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf), + &err); + CheckNoError(err); + } + rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes, + &err); + CheckNoError(err); + CheckCondition(sizes[0] > 0); + CheckCondition(sizes[1] > 0); + } + + StartPhase("property"); + { + char* prop = rocksdb_property_value(db, "nosuchprop"); + CheckCondition(prop == NULL); + prop = rocksdb_property_value(db, "rocksdb.stats"); + CheckCondition(prop != NULL); + Free(&prop); + } + + StartPhase("snapshot"); + { + const rocksdb_snapshot_t* snap; + snap = rocksdb_create_snapshot(db); + rocksdb_delete(db, woptions, "foo", 3, &err); + CheckNoError(err); + rocksdb_readoptions_set_snapshot(roptions, snap); + CheckGet(db, roptions, "foo", "hello"); + rocksdb_readoptions_set_snapshot(roptions, NULL); + CheckGet(db, roptions, "foo", NULL); + rocksdb_release_snapshot(db, snap); + } + StartPhase("snapshot_with_memtable_inplace_update"); + { + rocksdb_close(db); + const rocksdb_snapshot_t* snap = NULL; + const char* s_key = "foo_snap"; + const char* value1 = "hello_s1"; + const char* value2 = "hello_s2"; + rocksdb_options_set_allow_concurrent_memtable_write(options, 0); + rocksdb_options_set_inplace_update_support(options, 1); + rocksdb_options_set_error_if_exists(options, 0); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + rocksdb_put(db, woptions, s_key, 8, value1, 8, &err); + snap = rocksdb_create_snapshot(db); + assert(snap != NULL); + rocksdb_put(db, woptions, s_key, 8, value2, 8, &err); + CheckNoError(err); + rocksdb_readoptions_set_snapshot(roptions, snap); + CheckGet(db, roptions, "foo", NULL); + // snapshot syntax is invalid, because of inplace update supported is set + CheckGet(db, roptions, s_key, value2); + // restore the data and options + rocksdb_delete(db, woptions, s_key, 8, &err); + CheckGet(db, roptions, s_key, NULL); + rocksdb_release_snapshot(db, snap); + rocksdb_readoptions_set_snapshot(roptions, NULL); + rocksdb_options_set_inplace_update_support(options, 0); + rocksdb_options_set_allow_concurrent_memtable_write(options, 1); + rocksdb_options_set_error_if_exists(options, 1); + } + StartPhase("repair"); + { + // If we do not compact here, then the lazy deletion of + // files (https://reviews.facebook.net/D6123) would leave + // around deleted files and the repair process will find + // those files and put them back into the database. + rocksdb_compact_range(db, NULL, 0, NULL, 0); + rocksdb_close(db); + rocksdb_options_set_create_if_missing(options, 0); + rocksdb_options_set_error_if_exists(options, 0); + rocksdb_options_set_wal_recovery_mode(options, 2); + rocksdb_repair_db(options, dbname, &err); + CheckNoError(err); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", NULL); + CheckGet(db, roptions, "bar", NULL); + CheckGet(db, roptions, "box", "c"); + rocksdb_options_set_create_if_missing(options, 1); + rocksdb_options_set_error_if_exists(options, 1); + } + + StartPhase("filter"); + for (run = 1; run <= 4; run++) { + // run=0 uses custom filter (not currently supported) + // run=1 uses old block-based bloom filter + // run=2 run uses full bloom filter + // run=3 uses Ribbon + // run=4 uses Ribbon-Bloom hybrid configuration + CheckNoError(err); + rocksdb_filterpolicy_t* policy; + if (run == 1) { + policy = rocksdb_filterpolicy_create_bloom(8.0); + } else if (run == 2) { + policy = rocksdb_filterpolicy_create_bloom_full(8.0); + } else if (run == 3) { + policy = rocksdb_filterpolicy_create_ribbon(8.0); + } else { + policy = rocksdb_filterpolicy_create_ribbon_hybrid(8.0, 1); + } + rocksdb_block_based_options_set_filter_policy(table_options, policy); + + // Create new database + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + rocksdb_options_set_block_based_table_factory(options, table_options); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err); + CheckNoError(err); + + { + // Add enough keys to get just one reasonably populated Bloom filter + const int keys_to_add = 1500; + int i; + char keybuf[100]; + for (i = 0; i < keys_to_add; i++) { + snprintf(keybuf, sizeof(keybuf), "yes%020d", i); + rocksdb_put(db, woptions, keybuf, strlen(keybuf), "val", 3, &err); + CheckNoError(err); + } + } + rocksdb_compact_range(db, NULL, 0, NULL, 0); + + CheckGet(db, roptions, "foo", "foovalue"); + CheckGet(db, roptions, "bar", "barvalue"); + + { + // Query some keys not added to identify Bloom filter implementation + // from false positive queries, using perfcontext to detect Bloom + // filter behavior + rocksdb_perfcontext_t* perf = rocksdb_perfcontext_create(); + rocksdb_perfcontext_reset(perf); + + const int keys_to_query = 10000; + int i; + char keybuf[100]; + for (i = 0; i < keys_to_query; i++) { + snprintf(keybuf, sizeof(keybuf), "no%020d", i); + CheckGet(db, roptions, keybuf, NULL); + } + + const int hits = + (int)rocksdb_perfcontext_metric(perf, rocksdb_bloom_sst_hit_count); + if (run == 0) { + // Due to half true, half false with fake filter result + CheckCondition(hits == keys_to_query / 2); + } else if (run == 1 || run == 2 || run == 4) { + // For run == 1, block-based Bloom is no longer available in public + // API; attempting to enable it enables full Bloom instead. + // + // Essentially a fingerprint of full Bloom schema, format_version=5 + CheckCondition(hits == 188); + } else { + // Essentially a fingerprint of Ribbon schema + CheckCondition(hits == 226); + } + CheckCondition( + (keys_to_query - hits) == + (int)rocksdb_perfcontext_metric(perf, rocksdb_bloom_sst_miss_count)); + + rocksdb_perfcontext_destroy(perf); + } + + // Reset the policy + rocksdb_block_based_options_set_filter_policy(table_options, NULL); + rocksdb_options_set_block_based_table_factory(options, table_options); + } + + StartPhase("compaction_filter"); + { + rocksdb_options_t* options_with_filter = rocksdb_options_create(); + rocksdb_options_set_create_if_missing(options_with_filter, 1); + rocksdb_compactionfilter_t* cfilter; + cfilter = rocksdb_compactionfilter_create(NULL, CFilterDestroy, + CFilterFilter, CFilterName); + // Create new database + rocksdb_close(db); + rocksdb_destroy_db(options_with_filter, dbname, &err); + rocksdb_options_set_compaction_filter(options_with_filter, cfilter); + db = CheckCompaction(db, options_with_filter, roptions, woptions); + + rocksdb_options_set_compaction_filter(options_with_filter, NULL); + rocksdb_compactionfilter_destroy(cfilter); + rocksdb_options_destroy(options_with_filter); + } + + StartPhase("compaction_filter_factory"); + { + rocksdb_options_t* options_with_filter_factory = rocksdb_options_create(); + rocksdb_options_set_create_if_missing(options_with_filter_factory, 1); + rocksdb_compactionfilterfactory_t* factory; + factory = rocksdb_compactionfilterfactory_create( + NULL, CFilterFactoryDestroy, CFilterCreate, CFilterFactoryName); + // Create new database + rocksdb_close(db); + rocksdb_destroy_db(options_with_filter_factory, dbname, &err); + rocksdb_options_set_compaction_filter_factory(options_with_filter_factory, + factory); + db = CheckCompaction(db, options_with_filter_factory, roptions, woptions); + + rocksdb_options_set_compaction_filter_factory(options_with_filter_factory, + NULL); + rocksdb_options_destroy(options_with_filter_factory); + } + + StartPhase("merge_operator"); + { + rocksdb_mergeoperator_t* merge_operator; + merge_operator = rocksdb_mergeoperator_create( + NULL, MergeOperatorDestroy, MergeOperatorFullMerge, + MergeOperatorPartialMerge, NULL, MergeOperatorName); + // Create new database + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + rocksdb_options_set_merge_operator(options, merge_operator); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", "foovalue"); + rocksdb_merge(db, woptions, "foo", 3, "barvalue", 8, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", "fake"); + + // Merge of a non-existing value + rocksdb_merge(db, woptions, "bar", 3, "barvalue", 8, &err); + CheckNoError(err); + CheckGet(db, roptions, "bar", "fake"); + } + + StartPhase("columnfamilies"); + { + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + CheckNoError(err); + + rocksdb_options_t* db_options = rocksdb_options_create(); + rocksdb_options_set_create_if_missing(db_options, 1); + db = rocksdb_open(db_options, dbname, &err); + CheckNoError(err); + rocksdb_close(db); + { + const char* expected_cf_names[1] = {"default"}; + LoadAndCheckLatestOptions(dbname, env, false, cache, NULL, 1, + expected_cf_names, NULL); + } + + rocksdb_options_set_create_if_missing(db_options, 0); + db = rocksdb_open(db_options, dbname, &err); + rocksdb_column_family_handle_t* cfh; + cfh = rocksdb_create_column_family(db, db_options, "cf1", &err); + rocksdb_column_family_handle_destroy(cfh); + CheckNoError(err); + rocksdb_close(db); + + size_t cflen; + char** column_fams = + rocksdb_list_column_families(db_options, dbname, &cflen, &err); + CheckNoError(err); + CheckEqual("default", column_fams[0], 7); + CheckEqual("cf1", column_fams[1], 3); + CheckCondition(cflen == 2); + rocksdb_list_column_families_destroy(column_fams, cflen); + + rocksdb_options_t* cf_options = rocksdb_options_create(); + + const char* cf_names[2] = {"default", "cf1"}; + const rocksdb_options_t* cf_opts[2] = {cf_options, cf_options}; + rocksdb_column_family_handle_t* handles[2]; + + LoadAndCheckLatestOptions(dbname, env, false, cache, NULL, 2, cf_names, + NULL); + + db = rocksdb_open_column_families(db_options, dbname, 2, cf_names, cf_opts, + handles, &err); + CheckNoError(err); + + rocksdb_put_cf(db, woptions, handles[1], "foo", 3, "hello", 5, &err); + CheckNoError(err); + + rocksdb_put_cf(db, woptions, handles[1], "foobar1", 7, "hello1", 6, &err); + CheckNoError(err); + rocksdb_put_cf(db, woptions, handles[1], "foobar2", 7, "hello2", 6, &err); + CheckNoError(err); + rocksdb_put_cf(db, woptions, handles[1], "foobar3", 7, "hello3", 6, &err); + CheckNoError(err); + rocksdb_put_cf(db, woptions, handles[1], "foobar4", 7, "hello4", 6, &err); + CheckNoError(err); + rocksdb_suggest_compact_range_cf(db, handles[1], "foo", 3, "foobar9", 7, + &err); + CheckNoError(err); + + rocksdb_flushoptions_t* flush_options = rocksdb_flushoptions_create(); + rocksdb_flushoptions_set_wait(flush_options, 1); + rocksdb_flush_cf(db, flush_options, handles[1], &err); + CheckNoError(err) rocksdb_flushoptions_destroy(flush_options); + + CheckGetCF(db, roptions, handles[1], "foo", "hello"); + CheckPinGetCF(db, roptions, handles[1], "foo", "hello"); + + rocksdb_delete_cf(db, woptions, handles[1], "foo", 3, &err); + CheckNoError(err); + + rocksdb_delete_range_cf(db, woptions, handles[1], "foobar2", 7, "foobar4", + 7, &err); + CheckNoError(err); + + CheckGetCF(db, roptions, handles[1], "foo", NULL); + CheckPinGetCF(db, roptions, handles[1], "foo", NULL); + + rocksdb_writebatch_t* wb = rocksdb_writebatch_create(); + rocksdb_writebatch_put_cf(wb, handles[1], "baz", 3, "a", 1); + rocksdb_writebatch_clear(wb); + rocksdb_writebatch_put_cf(wb, handles[1], "bar", 3, "b", 1); + rocksdb_writebatch_put_cf(wb, handles[1], "box", 3, "c", 1); + rocksdb_writebatch_put_cf(wb, handles[1], "buff", 4, "rocksdb", 7); + rocksdb_writebatch_delete_cf(wb, handles[1], "bar", 3); + rocksdb_write(db, woptions, wb, &err); + CheckNoError(err); + CheckGetCF(db, roptions, handles[1], "baz", NULL); + CheckGetCF(db, roptions, handles[1], "bar", NULL); + CheckGetCF(db, roptions, handles[1], "box", "c"); + CheckGetCF(db, roptions, handles[1], "buff", "rocksdb"); + CheckPinGetCF(db, roptions, handles[1], "baz", NULL); + CheckPinGetCF(db, roptions, handles[1], "bar", NULL); + CheckPinGetCF(db, roptions, handles[1], "box", "c"); + CheckPinGetCF(db, roptions, handles[1], "buff", "rocksdb"); + rocksdb_writebatch_destroy(wb); + + rocksdb_flush_wal(db, 1, &err); + CheckNoError(err); + + const char* keys[3] = {"box", "box", "barfooxx"}; + const rocksdb_column_family_handle_t* get_handles[3] = { + handles[0], handles[1], handles[1]}; + const size_t keys_sizes[3] = {3, 3, 8}; + char* vals[3]; + size_t vals_sizes[3]; + char* errs[3]; + rocksdb_multi_get_cf(db, roptions, get_handles, 3, keys, keys_sizes, vals, + vals_sizes, errs); + + int i; + for (i = 0; i < 3; i++) { + CheckEqual(NULL, errs[i], 0); + switch (i) { + case 0: + CheckEqual(NULL, vals[i], vals_sizes[i]); // wrong cf + break; + case 1: + CheckEqual("c", vals[i], vals_sizes[i]); // bingo + break; + case 2: + CheckEqual(NULL, vals[i], vals_sizes[i]); // normal not found + break; + } + Free(&vals[i]); + } + + { + const char* batched_keys[4] = {"box", "buff", "barfooxx", "box"}; + const size_t batched_keys_sizes[4] = {3, 4, 8, 3}; + const char* expected_value[4] = {"c", "rocksdb", NULL, "c"}; + char* batched_errs[4]; + + rocksdb_pinnableslice_t* pvals[4]; + rocksdb_batched_multi_get_cf(db, roptions, handles[1], 4, batched_keys, + batched_keys_sizes, pvals, batched_errs, + false); + const char* val; + size_t val_len; + for (i = 0; i < 4; ++i) { + val = rocksdb_pinnableslice_value(pvals[i], &val_len); + CheckNoError(batched_errs[i]); + CheckEqual(expected_value[i], val, val_len); + rocksdb_pinnableslice_destroy(pvals[i]); + } + } + + { + unsigned char value_found = 0; + + CheckCondition(!rocksdb_key_may_exist(db, roptions, "invalid_key", 11, + NULL, NULL, NULL, 0, NULL)); + CheckCondition(!rocksdb_key_may_exist(db, roptions, "invalid_key", 11, + &vals[0], &vals_sizes[0], NULL, 0, + &value_found)); + if (value_found) { + Free(&vals[0]); + } + + CheckCondition(!rocksdb_key_may_exist_cf(db, roptions, handles[1], + "invalid_key", 11, NULL, NULL, + NULL, 0, NULL)); + CheckCondition(!rocksdb_key_may_exist_cf(db, roptions, handles[1], + "invalid_key", 11, &vals[0], + &vals_sizes[0], NULL, 0, NULL)); + if (value_found) { + Free(&vals[0]); + } + } + + rocksdb_iterator_t* iter = + rocksdb_create_iterator_cf(db, roptions, handles[1]); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_first(iter); + CheckCondition(rocksdb_iter_valid(iter)); + + for (i = 0; rocksdb_iter_valid(iter) != 0; rocksdb_iter_next(iter)) { + i++; + } + CheckCondition(i == 4); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + rocksdb_iter_destroy(iter); + + rocksdb_column_family_handle_t* iters_cf_handles[2] = {handles[0], + handles[1]}; + rocksdb_iterator_t* iters_handles[2]; + rocksdb_create_iterators(db, roptions, iters_cf_handles, iters_handles, 2, + &err); + CheckNoError(err); + + iter = iters_handles[0]; + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_first(iter); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_destroy(iter); + + iter = iters_handles[1]; + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_first(iter); + CheckCondition(rocksdb_iter_valid(iter)); + + for (i = 0; rocksdb_iter_valid(iter) != 0; rocksdb_iter_next(iter)) { + i++; + } + CheckCondition(i == 4); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + rocksdb_iter_destroy(iter); + + GetAndCheckMetaDataCf(db, handles[1], cf_names[1]); + + rocksdb_drop_column_family(db, handles[1], &err); + CheckNoError(err); + for (i = 0; i < 2; i++) { + rocksdb_column_family_handle_destroy(handles[i]); + } + rocksdb_close(db); + { + // As column family has been dropped, we expect only one column family. + const char* expected_cf_names[1] = {"default"}; + LoadAndCheckLatestOptions(dbname, env, false, cache, NULL, 1, + expected_cf_names, NULL); + } + rocksdb_destroy_db(options, dbname, &err); + rocksdb_options_destroy(db_options); + rocksdb_options_destroy(cf_options); + } + + StartPhase("prefix"); + { + // Create new database + rocksdb_options_set_allow_mmap_reads(options, 1); + rocksdb_options_set_prefix_extractor( + options, rocksdb_slicetransform_create_fixed_prefix(3)); + rocksdb_options_set_hash_skip_list_rep(options, 5000, 4, 4); + rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16); + rocksdb_options_set_allow_concurrent_memtable_write(options, 0); + + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + + rocksdb_put(db, woptions, "foo1", 4, "foo", 3, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "foo2", 4, "foo", 3, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "foo3", 4, "foo", 3, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "bar1", 4, "bar", 3, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "bar2", 4, "bar", 3, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "bar3", 4, "bar", 3, &err); + CheckNoError(err); + + rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); + CheckCondition(!rocksdb_iter_valid(iter)); + + rocksdb_iter_seek(iter, "bar", 3); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + CheckCondition(rocksdb_iter_valid(iter)); + + CheckIter(iter, "bar1", "bar"); + rocksdb_iter_next(iter); + CheckIter(iter, "bar2", "bar"); + rocksdb_iter_next(iter); + CheckIter(iter, "bar3", "bar"); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + rocksdb_iter_destroy(iter); + + rocksdb_readoptions_set_total_order_seek(roptions, 1); + iter = rocksdb_create_iterator(db, roptions); + CheckCondition(!rocksdb_iter_valid(iter)); + + rocksdb_iter_seek(iter, "ba", 2); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "bar1", "bar"); + + rocksdb_iter_destroy(iter); + rocksdb_readoptions_set_total_order_seek(roptions, 0); + + rocksdb_close(db); + + { + const char* expected_cf_names[1] = {"default"}; + LoadAndCheckLatestOptions(dbname, env, false, cache, NULL, 1, + expected_cf_names, + "Invalid argument: leveldb.BytewiseComparator: " + "does not match existing comparator foo"); + LoadAndCheckLatestOptions(dbname, env, false, cache, cmp, 1, + expected_cf_names, NULL); + } + rocksdb_destroy_db(options, dbname, &err); + } + + // Check memory usage stats + StartPhase("approximate_memory_usage"); + { + // Create database + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + + rocksdb_memory_consumers_t* consumers; + consumers = rocksdb_memory_consumers_create(); + rocksdb_memory_consumers_add_db(consumers, db); + rocksdb_memory_consumers_add_cache(consumers, cache); + + // take memory usage report before write-read operation + rocksdb_memory_usage_t* mu1; + mu1 = rocksdb_approximate_memory_usage_create(consumers, &err); + CheckNoError(err); + + // Put data (this should affect memtables) + rocksdb_put(db, woptions, "memory", 6, "test", 4, &err); + CheckNoError(err); + CheckGet(db, roptions, "memory", "test"); + + // take memory usage report after write-read operation + rocksdb_memory_usage_t* mu2; + mu2 = rocksdb_approximate_memory_usage_create(consumers, &err); + CheckNoError(err); + + // amount of memory used within memtables should grow + CheckCondition(rocksdb_approximate_memory_usage_get_mem_table_total(mu2) >= + rocksdb_approximate_memory_usage_get_mem_table_total(mu1)); + CheckCondition( + rocksdb_approximate_memory_usage_get_mem_table_unflushed(mu2) >= + rocksdb_approximate_memory_usage_get_mem_table_unflushed(mu1)); + + rocksdb_memory_consumers_destroy(consumers); + rocksdb_approximate_memory_usage_destroy(mu1); + rocksdb_approximate_memory_usage_destroy(mu2); + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + CheckNoError(err); + } + + StartPhase("cuckoo_options"); + { + rocksdb_cuckoo_table_options_t* cuckoo_options; + cuckoo_options = rocksdb_cuckoo_options_create(); + rocksdb_cuckoo_options_set_hash_ratio(cuckoo_options, 0.5); + rocksdb_cuckoo_options_set_max_search_depth(cuckoo_options, 200); + rocksdb_cuckoo_options_set_cuckoo_block_size(cuckoo_options, 10); + rocksdb_cuckoo_options_set_identity_as_first_hash(cuckoo_options, 1); + rocksdb_cuckoo_options_set_use_module_hash(cuckoo_options, 0); + rocksdb_options_set_cuckoo_table_factory(options, cuckoo_options); + + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + + rocksdb_cuckoo_options_destroy(cuckoo_options); + } + + StartPhase("options"); + { + rocksdb_options_t* o; + o = rocksdb_options_create(); + + // Set and check options. + rocksdb_options_set_allow_ingest_behind(o, 1); + CheckCondition(1 == rocksdb_options_get_allow_ingest_behind(o)); + + rocksdb_options_compaction_readahead_size(o, 10); + CheckCondition(10 == rocksdb_options_get_compaction_readahead_size(o)); + + rocksdb_options_set_create_if_missing(o, 1); + CheckCondition(1 == rocksdb_options_get_create_if_missing(o)); + + rocksdb_options_set_create_missing_column_families(o, 1); + CheckCondition(1 == rocksdb_options_get_create_missing_column_families(o)); + + rocksdb_options_set_error_if_exists(o, 1); + CheckCondition(1 == rocksdb_options_get_error_if_exists(o)); + + rocksdb_options_set_paranoid_checks(o, 1); + CheckCondition(1 == rocksdb_options_get_paranoid_checks(o)); + + rocksdb_options_set_info_log_level(o, 3); + CheckCondition(3 == rocksdb_options_get_info_log_level(o)); + + rocksdb_options_set_write_buffer_size(o, 100); + CheckCondition(100 == rocksdb_options_get_write_buffer_size(o)); + + rocksdb_options_set_db_write_buffer_size(o, 1000); + CheckCondition(1000 == rocksdb_options_get_db_write_buffer_size(o)); + + rocksdb_options_set_max_open_files(o, 21); + CheckCondition(21 == rocksdb_options_get_max_open_files(o)); + + rocksdb_options_set_max_file_opening_threads(o, 5); + CheckCondition(5 == rocksdb_options_get_max_file_opening_threads(o)); + + rocksdb_options_set_max_total_wal_size(o, 400); + CheckCondition(400 == rocksdb_options_get_max_total_wal_size(o)); + + rocksdb_options_set_num_levels(o, 7); + CheckCondition(7 == rocksdb_options_get_num_levels(o)); + + rocksdb_options_set_level0_file_num_compaction_trigger(o, 4); + CheckCondition(4 == + rocksdb_options_get_level0_file_num_compaction_trigger(o)); + + rocksdb_options_set_level0_slowdown_writes_trigger(o, 6); + CheckCondition(6 == rocksdb_options_get_level0_slowdown_writes_trigger(o)); + + rocksdb_options_set_level0_stop_writes_trigger(o, 8); + CheckCondition(8 == rocksdb_options_get_level0_stop_writes_trigger(o)); + + rocksdb_options_set_target_file_size_base(o, 256); + CheckCondition(256 == rocksdb_options_get_target_file_size_base(o)); + + rocksdb_options_set_target_file_size_multiplier(o, 3); + CheckCondition(3 == rocksdb_options_get_target_file_size_multiplier(o)); + + rocksdb_options_set_max_bytes_for_level_base(o, 1024); + CheckCondition(1024 == rocksdb_options_get_max_bytes_for_level_base(o)); + + rocksdb_options_set_level_compaction_dynamic_level_bytes(o, 1); + CheckCondition(1 == + rocksdb_options_get_level_compaction_dynamic_level_bytes(o)); + + rocksdb_options_set_max_bytes_for_level_multiplier(o, 2.0); + CheckCondition(2.0 == + rocksdb_options_get_max_bytes_for_level_multiplier(o)); + + rocksdb_options_set_skip_stats_update_on_db_open(o, 1); + CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(o)); + + rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(o, 1); + CheckCondition( + 1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(o)); + + rocksdb_options_set_max_write_buffer_number(o, 97); + CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(o)); + + rocksdb_options_set_min_write_buffer_number_to_merge(o, 23); + CheckCondition(23 == + rocksdb_options_get_min_write_buffer_number_to_merge(o)); + + rocksdb_options_set_max_write_buffer_number_to_maintain(o, 64); + CheckCondition(64 == + rocksdb_options_get_max_write_buffer_number_to_maintain(o)); + + rocksdb_options_set_max_write_buffer_size_to_maintain(o, 50000); + CheckCondition(50000 == + rocksdb_options_get_max_write_buffer_size_to_maintain(o)); + + rocksdb_options_set_enable_pipelined_write(o, 1); + CheckCondition(1 == rocksdb_options_get_enable_pipelined_write(o)); + + rocksdb_options_set_unordered_write(o, 1); + CheckCondition(1 == rocksdb_options_get_unordered_write(o)); + + rocksdb_options_set_max_subcompactions(o, 123456); + CheckCondition(123456 == rocksdb_options_get_max_subcompactions(o)); + + rocksdb_options_set_max_background_jobs(o, 2); + CheckCondition(2 == rocksdb_options_get_max_background_jobs(o)); + + rocksdb_options_set_max_background_compactions(o, 3); + CheckCondition(3 == rocksdb_options_get_max_background_compactions(o)); + + rocksdb_options_set_max_background_flushes(o, 5); + CheckCondition(5 == rocksdb_options_get_max_background_flushes(o)); + + rocksdb_options_set_max_log_file_size(o, 6); + CheckCondition(6 == rocksdb_options_get_max_log_file_size(o)); + + rocksdb_options_set_log_file_time_to_roll(o, 7); + CheckCondition(7 == rocksdb_options_get_log_file_time_to_roll(o)); + + rocksdb_options_set_keep_log_file_num(o, 8); + CheckCondition(8 == rocksdb_options_get_keep_log_file_num(o)); + + rocksdb_options_set_recycle_log_file_num(o, 9); + CheckCondition(9 == rocksdb_options_get_recycle_log_file_num(o)); + + rocksdb_options_set_soft_pending_compaction_bytes_limit(o, 10); + CheckCondition(10 == + rocksdb_options_get_soft_pending_compaction_bytes_limit(o)); + + rocksdb_options_set_hard_pending_compaction_bytes_limit(o, 11); + CheckCondition(11 == + rocksdb_options_get_hard_pending_compaction_bytes_limit(o)); + + rocksdb_options_set_max_manifest_file_size(o, 12); + CheckCondition(12 == rocksdb_options_get_max_manifest_file_size(o)); + + rocksdb_options_set_table_cache_numshardbits(o, 13); + CheckCondition(13 == rocksdb_options_get_table_cache_numshardbits(o)); + + rocksdb_options_set_arena_block_size(o, 14); + CheckCondition(14 == rocksdb_options_get_arena_block_size(o)); + + rocksdb_options_set_use_fsync(o, 1); + CheckCondition(1 == rocksdb_options_get_use_fsync(o)); + + rocksdb_options_set_WAL_ttl_seconds(o, 15); + CheckCondition(15 == rocksdb_options_get_WAL_ttl_seconds(o)); + + rocksdb_options_set_WAL_size_limit_MB(o, 16); + CheckCondition(16 == rocksdb_options_get_WAL_size_limit_MB(o)); + + rocksdb_options_set_manifest_preallocation_size(o, 17); + CheckCondition(17 == rocksdb_options_get_manifest_preallocation_size(o)); + + rocksdb_options_set_allow_mmap_reads(o, 1); + CheckCondition(1 == rocksdb_options_get_allow_mmap_reads(o)); + + rocksdb_options_set_allow_mmap_writes(o, 1); + CheckCondition(1 == rocksdb_options_get_allow_mmap_writes(o)); + + rocksdb_options_set_use_direct_reads(o, 1); + CheckCondition(1 == rocksdb_options_get_use_direct_reads(o)); + + rocksdb_options_set_use_direct_io_for_flush_and_compaction(o, 1); + CheckCondition( + 1 == rocksdb_options_get_use_direct_io_for_flush_and_compaction(o)); + + rocksdb_options_set_is_fd_close_on_exec(o, 1); + CheckCondition(1 == rocksdb_options_get_is_fd_close_on_exec(o)); + + rocksdb_options_set_stats_dump_period_sec(o, 18); + CheckCondition(18 == rocksdb_options_get_stats_dump_period_sec(o)); + + rocksdb_options_set_stats_persist_period_sec(o, 5); + CheckCondition(5 == rocksdb_options_get_stats_persist_period_sec(o)); + + rocksdb_options_set_advise_random_on_open(o, 1); + CheckCondition(1 == rocksdb_options_get_advise_random_on_open(o)); + + rocksdb_options_set_access_hint_on_compaction_start(o, 3); + CheckCondition(3 == rocksdb_options_get_access_hint_on_compaction_start(o)); + + rocksdb_options_set_use_adaptive_mutex(o, 1); + CheckCondition(1 == rocksdb_options_get_use_adaptive_mutex(o)); + + rocksdb_options_set_bytes_per_sync(o, 19); + CheckCondition(19 == rocksdb_options_get_bytes_per_sync(o)); + + rocksdb_options_set_wal_bytes_per_sync(o, 20); + CheckCondition(20 == rocksdb_options_get_wal_bytes_per_sync(o)); + + rocksdb_options_set_writable_file_max_buffer_size(o, 21); + CheckCondition(21 == rocksdb_options_get_writable_file_max_buffer_size(o)); + + rocksdb_options_set_allow_concurrent_memtable_write(o, 1); + CheckCondition(1 == rocksdb_options_get_allow_concurrent_memtable_write(o)); + + rocksdb_options_set_enable_write_thread_adaptive_yield(o, 1); + CheckCondition(1 == + rocksdb_options_get_enable_write_thread_adaptive_yield(o)); + + rocksdb_options_set_max_sequential_skip_in_iterations(o, 22); + CheckCondition(22 == + rocksdb_options_get_max_sequential_skip_in_iterations(o)); + + rocksdb_options_set_disable_auto_compactions(o, 1); + CheckCondition(1 == rocksdb_options_get_disable_auto_compactions(o)); + + rocksdb_options_set_optimize_filters_for_hits(o, 1); + CheckCondition(1 == rocksdb_options_get_optimize_filters_for_hits(o)); + + rocksdb_options_set_delete_obsolete_files_period_micros(o, 23); + CheckCondition(23 == + rocksdb_options_get_delete_obsolete_files_period_micros(o)); + + rocksdb_options_set_memtable_prefix_bloom_size_ratio(o, 2.0); + CheckCondition(2.0 == + rocksdb_options_get_memtable_prefix_bloom_size_ratio(o)); + + rocksdb_options_set_max_compaction_bytes(o, 24); + CheckCondition(24 == rocksdb_options_get_max_compaction_bytes(o)); + + rocksdb_options_set_memtable_huge_page_size(o, 25); + CheckCondition(25 == rocksdb_options_get_memtable_huge_page_size(o)); + + rocksdb_options_set_max_successive_merges(o, 26); + CheckCondition(26 == rocksdb_options_get_max_successive_merges(o)); + + rocksdb_options_set_bloom_locality(o, 27); + CheckCondition(27 == rocksdb_options_get_bloom_locality(o)); + + rocksdb_options_set_inplace_update_support(o, 1); + CheckCondition(1 == rocksdb_options_get_inplace_update_support(o)); + + rocksdb_options_set_inplace_update_num_locks(o, 28); + CheckCondition(28 == rocksdb_options_get_inplace_update_num_locks(o)); + + rocksdb_options_set_report_bg_io_stats(o, 1); + CheckCondition(1 == rocksdb_options_get_report_bg_io_stats(o)); + + rocksdb_options_set_wal_recovery_mode(o, 2); + CheckCondition(2 == rocksdb_options_get_wal_recovery_mode(o)); + + rocksdb_options_set_compression(o, 5); + CheckCondition(5 == rocksdb_options_get_compression(o)); + + rocksdb_options_set_bottommost_compression(o, 4); + CheckCondition(4 == rocksdb_options_get_bottommost_compression(o)); + + rocksdb_options_set_compaction_style(o, 2); + CheckCondition(2 == rocksdb_options_get_compaction_style(o)); + + rocksdb_options_set_atomic_flush(o, 1); + CheckCondition(1 == rocksdb_options_get_atomic_flush(o)); + + rocksdb_options_set_manual_wal_flush(o, 1); + CheckCondition(1 == rocksdb_options_get_manual_wal_flush(o)); + + rocksdb_options_set_wal_compression(o, 1); + CheckCondition(1 == rocksdb_options_get_wal_compression(o)); + + rocksdb_options_set_experimental_mempurge_threshold(o, 29.0); + CheckCondition(29.0 == + rocksdb_options_get_experimental_mempurge_threshold(o)); + + /* Blob Options */ + rocksdb_options_set_enable_blob_files(o, 1); + CheckCondition(1 == rocksdb_options_get_enable_blob_files(o)); + + rocksdb_options_set_min_blob_size(o, 29); + CheckCondition(29 == rocksdb_options_get_min_blob_size(o)); + + rocksdb_options_set_blob_file_size(o, 30); + CheckCondition(30 == rocksdb_options_get_blob_file_size(o)); + + rocksdb_options_set_blob_compression_type(o, 4); + CheckCondition(4 == rocksdb_options_get_blob_compression_type(o)); + + rocksdb_options_set_enable_blob_gc(o, 1); + CheckCondition(1 == rocksdb_options_get_enable_blob_gc(o)); + + rocksdb_options_set_blob_gc_age_cutoff(o, 0.5); + CheckCondition(0.5 == rocksdb_options_get_blob_gc_age_cutoff(o)); + + rocksdb_options_set_blob_gc_force_threshold(o, 0.75); + CheckCondition(0.75 == rocksdb_options_get_blob_gc_force_threshold(o)); + + rocksdb_options_set_blob_compaction_readahead_size(o, 262144); + CheckCondition(262144 == + rocksdb_options_get_blob_compaction_readahead_size(o)); + + rocksdb_options_set_blob_file_starting_level(o, 5); + CheckCondition(5 == rocksdb_options_get_blob_file_starting_level(o)); + + rocksdb_options_set_prepopulate_blob_cache(o, 1 /* flush only */); + CheckCondition(1 == rocksdb_options_get_prepopulate_blob_cache(o)); + + // Create a copy that should be equal to the original. + rocksdb_options_t* copy; + copy = rocksdb_options_create_copy(o); + + CheckCondition(1 == rocksdb_options_get_allow_ingest_behind(copy)); + CheckCondition(10 == rocksdb_options_get_compaction_readahead_size(copy)); + CheckCondition(1 == rocksdb_options_get_create_if_missing(copy)); + CheckCondition(1 == + rocksdb_options_get_create_missing_column_families(copy)); + CheckCondition(1 == rocksdb_options_get_error_if_exists(copy)); + CheckCondition(1 == rocksdb_options_get_paranoid_checks(copy)); + CheckCondition(3 == rocksdb_options_get_info_log_level(copy)); + CheckCondition(100 == rocksdb_options_get_write_buffer_size(copy)); + CheckCondition(1000 == rocksdb_options_get_db_write_buffer_size(copy)); + CheckCondition(21 == rocksdb_options_get_max_open_files(copy)); + CheckCondition(5 == rocksdb_options_get_max_file_opening_threads(copy)); + CheckCondition(400 == rocksdb_options_get_max_total_wal_size(copy)); + CheckCondition(7 == rocksdb_options_get_num_levels(copy)); + CheckCondition( + 4 == rocksdb_options_get_level0_file_num_compaction_trigger(copy)); + CheckCondition(6 == + rocksdb_options_get_level0_slowdown_writes_trigger(copy)); + CheckCondition(8 == rocksdb_options_get_level0_stop_writes_trigger(copy)); + CheckCondition(256 == rocksdb_options_get_target_file_size_base(copy)); + CheckCondition(3 == rocksdb_options_get_target_file_size_multiplier(copy)); + CheckCondition(1024 == rocksdb_options_get_max_bytes_for_level_base(copy)); + CheckCondition( + 1 == rocksdb_options_get_level_compaction_dynamic_level_bytes(copy)); + CheckCondition(2.0 == + rocksdb_options_get_max_bytes_for_level_multiplier(copy)); + CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(copy)); + CheckCondition( + 1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(copy)); + CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(copy)); + CheckCondition(23 == + rocksdb_options_get_min_write_buffer_number_to_merge(copy)); + CheckCondition( + 64 == rocksdb_options_get_max_write_buffer_number_to_maintain(copy)); + CheckCondition(50000 == + rocksdb_options_get_max_write_buffer_size_to_maintain(copy)); + CheckCondition(1 == rocksdb_options_get_enable_pipelined_write(copy)); + CheckCondition(1 == rocksdb_options_get_unordered_write(copy)); + CheckCondition(123456 == rocksdb_options_get_max_subcompactions(copy)); + CheckCondition(2 == rocksdb_options_get_max_background_jobs(copy)); + CheckCondition(3 == rocksdb_options_get_max_background_compactions(copy)); + CheckCondition(5 == rocksdb_options_get_max_background_flushes(copy)); + CheckCondition(6 == rocksdb_options_get_max_log_file_size(copy)); + CheckCondition(7 == rocksdb_options_get_log_file_time_to_roll(copy)); + CheckCondition(8 == rocksdb_options_get_keep_log_file_num(copy)); + CheckCondition(9 == rocksdb_options_get_recycle_log_file_num(copy)); + CheckCondition( + 10 == rocksdb_options_get_soft_pending_compaction_bytes_limit(copy)); + CheckCondition( + 11 == rocksdb_options_get_hard_pending_compaction_bytes_limit(copy)); + CheckCondition(12 == rocksdb_options_get_max_manifest_file_size(copy)); + CheckCondition(13 == rocksdb_options_get_table_cache_numshardbits(copy)); + CheckCondition(14 == rocksdb_options_get_arena_block_size(copy)); + CheckCondition(1 == rocksdb_options_get_use_fsync(copy)); + CheckCondition(15 == rocksdb_options_get_WAL_ttl_seconds(copy)); + CheckCondition(16 == rocksdb_options_get_WAL_size_limit_MB(copy)); + CheckCondition(17 == rocksdb_options_get_manifest_preallocation_size(copy)); + CheckCondition(1 == rocksdb_options_get_allow_mmap_reads(copy)); + CheckCondition(1 == rocksdb_options_get_allow_mmap_writes(copy)); + CheckCondition(1 == rocksdb_options_get_use_direct_reads(copy)); + CheckCondition( + 1 == rocksdb_options_get_use_direct_io_for_flush_and_compaction(copy)); + CheckCondition(1 == rocksdb_options_get_is_fd_close_on_exec(copy)); + CheckCondition(18 == rocksdb_options_get_stats_dump_period_sec(copy)); + CheckCondition(5 == rocksdb_options_get_stats_persist_period_sec(copy)); + CheckCondition(1 == rocksdb_options_get_advise_random_on_open(copy)); + CheckCondition(3 == + rocksdb_options_get_access_hint_on_compaction_start(copy)); + CheckCondition(1 == rocksdb_options_get_use_adaptive_mutex(copy)); + CheckCondition(19 == rocksdb_options_get_bytes_per_sync(copy)); + CheckCondition(20 == rocksdb_options_get_wal_bytes_per_sync(copy)); + CheckCondition(21 == + rocksdb_options_get_writable_file_max_buffer_size(copy)); + CheckCondition(1 == + rocksdb_options_get_allow_concurrent_memtable_write(copy)); + CheckCondition( + 1 == rocksdb_options_get_enable_write_thread_adaptive_yield(copy)); + CheckCondition(22 == + rocksdb_options_get_max_sequential_skip_in_iterations(copy)); + CheckCondition(1 == rocksdb_options_get_disable_auto_compactions(copy)); + CheckCondition(1 == rocksdb_options_get_optimize_filters_for_hits(copy)); + CheckCondition( + 23 == rocksdb_options_get_delete_obsolete_files_period_micros(copy)); + CheckCondition(2.0 == + rocksdb_options_get_memtable_prefix_bloom_size_ratio(copy)); + CheckCondition(24 == rocksdb_options_get_max_compaction_bytes(copy)); + CheckCondition(25 == rocksdb_options_get_memtable_huge_page_size(copy)); + CheckCondition(26 == rocksdb_options_get_max_successive_merges(copy)); + CheckCondition(27 == rocksdb_options_get_bloom_locality(copy)); + CheckCondition(1 == rocksdb_options_get_inplace_update_support(copy)); + CheckCondition(28 == rocksdb_options_get_inplace_update_num_locks(copy)); + CheckCondition(1 == rocksdb_options_get_report_bg_io_stats(copy)); + CheckCondition(2 == rocksdb_options_get_wal_recovery_mode(copy)); + CheckCondition(5 == rocksdb_options_get_compression(copy)); + CheckCondition(4 == rocksdb_options_get_bottommost_compression(copy)); + CheckCondition(2 == rocksdb_options_get_compaction_style(copy)); + CheckCondition(1 == rocksdb_options_get_atomic_flush(copy)); + CheckCondition(29.0 == + rocksdb_options_get_experimental_mempurge_threshold(copy)); + + // Copies should be independent. + rocksdb_options_set_allow_ingest_behind(copy, 0); + CheckCondition(0 == rocksdb_options_get_allow_ingest_behind(copy)); + CheckCondition(1 == rocksdb_options_get_allow_ingest_behind(o)); + + rocksdb_options_compaction_readahead_size(copy, 20); + CheckCondition(20 == rocksdb_options_get_compaction_readahead_size(copy)); + CheckCondition(10 == rocksdb_options_get_compaction_readahead_size(o)); + + rocksdb_options_set_create_if_missing(copy, 0); + CheckCondition(0 == rocksdb_options_get_create_if_missing(copy)); + CheckCondition(1 == rocksdb_options_get_create_if_missing(o)); + + rocksdb_options_set_create_missing_column_families(copy, 0); + CheckCondition(0 == + rocksdb_options_get_create_missing_column_families(copy)); + CheckCondition(1 == rocksdb_options_get_create_missing_column_families(o)); + + rocksdb_options_set_error_if_exists(copy, 0); + CheckCondition(0 == rocksdb_options_get_error_if_exists(copy)); + CheckCondition(1 == rocksdb_options_get_error_if_exists(o)); + + rocksdb_options_set_paranoid_checks(copy, 0); + CheckCondition(0 == rocksdb_options_get_paranoid_checks(copy)); + CheckCondition(1 == rocksdb_options_get_paranoid_checks(o)); + + rocksdb_options_set_info_log_level(copy, 2); + CheckCondition(2 == rocksdb_options_get_info_log_level(copy)); + CheckCondition(3 == rocksdb_options_get_info_log_level(o)); + + rocksdb_options_set_write_buffer_size(copy, 200); + CheckCondition(200 == rocksdb_options_get_write_buffer_size(copy)); + CheckCondition(100 == rocksdb_options_get_write_buffer_size(o)); + + rocksdb_options_set_db_write_buffer_size(copy, 2000); + CheckCondition(2000 == rocksdb_options_get_db_write_buffer_size(copy)); + CheckCondition(1000 == rocksdb_options_get_db_write_buffer_size(o)); + + rocksdb_options_set_max_open_files(copy, 42); + CheckCondition(42 == rocksdb_options_get_max_open_files(copy)); + CheckCondition(21 == rocksdb_options_get_max_open_files(o)); + + rocksdb_options_set_max_file_opening_threads(copy, 3); + CheckCondition(3 == rocksdb_options_get_max_file_opening_threads(copy)); + CheckCondition(5 == rocksdb_options_get_max_file_opening_threads(o)); + + rocksdb_options_set_max_total_wal_size(copy, 4000); + CheckCondition(4000 == rocksdb_options_get_max_total_wal_size(copy)); + CheckCondition(400 == rocksdb_options_get_max_total_wal_size(o)); + + rocksdb_options_set_num_levels(copy, 6); + CheckCondition(6 == rocksdb_options_get_num_levels(copy)); + CheckCondition(7 == rocksdb_options_get_num_levels(o)); + + rocksdb_options_set_level0_file_num_compaction_trigger(copy, 14); + CheckCondition( + 14 == rocksdb_options_get_level0_file_num_compaction_trigger(copy)); + CheckCondition(4 == + rocksdb_options_get_level0_file_num_compaction_trigger(o)); + + rocksdb_options_set_level0_slowdown_writes_trigger(copy, 61); + CheckCondition(61 == + rocksdb_options_get_level0_slowdown_writes_trigger(copy)); + CheckCondition(6 == rocksdb_options_get_level0_slowdown_writes_trigger(o)); + + rocksdb_options_set_level0_stop_writes_trigger(copy, 17); + CheckCondition(17 == rocksdb_options_get_level0_stop_writes_trigger(copy)); + CheckCondition(8 == rocksdb_options_get_level0_stop_writes_trigger(o)); + + rocksdb_options_set_target_file_size_base(copy, 128); + CheckCondition(128 == rocksdb_options_get_target_file_size_base(copy)); + CheckCondition(256 == rocksdb_options_get_target_file_size_base(o)); + + rocksdb_options_set_target_file_size_multiplier(copy, 13); + CheckCondition(13 == rocksdb_options_get_target_file_size_multiplier(copy)); + CheckCondition(3 == rocksdb_options_get_target_file_size_multiplier(o)); + + rocksdb_options_set_max_bytes_for_level_base(copy, 900); + CheckCondition(900 == rocksdb_options_get_max_bytes_for_level_base(copy)); + CheckCondition(1024 == rocksdb_options_get_max_bytes_for_level_base(o)); + + rocksdb_options_set_level_compaction_dynamic_level_bytes(copy, 0); + CheckCondition( + 0 == rocksdb_options_get_level_compaction_dynamic_level_bytes(copy)); + CheckCondition(1 == + rocksdb_options_get_level_compaction_dynamic_level_bytes(o)); + + rocksdb_options_set_max_bytes_for_level_multiplier(copy, 8.0); + CheckCondition(8.0 == + rocksdb_options_get_max_bytes_for_level_multiplier(copy)); + CheckCondition(2.0 == + rocksdb_options_get_max_bytes_for_level_multiplier(o)); + + rocksdb_options_set_skip_stats_update_on_db_open(copy, 0); + CheckCondition(0 == rocksdb_options_get_skip_stats_update_on_db_open(copy)); + CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(o)); + + rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(copy, 0); + CheckCondition( + 0 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(copy)); + CheckCondition( + 1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(o)); + + rocksdb_options_set_max_write_buffer_number(copy, 2000); + CheckCondition(2000 == rocksdb_options_get_max_write_buffer_number(copy)); + CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(o)); + + rocksdb_options_set_min_write_buffer_number_to_merge(copy, 146); + CheckCondition(146 == + rocksdb_options_get_min_write_buffer_number_to_merge(copy)); + CheckCondition(23 == + rocksdb_options_get_min_write_buffer_number_to_merge(o)); + + rocksdb_options_set_max_write_buffer_number_to_maintain(copy, 128); + CheckCondition( + 128 == rocksdb_options_get_max_write_buffer_number_to_maintain(copy)); + CheckCondition(64 == + rocksdb_options_get_max_write_buffer_number_to_maintain(o)); + + rocksdb_options_set_max_write_buffer_size_to_maintain(copy, 9000); + CheckCondition(9000 == + rocksdb_options_get_max_write_buffer_size_to_maintain(copy)); + CheckCondition(50000 == + rocksdb_options_get_max_write_buffer_size_to_maintain(o)); + + rocksdb_options_set_enable_pipelined_write(copy, 0); + CheckCondition(0 == rocksdb_options_get_enable_pipelined_write(copy)); + CheckCondition(1 == rocksdb_options_get_enable_pipelined_write(o)); + + rocksdb_options_set_unordered_write(copy, 0); + CheckCondition(0 == rocksdb_options_get_unordered_write(copy)); + CheckCondition(1 == rocksdb_options_get_unordered_write(o)); + + rocksdb_options_set_max_subcompactions(copy, 90001); + CheckCondition(90001 == rocksdb_options_get_max_subcompactions(copy)); + CheckCondition(123456 == rocksdb_options_get_max_subcompactions(o)); + + rocksdb_options_set_max_background_jobs(copy, 12); + CheckCondition(12 == rocksdb_options_get_max_background_jobs(copy)); + CheckCondition(2 == rocksdb_options_get_max_background_jobs(o)); + + rocksdb_options_set_max_background_compactions(copy, 13); + CheckCondition(13 == rocksdb_options_get_max_background_compactions(copy)); + CheckCondition(3 == rocksdb_options_get_max_background_compactions(o)); + + rocksdb_options_set_max_background_flushes(copy, 15); + CheckCondition(15 == rocksdb_options_get_max_background_flushes(copy)); + CheckCondition(5 == rocksdb_options_get_max_background_flushes(o)); + + rocksdb_options_set_max_log_file_size(copy, 16); + CheckCondition(16 == rocksdb_options_get_max_log_file_size(copy)); + CheckCondition(6 == rocksdb_options_get_max_log_file_size(o)); + + rocksdb_options_set_log_file_time_to_roll(copy, 17); + CheckCondition(17 == rocksdb_options_get_log_file_time_to_roll(copy)); + CheckCondition(7 == rocksdb_options_get_log_file_time_to_roll(o)); + + rocksdb_options_set_keep_log_file_num(copy, 18); + CheckCondition(18 == rocksdb_options_get_keep_log_file_num(copy)); + CheckCondition(8 == rocksdb_options_get_keep_log_file_num(o)); + + rocksdb_options_set_recycle_log_file_num(copy, 19); + CheckCondition(19 == rocksdb_options_get_recycle_log_file_num(copy)); + CheckCondition(9 == rocksdb_options_get_recycle_log_file_num(o)); + + rocksdb_options_set_soft_pending_compaction_bytes_limit(copy, 110); + CheckCondition( + 110 == rocksdb_options_get_soft_pending_compaction_bytes_limit(copy)); + CheckCondition(10 == + rocksdb_options_get_soft_pending_compaction_bytes_limit(o)); + + rocksdb_options_set_hard_pending_compaction_bytes_limit(copy, 111); + CheckCondition( + 111 == rocksdb_options_get_hard_pending_compaction_bytes_limit(copy)); + CheckCondition(11 == + rocksdb_options_get_hard_pending_compaction_bytes_limit(o)); + + rocksdb_options_set_max_manifest_file_size(copy, 112); + CheckCondition(112 == rocksdb_options_get_max_manifest_file_size(copy)); + CheckCondition(12 == rocksdb_options_get_max_manifest_file_size(o)); + + rocksdb_options_set_table_cache_numshardbits(copy, 113); + CheckCondition(113 == rocksdb_options_get_table_cache_numshardbits(copy)); + CheckCondition(13 == rocksdb_options_get_table_cache_numshardbits(o)); + + rocksdb_options_set_arena_block_size(copy, 114); + CheckCondition(114 == rocksdb_options_get_arena_block_size(copy)); + CheckCondition(14 == rocksdb_options_get_arena_block_size(o)); + + rocksdb_options_set_use_fsync(copy, 0); + CheckCondition(0 == rocksdb_options_get_use_fsync(copy)); + CheckCondition(1 == rocksdb_options_get_use_fsync(o)); + + rocksdb_options_set_WAL_ttl_seconds(copy, 115); + CheckCondition(115 == rocksdb_options_get_WAL_ttl_seconds(copy)); + CheckCondition(15 == rocksdb_options_get_WAL_ttl_seconds(o)); + + rocksdb_options_set_WAL_size_limit_MB(copy, 116); + CheckCondition(116 == rocksdb_options_get_WAL_size_limit_MB(copy)); + CheckCondition(16 == rocksdb_options_get_WAL_size_limit_MB(o)); + + rocksdb_options_set_manifest_preallocation_size(copy, 117); + CheckCondition(117 == + rocksdb_options_get_manifest_preallocation_size(copy)); + CheckCondition(17 == rocksdb_options_get_manifest_preallocation_size(o)); + + rocksdb_options_set_allow_mmap_reads(copy, 0); + CheckCondition(0 == rocksdb_options_get_allow_mmap_reads(copy)); + CheckCondition(1 == rocksdb_options_get_allow_mmap_reads(o)); + + rocksdb_options_set_allow_mmap_writes(copy, 0); + CheckCondition(0 == rocksdb_options_get_allow_mmap_writes(copy)); + CheckCondition(1 == rocksdb_options_get_allow_mmap_writes(o)); + + rocksdb_options_set_use_direct_reads(copy, 0); + CheckCondition(0 == rocksdb_options_get_use_direct_reads(copy)); + CheckCondition(1 == rocksdb_options_get_use_direct_reads(o)); + + rocksdb_options_set_use_direct_io_for_flush_and_compaction(copy, 0); + CheckCondition( + 0 == rocksdb_options_get_use_direct_io_for_flush_and_compaction(copy)); + CheckCondition( + 1 == rocksdb_options_get_use_direct_io_for_flush_and_compaction(o)); + + rocksdb_options_set_is_fd_close_on_exec(copy, 0); + CheckCondition(0 == rocksdb_options_get_is_fd_close_on_exec(copy)); + CheckCondition(1 == rocksdb_options_get_is_fd_close_on_exec(o)); + + rocksdb_options_set_stats_dump_period_sec(copy, 218); + CheckCondition(218 == rocksdb_options_get_stats_dump_period_sec(copy)); + CheckCondition(18 == rocksdb_options_get_stats_dump_period_sec(o)); + + rocksdb_options_set_stats_persist_period_sec(copy, 600); + CheckCondition(600 == rocksdb_options_get_stats_persist_period_sec(copy)); + CheckCondition(5 == rocksdb_options_get_stats_persist_period_sec(o)); + + rocksdb_options_set_advise_random_on_open(copy, 0); + CheckCondition(0 == rocksdb_options_get_advise_random_on_open(copy)); + CheckCondition(1 == rocksdb_options_get_advise_random_on_open(o)); + + rocksdb_options_set_access_hint_on_compaction_start(copy, 2); + CheckCondition(2 == + rocksdb_options_get_access_hint_on_compaction_start(copy)); + CheckCondition(3 == rocksdb_options_get_access_hint_on_compaction_start(o)); + + rocksdb_options_set_use_adaptive_mutex(copy, 0); + CheckCondition(0 == rocksdb_options_get_use_adaptive_mutex(copy)); + CheckCondition(1 == rocksdb_options_get_use_adaptive_mutex(o)); + + rocksdb_options_set_bytes_per_sync(copy, 219); + CheckCondition(219 == rocksdb_options_get_bytes_per_sync(copy)); + CheckCondition(19 == rocksdb_options_get_bytes_per_sync(o)); + + rocksdb_options_set_wal_bytes_per_sync(copy, 120); + CheckCondition(120 == rocksdb_options_get_wal_bytes_per_sync(copy)); + CheckCondition(20 == rocksdb_options_get_wal_bytes_per_sync(o)); + + rocksdb_options_set_writable_file_max_buffer_size(copy, 121); + CheckCondition(121 == + rocksdb_options_get_writable_file_max_buffer_size(copy)); + CheckCondition(21 == rocksdb_options_get_writable_file_max_buffer_size(o)); + + rocksdb_options_set_allow_concurrent_memtable_write(copy, 0); + CheckCondition(0 == + rocksdb_options_get_allow_concurrent_memtable_write(copy)); + CheckCondition(1 == rocksdb_options_get_allow_concurrent_memtable_write(o)); + + rocksdb_options_set_enable_write_thread_adaptive_yield(copy, 0); + CheckCondition( + 0 == rocksdb_options_get_enable_write_thread_adaptive_yield(copy)); + CheckCondition(1 == + rocksdb_options_get_enable_write_thread_adaptive_yield(o)); + + rocksdb_options_set_max_sequential_skip_in_iterations(copy, 122); + CheckCondition(122 == + rocksdb_options_get_max_sequential_skip_in_iterations(copy)); + CheckCondition(22 == + rocksdb_options_get_max_sequential_skip_in_iterations(o)); + + rocksdb_options_set_disable_auto_compactions(copy, 0); + CheckCondition(0 == rocksdb_options_get_disable_auto_compactions(copy)); + CheckCondition(1 == rocksdb_options_get_disable_auto_compactions(o)); + + rocksdb_options_set_optimize_filters_for_hits(copy, 0); + CheckCondition(0 == rocksdb_options_get_optimize_filters_for_hits(copy)); + CheckCondition(1 == rocksdb_options_get_optimize_filters_for_hits(o)); + + rocksdb_options_set_delete_obsolete_files_period_micros(copy, 123); + CheckCondition( + 123 == rocksdb_options_get_delete_obsolete_files_period_micros(copy)); + CheckCondition(23 == + rocksdb_options_get_delete_obsolete_files_period_micros(o)); + + rocksdb_options_set_memtable_prefix_bloom_size_ratio(copy, 4.0); + CheckCondition(4.0 == + rocksdb_options_get_memtable_prefix_bloom_size_ratio(copy)); + CheckCondition(2.0 == + rocksdb_options_get_memtable_prefix_bloom_size_ratio(o)); + + rocksdb_options_set_max_compaction_bytes(copy, 124); + CheckCondition(124 == rocksdb_options_get_max_compaction_bytes(copy)); + CheckCondition(24 == rocksdb_options_get_max_compaction_bytes(o)); + + rocksdb_options_set_memtable_huge_page_size(copy, 125); + CheckCondition(125 == rocksdb_options_get_memtable_huge_page_size(copy)); + CheckCondition(25 == rocksdb_options_get_memtable_huge_page_size(o)); + + rocksdb_options_set_max_successive_merges(copy, 126); + CheckCondition(126 == rocksdb_options_get_max_successive_merges(copy)); + CheckCondition(26 == rocksdb_options_get_max_successive_merges(o)); + + rocksdb_options_set_bloom_locality(copy, 127); + CheckCondition(127 == rocksdb_options_get_bloom_locality(copy)); + CheckCondition(27 == rocksdb_options_get_bloom_locality(o)); + + rocksdb_options_set_inplace_update_support(copy, 0); + CheckCondition(0 == rocksdb_options_get_inplace_update_support(copy)); + CheckCondition(1 == rocksdb_options_get_inplace_update_support(o)); + + rocksdb_options_set_inplace_update_num_locks(copy, 128); + CheckCondition(128 == rocksdb_options_get_inplace_update_num_locks(copy)); + CheckCondition(28 == rocksdb_options_get_inplace_update_num_locks(o)); + + rocksdb_options_set_report_bg_io_stats(copy, 0); + CheckCondition(0 == rocksdb_options_get_report_bg_io_stats(copy)); + CheckCondition(1 == rocksdb_options_get_report_bg_io_stats(o)); + + rocksdb_options_set_wal_recovery_mode(copy, 1); + CheckCondition(1 == rocksdb_options_get_wal_recovery_mode(copy)); + CheckCondition(2 == rocksdb_options_get_wal_recovery_mode(o)); + + rocksdb_options_set_compression(copy, 4); + CheckCondition(4 == rocksdb_options_get_compression(copy)); + CheckCondition(5 == rocksdb_options_get_compression(o)); + + rocksdb_options_set_bottommost_compression(copy, 3); + CheckCondition(3 == rocksdb_options_get_bottommost_compression(copy)); + CheckCondition(4 == rocksdb_options_get_bottommost_compression(o)); + + rocksdb_options_set_compaction_style(copy, 1); + CheckCondition(1 == rocksdb_options_get_compaction_style(copy)); + CheckCondition(2 == rocksdb_options_get_compaction_style(o)); + + rocksdb_options_set_atomic_flush(copy, 0); + CheckCondition(0 == rocksdb_options_get_atomic_flush(copy)); + CheckCondition(1 == rocksdb_options_get_atomic_flush(o)); + + rocksdb_options_set_experimental_mempurge_threshold(copy, 229.0); + CheckCondition(229.0 == + rocksdb_options_get_experimental_mempurge_threshold(copy)); + CheckCondition(29.0 == + rocksdb_options_get_experimental_mempurge_threshold(o)); + + rocksdb_options_destroy(copy); + rocksdb_options_destroy(o); + } + + StartPhase("read_options"); + { + rocksdb_readoptions_t* ro; + ro = rocksdb_readoptions_create(); + + rocksdb_readoptions_set_verify_checksums(ro, 1); + CheckCondition(1 == rocksdb_readoptions_get_verify_checksums(ro)); + + rocksdb_readoptions_set_fill_cache(ro, 1); + CheckCondition(1 == rocksdb_readoptions_get_fill_cache(ro)); + + rocksdb_readoptions_set_read_tier(ro, 2); + CheckCondition(2 == rocksdb_readoptions_get_read_tier(ro)); + + rocksdb_readoptions_set_tailing(ro, 1); + CheckCondition(1 == rocksdb_readoptions_get_tailing(ro)); + + rocksdb_readoptions_set_readahead_size(ro, 100); + CheckCondition(100 == rocksdb_readoptions_get_readahead_size(ro)); + + rocksdb_readoptions_set_prefix_same_as_start(ro, 1); + CheckCondition(1 == rocksdb_readoptions_get_prefix_same_as_start(ro)); + + rocksdb_readoptions_set_pin_data(ro, 1); + CheckCondition(1 == rocksdb_readoptions_get_pin_data(ro)); + + rocksdb_readoptions_set_total_order_seek(ro, 1); + CheckCondition(1 == rocksdb_readoptions_get_total_order_seek(ro)); + + rocksdb_readoptions_set_max_skippable_internal_keys(ro, 200); + CheckCondition(200 == + rocksdb_readoptions_get_max_skippable_internal_keys(ro)); + + rocksdb_readoptions_set_background_purge_on_iterator_cleanup(ro, 1); + CheckCondition( + 1 == rocksdb_readoptions_get_background_purge_on_iterator_cleanup(ro)); + + rocksdb_readoptions_set_ignore_range_deletions(ro, 1); + CheckCondition(1 == rocksdb_readoptions_get_ignore_range_deletions(ro)); + + rocksdb_readoptions_set_deadline(ro, 300); + CheckCondition(300 == rocksdb_readoptions_get_deadline(ro)); + + rocksdb_readoptions_set_io_timeout(ro, 400); + CheckCondition(400 == rocksdb_readoptions_get_io_timeout(ro)); + + rocksdb_readoptions_destroy(ro); + } + + StartPhase("write_options"); + { + rocksdb_writeoptions_t* wo; + wo = rocksdb_writeoptions_create(); + + rocksdb_writeoptions_set_sync(wo, 1); + CheckCondition(1 == rocksdb_writeoptions_get_sync(wo)); + + rocksdb_writeoptions_disable_WAL(wo, 1); + CheckCondition(1 == rocksdb_writeoptions_get_disable_WAL(wo)); + + rocksdb_writeoptions_set_ignore_missing_column_families(wo, 1); + CheckCondition(1 == + rocksdb_writeoptions_get_ignore_missing_column_families(wo)); + + rocksdb_writeoptions_set_no_slowdown(wo, 1); + CheckCondition(1 == rocksdb_writeoptions_get_no_slowdown(wo)); + + rocksdb_writeoptions_set_low_pri(wo, 1); + CheckCondition(1 == rocksdb_writeoptions_get_low_pri(wo)); + + rocksdb_writeoptions_set_memtable_insert_hint_per_batch(wo, 1); + CheckCondition(1 == + rocksdb_writeoptions_get_memtable_insert_hint_per_batch(wo)); + + rocksdb_writeoptions_destroy(wo); + } + + StartPhase("compact_options"); + { + rocksdb_compactoptions_t* co; + co = rocksdb_compactoptions_create(); + + rocksdb_compactoptions_set_exclusive_manual_compaction(co, 1); + CheckCondition(1 == + rocksdb_compactoptions_get_exclusive_manual_compaction(co)); + + rocksdb_compactoptions_set_bottommost_level_compaction(co, 1); + CheckCondition(1 == + rocksdb_compactoptions_get_bottommost_level_compaction(co)); + + rocksdb_compactoptions_set_change_level(co, 1); + CheckCondition(1 == rocksdb_compactoptions_get_change_level(co)); + + rocksdb_compactoptions_set_target_level(co, 1); + CheckCondition(1 == rocksdb_compactoptions_get_target_level(co)); + + rocksdb_compactoptions_destroy(co); + } + + StartPhase("flush_options"); + { + rocksdb_flushoptions_t* fo; + fo = rocksdb_flushoptions_create(); + + rocksdb_flushoptions_set_wait(fo, 1); + CheckCondition(1 == rocksdb_flushoptions_get_wait(fo)); + + rocksdb_flushoptions_destroy(fo); + } + + StartPhase("cache_options"); + { + rocksdb_cache_t* co; + co = rocksdb_cache_create_lru(100); + CheckCondition(100 == rocksdb_cache_get_capacity(co)); + + rocksdb_cache_set_capacity(co, 200); + CheckCondition(200 == rocksdb_cache_get_capacity(co)); + + rocksdb_cache_destroy(co); + } + + StartPhase("jemalloc_nodump_allocator"); + { + rocksdb_memory_allocator_t* allocator; + allocator = rocksdb_jemalloc_nodump_allocator_create(&err); + if (err != NULL) { + // not supported on all platforms, allow unsupported error + const char* ni = "Not implemented: "; + size_t ni_len = strlen(ni); + size_t err_len = strlen(err); + + CheckCondition(err_len >= ni_len); + CheckCondition(memcmp(ni, err, ni_len) == 0); + Free(&err); + } else { + rocksdb_cache_t* co; + rocksdb_lru_cache_options_t* copts; + + copts = rocksdb_lru_cache_options_create(); + + rocksdb_lru_cache_options_set_capacity(copts, 100); + rocksdb_lru_cache_options_set_memory_allocator(copts, allocator); + + co = rocksdb_cache_create_lru_opts(copts); + CheckCondition(100 == rocksdb_cache_get_capacity(co)); + + rocksdb_cache_destroy(co); + rocksdb_lru_cache_options_destroy(copts); + } + rocksdb_memory_allocator_destroy(allocator); + } + + StartPhase("env"); + { + rocksdb_env_t* e; + e = rocksdb_create_default_env(); + + rocksdb_env_set_background_threads(e, 10); + CheckCondition(10 == rocksdb_env_get_background_threads(e)); + + rocksdb_env_set_high_priority_background_threads(e, 20); + CheckCondition(20 == rocksdb_env_get_high_priority_background_threads(e)); + + rocksdb_env_set_low_priority_background_threads(e, 30); + CheckCondition(30 == rocksdb_env_get_low_priority_background_threads(e)); + + rocksdb_env_set_bottom_priority_background_threads(e, 40); + CheckCondition(40 == rocksdb_env_get_bottom_priority_background_threads(e)); + + rocksdb_env_destroy(e); + } + + StartPhase("universal_compaction_options"); + { + rocksdb_universal_compaction_options_t* uco; + uco = rocksdb_universal_compaction_options_create(); + + rocksdb_universal_compaction_options_set_size_ratio(uco, 5); + CheckCondition(5 == + rocksdb_universal_compaction_options_get_size_ratio(uco)); + + rocksdb_universal_compaction_options_set_min_merge_width(uco, 15); + CheckCondition( + 15 == rocksdb_universal_compaction_options_get_min_merge_width(uco)); + + rocksdb_universal_compaction_options_set_max_merge_width(uco, 25); + CheckCondition( + 25 == rocksdb_universal_compaction_options_get_max_merge_width(uco)); + + rocksdb_universal_compaction_options_set_max_size_amplification_percent(uco, + 35); + CheckCondition( + 35 == + rocksdb_universal_compaction_options_get_max_size_amplification_percent( + uco)); + + rocksdb_universal_compaction_options_set_compression_size_percent(uco, 45); + CheckCondition( + 45 == + rocksdb_universal_compaction_options_get_compression_size_percent(uco)); + + rocksdb_universal_compaction_options_set_stop_style(uco, 1); + CheckCondition(1 == + rocksdb_universal_compaction_options_get_stop_style(uco)); + + rocksdb_universal_compaction_options_destroy(uco); + } + + StartPhase("fifo_compaction_options"); + { + rocksdb_fifo_compaction_options_t* fco; + fco = rocksdb_fifo_compaction_options_create(); + + rocksdb_fifo_compaction_options_set_max_table_files_size(fco, 100000); + CheckCondition( + 100000 == + rocksdb_fifo_compaction_options_get_max_table_files_size(fco)); + + rocksdb_fifo_compaction_options_destroy(fco); + } + + StartPhase("backup_engine_option"); + { + rocksdb_backup_engine_options_t* bdo; + bdo = rocksdb_backup_engine_options_create("path"); + + rocksdb_backup_engine_options_set_share_table_files(bdo, 1); + CheckCondition(1 == + rocksdb_backup_engine_options_get_share_table_files(bdo)); + + rocksdb_backup_engine_options_set_sync(bdo, 1); + CheckCondition(1 == rocksdb_backup_engine_options_get_sync(bdo)); + + rocksdb_backup_engine_options_set_destroy_old_data(bdo, 1); + CheckCondition(1 == + rocksdb_backup_engine_options_get_destroy_old_data(bdo)); + + rocksdb_backup_engine_options_set_backup_log_files(bdo, 1); + CheckCondition(1 == + rocksdb_backup_engine_options_get_backup_log_files(bdo)); + + rocksdb_backup_engine_options_set_backup_rate_limit(bdo, 123); + CheckCondition(123 == + rocksdb_backup_engine_options_get_backup_rate_limit(bdo)); + + rocksdb_backup_engine_options_set_restore_rate_limit(bdo, 37); + CheckCondition(37 == + rocksdb_backup_engine_options_get_restore_rate_limit(bdo)); + + rocksdb_backup_engine_options_set_max_background_operations(bdo, 20); + CheckCondition( + 20 == rocksdb_backup_engine_options_get_max_background_operations(bdo)); + + rocksdb_backup_engine_options_set_callback_trigger_interval_size(bdo, 9000); + CheckCondition( + 9000 == + rocksdb_backup_engine_options_get_callback_trigger_interval_size(bdo)); + + rocksdb_backup_engine_options_set_max_valid_backups_to_open(bdo, 40); + CheckCondition( + 40 == rocksdb_backup_engine_options_get_max_valid_backups_to_open(bdo)); + + rocksdb_backup_engine_options_set_share_files_with_checksum_naming(bdo, 2); + CheckCondition( + 2 == rocksdb_backup_engine_options_get_share_files_with_checksum_naming( + bdo)); + + rocksdb_backup_engine_options_destroy(bdo); + } + + StartPhase("compression_options"); + { + rocksdb_options_t* co; + co = rocksdb_options_create(); + + rocksdb_options_set_compression_options_zstd_max_train_bytes(co, 100); + CheckCondition( + 100 == + rocksdb_options_get_compression_options_zstd_max_train_bytes(co)); + + rocksdb_options_set_compression_options_parallel_threads(co, 2); + CheckCondition( + 2 == rocksdb_options_get_compression_options_parallel_threads(co)); + + rocksdb_options_set_compression_options_max_dict_buffer_bytes(co, 200); + CheckCondition( + 200 == + rocksdb_options_get_compression_options_max_dict_buffer_bytes(co)); + + rocksdb_options_set_compression_options_use_zstd_dict_trainer(co, 0); + CheckCondition( + 0 == rocksdb_options_get_compression_options_use_zstd_dict_trainer(co)); + rocksdb_options_destroy(co); + } + + StartPhase("iterate_upper_bound"); + { + // Create new empty database + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + CheckNoError(err); + + rocksdb_options_set_prefix_extractor(options, NULL); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + + rocksdb_put(db, woptions, "a", 1, "0", 1, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "foo", 3, "bar", 3, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "foo1", 4, "bar1", 4, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "g1", 2, "0", 1, &err); + CheckNoError(err); + + // testing basic case with no iterate_upper_bound and no prefix_extractor + { + rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0); + rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); + + rocksdb_iter_seek(iter, "foo", 3); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "foo", "bar"); + + rocksdb_iter_next(iter); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "foo1", "bar1"); + + rocksdb_iter_next(iter); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "g1", "0"); + + rocksdb_iter_destroy(iter); + } + + // testing iterate_upper_bound and forward iterator + // to make sure it stops at bound + { + // iterate_upper_bound points beyond the last expected entry + rocksdb_readoptions_set_iterate_upper_bound(roptions, "foo2", 4); + + rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); + + rocksdb_iter_seek(iter, "foo", 3); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "foo", "bar"); + + rocksdb_iter_next(iter); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "foo1", "bar1"); + + rocksdb_iter_next(iter); + // should stop here... + CheckCondition(!rocksdb_iter_valid(iter)); + + rocksdb_iter_destroy(iter); + rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0); + } + } + + StartPhase("transactions"); + { + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + CheckNoError(err); + + // open a TransactionDB + txn_db_options = rocksdb_transactiondb_options_create(); + txn_options = rocksdb_transaction_options_create(); + rocksdb_options_set_create_if_missing(options, 1); + txn_db = rocksdb_transactiondb_open(options, txn_db_options, dbname, &err); + CheckNoError(err); + + // put outside a transaction + rocksdb_transactiondb_put(txn_db, woptions, "foo", 3, "hello", 5, &err); + CheckNoError(err); + CheckTxnDBGet(txn_db, roptions, "foo", "hello"); + CheckTxnDBPinGet(txn_db, roptions, "foo", "hello"); + + // delete from outside transaction + rocksdb_transactiondb_delete(txn_db, woptions, "foo", 3, &err); + CheckNoError(err); + CheckTxnDBGet(txn_db, roptions, "foo", NULL); + CheckTxnDBPinGet(txn_db, roptions, "foo", NULL); + + // write batch into TransactionDB + rocksdb_writebatch_t* wb = rocksdb_writebatch_create(); + rocksdb_writebatch_put(wb, "foo", 3, "a", 1); + rocksdb_writebatch_clear(wb); + rocksdb_writebatch_put(wb, "bar", 3, "b", 1); + rocksdb_writebatch_put(wb, "box", 3, "c", 1); + rocksdb_writebatch_delete(wb, "bar", 3); + rocksdb_transactiondb_write(txn_db, woptions, wb, &err); + rocksdb_writebatch_destroy(wb); + CheckTxnDBGet(txn_db, roptions, "box", "c"); + CheckTxnDBPinGet(txn_db, roptions, "box", "c"); + CheckNoError(err); + + // multi get + { + const char* keys[3] = {"box", "foo", "notfound"}; + const size_t keys_sizes[3] = {3, 3, 8}; + char* vals[3]; + size_t vals_sizes[3]; + char* errs[3]; + const char* expected[3] = {"c", NULL, NULL}; + rocksdb_transactiondb_multi_get(txn_db, roptions, 3, keys, keys_sizes, + vals, vals_sizes, errs); + CheckMultiGetValues(3, vals, vals_sizes, errs, expected); + } + + // begin a transaction + txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL); + // put + rocksdb_transaction_put(txn, "foo", 3, "hello", 5, &err); + CheckNoError(err); + CheckTxnGet(txn, roptions, "foo", "hello"); + CheckTxnPinGet(txn, roptions, "foo", "hello"); + { + const char* keys[3] = {"box", "foo", "notfound"}; + const size_t keys_sizes[3] = {3, 3, 8}; + char* vals[3]; + size_t vals_sizes[3]; + char* errs[3]; + const char* expected[3] = {"c", "hello", NULL}; + rocksdb_transaction_multi_get(txn, roptions, 3, keys, keys_sizes, vals, + vals_sizes, errs); + CheckMultiGetValues(3, vals, vals_sizes, errs, expected); + } + // delete + rocksdb_transaction_delete(txn, "foo", 3, &err); + CheckNoError(err); + CheckTxnGet(txn, roptions, "foo", NULL); + CheckTxnPinGet(txn, roptions, "foo", NULL); + + rocksdb_transaction_put(txn, "foo", 3, "hello", 5, &err); + CheckNoError(err); + + // read from outside transaction, before commit + CheckTxnDBGet(txn_db, roptions, "foo", NULL); + CheckTxnDBPinGet(txn_db, roptions, "foo", NULL); + { + const char* keys[3] = {"box", "foo", "notfound"}; + const size_t keys_sizes[3] = {3, 3, 8}; + char* vals[3]; + size_t vals_sizes[3]; + char* errs[3]; + const char* expected[3] = {"c", NULL, NULL}; + rocksdb_transactiondb_multi_get(txn_db, roptions, 3, keys, keys_sizes, + vals, vals_sizes, errs); + CheckMultiGetValues(3, vals, vals_sizes, errs, expected); + } + + // commit + rocksdb_transaction_commit(txn, &err); + CheckNoError(err); + + // read from outside transaction, after commit + CheckTxnDBGet(txn_db, roptions, "foo", "hello"); + CheckTxnDBPinGet(txn_db, roptions, "foo", "hello"); + { + const char* keys[3] = {"box", "foo", "notfound"}; + const size_t keys_sizes[3] = {3, 3, 8}; + char* vals[3]; + size_t vals_sizes[3]; + char* errs[3]; + const char* expected[3] = {"c", "hello", NULL}; + rocksdb_transactiondb_multi_get(txn_db, roptions, 3, keys, keys_sizes, + vals, vals_sizes, errs); + CheckMultiGetValues(3, vals, vals_sizes, errs, expected); + } + + // reuse old transaction + txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, txn); + + // snapshot + const rocksdb_snapshot_t* snapshot; + snapshot = rocksdb_transactiondb_create_snapshot(txn_db); + rocksdb_readoptions_set_snapshot(roptions, snapshot); + + rocksdb_transactiondb_put(txn_db, woptions, "foo", 3, "hey", 3, &err); + CheckNoError(err); + + CheckTxnDBGet(txn_db, roptions, "foo", "hello"); + CheckTxnDBPinGet(txn_db, roptions, "foo", "hello"); + rocksdb_readoptions_set_snapshot(roptions, NULL); + rocksdb_transactiondb_release_snapshot(txn_db, snapshot); + CheckTxnDBGet(txn_db, roptions, "foo", "hey"); + CheckTxnDBPinGet(txn_db, roptions, "foo", "hey"); + + // iterate + rocksdb_transaction_put(txn, "bar", 3, "hi", 2, &err); + rocksdb_iterator_t* iter = + rocksdb_transaction_create_iterator(txn, roptions); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_first(iter); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "bar", "hi"); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + rocksdb_iter_destroy(iter); + + // rollback + rocksdb_transaction_rollback(txn, &err); + CheckNoError(err); + CheckTxnDBGet(txn_db, roptions, "bar", NULL); + CheckTxnDBPinGet(txn_db, roptions, "bar", NULL); + + // save point + rocksdb_transaction_put(txn, "foo1", 4, "hi1", 3, &err); + rocksdb_transaction_set_savepoint(txn); + CheckTxnGet(txn, roptions, "foo1", "hi1"); + CheckTxnPinGet(txn, roptions, "foo1", "hi1"); + rocksdb_transaction_put(txn, "foo2", 4, "hi2", 3, &err); + CheckTxnGet(txn, roptions, "foo2", "hi2"); + CheckTxnPinGet(txn, roptions, "foo2", "hi2"); + + // rollback to savepoint + rocksdb_transaction_rollback_to_savepoint(txn, &err); + CheckNoError(err); + CheckTxnGet(txn, roptions, "foo2", NULL); + CheckTxnGet(txn, roptions, "foo1", "hi1"); + CheckTxnPinGet(txn, roptions, "foo2", NULL); + CheckTxnPinGet(txn, roptions, "foo1", "hi1"); + CheckTxnDBGet(txn_db, roptions, "foo1", NULL); + CheckTxnDBGet(txn_db, roptions, "foo2", NULL); + CheckTxnDBPinGet(txn_db, roptions, "foo1", NULL); + CheckTxnDBPinGet(txn_db, roptions, "foo2", NULL); + rocksdb_transaction_commit(txn, &err); + CheckNoError(err); + CheckTxnDBGet(txn_db, roptions, "foo1", "hi1"); + CheckTxnDBGet(txn_db, roptions, "foo2", NULL); + CheckTxnDBPinGet(txn_db, roptions, "foo1", "hi1"); + CheckTxnDBPinGet(txn_db, roptions, "foo2", NULL); + + // Column families. + rocksdb_column_family_handle_t* cfh; + cfh = rocksdb_transactiondb_create_column_family(txn_db, options, + "txn_db_cf", &err); + CheckNoError(err); + + rocksdb_transactiondb_put_cf(txn_db, woptions, cfh, "cf_foo", 6, "cf_hello", + 8, &err); + CheckNoError(err); + CheckTxnDBGetCF(txn_db, roptions, cfh, "cf_foo", "cf_hello"); + CheckTxnDBPinGetCF(txn_db, roptions, cfh, "cf_foo", "cf_hello"); + { + const rocksdb_column_family_handle_t* get_handles[2] = {cfh, cfh}; + const char* keys[2] = {"cf_foo", "notfound"}; + const size_t keys_sizes[2] = {6, 8}; + char* vals[2]; + size_t vals_sizes[2]; + char* errs[2]; + const char* expected[2] = {"cf_hello", NULL}; + rocksdb_transactiondb_multi_get_cf(txn_db, roptions, get_handles, 2, keys, + keys_sizes, vals, vals_sizes, errs); + CheckMultiGetValues(2, vals, vals_sizes, errs, expected); + } + + rocksdb_transactiondb_delete_cf(txn_db, woptions, cfh, "cf_foo", 6, &err); + CheckNoError(err); + CheckTxnDBGetCF(txn_db, roptions, cfh, "cf_foo", NULL); + CheckTxnDBPinGetCF(txn_db, roptions, cfh, "cf_foo", NULL); + + // flush + rocksdb_flushoptions_t* flush_options = rocksdb_flushoptions_create(); + rocksdb_flushoptions_set_wait(flush_options, 1); + rocksdb_transactiondb_flush_wal(txn_db, 1, &err); + CheckNoError(err); + rocksdb_transactiondb_flush_cf(txn_db, flush_options, cfh, &err); + CheckNoError(err); + rocksdb_transactiondb_flush(txn_db, flush_options, &err); + CheckNoError(err); + rocksdb_flushoptions_destroy(flush_options); + + // close and destroy + rocksdb_column_family_handle_destroy(cfh); + rocksdb_transaction_destroy(txn); + rocksdb_transactiondb_close(txn_db); + rocksdb_destroy_db(options, dbname, &err); + CheckNoError(err); + rocksdb_transaction_options_destroy(txn_options); + rocksdb_transactiondb_options_destroy(txn_db_options); + } + + StartPhase("two-phase commit"); + { + // open a TransactionDB + txn_db_options = rocksdb_transactiondb_options_create(); + txn_options = rocksdb_transaction_options_create(); + rocksdb_options_set_create_if_missing(options, 1); + txn_db = rocksdb_transactiondb_open(options, txn_db_options, dbname, &err); + CheckNoError(err); + + rocksdb_transaction_options_set_skip_prepare(txn_options, 0); + txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL); + rocksdb_transaction_commit(txn, &err); + CheckCondition(err != NULL); + Free(&err); + err = NULL; + rocksdb_transaction_prepare(txn, &err); + CheckCondition(err != NULL); + Free(&err); + err = NULL; + rocksdb_transaction_set_name(txn, "txn1", 4, &err); + CheckNoError(err); + rocksdb_transaction_prepare(txn, &err); + CheckNoError(err); + rocksdb_transaction_commit(txn, &err); + CheckNoError(err); + rocksdb_transaction_destroy(txn); + + // prepare 2 transactions and close db. + rocksdb_transaction_t* txn1 = + rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL); + rocksdb_transaction_put(txn1, "bar1", 4, "1", 1, &err); + CheckNoError(err); + rocksdb_transaction_set_name(txn1, "txn1", 4, &err); + CheckNoError(err); + rocksdb_transaction_prepare(txn1, &err); + CheckNoError(err); + rocksdb_transaction_t* txn2 = + rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL); + rocksdb_transaction_put(txn2, "bar2", 4, "2", 1, &err); + CheckNoError(err); + rocksdb_transaction_set_name(txn2, "txn2", 4, &err); + CheckNoError(err); + rocksdb_transaction_prepare(txn2, &err); + CheckNoError(err); + rocksdb_transaction_destroy(txn1); + rocksdb_transaction_destroy(txn2); + rocksdb_transactiondb_close(txn_db); + rocksdb_transaction_options_destroy(txn_options); + rocksdb_transactiondb_options_destroy(txn_db_options); + + // reopen db and get all prepared. + txn_db_options = rocksdb_transactiondb_options_create(); + txn_options = rocksdb_transaction_options_create(); + rocksdb_options_set_error_if_exists(options, 0); + txn_db = rocksdb_transactiondb_open(options, txn_db_options, dbname, &err); + CheckNoError(err); + CheckTxnDBPinGet(txn_db, roptions, "bar1", NULL); + CheckTxnDBPinGet(txn_db, roptions, "bar2", NULL); + size_t cnt; + rocksdb_transaction_t** txns = + rocksdb_transactiondb_get_prepared_transactions(txn_db, &cnt); + CheckCondition(cnt == 2); + size_t i; + for (i = 0; i < cnt; i++) { + txn = txns[i]; + size_t name_len = 0; + char* name = rocksdb_transaction_get_name(txn, &name_len); + CheckCondition(name_len == 4); + if (strncmp(name, "txn1", name_len) == 0) { + rocksdb_transaction_commit(txn, &err); + } else if (strncmp(name, "txn2", name_len) == 0) { + rocksdb_transaction_rollback(txn, &err); + } + rocksdb_free(name); + CheckNoError(err); + rocksdb_transaction_destroy(txn); + } + rocksdb_free(txns); + CheckTxnDBGet(txn_db, roptions, "bar1", "1"); + CheckTxnDBGet(txn_db, roptions, "bar2", NULL); + rocksdb_transactiondb_put(txn_db, woptions, "bar2", 4, "2", 1, &err); + CheckNoError(err); + + // close and destroy + rocksdb_transactiondb_close(txn_db); + rocksdb_destroy_db(options, dbname, &err); + CheckNoError(err); + rocksdb_transaction_options_destroy(txn_options); + rocksdb_transactiondb_options_destroy(txn_db_options); + } + + StartPhase("optimistic_transactions"); + { + rocksdb_options_t* db_options = rocksdb_options_create(); + rocksdb_options_set_create_if_missing(db_options, 1); + rocksdb_options_set_allow_concurrent_memtable_write(db_options, 1); + otxn_db = rocksdb_optimistictransactiondb_open(db_options, dbname, &err); + otxn_options = rocksdb_optimistictransaction_options_create(); + rocksdb_transaction_t* txn1 = rocksdb_optimistictransaction_begin( + otxn_db, woptions, otxn_options, NULL); + rocksdb_transaction_t* txn2 = rocksdb_optimistictransaction_begin( + otxn_db, woptions, otxn_options, NULL); + rocksdb_transaction_put(txn1, "key", 3, "value", 5, &err); + CheckNoError(err); + rocksdb_transaction_put(txn2, "key1", 4, "value1", 6, &err); + CheckNoError(err); + CheckTxnGet(txn1, roptions, "key", "value"); + CheckTxnPinGet(txn1, roptions, "key", "value"); + rocksdb_transaction_commit(txn1, &err); + CheckNoError(err); + rocksdb_transaction_commit(txn2, &err); + CheckNoError(err); + rocksdb_transaction_destroy(txn1); + rocksdb_transaction_destroy(txn2); + + // Check column family + db = rocksdb_optimistictransactiondb_get_base_db(otxn_db); + rocksdb_put(db, woptions, "key", 3, "value", 5, &err); + CheckNoError(err); + rocksdb_column_family_handle_t *cfh1, *cfh2; + cfh1 = rocksdb_create_column_family(db, db_options, "txn_db_cf1", &err); + cfh2 = rocksdb_create_column_family(db, db_options, "txn_db_cf2", &err); + txn = rocksdb_optimistictransaction_begin(otxn_db, woptions, otxn_options, + NULL); + rocksdb_transaction_put_cf(txn, cfh1, "key_cf1", 7, "val_cf1", 7, &err); + CheckNoError(err); + rocksdb_transaction_put_cf(txn, cfh2, "key_cf2", 7, "val_cf2", 7, &err); + CheckNoError(err); + rocksdb_transaction_commit(txn, &err); + CheckNoError(err); + txn = rocksdb_optimistictransaction_begin(otxn_db, woptions, otxn_options, + txn); + CheckGetCF(db, roptions, cfh1, "key_cf1", "val_cf1"); + CheckTxnGetCF(txn, roptions, cfh1, "key_cf1", "val_cf1"); + CheckTxnPinGetCF(txn, roptions, cfh1, "key_cf1", "val_cf1"); + { + const rocksdb_column_family_handle_t* get_handles[3] = {cfh1, cfh2, cfh2}; + const char* keys[3] = {"key_cf1", "key_cf2", "notfound"}; + const size_t keys_sizes[3] = {7, 7, 8}; + char* vals[3]; + size_t vals_sizes[3]; + char* errs[3]; + const char* expected[3] = {"val_cf1", "val_cf2", NULL}; + rocksdb_transaction_multi_get_cf(txn, roptions, get_handles, 3, keys, + keys_sizes, vals, vals_sizes, errs); + CheckMultiGetValues(3, vals, vals_sizes, errs, expected); + } + + // Check iterator with column family + rocksdb_transaction_put_cf(txn, cfh1, "key1_cf", 7, "val1_cf", 7, &err); + CheckNoError(err); + rocksdb_iterator_t* iter = + rocksdb_transaction_create_iterator_cf(txn, roptions, cfh1); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_first(iter); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "key1_cf", "val1_cf"); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + rocksdb_iter_destroy(iter); + + rocksdb_transaction_destroy(txn); + rocksdb_column_family_handle_destroy(cfh1); + rocksdb_column_family_handle_destroy(cfh2); + rocksdb_optimistictransactiondb_close_base_db(db); + rocksdb_optimistictransactiondb_close(otxn_db); + + // Check open optimistic transaction db with column families + size_t cf_len; + char** column_fams = + rocksdb_list_column_families(db_options, dbname, &cf_len, &err); + CheckNoError(err); + CheckEqual("default", column_fams[0], 7); + CheckEqual("txn_db_cf1", column_fams[1], 10); + CheckEqual("txn_db_cf2", column_fams[2], 10); + CheckCondition(cf_len == 3); + rocksdb_list_column_families_destroy(column_fams, cf_len); + + const char* cf_names[3] = {"default", "txn_db_cf1", "txn_db_cf2"}; + rocksdb_options_t* cf_options = rocksdb_options_create(); + const rocksdb_options_t* cf_opts[3] = {cf_options, cf_options, cf_options}; + + rocksdb_options_set_error_if_exists(cf_options, 0); + rocksdb_column_family_handle_t* cf_handles[3]; + otxn_db = rocksdb_optimistictransactiondb_open_column_families( + db_options, dbname, 3, cf_names, cf_opts, cf_handles, &err); + CheckNoError(err); + rocksdb_transaction_t* txn_cf = rocksdb_optimistictransaction_begin( + otxn_db, woptions, otxn_options, NULL); + CheckTxnGetCF(txn_cf, roptions, cf_handles[0], "key", "value"); + CheckTxnGetCF(txn_cf, roptions, cf_handles[1], "key_cf1", "val_cf1"); + CheckTxnGetCF(txn_cf, roptions, cf_handles[2], "key_cf2", "val_cf2"); + CheckTxnPinGetCF(txn_cf, roptions, cf_handles[0], "key", "value"); + CheckTxnPinGetCF(txn_cf, roptions, cf_handles[1], "key_cf1", "val_cf1"); + CheckTxnPinGetCF(txn_cf, roptions, cf_handles[2], "key_cf2", "val_cf2"); + rocksdb_transaction_destroy(txn_cf); + rocksdb_options_destroy(cf_options); + rocksdb_column_family_handle_destroy(cf_handles[0]); + rocksdb_column_family_handle_destroy(cf_handles[1]); + rocksdb_column_family_handle_destroy(cf_handles[2]); + rocksdb_optimistictransactiondb_close(otxn_db); + rocksdb_destroy_db(db_options, dbname, &err); + rocksdb_options_destroy(db_options); + rocksdb_optimistictransaction_options_destroy(otxn_options); + CheckNoError(err); + } + + // Simple sanity check that setting memtable rep works. + StartPhase("memtable_reps"); + { + // Create database with vector memtable. + rocksdb_options_set_memtable_vector_rep(options); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + + // Create database with hash skiplist memtable. + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + CheckNoError(err); + + rocksdb_options_set_hash_skip_list_rep(options, 5000, 4, 4); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + } + + // Check that secondary instance works. + StartPhase("open_as_secondary"); + { + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + + rocksdb_options_t* db_options = rocksdb_options_create(); + rocksdb_options_set_create_if_missing(db_options, 1); + db = rocksdb_open(db_options, dbname, &err); + CheckNoError(err); + rocksdb_t* db1; + rocksdb_options_t* opts = rocksdb_options_create(); + rocksdb_options_set_max_open_files(opts, -1); + rocksdb_options_set_create_if_missing(opts, 1); + snprintf(secondary_path, sizeof(secondary_path), + "%s/rocksdb_c_test_secondary-%d", GetTempDir(), ((int)geteuid())); + db1 = rocksdb_open_as_secondary(opts, dbname, secondary_path, &err); + CheckNoError(err); + + rocksdb_writeoptions_set_sync(woptions, 0); + rocksdb_writeoptions_disable_WAL(woptions, 1); + rocksdb_put(db, woptions, "key0", 4, "value0", 6, &err); + CheckNoError(err); + rocksdb_flushoptions_t* flush_opts = rocksdb_flushoptions_create(); + rocksdb_flushoptions_set_wait(flush_opts, 1); + rocksdb_flush(db, flush_opts, &err); + CheckNoError(err); + rocksdb_try_catch_up_with_primary(db1, &err); + CheckNoError(err); + rocksdb_readoptions_t* ropts = rocksdb_readoptions_create(); + rocksdb_readoptions_set_verify_checksums(ropts, 1); + rocksdb_readoptions_set_snapshot(ropts, NULL); + CheckGet(db, ropts, "key0", "value0"); + CheckGet(db1, ropts, "key0", "value0"); + + rocksdb_writeoptions_disable_WAL(woptions, 0); + rocksdb_put(db, woptions, "key1", 4, "value1", 6, &err); + CheckNoError(err); + rocksdb_try_catch_up_with_primary(db1, &err); + CheckNoError(err); + CheckGet(db1, ropts, "key0", "value0"); + CheckGet(db1, ropts, "key1", "value1"); + + rocksdb_close(db1); + rocksdb_destroy_db(opts, secondary_path, &err); + CheckNoError(err); + + rocksdb_options_destroy(db_options); + rocksdb_options_destroy(opts); + rocksdb_readoptions_destroy(ropts); + rocksdb_flushoptions_destroy(flush_opts); + } + + // Simple sanity check that options setting db_paths work. + StartPhase("open_db_paths"); + { + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + + const rocksdb_dbpath_t* paths[1] = {dbpath}; + rocksdb_options_set_db_paths(options, paths, 1); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + } + + StartPhase("filter_with_prefix_seek"); + { + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + CheckNoError(err); + + rocksdb_options_set_prefix_extractor( + options, rocksdb_slicetransform_create_fixed_prefix(1)); + rocksdb_filterpolicy_t* filter_policy = + rocksdb_filterpolicy_create_bloom_full(8.0); + rocksdb_block_based_options_set_filter_policy(table_options, filter_policy); + rocksdb_options_set_block_based_table_factory(options, table_options); + + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + + int i; + for (i = 0; i < 10; ++i) { + char key = '0' + (char)i; + rocksdb_put(db, woptions, &key, 1, "", 1, &err); + CheckNoError(err); + } + + // Flush to generate an L0 so that filter will be used later. + rocksdb_flushoptions_t* flush_options = rocksdb_flushoptions_create(); + rocksdb_flushoptions_set_wait(flush_options, 1); + rocksdb_flush(db, flush_options, &err); + rocksdb_flushoptions_destroy(flush_options); + CheckNoError(err); + + rocksdb_readoptions_t* ropts = rocksdb_readoptions_create(); + rocksdb_iterator_t* iter = rocksdb_create_iterator(db, ropts); + + rocksdb_iter_seek(iter, "0", 1); + int cnt = 0; + while (rocksdb_iter_valid(iter)) { + ++cnt; + rocksdb_iter_next(iter); + } + CheckCondition(10 == cnt); + + rocksdb_iter_destroy(iter); + rocksdb_readoptions_destroy(ropts); + } + + StartPhase("cancel_all_background_work"); + rocksdb_cancel_all_background_work(db, 1); + + StartPhase("cleanup"); + rocksdb_close(db); + rocksdb_options_destroy(options); + rocksdb_block_based_options_destroy(table_options); + rocksdb_readoptions_destroy(roptions); + rocksdb_writeoptions_destroy(woptions); + rocksdb_compactoptions_destroy(coptions); + rocksdb_cache_destroy(cache); + rocksdb_comparator_destroy(cmp); + rocksdb_dbpath_destroy(dbpath); + rocksdb_env_destroy(env); + + fprintf(stderr, "PASS\n"); + return 0; +} + +#else + +int main(void) { + fprintf(stderr, "SKIPPED\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/column_family.cc b/src/rocksdb/db/column_family.cc new file mode 100644 index 000000000..268060ddf --- /dev/null +++ b/src/rocksdb/db/column_family.cc @@ -0,0 +1,1683 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/column_family.h" + +#include +#include +#include +#include +#include +#include + +#include "db/blob/blob_file_cache.h" +#include "db/blob/blob_source.h" +#include "db/compaction/compaction_picker.h" +#include "db/compaction/compaction_picker_fifo.h" +#include "db/compaction/compaction_picker_level.h" +#include "db/compaction/compaction_picker_universal.h" +#include "db/db_impl/db_impl.h" +#include "db/internal_stats.h" +#include "db/job_context.h" +#include "db/range_del_aggregator.h" +#include "db/table_properties_collector.h" +#include "db/version_set.h" +#include "db/write_controller.h" +#include "file/sst_file_manager_impl.h" +#include "logging/logging.h" +#include "monitoring/thread_status_util.h" +#include "options/options_helper.h" +#include "port/port.h" +#include "rocksdb/convenience.h" +#include "rocksdb/table.h" +#include "table/merging_iterator.h" +#include "util/autovector.h" +#include "util/cast_util.h" +#include "util/compression.h" + +namespace ROCKSDB_NAMESPACE { + +ColumnFamilyHandleImpl::ColumnFamilyHandleImpl( + ColumnFamilyData* column_family_data, DBImpl* db, InstrumentedMutex* mutex) + : cfd_(column_family_data), db_(db), mutex_(mutex) { + if (cfd_ != nullptr) { + cfd_->Ref(); + } +} + +ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() { + if (cfd_ != nullptr) { +#ifndef ROCKSDB_LITE + for (auto& listener : cfd_->ioptions()->listeners) { + listener->OnColumnFamilyHandleDeletionStarted(this); + } +#endif // ROCKSDB_LITE + // Job id == 0 means that this is not our background process, but rather + // user thread + // Need to hold some shared pointers owned by the initial_cf_options + // before final cleaning up finishes. + ColumnFamilyOptions initial_cf_options_copy = cfd_->initial_cf_options(); + JobContext job_context(0); + mutex_->Lock(); + bool dropped = cfd_->IsDropped(); + if (cfd_->UnrefAndTryDelete()) { + if (dropped) { + db_->FindObsoleteFiles(&job_context, false, true); + } + } + mutex_->Unlock(); + if (job_context.HaveSomethingToDelete()) { + bool defer_purge = + db_->immutable_db_options().avoid_unnecessary_blocking_io; + db_->PurgeObsoleteFiles(job_context, defer_purge); + } + job_context.Clean(); + } +} + +uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); } + +const std::string& ColumnFamilyHandleImpl::GetName() const { + return cfd()->GetName(); +} + +Status ColumnFamilyHandleImpl::GetDescriptor(ColumnFamilyDescriptor* desc) { +#ifndef ROCKSDB_LITE + // accessing mutable cf-options requires db mutex. + InstrumentedMutexLock l(mutex_); + *desc = ColumnFamilyDescriptor(cfd()->GetName(), cfd()->GetLatestCFOptions()); + return Status::OK(); +#else + (void)desc; + return Status::NotSupported(); +#endif // !ROCKSDB_LITE +} + +const Comparator* ColumnFamilyHandleImpl::GetComparator() const { + return cfd()->user_comparator(); +} + +void GetIntTblPropCollectorFactory( + const ImmutableCFOptions& ioptions, + IntTblPropCollectorFactories* int_tbl_prop_collector_factories) { + assert(int_tbl_prop_collector_factories); + + auto& collector_factories = ioptions.table_properties_collector_factories; + for (size_t i = 0; i < ioptions.table_properties_collector_factories.size(); + ++i) { + assert(collector_factories[i]); + int_tbl_prop_collector_factories->emplace_back( + new UserKeyTablePropertiesCollectorFactory(collector_factories[i])); + } +} + +Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options) { + if (!cf_options.compression_per_level.empty()) { + for (size_t level = 0; level < cf_options.compression_per_level.size(); + ++level) { + if (!CompressionTypeSupported(cf_options.compression_per_level[level])) { + return Status::InvalidArgument( + "Compression type " + + CompressionTypeToString(cf_options.compression_per_level[level]) + + " is not linked with the binary."); + } + } + } else { + if (!CompressionTypeSupported(cf_options.compression)) { + return Status::InvalidArgument( + "Compression type " + + CompressionTypeToString(cf_options.compression) + + " is not linked with the binary."); + } + } + if (cf_options.compression_opts.zstd_max_train_bytes > 0) { + if (cf_options.compression_opts.use_zstd_dict_trainer) { + if (!ZSTD_TrainDictionarySupported()) { + return Status::InvalidArgument( + "zstd dictionary trainer cannot be used because ZSTD 1.1.3+ " + "is not linked with the binary."); + } + } else if (!ZSTD_FinalizeDictionarySupported()) { + return Status::InvalidArgument( + "zstd finalizeDictionary cannot be used because ZSTD 1.4.5+ " + "is not linked with the binary."); + } + if (cf_options.compression_opts.max_dict_bytes == 0) { + return Status::InvalidArgument( + "The dictionary size limit (`CompressionOptions::max_dict_bytes`) " + "should be nonzero if we're using zstd's dictionary generator."); + } + } + + if (!CompressionTypeSupported(cf_options.blob_compression_type)) { + std::ostringstream oss; + oss << "The specified blob compression type " + << CompressionTypeToString(cf_options.blob_compression_type) + << " is not available."; + + return Status::InvalidArgument(oss.str()); + } + + return Status::OK(); +} + +Status CheckConcurrentWritesSupported(const ColumnFamilyOptions& cf_options) { + if (cf_options.inplace_update_support) { + return Status::InvalidArgument( + "In-place memtable updates (inplace_update_support) is not compatible " + "with concurrent writes (allow_concurrent_memtable_write)"); + } + if (!cf_options.memtable_factory->IsInsertConcurrentlySupported()) { + return Status::InvalidArgument( + "Memtable doesn't concurrent writes (allow_concurrent_memtable_write)"); + } + return Status::OK(); +} + +Status CheckCFPathsSupported(const DBOptions& db_options, + const ColumnFamilyOptions& cf_options) { + // More than one cf_paths are supported only in universal + // and level compaction styles. This function also checks the case + // in which cf_paths is not specified, which results in db_paths + // being used. + if ((cf_options.compaction_style != kCompactionStyleUniversal) && + (cf_options.compaction_style != kCompactionStyleLevel)) { + if (cf_options.cf_paths.size() > 1) { + return Status::NotSupported( + "More than one CF paths are only supported in " + "universal and level compaction styles. "); + } else if (cf_options.cf_paths.empty() && db_options.db_paths.size() > 1) { + return Status::NotSupported( + "More than one DB paths are only supported in " + "universal and level compaction styles. "); + } + } + return Status::OK(); +} + +namespace { +const uint64_t kDefaultTtl = 0xfffffffffffffffe; +const uint64_t kDefaultPeriodicCompSecs = 0xfffffffffffffffe; +} // anonymous namespace + +ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, + const ColumnFamilyOptions& src) { + ColumnFamilyOptions result = src; + size_t clamp_max = std::conditional< + sizeof(size_t) == 4, std::integral_constant, + std::integral_constant>::type::value; + ClipToRange(&result.write_buffer_size, (static_cast(64)) << 10, + clamp_max); + // if user sets arena_block_size, we trust user to use this value. Otherwise, + // calculate a proper value from writer_buffer_size; + if (result.arena_block_size <= 0) { + result.arena_block_size = + std::min(size_t{1024 * 1024}, result.write_buffer_size / 8); + + // Align up to 4k + const size_t align = 4 * 1024; + result.arena_block_size = + ((result.arena_block_size + align - 1) / align) * align; + } + result.min_write_buffer_number_to_merge = + std::min(result.min_write_buffer_number_to_merge, + result.max_write_buffer_number - 1); + if (result.min_write_buffer_number_to_merge < 1) { + result.min_write_buffer_number_to_merge = 1; + } + + if (db_options.atomic_flush && result.min_write_buffer_number_to_merge > 1) { + ROCKS_LOG_WARN( + db_options.logger, + "Currently, if atomic_flush is true, then triggering flush for any " + "column family internally (non-manual flush) will trigger flushing " + "all column families even if the number of memtables is smaller " + "min_write_buffer_number_to_merge. Therefore, configuring " + "min_write_buffer_number_to_merge > 1 is not compatible and should " + "be satinized to 1. Not doing so will lead to data loss and " + "inconsistent state across multiple column families when WAL is " + "disabled, which is a common setting for atomic flush"); + + result.min_write_buffer_number_to_merge = 1; + } + + if (result.num_levels < 1) { + result.num_levels = 1; + } + if (result.compaction_style == kCompactionStyleLevel && + result.num_levels < 2) { + result.num_levels = 2; + } + + if (result.compaction_style == kCompactionStyleUniversal && + db_options.allow_ingest_behind && result.num_levels < 3) { + result.num_levels = 3; + } + + if (result.max_write_buffer_number < 2) { + result.max_write_buffer_number = 2; + } + // fall back max_write_buffer_number_to_maintain if + // max_write_buffer_size_to_maintain is not set + if (result.max_write_buffer_size_to_maintain < 0) { + result.max_write_buffer_size_to_maintain = + result.max_write_buffer_number * + static_cast(result.write_buffer_size); + } else if (result.max_write_buffer_size_to_maintain == 0 && + result.max_write_buffer_number_to_maintain < 0) { + result.max_write_buffer_number_to_maintain = result.max_write_buffer_number; + } + // bloom filter size shouldn't exceed 1/4 of memtable size. + if (result.memtable_prefix_bloom_size_ratio > 0.25) { + result.memtable_prefix_bloom_size_ratio = 0.25; + } else if (result.memtable_prefix_bloom_size_ratio < 0) { + result.memtable_prefix_bloom_size_ratio = 0; + } + + if (!result.prefix_extractor) { + assert(result.memtable_factory); + Slice name = result.memtable_factory->Name(); + if (name.compare("HashSkipListRepFactory") == 0 || + name.compare("HashLinkListRepFactory") == 0) { + result.memtable_factory = std::make_shared(); + } + } + + if (result.compaction_style == kCompactionStyleFIFO) { + // since we delete level0 files in FIFO compaction when there are too many + // of them, these options don't really mean anything + result.level0_slowdown_writes_trigger = std::numeric_limits::max(); + result.level0_stop_writes_trigger = std::numeric_limits::max(); + } + + if (result.max_bytes_for_level_multiplier <= 0) { + result.max_bytes_for_level_multiplier = 1; + } + + if (result.level0_file_num_compaction_trigger == 0) { + ROCKS_LOG_WARN(db_options.logger, + "level0_file_num_compaction_trigger cannot be 0"); + result.level0_file_num_compaction_trigger = 1; + } + + if (result.level0_stop_writes_trigger < + result.level0_slowdown_writes_trigger || + result.level0_slowdown_writes_trigger < + result.level0_file_num_compaction_trigger) { + ROCKS_LOG_WARN(db_options.logger, + "This condition must be satisfied: " + "level0_stop_writes_trigger(%d) >= " + "level0_slowdown_writes_trigger(%d) >= " + "level0_file_num_compaction_trigger(%d)", + result.level0_stop_writes_trigger, + result.level0_slowdown_writes_trigger, + result.level0_file_num_compaction_trigger); + if (result.level0_slowdown_writes_trigger < + result.level0_file_num_compaction_trigger) { + result.level0_slowdown_writes_trigger = + result.level0_file_num_compaction_trigger; + } + if (result.level0_stop_writes_trigger < + result.level0_slowdown_writes_trigger) { + result.level0_stop_writes_trigger = result.level0_slowdown_writes_trigger; + } + ROCKS_LOG_WARN(db_options.logger, + "Adjust the value to " + "level0_stop_writes_trigger(%d)" + "level0_slowdown_writes_trigger(%d)" + "level0_file_num_compaction_trigger(%d)", + result.level0_stop_writes_trigger, + result.level0_slowdown_writes_trigger, + result.level0_file_num_compaction_trigger); + } + + if (result.soft_pending_compaction_bytes_limit == 0) { + result.soft_pending_compaction_bytes_limit = + result.hard_pending_compaction_bytes_limit; + } else if (result.hard_pending_compaction_bytes_limit > 0 && + result.soft_pending_compaction_bytes_limit > + result.hard_pending_compaction_bytes_limit) { + result.soft_pending_compaction_bytes_limit = + result.hard_pending_compaction_bytes_limit; + } + +#ifndef ROCKSDB_LITE + // When the DB is stopped, it's possible that there are some .trash files that + // were not deleted yet, when we open the DB we will find these .trash files + // and schedule them to be deleted (or delete immediately if SstFileManager + // was not used) + auto sfm = + static_cast(db_options.sst_file_manager.get()); + for (size_t i = 0; i < result.cf_paths.size(); i++) { + DeleteScheduler::CleanupDirectory(db_options.env, sfm, + result.cf_paths[i].path) + .PermitUncheckedError(); + } +#endif + + if (result.cf_paths.empty()) { + result.cf_paths = db_options.db_paths; + } + + if (result.level_compaction_dynamic_level_bytes) { + if (result.compaction_style != kCompactionStyleLevel) { + ROCKS_LOG_WARN(db_options.info_log.get(), + "level_compaction_dynamic_level_bytes only makes sense" + "for level-based compaction"); + result.level_compaction_dynamic_level_bytes = false; + } else if (result.cf_paths.size() > 1U) { + // we don't yet know how to make both of this feature and multiple + // DB path work. + ROCKS_LOG_WARN(db_options.info_log.get(), + "multiple cf_paths/db_paths and" + "level_compaction_dynamic_level_bytes" + "can't be used together"); + result.level_compaction_dynamic_level_bytes = false; + } + } + + if (result.max_compaction_bytes == 0) { + result.max_compaction_bytes = result.target_file_size_base * 25; + } + + bool is_block_based_table = (result.table_factory->IsInstanceOf( + TableFactory::kBlockBasedTableName())); + + const uint64_t kAdjustedTtl = 30 * 24 * 60 * 60; + if (result.ttl == kDefaultTtl) { + if (is_block_based_table && + result.compaction_style != kCompactionStyleFIFO) { + result.ttl = kAdjustedTtl; + } else { + result.ttl = 0; + } + } + + const uint64_t kAdjustedPeriodicCompSecs = 30 * 24 * 60 * 60; + + // Turn on periodic compactions and set them to occur once every 30 days if + // compaction filters are used and periodic_compaction_seconds is set to the + // default value. + if (result.compaction_style != kCompactionStyleFIFO) { + if ((result.compaction_filter != nullptr || + result.compaction_filter_factory != nullptr) && + result.periodic_compaction_seconds == kDefaultPeriodicCompSecs && + is_block_based_table) { + result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs; + } + } else { + // result.compaction_style == kCompactionStyleFIFO + if (result.ttl == 0) { + if (is_block_based_table) { + if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) { + result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs; + } + result.ttl = result.periodic_compaction_seconds; + } + } else if (result.periodic_compaction_seconds != 0) { + result.ttl = std::min(result.ttl, result.periodic_compaction_seconds); + } + } + + // TTL compactions would work similar to Periodic Compactions in Universal in + // most of the cases. So, if ttl is set, execute the periodic compaction + // codepath. + if (result.compaction_style == kCompactionStyleUniversal && result.ttl != 0) { + if (result.periodic_compaction_seconds != 0) { + result.periodic_compaction_seconds = + std::min(result.ttl, result.periodic_compaction_seconds); + } else { + result.periodic_compaction_seconds = result.ttl; + } + } + + if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) { + result.periodic_compaction_seconds = 0; + } + + return result; +} + +int SuperVersion::dummy = 0; +void* const SuperVersion::kSVInUse = &SuperVersion::dummy; +void* const SuperVersion::kSVObsolete = nullptr; + +SuperVersion::~SuperVersion() { + for (auto td : to_delete) { + delete td; + } +} + +SuperVersion* SuperVersion::Ref() { + refs.fetch_add(1, std::memory_order_relaxed); + return this; +} + +bool SuperVersion::Unref() { + // fetch_sub returns the previous value of ref + uint32_t previous_refs = refs.fetch_sub(1); + assert(previous_refs > 0); + return previous_refs == 1; +} + +void SuperVersion::Cleanup() { + assert(refs.load(std::memory_order_relaxed) == 0); + // Since this SuperVersion object is being deleted, + // decrement reference to the immutable MemtableList + // this SV object was pointing to. + imm->Unref(&to_delete); + MemTable* m = mem->Unref(); + if (m != nullptr) { + auto* memory_usage = current->cfd()->imm()->current_memory_usage(); + assert(*memory_usage >= m->ApproximateMemoryUsage()); + *memory_usage -= m->ApproximateMemoryUsage(); + to_delete.push_back(m); + } + current->Unref(); + cfd->UnrefAndTryDelete(); +} + +void SuperVersion::Init(ColumnFamilyData* new_cfd, MemTable* new_mem, + MemTableListVersion* new_imm, Version* new_current) { + cfd = new_cfd; + mem = new_mem; + imm = new_imm; + current = new_current; + cfd->Ref(); + mem->Ref(); + imm->Ref(); + current->Ref(); + refs.store(1, std::memory_order_relaxed); +} + +namespace { +void SuperVersionUnrefHandle(void* ptr) { + // UnrefHandle is called when a thread exits or a ThreadLocalPtr gets + // destroyed. When the former happens, the thread shouldn't see kSVInUse. + // When the latter happens, only super_version_ holds a reference + // to ColumnFamilyData, so no further queries are possible. + SuperVersion* sv = static_cast(ptr); + bool was_last_ref __attribute__((__unused__)); + was_last_ref = sv->Unref(); + // Thread-local SuperVersions can't outlive ColumnFamilyData::super_version_. + // This is important because we can't do SuperVersion cleanup here. + // That would require locking DB mutex, which would deadlock because + // SuperVersionUnrefHandle is called with locked ThreadLocalPtr mutex. + assert(!was_last_ref); +} +} // anonymous namespace + +std::vector ColumnFamilyData::GetDbPaths() const { + std::vector paths; + paths.reserve(ioptions_.cf_paths.size()); + for (const DbPath& db_path : ioptions_.cf_paths) { + paths.emplace_back(db_path.path); + } + return paths; +} + +const uint32_t ColumnFamilyData::kDummyColumnFamilyDataId = + std::numeric_limits::max(); + +ColumnFamilyData::ColumnFamilyData( + uint32_t id, const std::string& name, Version* _dummy_versions, + Cache* _table_cache, WriteBufferManager* write_buffer_manager, + const ColumnFamilyOptions& cf_options, const ImmutableDBOptions& db_options, + const FileOptions* file_options, ColumnFamilySet* column_family_set, + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, const std::string& db_id, + const std::string& db_session_id) + : id_(id), + name_(name), + dummy_versions_(_dummy_versions), + current_(nullptr), + refs_(0), + initialized_(false), + dropped_(false), + internal_comparator_(cf_options.comparator), + initial_cf_options_(SanitizeOptions(db_options, cf_options)), + ioptions_(db_options, initial_cf_options_), + mutable_cf_options_(initial_cf_options_), + is_delete_range_supported_( + cf_options.table_factory->IsDeleteRangeSupported()), + write_buffer_manager_(write_buffer_manager), + mem_(nullptr), + imm_(ioptions_.min_write_buffer_number_to_merge, + ioptions_.max_write_buffer_number_to_maintain, + ioptions_.max_write_buffer_size_to_maintain), + super_version_(nullptr), + super_version_number_(0), + local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)), + next_(nullptr), + prev_(nullptr), + log_number_(0), + flush_reason_(FlushReason::kOthers), + column_family_set_(column_family_set), + queued_for_flush_(false), + queued_for_compaction_(false), + prev_compaction_needed_bytes_(0), + allow_2pc_(db_options.allow_2pc), + last_memtable_id_(0), + db_paths_registered_(false), + mempurge_used_(false) { + if (id_ != kDummyColumnFamilyDataId) { + // TODO(cc): RegisterDbPaths can be expensive, considering moving it + // outside of this constructor which might be called with db mutex held. + // TODO(cc): considering using ioptions_.fs, currently some tests rely on + // EnvWrapper, that's the main reason why we use env here. + Status s = ioptions_.env->RegisterDbPaths(GetDbPaths()); + if (s.ok()) { + db_paths_registered_ = true; + } else { + ROCKS_LOG_ERROR( + ioptions_.logger, + "Failed to register data paths of column family (id: %d, name: %s)", + id_, name_.c_str()); + } + } + Ref(); + + // Convert user defined table properties collector factories to internal ones. + GetIntTblPropCollectorFactory(ioptions_, &int_tbl_prop_collector_factories_); + + // if _dummy_versions is nullptr, then this is a dummy column family. + if (_dummy_versions != nullptr) { + internal_stats_.reset( + new InternalStats(ioptions_.num_levels, ioptions_.clock, this)); + table_cache_.reset(new TableCache(ioptions_, file_options, _table_cache, + block_cache_tracer, io_tracer, + db_session_id)); + blob_file_cache_.reset( + new BlobFileCache(_table_cache, ioptions(), soptions(), id_, + internal_stats_->GetBlobFileReadHist(), io_tracer)); + blob_source_.reset(new BlobSource(ioptions(), db_id, db_session_id, + blob_file_cache_.get())); + + if (ioptions_.compaction_style == kCompactionStyleLevel) { + compaction_picker_.reset( + new LevelCompactionPicker(ioptions_, &internal_comparator_)); +#ifndef ROCKSDB_LITE + } else if (ioptions_.compaction_style == kCompactionStyleUniversal) { + compaction_picker_.reset( + new UniversalCompactionPicker(ioptions_, &internal_comparator_)); + } else if (ioptions_.compaction_style == kCompactionStyleFIFO) { + compaction_picker_.reset( + new FIFOCompactionPicker(ioptions_, &internal_comparator_)); + } else if (ioptions_.compaction_style == kCompactionStyleNone) { + compaction_picker_.reset( + new NullCompactionPicker(ioptions_, &internal_comparator_)); + ROCKS_LOG_WARN(ioptions_.logger, + "Column family %s does not use any background compaction. " + "Compactions can only be done via CompactFiles\n", + GetName().c_str()); +#endif // !ROCKSDB_LITE + } else { + ROCKS_LOG_ERROR(ioptions_.logger, + "Unable to recognize the specified compaction style %d. " + "Column family %s will use kCompactionStyleLevel.\n", + ioptions_.compaction_style, GetName().c_str()); + compaction_picker_.reset( + new LevelCompactionPicker(ioptions_, &internal_comparator_)); + } + + if (column_family_set_->NumberOfColumnFamilies() < 10) { + ROCKS_LOG_INFO(ioptions_.logger, + "--------------- Options for column family [%s]:\n", + name.c_str()); + initial_cf_options_.Dump(ioptions_.logger); + } else { + ROCKS_LOG_INFO(ioptions_.logger, "\t(skipping printing options)\n"); + } + } + + RecalculateWriteStallConditions(mutable_cf_options_); + + if (cf_options.table_factory->IsInstanceOf( + TableFactory::kBlockBasedTableName()) && + cf_options.table_factory->GetOptions()) { + const BlockBasedTableOptions* bbto = + cf_options.table_factory->GetOptions(); + const auto& options_overrides = bbto->cache_usage_options.options_overrides; + const auto file_metadata_charged = + options_overrides.at(CacheEntryRole::kFileMetadata).charged; + if (bbto->block_cache && + file_metadata_charged == CacheEntryRoleOptions::Decision::kEnabled) { + // TODO(hx235): Add a `ConcurrentCacheReservationManager` at DB scope + // responsible for reservation of `ObsoleteFileInfo` so that we can keep + // this `file_metadata_cache_res_mgr_` nonconcurrent + file_metadata_cache_res_mgr_.reset(new ConcurrentCacheReservationManager( + std::make_shared< + CacheReservationManagerImpl>( + bbto->block_cache))); + } + } +} + +// DB mutex held +ColumnFamilyData::~ColumnFamilyData() { + assert(refs_.load(std::memory_order_relaxed) == 0); + // remove from linked list + auto prev = prev_; + auto next = next_; + prev->next_ = next; + next->prev_ = prev; + + if (!dropped_ && column_family_set_ != nullptr) { + // If it's dropped, it's already removed from column family set + // If column_family_set_ == nullptr, this is dummy CFD and not in + // ColumnFamilySet + column_family_set_->RemoveColumnFamily(this); + } + + if (current_ != nullptr) { + current_->Unref(); + } + + // It would be wrong if this ColumnFamilyData is in flush_queue_ or + // compaction_queue_ and we destroyed it + assert(!queued_for_flush_); + assert(!queued_for_compaction_); + assert(super_version_ == nullptr); + + if (dummy_versions_ != nullptr) { + // List must be empty + assert(dummy_versions_->Next() == dummy_versions_); + bool deleted __attribute__((__unused__)); + deleted = dummy_versions_->Unref(); + assert(deleted); + } + + if (mem_ != nullptr) { + delete mem_->Unref(); + } + autovector to_delete; + imm_.current()->Unref(&to_delete); + for (MemTable* m : to_delete) { + delete m; + } + + if (db_paths_registered_) { + // TODO(cc): considering using ioptions_.fs, currently some tests rely on + // EnvWrapper, that's the main reason why we use env here. + Status s = ioptions_.env->UnregisterDbPaths(GetDbPaths()); + if (!s.ok()) { + ROCKS_LOG_ERROR( + ioptions_.logger, + "Failed to unregister data paths of column family (id: %d, name: %s)", + id_, name_.c_str()); + } + } +} + +bool ColumnFamilyData::UnrefAndTryDelete() { + int old_refs = refs_.fetch_sub(1); + assert(old_refs > 0); + + if (old_refs == 1) { + assert(super_version_ == nullptr); + delete this; + return true; + } + + if (old_refs == 2 && super_version_ != nullptr) { + // Only the super_version_ holds me + SuperVersion* sv = super_version_; + super_version_ = nullptr; + + // Release SuperVersion references kept in ThreadLocalPtr. + local_sv_.reset(); + + if (sv->Unref()) { + // Note: sv will delete this ColumnFamilyData during Cleanup() + assert(sv->cfd == this); + sv->Cleanup(); + delete sv; + return true; + } + } + return false; +} + +void ColumnFamilyData::SetDropped() { + // can't drop default CF + assert(id_ != 0); + dropped_ = true; + write_controller_token_.reset(); + + // remove from column_family_set + column_family_set_->RemoveColumnFamily(this); +} + +ColumnFamilyOptions ColumnFamilyData::GetLatestCFOptions() const { + return BuildColumnFamilyOptions(initial_cf_options_, mutable_cf_options_); +} + +uint64_t ColumnFamilyData::OldestLogToKeep() { + auto current_log = GetLogNumber(); + + if (allow_2pc_) { + auto imm_prep_log = imm()->PrecomputeMinLogContainingPrepSection(); + auto mem_prep_log = mem()->GetMinLogContainingPrepSection(); + + if (imm_prep_log > 0 && imm_prep_log < current_log) { + current_log = imm_prep_log; + } + + if (mem_prep_log > 0 && mem_prep_log < current_log) { + current_log = mem_prep_log; + } + } + + return current_log; +} + +const double kIncSlowdownRatio = 0.8; +const double kDecSlowdownRatio = 1 / kIncSlowdownRatio; +const double kNearStopSlowdownRatio = 0.6; +const double kDelayRecoverSlowdownRatio = 1.4; + +namespace { +// If penalize_stop is true, we further reduce slowdown rate. +std::unique_ptr SetupDelay( + WriteController* write_controller, uint64_t compaction_needed_bytes, + uint64_t prev_compaction_need_bytes, bool penalize_stop, + bool auto_compactions_disabled) { + const uint64_t kMinWriteRate = 16 * 1024u; // Minimum write rate 16KB/s. + + uint64_t max_write_rate = write_controller->max_delayed_write_rate(); + uint64_t write_rate = write_controller->delayed_write_rate(); + + if (auto_compactions_disabled) { + // When auto compaction is disabled, always use the value user gave. + write_rate = max_write_rate; + } else if (write_controller->NeedsDelay() && max_write_rate > kMinWriteRate) { + // If user gives rate less than kMinWriteRate, don't adjust it. + // + // If already delayed, need to adjust based on previous compaction debt. + // When there are two or more column families require delay, we always + // increase or reduce write rate based on information for one single + // column family. It is likely to be OK but we can improve if there is a + // problem. + // Ignore compaction_needed_bytes = 0 case because compaction_needed_bytes + // is only available in level-based compaction + // + // If the compaction debt stays the same as previously, we also further slow + // down. It usually means a mem table is full. It's mainly for the case + // where both of flush and compaction are much slower than the speed we + // insert to mem tables, so we need to actively slow down before we get + // feedback signal from compaction and flushes to avoid the full stop + // because of hitting the max write buffer number. + // + // If DB just falled into the stop condition, we need to further reduce + // the write rate to avoid the stop condition. + if (penalize_stop) { + // Penalize the near stop or stop condition by more aggressive slowdown. + // This is to provide the long term slowdown increase signal. + // The penalty is more than the reward of recovering to the normal + // condition. + write_rate = static_cast(static_cast(write_rate) * + kNearStopSlowdownRatio); + if (write_rate < kMinWriteRate) { + write_rate = kMinWriteRate; + } + } else if (prev_compaction_need_bytes > 0 && + prev_compaction_need_bytes <= compaction_needed_bytes) { + write_rate = static_cast(static_cast(write_rate) * + kIncSlowdownRatio); + if (write_rate < kMinWriteRate) { + write_rate = kMinWriteRate; + } + } else if (prev_compaction_need_bytes > compaction_needed_bytes) { + // We are speeding up by ratio of kSlowdownRatio when we have paid + // compaction debt. But we'll never speed up to faster than the write rate + // given by users. + write_rate = static_cast(static_cast(write_rate) * + kDecSlowdownRatio); + if (write_rate > max_write_rate) { + write_rate = max_write_rate; + } + } + } + return write_controller->GetDelayToken(write_rate); +} + +int GetL0ThresholdSpeedupCompaction(int level0_file_num_compaction_trigger, + int level0_slowdown_writes_trigger) { + // SanitizeOptions() ensures it. + assert(level0_file_num_compaction_trigger <= level0_slowdown_writes_trigger); + + if (level0_file_num_compaction_trigger < 0) { + return std::numeric_limits::max(); + } + + const int64_t twice_level0_trigger = + static_cast(level0_file_num_compaction_trigger) * 2; + + const int64_t one_fourth_trigger_slowdown = + static_cast(level0_file_num_compaction_trigger) + + ((level0_slowdown_writes_trigger - level0_file_num_compaction_trigger) / + 4); + + assert(twice_level0_trigger >= 0); + assert(one_fourth_trigger_slowdown >= 0); + + // 1/4 of the way between L0 compaction trigger threshold and slowdown + // condition. + // Or twice as compaction trigger, if it is smaller. + int64_t res = std::min(twice_level0_trigger, one_fourth_trigger_slowdown); + if (res >= std::numeric_limits::max()) { + return std::numeric_limits::max(); + } else { + // res fits in int + return static_cast(res); + } +} +} // anonymous namespace + +std::pair +ColumnFamilyData::GetWriteStallConditionAndCause( + int num_unflushed_memtables, int num_l0_files, + uint64_t num_compaction_needed_bytes, + const MutableCFOptions& mutable_cf_options, + const ImmutableCFOptions& immutable_cf_options) { + if (num_unflushed_memtables >= mutable_cf_options.max_write_buffer_number) { + return {WriteStallCondition::kStopped, WriteStallCause::kMemtableLimit}; + } else if (!mutable_cf_options.disable_auto_compactions && + num_l0_files >= mutable_cf_options.level0_stop_writes_trigger) { + return {WriteStallCondition::kStopped, WriteStallCause::kL0FileCountLimit}; + } else if (!mutable_cf_options.disable_auto_compactions && + mutable_cf_options.hard_pending_compaction_bytes_limit > 0 && + num_compaction_needed_bytes >= + mutable_cf_options.hard_pending_compaction_bytes_limit) { + return {WriteStallCondition::kStopped, + WriteStallCause::kPendingCompactionBytes}; + } else if (mutable_cf_options.max_write_buffer_number > 3 && + num_unflushed_memtables >= + mutable_cf_options.max_write_buffer_number - 1 && + num_unflushed_memtables - 1 >= + immutable_cf_options.min_write_buffer_number_to_merge) { + return {WriteStallCondition::kDelayed, WriteStallCause::kMemtableLimit}; + } else if (!mutable_cf_options.disable_auto_compactions && + mutable_cf_options.level0_slowdown_writes_trigger >= 0 && + num_l0_files >= + mutable_cf_options.level0_slowdown_writes_trigger) { + return {WriteStallCondition::kDelayed, WriteStallCause::kL0FileCountLimit}; + } else if (!mutable_cf_options.disable_auto_compactions && + mutable_cf_options.soft_pending_compaction_bytes_limit > 0 && + num_compaction_needed_bytes >= + mutable_cf_options.soft_pending_compaction_bytes_limit) { + return {WriteStallCondition::kDelayed, + WriteStallCause::kPendingCompactionBytes}; + } + return {WriteStallCondition::kNormal, WriteStallCause::kNone}; +} + +WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( + const MutableCFOptions& mutable_cf_options) { + auto write_stall_condition = WriteStallCondition::kNormal; + if (current_ != nullptr) { + auto* vstorage = current_->storage_info(); + auto write_controller = column_family_set_->write_controller_; + uint64_t compaction_needed_bytes = + vstorage->estimated_compaction_needed_bytes(); + + auto write_stall_condition_and_cause = GetWriteStallConditionAndCause( + imm()->NumNotFlushed(), vstorage->l0_delay_trigger_count(), + vstorage->estimated_compaction_needed_bytes(), mutable_cf_options, + *ioptions()); + write_stall_condition = write_stall_condition_and_cause.first; + auto write_stall_cause = write_stall_condition_and_cause.second; + + bool was_stopped = write_controller->IsStopped(); + bool needed_delay = write_controller->NeedsDelay(); + + if (write_stall_condition == WriteStallCondition::kStopped && + write_stall_cause == WriteStallCause::kMemtableLimit) { + write_controller_token_ = write_controller->GetStopToken(); + internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_STOPS, 1); + ROCKS_LOG_WARN( + ioptions_.logger, + "[%s] Stopping writes because we have %d immutable memtables " + "(waiting for flush), max_write_buffer_number is set to %d", + name_.c_str(), imm()->NumNotFlushed(), + mutable_cf_options.max_write_buffer_number); + } else if (write_stall_condition == WriteStallCondition::kStopped && + write_stall_cause == WriteStallCause::kL0FileCountLimit) { + write_controller_token_ = write_controller->GetStopToken(); + internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_STOPS, 1); + if (compaction_picker_->IsLevel0CompactionInProgress()) { + internal_stats_->AddCFStats( + InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_STOPS, 1); + } + ROCKS_LOG_WARN(ioptions_.logger, + "[%s] Stopping writes because we have %d level-0 files", + name_.c_str(), vstorage->l0_delay_trigger_count()); + } else if (write_stall_condition == WriteStallCondition::kStopped && + write_stall_cause == WriteStallCause::kPendingCompactionBytes) { + write_controller_token_ = write_controller->GetStopToken(); + internal_stats_->AddCFStats( + InternalStats::PENDING_COMPACTION_BYTES_LIMIT_STOPS, 1); + ROCKS_LOG_WARN( + ioptions_.logger, + "[%s] Stopping writes because of estimated pending compaction " + "bytes %" PRIu64, + name_.c_str(), compaction_needed_bytes); + } else if (write_stall_condition == WriteStallCondition::kDelayed && + write_stall_cause == WriteStallCause::kMemtableLimit) { + write_controller_token_ = + SetupDelay(write_controller, compaction_needed_bytes, + prev_compaction_needed_bytes_, was_stopped, + mutable_cf_options.disable_auto_compactions); + internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_SLOWDOWNS, 1); + ROCKS_LOG_WARN( + ioptions_.logger, + "[%s] Stalling writes because we have %d immutable memtables " + "(waiting for flush), max_write_buffer_number is set to %d " + "rate %" PRIu64, + name_.c_str(), imm()->NumNotFlushed(), + mutable_cf_options.max_write_buffer_number, + write_controller->delayed_write_rate()); + } else if (write_stall_condition == WriteStallCondition::kDelayed && + write_stall_cause == WriteStallCause::kL0FileCountLimit) { + // L0 is the last two files from stopping. + bool near_stop = vstorage->l0_delay_trigger_count() >= + mutable_cf_options.level0_stop_writes_trigger - 2; + write_controller_token_ = + SetupDelay(write_controller, compaction_needed_bytes, + prev_compaction_needed_bytes_, was_stopped || near_stop, + mutable_cf_options.disable_auto_compactions); + internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_SLOWDOWNS, + 1); + if (compaction_picker_->IsLevel0CompactionInProgress()) { + internal_stats_->AddCFStats( + InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS, 1); + } + ROCKS_LOG_WARN(ioptions_.logger, + "[%s] Stalling writes because we have %d level-0 files " + "rate %" PRIu64, + name_.c_str(), vstorage->l0_delay_trigger_count(), + write_controller->delayed_write_rate()); + } else if (write_stall_condition == WriteStallCondition::kDelayed && + write_stall_cause == WriteStallCause::kPendingCompactionBytes) { + // If the distance to hard limit is less than 1/4 of the gap between soft + // and + // hard bytes limit, we think it is near stop and speed up the slowdown. + bool near_stop = + mutable_cf_options.hard_pending_compaction_bytes_limit > 0 && + (compaction_needed_bytes - + mutable_cf_options.soft_pending_compaction_bytes_limit) > + 3 * + (mutable_cf_options.hard_pending_compaction_bytes_limit - + mutable_cf_options.soft_pending_compaction_bytes_limit) / + 4; + + write_controller_token_ = + SetupDelay(write_controller, compaction_needed_bytes, + prev_compaction_needed_bytes_, was_stopped || near_stop, + mutable_cf_options.disable_auto_compactions); + internal_stats_->AddCFStats( + InternalStats::PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, 1); + ROCKS_LOG_WARN( + ioptions_.logger, + "[%s] Stalling writes because of estimated pending compaction " + "bytes %" PRIu64 " rate %" PRIu64, + name_.c_str(), vstorage->estimated_compaction_needed_bytes(), + write_controller->delayed_write_rate()); + } else { + assert(write_stall_condition == WriteStallCondition::kNormal); + if (vstorage->l0_delay_trigger_count() >= + GetL0ThresholdSpeedupCompaction( + mutable_cf_options.level0_file_num_compaction_trigger, + mutable_cf_options.level0_slowdown_writes_trigger)) { + write_controller_token_ = + write_controller->GetCompactionPressureToken(); + ROCKS_LOG_INFO( + ioptions_.logger, + "[%s] Increasing compaction threads because we have %d level-0 " + "files ", + name_.c_str(), vstorage->l0_delay_trigger_count()); + } else if (vstorage->estimated_compaction_needed_bytes() >= + mutable_cf_options.soft_pending_compaction_bytes_limit / 4) { + // Increase compaction threads if bytes needed for compaction exceeds + // 1/4 of threshold for slowing down. + // If soft pending compaction byte limit is not set, always speed up + // compaction. + write_controller_token_ = + write_controller->GetCompactionPressureToken(); + if (mutable_cf_options.soft_pending_compaction_bytes_limit > 0) { + ROCKS_LOG_INFO( + ioptions_.logger, + "[%s] Increasing compaction threads because of estimated pending " + "compaction " + "bytes %" PRIu64, + name_.c_str(), vstorage->estimated_compaction_needed_bytes()); + } + } else { + write_controller_token_.reset(); + } + // If the DB recovers from delay conditions, we reward with reducing + // double the slowdown ratio. This is to balance the long term slowdown + // increase signal. + if (needed_delay) { + uint64_t write_rate = write_controller->delayed_write_rate(); + write_controller->set_delayed_write_rate(static_cast( + static_cast(write_rate) * kDelayRecoverSlowdownRatio)); + // Set the low pri limit to be 1/4 the delayed write rate. + // Note we don't reset this value even after delay condition is relased. + // Low-pri rate will continue to apply if there is a compaction + // pressure. + write_controller->low_pri_rate_limiter()->SetBytesPerSecond(write_rate / + 4); + } + } + prev_compaction_needed_bytes_ = compaction_needed_bytes; + } + return write_stall_condition; +} + +const FileOptions* ColumnFamilyData::soptions() const { + return &(column_family_set_->file_options_); +} + +void ColumnFamilyData::SetCurrent(Version* current_version) { + current_ = current_version; +} + +uint64_t ColumnFamilyData::GetNumLiveVersions() const { + return VersionSet::GetNumLiveVersions(dummy_versions_); +} + +uint64_t ColumnFamilyData::GetTotalSstFilesSize() const { + return VersionSet::GetTotalSstFilesSize(dummy_versions_); +} + +uint64_t ColumnFamilyData::GetTotalBlobFileSize() const { + return VersionSet::GetTotalBlobFileSize(dummy_versions_); +} + +uint64_t ColumnFamilyData::GetLiveSstFilesSize() const { + return current_->GetSstFilesSize(); +} + +MemTable* ColumnFamilyData::ConstructNewMemtable( + const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) { + return new MemTable(internal_comparator_, ioptions_, mutable_cf_options, + write_buffer_manager_, earliest_seq, id_); +} + +void ColumnFamilyData::CreateNewMemtable( + const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) { + if (mem_ != nullptr) { + delete mem_->Unref(); + } + SetMemtable(ConstructNewMemtable(mutable_cf_options, earliest_seq)); + mem_->Ref(); +} + +bool ColumnFamilyData::NeedsCompaction() const { + return !mutable_cf_options_.disable_auto_compactions && + compaction_picker_->NeedsCompaction(current_->storage_info()); +} + +Compaction* ColumnFamilyData::PickCompaction( + const MutableCFOptions& mutable_options, + const MutableDBOptions& mutable_db_options, LogBuffer* log_buffer) { + SequenceNumber earliest_mem_seqno = + std::min(mem_->GetEarliestSequenceNumber(), + imm_.current()->GetEarliestSequenceNumber(false)); + auto* result = compaction_picker_->PickCompaction( + GetName(), mutable_options, mutable_db_options, current_->storage_info(), + log_buffer, earliest_mem_seqno); + if (result != nullptr) { + result->SetInputVersion(current_); + } + return result; +} + +bool ColumnFamilyData::RangeOverlapWithCompaction( + const Slice& smallest_user_key, const Slice& largest_user_key, + int level) const { + return compaction_picker_->RangeOverlapWithCompaction( + smallest_user_key, largest_user_key, level); +} + +Status ColumnFamilyData::RangesOverlapWithMemtables( + const autovector& ranges, SuperVersion* super_version, + bool allow_data_in_errors, bool* overlap) { + assert(overlap != nullptr); + *overlap = false; + // Create an InternalIterator over all unflushed memtables + Arena arena; + ReadOptions read_opts; + read_opts.total_order_seek = true; + MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena); + merge_iter_builder.AddIterator( + super_version->mem->NewIterator(read_opts, &arena)); + super_version->imm->AddIterators(read_opts, &merge_iter_builder, + false /* add_range_tombstone_iter */); + ScopedArenaIterator memtable_iter(merge_iter_builder.Finish()); + + auto read_seq = super_version->current->version_set()->LastSequence(); + ReadRangeDelAggregator range_del_agg(&internal_comparator_, read_seq); + auto* active_range_del_iter = super_version->mem->NewRangeTombstoneIterator( + read_opts, read_seq, false /* immutable_memtable */); + range_del_agg.AddTombstones( + std::unique_ptr(active_range_del_iter)); + Status status; + status = super_version->imm->AddRangeTombstoneIterators( + read_opts, nullptr /* arena */, &range_del_agg); + // AddRangeTombstoneIterators always return Status::OK. + assert(status.ok()); + + for (size_t i = 0; i < ranges.size() && status.ok() && !*overlap; ++i) { + auto* vstorage = super_version->current->storage_info(); + auto* ucmp = vstorage->InternalComparator()->user_comparator(); + InternalKey range_start(ranges[i].start, kMaxSequenceNumber, + kValueTypeForSeek); + memtable_iter->Seek(range_start.Encode()); + status = memtable_iter->status(); + ParsedInternalKey seek_result; + + if (status.ok() && memtable_iter->Valid()) { + status = ParseInternalKey(memtable_iter->key(), &seek_result, + allow_data_in_errors); + } + + if (status.ok()) { + if (memtable_iter->Valid() && + ucmp->Compare(seek_result.user_key, ranges[i].limit) <= 0) { + *overlap = true; + } else if (range_del_agg.IsRangeOverlapped(ranges[i].start, + ranges[i].limit)) { + *overlap = true; + } + } + } + return status; +} + +const int ColumnFamilyData::kCompactAllLevels = -1; +const int ColumnFamilyData::kCompactToBaseLevel = -2; + +Compaction* ColumnFamilyData::CompactRange( + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, int input_level, + int output_level, const CompactRangeOptions& compact_range_options, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end, bool* conflict, + uint64_t max_file_num_to_ignore, const std::string& trim_ts) { + auto* result = compaction_picker_->CompactRange( + GetName(), mutable_cf_options, mutable_db_options, + current_->storage_info(), input_level, output_level, + compact_range_options, begin, end, compaction_end, conflict, + max_file_num_to_ignore, trim_ts); + if (result != nullptr) { + result->SetInputVersion(current_); + } + return result; +} + +SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(DBImpl* db) { + SuperVersion* sv = GetThreadLocalSuperVersion(db); + sv->Ref(); + if (!ReturnThreadLocalSuperVersion(sv)) { + // This Unref() corresponds to the Ref() in GetThreadLocalSuperVersion() + // when the thread-local pointer was populated. So, the Ref() earlier in + // this function still prevents the returned SuperVersion* from being + // deleted out from under the caller. + sv->Unref(); + } + return sv; +} + +SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(DBImpl* db) { + // The SuperVersion is cached in thread local storage to avoid acquiring + // mutex when SuperVersion does not change since the last use. When a new + // SuperVersion is installed, the compaction or flush thread cleans up + // cached SuperVersion in all existing thread local storage. To avoid + // acquiring mutex for this operation, we use atomic Swap() on the thread + // local pointer to guarantee exclusive access. If the thread local pointer + // is being used while a new SuperVersion is installed, the cached + // SuperVersion can become stale. In that case, the background thread would + // have swapped in kSVObsolete. We re-check the value at when returning + // SuperVersion back to thread local, with an atomic compare and swap. + // The superversion will need to be released if detected to be stale. + void* ptr = local_sv_->Swap(SuperVersion::kSVInUse); + // Invariant: + // (1) Scrape (always) installs kSVObsolete in ThreadLocal storage + // (2) the Swap above (always) installs kSVInUse, ThreadLocal storage + // should only keep kSVInUse before ReturnThreadLocalSuperVersion call + // (if no Scrape happens). + assert(ptr != SuperVersion::kSVInUse); + SuperVersion* sv = static_cast(ptr); + if (sv == SuperVersion::kSVObsolete || + sv->version_number != super_version_number_.load()) { + RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_ACQUIRES); + SuperVersion* sv_to_delete = nullptr; + + if (sv && sv->Unref()) { + RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_CLEANUPS); + db->mutex()->Lock(); + // NOTE: underlying resources held by superversion (sst files) might + // not be released until the next background job. + sv->Cleanup(); + if (db->immutable_db_options().avoid_unnecessary_blocking_io) { + db->AddSuperVersionsToFreeQueue(sv); + db->SchedulePurge(); + } else { + sv_to_delete = sv; + } + } else { + db->mutex()->Lock(); + } + sv = super_version_->Ref(); + db->mutex()->Unlock(); + + delete sv_to_delete; + } + assert(sv != nullptr); + return sv; +} + +bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) { + assert(sv != nullptr); + // Put the SuperVersion back + void* expected = SuperVersion::kSVInUse; + if (local_sv_->CompareAndSwap(static_cast(sv), expected)) { + // When we see kSVInUse in the ThreadLocal, we are sure ThreadLocal + // storage has not been altered and no Scrape has happened. The + // SuperVersion is still current. + return true; + } else { + // ThreadLocal scrape happened in the process of this GetImpl call (after + // thread local Swap() at the beginning and before CompareAndSwap()). + // This means the SuperVersion it holds is obsolete. + assert(expected == SuperVersion::kSVObsolete); + } + return false; +} + +void ColumnFamilyData::InstallSuperVersion(SuperVersionContext* sv_context, + InstrumentedMutex* db_mutex) { + db_mutex->AssertHeld(); + return InstallSuperVersion(sv_context, mutable_cf_options_); +} + +void ColumnFamilyData::InstallSuperVersion( + SuperVersionContext* sv_context, + const MutableCFOptions& mutable_cf_options) { + SuperVersion* new_superversion = sv_context->new_superversion.release(); + new_superversion->mutable_cf_options = mutable_cf_options; + new_superversion->Init(this, mem_, imm_.current(), current_); + SuperVersion* old_superversion = super_version_; + super_version_ = new_superversion; + ++super_version_number_; + super_version_->version_number = super_version_number_; + if (old_superversion == nullptr || old_superversion->current != current() || + old_superversion->mem != mem_ || + old_superversion->imm != imm_.current()) { + // Should not recalculate slow down condition if nothing has changed, since + // currently RecalculateWriteStallConditions() treats it as further slowing + // down is needed. + super_version_->write_stall_condition = + RecalculateWriteStallConditions(mutable_cf_options); + } else { + super_version_->write_stall_condition = + old_superversion->write_stall_condition; + } + if (old_superversion != nullptr) { + // Reset SuperVersions cached in thread local storage. + // This should be done before old_superversion->Unref(). That's to ensure + // that local_sv_ never holds the last reference to SuperVersion, since + // it has no means to safely do SuperVersion cleanup. + ResetThreadLocalSuperVersions(); + + if (old_superversion->mutable_cf_options.write_buffer_size != + mutable_cf_options.write_buffer_size) { + mem_->UpdateWriteBufferSize(mutable_cf_options.write_buffer_size); + } + if (old_superversion->write_stall_condition != + new_superversion->write_stall_condition) { + sv_context->PushWriteStallNotification( + old_superversion->write_stall_condition, + new_superversion->write_stall_condition, GetName(), ioptions()); + } + if (old_superversion->Unref()) { + old_superversion->Cleanup(); + sv_context->superversions_to_free.push_back(old_superversion); + } + } +} + +void ColumnFamilyData::ResetThreadLocalSuperVersions() { + autovector sv_ptrs; + local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete); + for (auto ptr : sv_ptrs) { + assert(ptr); + if (ptr == SuperVersion::kSVInUse) { + continue; + } + auto sv = static_cast(ptr); + bool was_last_ref __attribute__((__unused__)); + was_last_ref = sv->Unref(); + // sv couldn't have been the last reference because + // ResetThreadLocalSuperVersions() is called before + // unref'ing super_version_. + assert(!was_last_ref); + } +} + +Status ColumnFamilyData::ValidateOptions( + const DBOptions& db_options, const ColumnFamilyOptions& cf_options) { + Status s; + s = CheckCompressionSupported(cf_options); + if (s.ok() && db_options.allow_concurrent_memtable_write) { + s = CheckConcurrentWritesSupported(cf_options); + } + if (s.ok() && db_options.unordered_write && + cf_options.max_successive_merges != 0) { + s = Status::InvalidArgument( + "max_successive_merges > 0 is incompatible with unordered_write"); + } + if (s.ok()) { + s = CheckCFPathsSupported(db_options, cf_options); + } + if (!s.ok()) { + return s; + } + + if (cf_options.ttl > 0 && cf_options.ttl != kDefaultTtl) { + if (!cf_options.table_factory->IsInstanceOf( + TableFactory::kBlockBasedTableName())) { + return Status::NotSupported( + "TTL is only supported in Block-Based Table format. "); + } + } + + if (cf_options.periodic_compaction_seconds > 0 && + cf_options.periodic_compaction_seconds != kDefaultPeriodicCompSecs) { + if (!cf_options.table_factory->IsInstanceOf( + TableFactory::kBlockBasedTableName())) { + return Status::NotSupported( + "Periodic Compaction is only supported in " + "Block-Based Table format. "); + } + } + + if (cf_options.enable_blob_garbage_collection) { + if (cf_options.blob_garbage_collection_age_cutoff < 0.0 || + cf_options.blob_garbage_collection_age_cutoff > 1.0) { + return Status::InvalidArgument( + "The age cutoff for blob garbage collection should be in the range " + "[0.0, 1.0]."); + } + if (cf_options.blob_garbage_collection_force_threshold < 0.0 || + cf_options.blob_garbage_collection_force_threshold > 1.0) { + return Status::InvalidArgument( + "The garbage ratio threshold for forcing blob garbage collection " + "should be in the range [0.0, 1.0]."); + } + } + + if (cf_options.compaction_style == kCompactionStyleFIFO && + db_options.max_open_files != -1 && cf_options.ttl > 0) { + return Status::NotSupported( + "FIFO compaction only supported with max_open_files = -1."); + } + + std::vector supported{0, 1, 2, 4, 8}; + if (std::find(supported.begin(), supported.end(), + cf_options.memtable_protection_bytes_per_key) == + supported.end()) { + return Status::NotSupported( + "Memtable per key-value checksum protection only supports 0, 1, 2, 4 " + "or 8 bytes per key."); + } + return s; +} + +#ifndef ROCKSDB_LITE +Status ColumnFamilyData::SetOptions( + const DBOptions& db_opts, + const std::unordered_map& options_map) { + ColumnFamilyOptions cf_opts = + BuildColumnFamilyOptions(initial_cf_options_, mutable_cf_options_); + ConfigOptions config_opts; + config_opts.mutable_options_only = true; + Status s = GetColumnFamilyOptionsFromMap(config_opts, cf_opts, options_map, + &cf_opts); + if (s.ok()) { + s = ValidateOptions(db_opts, cf_opts); + } + if (s.ok()) { + mutable_cf_options_ = MutableCFOptions(cf_opts); + mutable_cf_options_.RefreshDerivedOptions(ioptions_); + } + return s; +} +#endif // ROCKSDB_LITE + +// REQUIRES: DB mutex held +Env::WriteLifeTimeHint ColumnFamilyData::CalculateSSTWriteHint(int level) { + if (initial_cf_options_.compaction_style != kCompactionStyleLevel) { + return Env::WLTH_NOT_SET; + } + if (level == 0) { + return Env::WLTH_MEDIUM; + } + int base_level = current_->storage_info()->base_level(); + + // L1: medium, L2: long, ... + if (level - base_level >= 2) { + return Env::WLTH_EXTREME; + } else if (level < base_level) { + // There is no restriction which prevents level passed in to be smaller + // than base_level. + return Env::WLTH_MEDIUM; + } + return static_cast( + level - base_level + static_cast(Env::WLTH_MEDIUM)); +} + +Status ColumnFamilyData::AddDirectories( + std::map>* created_dirs) { + Status s; + assert(created_dirs != nullptr); + assert(data_dirs_.empty()); + for (auto& p : ioptions_.cf_paths) { + auto existing_dir = created_dirs->find(p.path); + + if (existing_dir == created_dirs->end()) { + std::unique_ptr path_directory; + s = DBImpl::CreateAndNewDirectory(ioptions_.fs.get(), p.path, + &path_directory); + if (!s.ok()) { + return s; + } + assert(path_directory != nullptr); + data_dirs_.emplace_back(path_directory.release()); + (*created_dirs)[p.path] = data_dirs_.back(); + } else { + data_dirs_.emplace_back(existing_dir->second); + } + } + assert(data_dirs_.size() == ioptions_.cf_paths.size()); + return s; +} + +FSDirectory* ColumnFamilyData::GetDataDir(size_t path_id) const { + if (data_dirs_.empty()) { + return nullptr; + } + + assert(path_id < data_dirs_.size()); + return data_dirs_[path_id].get(); +} + +ColumnFamilySet::ColumnFamilySet(const std::string& dbname, + const ImmutableDBOptions* db_options, + const FileOptions& file_options, + Cache* table_cache, + WriteBufferManager* _write_buffer_manager, + WriteController* _write_controller, + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, + const std::string& db_id, + const std::string& db_session_id) + : max_column_family_(0), + file_options_(file_options), + dummy_cfd_(new ColumnFamilyData( + ColumnFamilyData::kDummyColumnFamilyDataId, "", nullptr, nullptr, + nullptr, ColumnFamilyOptions(), *db_options, &file_options_, nullptr, + block_cache_tracer, io_tracer, db_id, db_session_id)), + default_cfd_cache_(nullptr), + db_name_(dbname), + db_options_(db_options), + table_cache_(table_cache), + write_buffer_manager_(_write_buffer_manager), + write_controller_(_write_controller), + block_cache_tracer_(block_cache_tracer), + io_tracer_(io_tracer), + db_id_(db_id), + db_session_id_(db_session_id) { + // initialize linked list + dummy_cfd_->prev_ = dummy_cfd_; + dummy_cfd_->next_ = dummy_cfd_; +} + +ColumnFamilySet::~ColumnFamilySet() { + while (column_family_data_.size() > 0) { + // cfd destructor will delete itself from column_family_data_ + auto cfd = column_family_data_.begin()->second; + bool last_ref __attribute__((__unused__)); + last_ref = cfd->UnrefAndTryDelete(); + assert(last_ref); + } + bool dummy_last_ref __attribute__((__unused__)); + dummy_last_ref = dummy_cfd_->UnrefAndTryDelete(); + assert(dummy_last_ref); +} + +ColumnFamilyData* ColumnFamilySet::GetDefault() const { + assert(default_cfd_cache_ != nullptr); + return default_cfd_cache_; +} + +ColumnFamilyData* ColumnFamilySet::GetColumnFamily(uint32_t id) const { + auto cfd_iter = column_family_data_.find(id); + if (cfd_iter != column_family_data_.end()) { + return cfd_iter->second; + } else { + return nullptr; + } +} + +ColumnFamilyData* ColumnFamilySet::GetColumnFamily( + const std::string& name) const { + auto cfd_iter = column_families_.find(name); + if (cfd_iter != column_families_.end()) { + auto cfd = GetColumnFamily(cfd_iter->second); + assert(cfd != nullptr); + return cfd; + } else { + return nullptr; + } +} + +uint32_t ColumnFamilySet::GetNextColumnFamilyID() { + return ++max_column_family_; +} + +uint32_t ColumnFamilySet::GetMaxColumnFamily() { return max_column_family_; } + +void ColumnFamilySet::UpdateMaxColumnFamily(uint32_t new_max_column_family) { + max_column_family_ = std::max(new_max_column_family, max_column_family_); +} + +size_t ColumnFamilySet::NumberOfColumnFamilies() const { + return column_families_.size(); +} + +// under a DB mutex AND write thread +ColumnFamilyData* ColumnFamilySet::CreateColumnFamily( + const std::string& name, uint32_t id, Version* dummy_versions, + const ColumnFamilyOptions& options) { + assert(column_families_.find(name) == column_families_.end()); + ColumnFamilyData* new_cfd = new ColumnFamilyData( + id, name, dummy_versions, table_cache_, write_buffer_manager_, options, + *db_options_, &file_options_, this, block_cache_tracer_, io_tracer_, + db_id_, db_session_id_); + column_families_.insert({name, id}); + column_family_data_.insert({id, new_cfd}); + max_column_family_ = std::max(max_column_family_, id); + // add to linked list + new_cfd->next_ = dummy_cfd_; + auto prev = dummy_cfd_->prev_; + new_cfd->prev_ = prev; + prev->next_ = new_cfd; + dummy_cfd_->prev_ = new_cfd; + if (id == 0) { + default_cfd_cache_ = new_cfd; + } + return new_cfd; +} + +// under a DB mutex AND from a write thread +void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) { + auto cfd_iter = column_family_data_.find(cfd->GetID()); + assert(cfd_iter != column_family_data_.end()); + column_family_data_.erase(cfd_iter); + column_families_.erase(cfd->GetName()); +} + +// under a DB mutex OR from a write thread +bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) { + if (column_family_id == 0) { + // optimization for common case + current_ = column_family_set_->GetDefault(); + } else { + current_ = column_family_set_->GetColumnFamily(column_family_id); + } + handle_.SetCFD(current_); + return current_ != nullptr; +} + +uint64_t ColumnFamilyMemTablesImpl::GetLogNumber() const { + assert(current_ != nullptr); + return current_->GetLogNumber(); +} + +MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const { + assert(current_ != nullptr); + return current_->mem(); +} + +ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() { + assert(current_ != nullptr); + return &handle_; +} + +uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) { + uint32_t column_family_id = 0; + if (column_family != nullptr) { + auto cfh = static_cast_with_check(column_family); + column_family_id = cfh->GetID(); + } + return column_family_id; +} + +const Comparator* GetColumnFamilyUserComparator( + ColumnFamilyHandle* column_family) { + if (column_family != nullptr) { + return column_family->GetComparator(); + } + return nullptr; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/column_family.h b/src/rocksdb/db/column_family.h new file mode 100644 index 000000000..3e6d01d22 --- /dev/null +++ b/src/rocksdb/db/column_family.h @@ -0,0 +1,845 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include + +#include "cache/cache_reservation_manager.h" +#include "db/memtable_list.h" +#include "db/table_cache.h" +#include "db/table_properties_collector.h" +#include "db/write_batch_internal.h" +#include "db/write_controller.h" +#include "options/cf_options.h" +#include "rocksdb/compaction_job_stats.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "trace_replay/block_cache_tracer.h" +#include "util/hash_containers.h" +#include "util/thread_local.h" + +namespace ROCKSDB_NAMESPACE { + +class Version; +class VersionSet; +class VersionStorageInfo; +class MemTable; +class MemTableListVersion; +class CompactionPicker; +class Compaction; +class InternalKey; +class InternalStats; +class ColumnFamilyData; +class DBImpl; +class LogBuffer; +class InstrumentedMutex; +class InstrumentedMutexLock; +struct SuperVersionContext; +class BlobFileCache; +class BlobSource; + +extern const double kIncSlowdownRatio; +// This file contains a list of data structures for managing column family +// level metadata. +// +// The basic relationships among classes declared here are illustrated as +// following: +// +// +----------------------+ +----------------------+ +--------+ +// +---+ ColumnFamilyHandle 1 | +--+ ColumnFamilyHandle 2 | | DBImpl | +// | +----------------------+ | +----------------------+ +----+---+ +// | +--------------------------+ | +// | | +-----------------------------+ +// | | | +// | | +-----------------------------v-------------------------------+ +// | | | | +// | | | ColumnFamilySet | +// | | | | +// | | +-------------+--------------------------+----------------+---+ +// | | | | | +// | +-------------------------------------+ | | +// | | | | v +// | +-------------v-------------+ +-----v----v---------+ +// | | | | | +// | | ColumnFamilyData 1 | | ColumnFamilyData 2 | ...... +// | | | | | +// +---> | | | +// | +---------+ | | +// | | MemTable| | | +// | | List | | | +// +--------+---+--+-+----+----+ +--------------------++ +// | | | | +// | | | | +// | | | +-----------------------+ +// | | +-----------+ | +// v +--------+ | | +// +--------+--------+ | | | +// | | | | +----------v----------+ +// +---> |SuperVersion 1.a +-----------------> | +// | +------+ | | MemTableListVersion | +// +---+-------------+ | | | | | +// | | | | +----+------------+---+ +// | current | | | | | +// | +-------------+ | |mem | | +// | | | | | | +// +-v---v-------+ +---v--v---+ +-----v----+ +----v-----+ +// | | | | | | | | +// | Version 1.a | | memtable | | memtable | | memtable | +// | | | 1.a | | 1.b | | 1.c | +// +-------------+ | | | | | | +// +----------+ +----------+ +----------+ +// +// DBImpl keeps a ColumnFamilySet, which references to all column families by +// pointing to respective ColumnFamilyData object of each column family. +// This is how DBImpl can list and operate on all the column families. +// ColumnFamilyHandle also points to ColumnFamilyData directly, so that +// when a user executes a query, it can directly find memtables and Version +// as well as SuperVersion to the column family, without going through +// ColumnFamilySet. +// +// ColumnFamilySet points to the latest view of the LSM-tree (list of memtables +// and SST files) indirectly, while ongoing operations may hold references +// to a current or an out-of-date SuperVersion, which in turn points to a +// point-in-time view of the LSM-tree. This guarantees the memtables and SST +// files being operated on will not go away, until the SuperVersion is +// unreferenced to 0 and destoryed. +// +// The following graph illustrates a possible referencing relationships: +// +// Column +--------------+ current +-----------+ +// Family +---->+ +------------------->+ | +// Data | SuperVersion +----------+ | Version A | +// | 3 | imm | | | +// Iter2 +----->+ | +-------v------+ +-----------+ +// +-----+--------+ | MemtableList +----------------> Empty +// | | Version r | +-----------+ +// | +--------------+ | | +// +------------------+ current| Version B | +// +--------------+ | +----->+ | +// | | | | +-----+-----+ +// Compaction +>+ SuperVersion +-------------+ ^ +// Job | 2 +------+ | |current +// | +----+ | | mem | +------------+ +// +--------------+ | | +---------------------> | +// | +------------------------> MemTable a | +// | mem | | | +// +--------------+ | | +------------+ +// | +--------------------------+ +// Iter1 +-----> SuperVersion | | +------------+ +// | 1 +------------------------------>+ | +// | +-+ | mem | MemTable b | +// +--------------+ | | | | +// | | +--------------+ +-----^------+ +// | |imm | MemtableList | | +// | +--->+ Version s +------------+ +// | +--------------+ +// | +--------------+ +// | | MemtableList | +// +------>+ Version t +--------> Empty +// imm +--------------+ +// +// In this example, even if the current LSM-tree consists of Version A and +// memtable a, which is also referenced by SuperVersion, two older SuperVersion +// SuperVersion2 and Superversion1 still exist, and are referenced by a +// compaction job and an old iterator Iter1, respectively. SuperVersion2 +// contains Version B, memtable a and memtable b; SuperVersion1 contains +// Version B and memtable b (mutable). As a result, Version B and memtable b +// are prevented from being destroyed or deleted. + +// ColumnFamilyHandleImpl is the class that clients use to access different +// column families. It has non-trivial destructor, which gets called when client +// is done using the column family +class ColumnFamilyHandleImpl : public ColumnFamilyHandle { + public: + // create while holding the mutex + ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db, + InstrumentedMutex* mutex); + // destroy without mutex + virtual ~ColumnFamilyHandleImpl(); + virtual ColumnFamilyData* cfd() const { return cfd_; } + + virtual uint32_t GetID() const override; + virtual const std::string& GetName() const override; + virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) override; + virtual const Comparator* GetComparator() const override; + + private: + ColumnFamilyData* cfd_; + DBImpl* db_; + InstrumentedMutex* mutex_; +}; + +// Does not ref-count ColumnFamilyData +// We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter +// calls DBImpl methods. When this happens, MemTableInserter need access to +// ColumnFamilyHandle (same as the client would need). In that case, we feed +// MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl +// methods +class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl { + public: + ColumnFamilyHandleInternal() + : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), + internal_cfd_(nullptr) {} + + void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; } + virtual ColumnFamilyData* cfd() const override { return internal_cfd_; } + + private: + ColumnFamilyData* internal_cfd_; +}; + +// holds references to memtable, all immutable memtables and version +struct SuperVersion { + // Accessing members of this class is not thread-safe and requires external + // synchronization (ie db mutex held or on write thread). + ColumnFamilyData* cfd; + MemTable* mem; + MemTableListVersion* imm; + Version* current; + MutableCFOptions mutable_cf_options; + // Version number of the current SuperVersion + uint64_t version_number; + WriteStallCondition write_stall_condition; + + // should be called outside the mutex + SuperVersion() = default; + ~SuperVersion(); + SuperVersion* Ref(); + // If Unref() returns true, Cleanup() should be called with mutex held + // before deleting this SuperVersion. + bool Unref(); + + // call these two methods with db mutex held + // Cleanup unrefs mem, imm and current. Also, it stores all memtables + // that needs to be deleted in to_delete vector. Unrefing those + // objects needs to be done in the mutex + void Cleanup(); + void Init(ColumnFamilyData* new_cfd, MemTable* new_mem, + MemTableListVersion* new_imm, Version* new_current); + + // The value of dummy is not actually used. kSVInUse takes its address as a + // mark in the thread local storage to indicate the SuperVersion is in use + // by thread. This way, the value of kSVInUse is guaranteed to have no + // conflict with SuperVersion object address and portable on different + // platform. + static int dummy; + static void* const kSVInUse; + static void* const kSVObsolete; + + private: + std::atomic refs; + // We need to_delete because during Cleanup(), imm->Unref() returns + // all memtables that we need to free through this vector. We then + // delete all those memtables outside of mutex, during destruction + autovector to_delete; +}; + +extern Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options); + +extern Status CheckConcurrentWritesSupported( + const ColumnFamilyOptions& cf_options); + +extern Status CheckCFPathsSupported(const DBOptions& db_options, + const ColumnFamilyOptions& cf_options); + +extern ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, + const ColumnFamilyOptions& src); +// Wrap user defined table properties collector factories `from cf_options` +// into internal ones in int_tbl_prop_collector_factories. Add a system internal +// one too. +extern void GetIntTblPropCollectorFactory( + const ImmutableCFOptions& ioptions, + IntTblPropCollectorFactories* int_tbl_prop_collector_factories); + +class ColumnFamilySet; + +// This class keeps all the data that a column family needs. +// Most methods require DB mutex held, unless otherwise noted +class ColumnFamilyData { + public: + ~ColumnFamilyData(); + + // thread-safe + uint32_t GetID() const { return id_; } + // thread-safe + const std::string& GetName() const { return name_; } + + // Ref() can only be called from a context where the caller can guarantee + // that ColumnFamilyData is alive (while holding a non-zero ref already, + // holding a DB mutex, or as the leader in a write batch group). + void Ref() { refs_.fetch_add(1); } + + // UnrefAndTryDelete() decreases the reference count and do free if needed, + // return true if this is freed else false, UnrefAndTryDelete() can only + // be called while holding a DB mutex, or during single-threaded recovery. + bool UnrefAndTryDelete(); + + // SetDropped() can only be called under following conditions: + // 1) Holding a DB mutex, + // 2) from single-threaded write thread, AND + // 3) from single-threaded VersionSet::LogAndApply() + // After dropping column family no other operation on that column family + // will be executed. All the files and memory will be, however, kept around + // until client drops the column family handle. That way, client can still + // access data from dropped column family. + // Column family can be dropped and still alive. In that state: + // *) Compaction and flush is not executed on the dropped column family. + // *) Client can continue reading from column family. Writes will fail unless + // WriteOptions::ignore_missing_column_families is true + // When the dropped column family is unreferenced, then we: + // *) Remove column family from the linked list maintained by ColumnFamilySet + // *) delete all memory associated with that column family + // *) delete all the files associated with that column family + void SetDropped(); + bool IsDropped() const { return dropped_.load(std::memory_order_relaxed); } + + // thread-safe + int NumberLevels() const { return ioptions_.num_levels; } + + void SetLogNumber(uint64_t log_number) { log_number_ = log_number; } + uint64_t GetLogNumber() const { return log_number_; } + + void SetFlushReason(FlushReason flush_reason) { + flush_reason_ = flush_reason; + } + FlushReason GetFlushReason() const { return flush_reason_; } + // thread-safe + const FileOptions* soptions() const; + const ImmutableOptions* ioptions() const { return &ioptions_; } + // REQUIRES: DB mutex held + // This returns the MutableCFOptions used by current SuperVersion + // You should use this API to reference MutableCFOptions most of the time. + const MutableCFOptions* GetCurrentMutableCFOptions() const { + return &(super_version_->mutable_cf_options); + } + // REQUIRES: DB mutex held + // This returns the latest MutableCFOptions, which may be not in effect yet. + const MutableCFOptions* GetLatestMutableCFOptions() const { + return &mutable_cf_options_; + } + + // REQUIRES: DB mutex held + // Build ColumnFamiliesOptions with immutable options and latest mutable + // options. + ColumnFamilyOptions GetLatestCFOptions() const; + + bool is_delete_range_supported() { return is_delete_range_supported_; } + + // Validate CF options against DB options + static Status ValidateOptions(const DBOptions& db_options, + const ColumnFamilyOptions& cf_options); +#ifndef ROCKSDB_LITE + // REQUIRES: DB mutex held + Status SetOptions( + const DBOptions& db_options, + const std::unordered_map& options_map); +#endif // ROCKSDB_LITE + + InternalStats* internal_stats() { return internal_stats_.get(); } + + MemTableList* imm() { return &imm_; } + MemTable* mem() { return mem_; } + + bool IsEmpty() { + return mem()->GetFirstSequenceNumber() == 0 && imm()->NumNotFlushed() == 0; + } + + Version* current() { return current_; } + Version* dummy_versions() { return dummy_versions_; } + void SetCurrent(Version* _current); + uint64_t GetNumLiveVersions() const; // REQUIRE: DB mutex held + uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held + uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held + uint64_t GetTotalBlobFileSize() const; // REQUIRE: DB mutex held + void SetMemtable(MemTable* new_mem) { + uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1; + new_mem->SetID(memtable_id); + mem_ = new_mem; + } + + // calculate the oldest log needed for the durability of this column family + uint64_t OldestLogToKeep(); + + // See Memtable constructor for explanation of earliest_seq param. + MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options, + SequenceNumber earliest_seq); + void CreateNewMemtable(const MutableCFOptions& mutable_cf_options, + SequenceNumber earliest_seq); + + TableCache* table_cache() const { return table_cache_.get(); } + BlobSource* blob_source() const { return blob_source_.get(); } + + // See documentation in compaction_picker.h + // REQUIRES: DB mutex held + bool NeedsCompaction() const; + // REQUIRES: DB mutex held + Compaction* PickCompaction(const MutableCFOptions& mutable_options, + const MutableDBOptions& mutable_db_options, + LogBuffer* log_buffer); + + // Check if the passed range overlap with any running compactions. + // REQUIRES: DB mutex held + bool RangeOverlapWithCompaction(const Slice& smallest_user_key, + const Slice& largest_user_key, + int level) const; + + // Check if the passed ranges overlap with any unflushed memtables + // (immutable or mutable). + // + // @param super_version A referenced SuperVersion that will be held for the + // duration of this function. + // + // Thread-safe + Status RangesOverlapWithMemtables(const autovector& ranges, + SuperVersion* super_version, + bool allow_data_in_errors, bool* overlap); + + // A flag to tell a manual compaction is to compact all levels together + // instead of a specific level. + static const int kCompactAllLevels; + // A flag to tell a manual compaction's output is base level. + static const int kCompactToBaseLevel; + // REQUIRES: DB mutex held + Compaction* CompactRange(const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + int input_level, int output_level, + const CompactRangeOptions& compact_range_options, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end, bool* manual_conflict, + uint64_t max_file_num_to_ignore, + const std::string& trim_ts); + + CompactionPicker* compaction_picker() { return compaction_picker_.get(); } + // thread-safe + const Comparator* user_comparator() const { + return internal_comparator_.user_comparator(); + } + // thread-safe + const InternalKeyComparator& internal_comparator() const { + return internal_comparator_; + } + + const IntTblPropCollectorFactories* int_tbl_prop_collector_factories() const { + return &int_tbl_prop_collector_factories_; + } + + SuperVersion* GetSuperVersion() { return super_version_; } + // thread-safe + // Return a already referenced SuperVersion to be used safely. + SuperVersion* GetReferencedSuperVersion(DBImpl* db); + // thread-safe + // Get SuperVersion stored in thread local storage. If it does not exist, + // get a reference from a current SuperVersion. + SuperVersion* GetThreadLocalSuperVersion(DBImpl* db); + // Try to return SuperVersion back to thread local storage. Return true on + // success and false on failure. It fails when the thread local storage + // contains anything other than SuperVersion::kSVInUse flag. + bool ReturnThreadLocalSuperVersion(SuperVersion* sv); + // thread-safe + uint64_t GetSuperVersionNumber() const { + return super_version_number_.load(); + } + // will return a pointer to SuperVersion* if previous SuperVersion + // if its reference count is zero and needs deletion or nullptr if not + // As argument takes a pointer to allocated SuperVersion to enable + // the clients to allocate SuperVersion outside of mutex. + // IMPORTANT: Only call this from DBImpl::InstallSuperVersion() + void InstallSuperVersion(SuperVersionContext* sv_context, + const MutableCFOptions& mutable_cf_options); + void InstallSuperVersion(SuperVersionContext* sv_context, + InstrumentedMutex* db_mutex); + + void ResetThreadLocalSuperVersions(); + + // Protected by DB mutex + void set_queued_for_flush(bool value) { queued_for_flush_ = value; } + void set_queued_for_compaction(bool value) { queued_for_compaction_ = value; } + bool queued_for_flush() { return queued_for_flush_; } + bool queued_for_compaction() { return queued_for_compaction_; } + + enum class WriteStallCause { + kNone, + kMemtableLimit, + kL0FileCountLimit, + kPendingCompactionBytes, + }; + static std::pair + GetWriteStallConditionAndCause( + int num_unflushed_memtables, int num_l0_files, + uint64_t num_compaction_needed_bytes, + const MutableCFOptions& mutable_cf_options, + const ImmutableCFOptions& immutable_cf_options); + + // Recalculate some stall conditions, which are changed only during + // compaction, adding new memtable and/or recalculation of compaction score. + WriteStallCondition RecalculateWriteStallConditions( + const MutableCFOptions& mutable_cf_options); + + void set_initialized() { initialized_.store(true); } + + bool initialized() const { return initialized_.load(); } + + const ColumnFamilyOptions& initial_cf_options() { + return initial_cf_options_; + } + + Env::WriteLifeTimeHint CalculateSSTWriteHint(int level); + + // created_dirs remembers directory created, so that we don't need to call + // the same data creation operation again. + Status AddDirectories( + std::map>* created_dirs); + + FSDirectory* GetDataDir(size_t path_id) const; + + // full_history_ts_low_ can only increase. + void SetFullHistoryTsLow(std::string ts_low) { + assert(!ts_low.empty()); + const Comparator* ucmp = user_comparator(); + assert(ucmp); + if (full_history_ts_low_.empty() || + ucmp->CompareTimestamp(ts_low, full_history_ts_low_) > 0) { + full_history_ts_low_ = std::move(ts_low); + } + } + + const std::string& GetFullHistoryTsLow() const { + return full_history_ts_low_; + } + + ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); } + WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; } + std::shared_ptr + GetFileMetadataCacheReservationManager() { + return file_metadata_cache_res_mgr_; + } + + SequenceNumber GetFirstMemtableSequenceNumber() const; + + static const uint32_t kDummyColumnFamilyDataId; + + // Keep track of whether the mempurge feature was ever used. + void SetMempurgeUsed() { mempurge_used_ = true; } + bool GetMempurgeUsed() { return mempurge_used_; } + + private: + friend class ColumnFamilySet; + ColumnFamilyData(uint32_t id, const std::string& name, + Version* dummy_versions, Cache* table_cache, + WriteBufferManager* write_buffer_manager, + const ColumnFamilyOptions& options, + const ImmutableDBOptions& db_options, + const FileOptions* file_options, + ColumnFamilySet* column_family_set, + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, + const std::string& db_id, const std::string& db_session_id); + + std::vector GetDbPaths() const; + + uint32_t id_; + const std::string name_; + Version* dummy_versions_; // Head of circular doubly-linked list of versions. + Version* current_; // == dummy_versions->prev_ + + std::atomic refs_; // outstanding references to ColumnFamilyData + std::atomic initialized_; + std::atomic dropped_; // true if client dropped it + + const InternalKeyComparator internal_comparator_; + IntTblPropCollectorFactories int_tbl_prop_collector_factories_; + + const ColumnFamilyOptions initial_cf_options_; + const ImmutableOptions ioptions_; + MutableCFOptions mutable_cf_options_; + + const bool is_delete_range_supported_; + + std::unique_ptr table_cache_; + std::unique_ptr blob_file_cache_; + std::unique_ptr blob_source_; + + std::unique_ptr internal_stats_; + + WriteBufferManager* write_buffer_manager_; + + MemTable* mem_; + MemTableList imm_; + SuperVersion* super_version_; + + // An ordinal representing the current SuperVersion. Updated by + // InstallSuperVersion(), i.e. incremented every time super_version_ + // changes. + std::atomic super_version_number_; + + // Thread's local copy of SuperVersion pointer + // This needs to be destructed before mutex_ + std::unique_ptr local_sv_; + + // pointers for a circular linked list. we use it to support iterations over + // all column families that are alive (note: dropped column families can also + // be alive as long as client holds a reference) + ColumnFamilyData* next_; + ColumnFamilyData* prev_; + + // This is the earliest log file number that contains data from this + // Column Family. All earlier log files must be ignored and not + // recovered from + uint64_t log_number_; + + std::atomic flush_reason_; + + // An object that keeps all the compaction stats + // and picks the next compaction + std::unique_ptr compaction_picker_; + + ColumnFamilySet* column_family_set_; + + std::unique_ptr write_controller_token_; + + // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_ + bool queued_for_flush_; + + // If true --> this ColumnFamily is currently present in + // DBImpl::compaction_queue_ + bool queued_for_compaction_; + + uint64_t prev_compaction_needed_bytes_; + + // if the database was opened with 2pc enabled + bool allow_2pc_; + + // Memtable id to track flush. + std::atomic last_memtable_id_; + + // Directories corresponding to cf_paths. + std::vector> data_dirs_; + + bool db_paths_registered_; + + std::string full_history_ts_low_; + + // For charging memory usage of file metadata created for newly added files to + // a Version associated with this CFD + std::shared_ptr file_metadata_cache_res_mgr_; + bool mempurge_used_; +}; + +// ColumnFamilySet has interesting thread-safety requirements +// * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB +// mutex AND executed in the write thread. +// CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND +// single-threaded write thread. It is also called during Recovery and in +// DumpManifest(). +// RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be +// held and it needs to be executed from the write thread. SetDropped() also +// guarantees that it will be called only from single-threaded LogAndApply(), +// but this condition is not that important. +// * Iteration -- hold DB mutex. If you want to release the DB mutex in the +// body of the iteration, wrap in a RefedColumnFamilySet. +// * GetDefault() -- thread safe +// * GetColumnFamily() -- either inside of DB mutex or from a write thread +// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(), +// NumberOfColumnFamilies -- inside of DB mutex +class ColumnFamilySet { + public: + // ColumnFamilySet supports iteration + class iterator { + public: + explicit iterator(ColumnFamilyData* cfd) : current_(cfd) {} + // NOTE: minimum operators for for-loop iteration + iterator& operator++() { + current_ = current_->next_; + return *this; + } + bool operator!=(const iterator& other) const { + return this->current_ != other.current_; + } + ColumnFamilyData* operator*() { return current_; } + + private: + ColumnFamilyData* current_; + }; + + ColumnFamilySet(const std::string& dbname, + const ImmutableDBOptions* db_options, + const FileOptions& file_options, Cache* table_cache, + WriteBufferManager* _write_buffer_manager, + WriteController* _write_controller, + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, + const std::string& db_id, const std::string& db_session_id); + ~ColumnFamilySet(); + + ColumnFamilyData* GetDefault() const; + // GetColumnFamily() calls return nullptr if column family is not found + ColumnFamilyData* GetColumnFamily(uint32_t id) const; + ColumnFamilyData* GetColumnFamily(const std::string& name) const; + // this call will return the next available column family ID. it guarantees + // that there is no column family with id greater than or equal to the + // returned value in the current running instance or anytime in RocksDB + // instance history. + uint32_t GetNextColumnFamilyID(); + uint32_t GetMaxColumnFamily(); + void UpdateMaxColumnFamily(uint32_t new_max_column_family); + size_t NumberOfColumnFamilies() const; + + ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id, + Version* dummy_version, + const ColumnFamilyOptions& options); + + iterator begin() { return iterator(dummy_cfd_->next_); } + iterator end() { return iterator(dummy_cfd_); } + + Cache* get_table_cache() { return table_cache_; } + + WriteBufferManager* write_buffer_manager() { return write_buffer_manager_; } + + WriteController* write_controller() { return write_controller_; } + + private: + friend class ColumnFamilyData; + // helper function that gets called from cfd destructor + // REQUIRES: DB mutex held + void RemoveColumnFamily(ColumnFamilyData* cfd); + + // column_families_ and column_family_data_ need to be protected: + // * when mutating both conditions have to be satisfied: + // 1. DB mutex locked + // 2. thread currently in single-threaded write thread + // * when reading, at least one condition needs to be satisfied: + // 1. DB mutex locked + // 2. accessed from a single-threaded write thread + UnorderedMap column_families_; + UnorderedMap column_family_data_; + + uint32_t max_column_family_; + const FileOptions file_options_; + + ColumnFamilyData* dummy_cfd_; + // We don't hold the refcount here, since default column family always exists + // We are also not responsible for cleaning up default_cfd_cache_. This is + // just a cache that makes common case (accessing default column family) + // faster + ColumnFamilyData* default_cfd_cache_; + + const std::string db_name_; + const ImmutableDBOptions* const db_options_; + Cache* table_cache_; + WriteBufferManager* write_buffer_manager_; + WriteController* write_controller_; + BlockCacheTracer* const block_cache_tracer_; + std::shared_ptr io_tracer_; + const std::string& db_id_; + std::string db_session_id_; +}; + +// A wrapper for ColumnFamilySet that supports releasing DB mutex during each +// iteration over the iterator, because the cfd is Refed and Unrefed during +// each iteration to prevent concurrent CF drop from destroying it (until +// Unref). +class RefedColumnFamilySet { + public: + explicit RefedColumnFamilySet(ColumnFamilySet* cfs) : wrapped_(cfs) {} + + class iterator { + public: + explicit iterator(ColumnFamilySet::iterator wrapped) : wrapped_(wrapped) { + MaybeRef(*wrapped_); + } + ~iterator() { MaybeUnref(*wrapped_); } + inline void MaybeRef(ColumnFamilyData* cfd) { + if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) { + cfd->Ref(); + } + } + inline void MaybeUnref(ColumnFamilyData* cfd) { + if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) { + cfd->UnrefAndTryDelete(); + } + } + // NOTE: minimum operators for for-loop iteration + inline iterator& operator++() { + ColumnFamilyData* old = *wrapped_; + ++wrapped_; + // Can only unref & potentially free cfd after accessing its next_ + MaybeUnref(old); + MaybeRef(*wrapped_); + return *this; + } + inline bool operator!=(const iterator& other) const { + return this->wrapped_ != other.wrapped_; + } + inline ColumnFamilyData* operator*() { return *wrapped_; } + + private: + ColumnFamilySet::iterator wrapped_; + }; + + iterator begin() { return iterator(wrapped_->begin()); } + iterator end() { return iterator(wrapped_->end()); } + + private: + ColumnFamilySet* wrapped_; +}; + +// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access +// memtables of different column families (specified by ID in the write batch) +class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables { + public: + explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set) + : column_family_set_(column_family_set), current_(nullptr) {} + + // Constructs a ColumnFamilyMemTablesImpl equivalent to one constructed + // with the arguments used to construct *orig. + explicit ColumnFamilyMemTablesImpl(ColumnFamilyMemTablesImpl* orig) + : column_family_set_(orig->column_family_set_), current_(nullptr) {} + + // sets current_ to ColumnFamilyData with column_family_id + // returns false if column family doesn't exist + // REQUIRES: use this function of DBImpl::column_family_memtables_ should be + // under a DB mutex OR from a write thread + bool Seek(uint32_t column_family_id) override; + + // Returns log number of the selected column family + // REQUIRES: under a DB mutex OR from a write thread + uint64_t GetLogNumber() const override; + + // REQUIRES: Seek() called first + // REQUIRES: use this function of DBImpl::column_family_memtables_ should be + // under a DB mutex OR from a write thread + virtual MemTable* GetMemTable() const override; + + // Returns column family handle for the selected column family + // REQUIRES: use this function of DBImpl::column_family_memtables_ should be + // under a DB mutex OR from a write thread + virtual ColumnFamilyHandle* GetColumnFamilyHandle() override; + + // Cannot be called while another thread is calling Seek(). + // REQUIRES: use this function of DBImpl::column_family_memtables_ should be + // under a DB mutex OR from a write thread + virtual ColumnFamilyData* current() override { return current_; } + + private: + ColumnFamilySet* column_family_set_; + ColumnFamilyData* current_; + ColumnFamilyHandleInternal handle_; +}; + +extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family); + +extern const Comparator* GetColumnFamilyUserComparator( + ColumnFamilyHandle* column_family); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/column_family_test.cc b/src/rocksdb/db/column_family_test.cc new file mode 100644 index 000000000..d33cbe50a --- /dev/null +++ b/src/rocksdb/db/column_family_test.cc @@ -0,0 +1,3453 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "options/options_parser.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/utilities/object_registry.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/coding.h" +#include "util/string_util.h" +#include "utilities/fault_injection_env.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +static const int kValueSize = 1000; + +// counts how many operations were performed +class EnvCounter : public SpecialEnv { + public: + explicit EnvCounter(Env* base) + : SpecialEnv(base), num_new_writable_file_(0) {} + int GetNumberOfNewWritableFileCalls() { return num_new_writable_file_; } + Status NewWritableFile(const std::string& f, std::unique_ptr* r, + const EnvOptions& soptions) override { + ++num_new_writable_file_; + return EnvWrapper::NewWritableFile(f, r, soptions); + } + + private: + std::atomic num_new_writable_file_; +}; + +class ColumnFamilyTestBase : public testing::Test { + public: + explicit ColumnFamilyTestBase(uint32_t format) : rnd_(139), format_(format) { + Env* base_env = Env::Default(); + EXPECT_OK( + test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_)); + EXPECT_NE(nullptr, base_env); + env_ = new EnvCounter(base_env); + env_->skip_fsync_ = true; + dbname_ = test::PerThreadDBPath("column_family_test"); + db_options_.create_if_missing = true; + db_options_.fail_if_options_file_error = true; + db_options_.env = env_; + EXPECT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_))); + } + + ~ColumnFamilyTestBase() override { + std::vector column_families; + for (auto h : handles_) { + ColumnFamilyDescriptor cfdescriptor; + Status s = h->GetDescriptor(&cfdescriptor); +#ifdef ROCKSDB_LITE + EXPECT_TRUE(s.IsNotSupported()); +#else + EXPECT_OK(s); +#endif // ROCKSDB_LITE + column_families.push_back(cfdescriptor); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + Destroy(column_families); + delete env_; + } + + BlockBasedTableOptions GetBlockBasedTableOptions() { + BlockBasedTableOptions options; + options.format_version = format_; + return options; + } + + // Return the value to associate with the specified key + Slice Value(int k, std::string* storage) { + if (k == 0) { + // Ugh. Random seed of 0 used to produce no entropy. This code + // preserves the implementation that was in place when all of the + // magic values in this file were picked. + *storage = std::string(kValueSize, ' '); + } else { + Random r(k); + *storage = r.RandomString(kValueSize); + } + return Slice(*storage); + } + + void Build(int base, int n, int flush_every = 0) { + std::string key_space, value_space; + WriteBatch batch; + + for (int i = 0; i < n; i++) { + if (flush_every != 0 && i != 0 && i % flush_every == 0) { + DBImpl* dbi = static_cast_with_check(db_); + dbi->TEST_FlushMemTable(); + } + + int keyi = base + i; + Slice key(DBTestBase::Key(keyi)); + + batch.Clear(); + batch.Put(handles_[0], key, Value(keyi, &value_space)); + batch.Put(handles_[1], key, Value(keyi, &value_space)); + batch.Put(handles_[2], key, Value(keyi, &value_space)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + } + } + + void CheckMissed() { + uint64_t next_expected = 0; + uint64_t missed = 0; + int bad_keys = 0; + int bad_values = 0; + int correct = 0; + std::string value_space; + for (int cf = 0; cf < 3; cf++) { + next_expected = 0; + Iterator* iter = db_->NewIterator(ReadOptions(false, true), handles_[cf]); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + uint64_t key; + Slice in(iter->key()); + in.remove_prefix(3); + if (!ConsumeDecimalNumber(&in, &key) || !in.empty() || + key < next_expected) { + bad_keys++; + continue; + } + missed += (key - next_expected); + next_expected = key + 1; + if (iter->value() != Value(static_cast(key), &value_space)) { + bad_values++; + } else { + correct++; + } + } + delete iter; + } + + ASSERT_EQ(0, bad_keys); + ASSERT_EQ(0, bad_values); + ASSERT_EQ(0, missed); + (void)correct; + } + + void Close() { + for (auto h : handles_) { + if (h) { + ASSERT_OK(db_->DestroyColumnFamilyHandle(h)); + } + } + handles_.clear(); + names_.clear(); + delete db_; + db_ = nullptr; + } + + Status TryOpen(std::vector cf, + std::vector options = {}) { + std::vector column_families; + names_.clear(); + for (size_t i = 0; i < cf.size(); ++i) { + column_families.emplace_back( + cf[i], options.size() == 0 ? column_family_options_ : options[i]); + names_.push_back(cf[i]); + } + return DB::Open(db_options_, dbname_, column_families, &handles_, &db_); + } + + Status OpenReadOnly(std::vector cf, + std::vector options = {}) { + std::vector column_families; + names_.clear(); + for (size_t i = 0; i < cf.size(); ++i) { + column_families.emplace_back( + cf[i], options.size() == 0 ? column_family_options_ : options[i]); + names_.push_back(cf[i]); + } + return DB::OpenForReadOnly(db_options_, dbname_, column_families, &handles_, + &db_); + } + +#ifndef ROCKSDB_LITE // ReadOnlyDB is not supported + void AssertOpenReadOnly(std::vector cf, + std::vector options = {}) { + ASSERT_OK(OpenReadOnly(cf, options)); + } +#endif // !ROCKSDB_LITE + + void Open(std::vector cf, + std::vector options = {}) { + ASSERT_OK(TryOpen(cf, options)); + } + + void Open() { Open({"default"}); } + + DBImpl* dbfull() { return static_cast_with_check(db_); } + + int GetProperty(int cf, std::string property) { + std::string value; + EXPECT_TRUE(dbfull()->GetProperty(handles_[cf], property, &value)); +#ifndef CYGWIN + return std::stoi(value); +#else + return std::strtol(value.c_str(), 0 /* off */, 10 /* base */); +#endif + } + + bool IsDbWriteStopped() { +#ifndef ROCKSDB_LITE + uint64_t v; + EXPECT_TRUE(dbfull()->GetIntProperty("rocksdb.is-write-stopped", &v)); + return (v == 1); +#else + return dbfull()->TEST_write_controler().IsStopped(); +#endif // !ROCKSDB_LITE + } + + uint64_t GetDbDelayedWriteRate() { +#ifndef ROCKSDB_LITE + uint64_t v; + EXPECT_TRUE( + dbfull()->GetIntProperty("rocksdb.actual-delayed-write-rate", &v)); + return v; +#else + if (!dbfull()->TEST_write_controler().NeedsDelay()) { + return 0; + } + return dbfull()->TEST_write_controler().delayed_write_rate(); +#endif // !ROCKSDB_LITE + } + + void Destroy(const std::vector& column_families = + std::vector()) { + Close(); + ASSERT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_), + column_families)); + } + + void CreateColumnFamilies( + const std::vector& cfs, + const std::vector options = {}) { + int cfi = static_cast(handles_.size()); + handles_.resize(cfi + cfs.size()); + names_.resize(cfi + cfs.size()); + for (size_t i = 0; i < cfs.size(); ++i) { + const auto& current_cf_opt = + options.size() == 0 ? column_family_options_ : options[i]; + ASSERT_OK( + db_->CreateColumnFamily(current_cf_opt, cfs[i], &handles_[cfi])); + names_[cfi] = cfs[i]; + +#ifndef ROCKSDB_LITE // RocksDBLite does not support GetDescriptor + // Verify the CF options of the returned CF handle. + ColumnFamilyDescriptor desc; + ASSERT_OK(handles_[cfi]->GetDescriptor(&desc)); + // Need to sanitize the default column family options before comparing + // them. + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions( + ConfigOptions(), desc.options, + SanitizeOptions(dbfull()->immutable_db_options(), current_cf_opt))); +#endif // !ROCKSDB_LITE + cfi++; + } + } + + void Reopen(const std::vector options = {}) { + std::vector names; + for (auto name : names_) { + if (name != "") { + names.push_back(name); + } + } + Close(); + assert(options.size() == 0 || names.size() == options.size()); + Open(names, options); + } + + void CreateColumnFamiliesAndReopen(const std::vector& cfs) { + CreateColumnFamilies(cfs); + Reopen(); + } + + void DropColumnFamilies(const std::vector& cfs) { + for (auto cf : cfs) { + ASSERT_OK(db_->DropColumnFamily(handles_[cf])); + ASSERT_OK(db_->DestroyColumnFamilyHandle(handles_[cf])); + handles_[cf] = nullptr; + names_[cf] = ""; + } + } + + void PutRandomData(int cf, int num, int key_value_size, bool save = false) { + if (cf >= static_cast(keys_.size())) { + keys_.resize(cf + 1); + } + for (int i = 0; i < num; ++i) { + // 10 bytes for key, rest is value + if (!save) { + ASSERT_OK(Put(cf, test::RandomKey(&rnd_, 11), + rnd_.RandomString(key_value_size - 10))); + } else { + std::string key = test::RandomKey(&rnd_, 11); + keys_[cf].insert(key); + ASSERT_OK(Put(cf, key, rnd_.RandomString(key_value_size - 10))); + } + } + ASSERT_OK(db_->FlushWAL(/*sync=*/false)); + } + +#ifndef ROCKSDB_LITE // TEST functions in DB are not supported in lite + void WaitForFlush(int cf) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf])); + } + + void WaitForCompaction() { ASSERT_OK(dbfull()->TEST_WaitForCompact()); } + + uint64_t MaxTotalInMemoryState() { + return dbfull()->TEST_MaxTotalInMemoryState(); + } + + void AssertMaxTotalInMemoryState(uint64_t value) { + ASSERT_EQ(value, MaxTotalInMemoryState()); + } +#endif // !ROCKSDB_LITE + + Status Put(int cf, const std::string& key, const std::string& value) { + return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(value)); + } + Status Merge(int cf, const std::string& key, const std::string& value) { + return db_->Merge(WriteOptions(), handles_[cf], Slice(key), Slice(value)); + } + Status Flush(int cf) { return db_->Flush(FlushOptions(), handles_[cf]); } + + std::string Get(int cf, const std::string& key) { + ReadOptions options; + options.verify_checksums = true; + std::string result; + Status s = db_->Get(options, handles_[cf], Slice(key), &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + void CompactAll(int cf) { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf], nullptr, + nullptr)); + } + + void Compact(int cf, const Slice& start, const Slice& limit) { + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit)); + } + + int NumTableFilesAtLevel(int level, int cf) { + return GetProperty(cf, + "rocksdb.num-files-at-level" + std::to_string(level)); + } + +#ifndef ROCKSDB_LITE + // Return spread of files per level + std::string FilesPerLevel(int cf) { + std::string result; + int last_non_zero_offset = 0; + for (int level = 0; level < dbfull()->NumberLevels(handles_[cf]); level++) { + int f = NumTableFilesAtLevel(level, cf); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = static_cast(result.size()); + } + } + result.resize(last_non_zero_offset); + return result; + } +#endif + + void AssertFilesPerLevel(const std::string& value, int cf) { +#ifndef ROCKSDB_LITE + ASSERT_EQ(value, FilesPerLevel(cf)); +#else + (void)value; + (void)cf; +#endif + } + +#ifndef ROCKSDB_LITE // GetLiveFilesMetaData is not supported + int CountLiveFiles() { + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + return static_cast(metadata.size()); + } +#endif // !ROCKSDB_LITE + + void AssertCountLiveFiles(int expected_value) { +#ifndef ROCKSDB_LITE + ASSERT_EQ(expected_value, CountLiveFiles()); +#else + (void)expected_value; +#endif + } + + // Do n memtable flushes, each of which produces an sstable + // covering the range [small,large]. + void MakeTables(int cf, int n, const std::string& small, + const std::string& large) { + for (int i = 0; i < n; i++) { + ASSERT_OK(Put(cf, small, "begin")); + ASSERT_OK(Put(cf, large, "end")); + ASSERT_OK(db_->Flush(FlushOptions(), handles_[cf])); + } + } + +#ifndef ROCKSDB_LITE // GetSortedWalFiles is not supported + int CountLiveLogFiles() { + int micros_wait_for_log_deletion = 20000; + env_->SleepForMicroseconds(micros_wait_for_log_deletion); + int ret = 0; + VectorLogPtr wal_files; + Status s; + // GetSortedWalFiles is a flakey function -- it gets all the wal_dir + // children files and then later checks for their existence. if some of the + // log files doesn't exist anymore, it reports an error. it does all of this + // without DB mutex held, so if a background process deletes the log file + // while the function is being executed, it returns an error. We retry the + // function 10 times to avoid the error failing the test + for (int retries = 0; retries < 10; ++retries) { + wal_files.clear(); + s = db_->GetSortedWalFiles(wal_files); + if (s.ok()) { + break; + } + } + EXPECT_OK(s); + for (const auto& wal : wal_files) { + if (wal->Type() == kAliveLogFile) { + ++ret; + } + } + return ret; + return 0; + } +#endif // !ROCKSDB_LITE + + void AssertCountLiveLogFiles(int value) { +#ifndef ROCKSDB_LITE // GetSortedWalFiles is not supported + ASSERT_EQ(value, CountLiveLogFiles()); +#else + (void)value; +#endif // !ROCKSDB_LITE + } + + void AssertNumberOfImmutableMemtables(std::vector num_per_cf) { + assert(num_per_cf.size() == handles_.size()); + +#ifndef ROCKSDB_LITE // GetProperty is not supported in lite + for (size_t i = 0; i < num_per_cf.size(); ++i) { + ASSERT_EQ(num_per_cf[i], GetProperty(static_cast(i), + "rocksdb.num-immutable-mem-table")); + } +#endif // !ROCKSDB_LITE + } + + void CopyFile(const std::string& source, const std::string& destination, + uint64_t size = 0) { + const EnvOptions soptions; + std::unique_ptr srcfile; + ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions)); + std::unique_ptr destfile; + ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions)); + + if (size == 0) { + // default argument means copy everything + ASSERT_OK(env_->GetFileSize(source, &size)); + } + + char buffer[4096]; + Slice slice; + while (size > 0) { + uint64_t one = std::min(uint64_t(sizeof(buffer)), size); + ASSERT_OK(srcfile->Read(one, &slice, buffer)); + ASSERT_OK(destfile->Append(slice)); + size -= slice.size(); + } + ASSERT_OK(destfile->Close()); + } + + int GetSstFileCount(std::string path) { + std::vector files; + DBTestBase::GetSstFiles(env_, path, &files); + return static_cast(files.size()); + } + + void RecalculateWriteStallConditions( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options) { + // add lock to avoid race condition between + // `RecalculateWriteStallConditions` which writes to CFStats and + // background `DBImpl::DumpStats()` threads which read CFStats + dbfull()->TEST_LockMutex(); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + dbfull()->TEST_UnlockMutex(); + } + + std::vector handles_; + std::vector names_; + std::vector> keys_; + ColumnFamilyOptions column_family_options_; + DBOptions db_options_; + std::string dbname_; + DB* db_ = nullptr; + EnvCounter* env_; + std::shared_ptr env_guard_; + Random rnd_; + uint32_t format_; +}; + +class ColumnFamilyTest + : public ColumnFamilyTestBase, + virtual public ::testing::WithParamInterface { + public: + ColumnFamilyTest() : ColumnFamilyTestBase(GetParam()) {} +}; + +INSTANTIATE_TEST_CASE_P(FormatDef, ColumnFamilyTest, + testing::Values(test::kDefaultFormatVersion)); +INSTANTIATE_TEST_CASE_P(FormatLatest, ColumnFamilyTest, + testing::Values(kLatestFormatVersion)); + +TEST_P(ColumnFamilyTest, DontReuseColumnFamilyID) { + for (int iter = 0; iter < 3; ++iter) { + Open(); + CreateColumnFamilies({"one", "two", "three"}); + for (size_t i = 0; i < handles_.size(); ++i) { + auto cfh = static_cast_with_check(handles_[i]); + ASSERT_EQ(i, cfh->GetID()); + } + if (iter == 1) { + Reopen(); + } + DropColumnFamilies({3}); + Reopen(); + if (iter == 2) { + // this tests if max_column_family is correctly persisted with + // WriteSnapshot() + Reopen(); + } + CreateColumnFamilies({"three2"}); + // ID 3 that was used for dropped column family "three" should not be + // reused + auto cfh3 = static_cast_with_check(handles_[3]); + ASSERT_EQ(4U, cfh3->GetID()); + Close(); + Destroy(); + } +} + +#ifndef ROCKSDB_LITE +TEST_P(ColumnFamilyTest, CreateCFRaceWithGetAggProperty) { + Open(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::WriteOptionsFile:1", + "ColumnFamilyTest.CreateCFRaceWithGetAggProperty:1"}, + {"ColumnFamilyTest.CreateCFRaceWithGetAggProperty:2", + "DBImpl::WriteOptionsFile:2"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread thread( + [&] { CreateColumnFamilies({"one"}); }); + + TEST_SYNC_POINT("ColumnFamilyTest.CreateCFRaceWithGetAggProperty:1"); + uint64_t pv; + db_->GetAggregatedIntProperty(DB::Properties::kEstimateTableReadersMem, &pv); + TEST_SYNC_POINT("ColumnFamilyTest.CreateCFRaceWithGetAggProperty:2"); + + thread.join(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} +#endif // !ROCKSDB_LITE + +class FlushEmptyCFTestWithParam + : public ColumnFamilyTestBase, + virtual public testing::WithParamInterface> { + public: + FlushEmptyCFTestWithParam() + : ColumnFamilyTestBase(std::get<0>(GetParam())), + allow_2pc_(std::get<1>(GetParam())) {} + + // Required if inheriting from testing::WithParamInterface<> + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + bool allow_2pc_; +}; + +TEST_P(FlushEmptyCFTestWithParam, FlushEmptyCFTest) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(env_)); + db_options_.env = fault_env.get(); + db_options_.allow_2pc = allow_2pc_; + Open(); + CreateColumnFamilies({"one", "two"}); + // Generate log file A. + ASSERT_OK(Put(1, "foo", "v1")); // seqID 1 + + Reopen(); + // Log file A is not dropped after reopening because default column family's + // min log number is 0. + // It flushes to SST file X + ASSERT_OK(Put(1, "foo", "v1")); // seqID 2 + ASSERT_OK(Put(1, "bar", "v2")); // seqID 3 + // Current log file is file B now. While flushing, a new log file C is created + // and is set to current. Boths' min log number is set to file C in memory, so + // after flushing file B is deleted. At the same time, the min log number of + // default CF is not written to manifest. Log file A still remains. + // Flushed to SST file Y. + ASSERT_OK(Flush(1)); + ASSERT_OK(Flush(0)); + ASSERT_OK(Put(1, "bar", "v3")); // seqID 4 + ASSERT_OK(Put(1, "foo", "v4")); // seqID 5 + ASSERT_OK(db_->FlushWAL(/*sync=*/false)); + + // Preserve file system state up to here to simulate a crash condition. + fault_env->SetFilesystemActive(false); + std::vector names; + for (auto name : names_) { + if (name != "") { + names.push_back(name); + } + } + + Close(); + fault_env->ResetState(); + + // Before opening, there are four files: + // Log file A contains seqID 1 + // Log file C contains seqID 4, 5 + // SST file X contains seqID 1 + // SST file Y contains seqID 2, 3 + // Min log number: + // default CF: 0 + // CF one, two: C + // When opening the DB, all the seqID should be preserved. + Open(names, {}); + ASSERT_EQ("v4", Get(1, "foo")); + ASSERT_EQ("v3", Get(1, "bar")); + Close(); + + db_options_.env = env_; +} + +TEST_P(FlushEmptyCFTestWithParam, FlushEmptyCFTest2) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(env_)); + db_options_.env = fault_env.get(); + db_options_.allow_2pc = allow_2pc_; + Open(); + CreateColumnFamilies({"one", "two"}); + // Generate log file A. + ASSERT_OK(Put(1, "foo", "v1")); // seqID 1 + + Reopen(); + // Log file A is not dropped after reopening because default column family's + // min log number is 0. + // It flushes to SST file X + ASSERT_OK(Put(1, "foo", "v1")); // seqID 2 + ASSERT_OK(Put(1, "bar", "v2")); // seqID 3 + // Current log file is file B now. While flushing, a new log file C is created + // and is set to current. Both CFs' min log number is set to file C so after + // flushing file B is deleted. Log file A still remains. + // Flushed to SST file Y. + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(0, "bar", "v2")); // seqID 4 + ASSERT_OK(Put(2, "bar", "v2")); // seqID 5 + ASSERT_OK(Put(1, "bar", "v3")); // seqID 6 + // Flushing all column families. This forces all CFs' min log to current. This + // is written to the manifest file. Log file C is cleared. + ASSERT_OK(Flush(0)); + ASSERT_OK(Flush(1)); + ASSERT_OK(Flush(2)); + // Write to log file D + ASSERT_OK(Put(1, "bar", "v4")); // seqID 7 + ASSERT_OK(Put(1, "bar", "v5")); // seqID 8 + ASSERT_OK(db_->FlushWAL(/*sync=*/false)); + // Preserve file system state up to here to simulate a crash condition. + fault_env->SetFilesystemActive(false); + std::vector names; + for (auto name : names_) { + if (name != "") { + names.push_back(name); + } + } + + Close(); + fault_env->ResetState(); + // Before opening, there are two logfiles: + // Log file A contains seqID 1 + // Log file D contains seqID 7, 8 + // Min log number: + // default CF: D + // CF one, two: D + // When opening the DB, log file D should be replayed using the seqID + // specified in the file. + Open(names, {}); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v5", Get(1, "bar")); + Close(); + + db_options_.env = env_; +} + +INSTANTIATE_TEST_CASE_P( + FormatDef, FlushEmptyCFTestWithParam, + testing::Values(std::make_tuple(test::kDefaultFormatVersion, true), + std::make_tuple(test::kDefaultFormatVersion, false))); +INSTANTIATE_TEST_CASE_P( + FormatLatest, FlushEmptyCFTestWithParam, + testing::Values(std::make_tuple(kLatestFormatVersion, true), + std::make_tuple(kLatestFormatVersion, false))); + +TEST_P(ColumnFamilyTest, AddDrop) { + Open(); + CreateColumnFamilies({"one", "two", "three"}); + ASSERT_EQ("NOT_FOUND", Get(1, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(2, "fodor")); + DropColumnFamilies({2}); + ASSERT_EQ("NOT_FOUND", Get(1, "fodor")); + CreateColumnFamilies({"four"}); + ASSERT_EQ("NOT_FOUND", Get(3, "fodor")); + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_EQ("mirko", Get(1, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(3, "fodor")); + Close(); + ASSERT_TRUE(TryOpen({"default"}).IsInvalidArgument()); + Open({"default", "one", "three", "four"}); + DropColumnFamilies({1}); + Reopen(); + Close(); + + std::vector families; + ASSERT_OK(DB::ListColumnFamilies(db_options_, dbname_, &families)); + std::sort(families.begin(), families.end()); + ASSERT_TRUE(families == + std::vector({"default", "four", "three"})); +} + +TEST_P(ColumnFamilyTest, BulkAddDrop) { + constexpr int kNumCF = 1000; + ColumnFamilyOptions cf_options; + WriteOptions write_options; + Open(); + std::vector cf_names; + std::vector cf_handles; + for (int i = 1; i <= kNumCF; i++) { + cf_names.push_back("cf1-" + std::to_string(i)); + } + ASSERT_OK(db_->CreateColumnFamilies(cf_options, cf_names, &cf_handles)); + for (int i = 1; i <= kNumCF; i++) { + ASSERT_OK(db_->Put(write_options, cf_handles[i - 1], "foo", "bar")); + } + ASSERT_OK(db_->DropColumnFamilies(cf_handles)); + std::vector cf_descriptors; + for (auto* handle : cf_handles) { + delete handle; + } + cf_handles.clear(); + for (int i = 1; i <= kNumCF; i++) { + cf_descriptors.emplace_back("cf2-" + std::to_string(i), + ColumnFamilyOptions()); + } + ASSERT_OK(db_->CreateColumnFamilies(cf_descriptors, &cf_handles)); + for (int i = 1; i <= kNumCF; i++) { + ASSERT_OK(db_->Put(write_options, cf_handles[i - 1], "foo", "bar")); + } + ASSERT_OK(db_->DropColumnFamilies(cf_handles)); + for (auto* handle : cf_handles) { + delete handle; + } + Close(); + std::vector families; + ASSERT_OK(DB::ListColumnFamilies(db_options_, dbname_, &families)); + std::sort(families.begin(), families.end()); + ASSERT_TRUE(families == std::vector({"default"})); +} + +TEST_P(ColumnFamilyTest, DropTest) { + // first iteration - don't reopen DB before dropping + // second iteration - reopen DB before dropping + for (int iter = 0; iter < 2; ++iter) { + Open({"default"}); + CreateColumnFamiliesAndReopen({"pikachu"}); + for (int i = 0; i < 100; ++i) { + ASSERT_OK(Put(1, std::to_string(i), "bar" + std::to_string(i))); + } + ASSERT_OK(Flush(1)); + + if (iter == 1) { + Reopen(); + } + ASSERT_EQ("bar1", Get(1, "1")); + + AssertCountLiveFiles(1); + DropColumnFamilies({1}); + // make sure that all files are deleted when we drop the column family + AssertCountLiveFiles(0); + Destroy(); + } +} + +TEST_P(ColumnFamilyTest, WriteBatchFailure) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two"}); + WriteBatch batch; + ASSERT_OK(batch.Put(handles_[0], Slice("existing"), Slice("column-family"))); + ASSERT_OK( + batch.Put(handles_[1], Slice("non-existing"), Slice("column-family"))); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + DropColumnFamilies({1}); + WriteOptions woptions_ignore_missing_cf; + woptions_ignore_missing_cf.ignore_missing_column_families = true; + ASSERT_OK( + batch.Put(handles_[0], Slice("still here"), Slice("column-family"))); + ASSERT_OK(db_->Write(woptions_ignore_missing_cf, &batch)); + ASSERT_EQ("column-family", Get(0, "still here")); + Status s = db_->Write(WriteOptions(), &batch); + ASSERT_TRUE(s.IsInvalidArgument()); + Close(); +} + +TEST_P(ColumnFamilyTest, ReadWrite) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two"}); + ASSERT_OK(Put(0, "foo", "v1")); + ASSERT_OK(Put(0, "bar", "v2")); + ASSERT_OK(Put(1, "mirko", "v3")); + ASSERT_OK(Put(0, "foo", "v2")); + ASSERT_OK(Put(2, "fodor", "v5")); + + for (int iter = 0; iter <= 3; ++iter) { + ASSERT_EQ("v2", Get(0, "foo")); + ASSERT_EQ("v2", Get(0, "bar")); + ASSERT_EQ("v3", Get(1, "mirko")); + ASSERT_EQ("v5", Get(2, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(0, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(1, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(2, "foo")); + if (iter <= 1) { + Reopen(); + } + } + Close(); +} + +TEST_P(ColumnFamilyTest, IgnoreRecoveredLog) { + std::string backup_logs = dbname_ + "/backup_logs"; + + // delete old files in backup_logs directory + ASSERT_OK(env_->CreateDirIfMissing(dbname_)); + ASSERT_OK(env_->CreateDirIfMissing(backup_logs)); + std::vector old_files; + ASSERT_OK(env_->GetChildren(backup_logs, &old_files)); + for (auto& file : old_files) { + ASSERT_OK(env_->DeleteFile(backup_logs + "/" + file)); + } + + column_family_options_.merge_operator = + MergeOperators::CreateUInt64AddOperator(); + db_options_.wal_dir = dbname_ + "/logs"; + Destroy(); + Open(); + CreateColumnFamilies({"cf1", "cf2"}); + + // fill up the DB + std::string one, two, three; + PutFixed64(&one, 1); + PutFixed64(&two, 2); + PutFixed64(&three, 3); + ASSERT_OK(Merge(0, "foo", one)); + ASSERT_OK(Merge(1, "mirko", one)); + ASSERT_OK(Merge(0, "foo", one)); + ASSERT_OK(Merge(2, "bla", one)); + ASSERT_OK(Merge(2, "fodor", one)); + ASSERT_OK(Merge(0, "bar", one)); + ASSERT_OK(Merge(2, "bla", one)); + ASSERT_OK(Merge(1, "mirko", two)); + ASSERT_OK(Merge(1, "franjo", one)); + + // copy the logs to backup + std::vector logs; + ASSERT_OK(env_->GetChildren(db_options_.wal_dir, &logs)); + for (auto& log : logs) { + CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log); + } + + // recover the DB + Close(); + + // 1. check consistency + // 2. copy the logs from backup back to WAL dir. if the recovery happens + // again on the same log files, this should lead to incorrect results + // due to applying merge operator twice + // 3. check consistency + for (int iter = 0; iter < 2; ++iter) { + // assert consistency + Open({"default", "cf1", "cf2"}); + ASSERT_EQ(two, Get(0, "foo")); + ASSERT_EQ(one, Get(0, "bar")); + ASSERT_EQ(three, Get(1, "mirko")); + ASSERT_EQ(one, Get(1, "franjo")); + ASSERT_EQ(one, Get(2, "fodor")); + ASSERT_EQ(two, Get(2, "bla")); + Close(); + + if (iter == 0) { + // copy the logs from backup back to wal dir + for (auto& log : logs) { + CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log); + } + } + } +} + +#ifndef ROCKSDB_LITE // TEST functions used are not supported +TEST_P(ColumnFamilyTest, FlushTest) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two"}); + ASSERT_OK(Put(0, "foo", "v1")); + ASSERT_OK(Put(0, "bar", "v2")); + ASSERT_OK(Put(1, "mirko", "v3")); + ASSERT_OK(Put(0, "foo", "v2")); + ASSERT_OK(Put(2, "fodor", "v5")); + + for (int j = 0; j < 2; j++) { + ReadOptions ro; + std::vector iterators; + // Hold super version. + if (j == 0) { + ASSERT_OK(db_->NewIterators(ro, handles_, &iterators)); + } + + for (int i = 0; i < 3; ++i) { + uint64_t max_total_in_memory_state = MaxTotalInMemoryState(); + ASSERT_OK(Flush(i)); + AssertMaxTotalInMemoryState(max_total_in_memory_state); + } + ASSERT_OK(Put(1, "foofoo", "bar")); + ASSERT_OK(Put(0, "foofoo", "bar")); + + for (auto* it : iterators) { + ASSERT_OK(it->status()); + delete it; + } + } + Reopen(); + + for (int iter = 0; iter <= 2; ++iter) { + ASSERT_EQ("v2", Get(0, "foo")); + ASSERT_EQ("v2", Get(0, "bar")); + ASSERT_EQ("v3", Get(1, "mirko")); + ASSERT_EQ("v5", Get(2, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(0, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(1, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(2, "foo")); + if (iter <= 1) { + Reopen(); + } + } + Close(); +} + +// Makes sure that obsolete log files get deleted +TEST_P(ColumnFamilyTest, LogDeletionTest) { + db_options_.max_total_wal_size = std::numeric_limits::max(); + column_family_options_.arena_block_size = 4 * 1024; + column_family_options_.write_buffer_size = 128000; // 128KB + Open(); + CreateColumnFamilies({"one", "two", "three", "four"}); + // Each bracket is one log file. if number is in (), it means + // we don't need it anymore (it's been flushed) + // [] + AssertCountLiveLogFiles(0); + PutRandomData(0, 1, 128); + // [0] + PutRandomData(1, 1, 128); + // [0, 1] + PutRandomData(1, 1000, 128); + WaitForFlush(1); + // [0, (1)] [1] + AssertCountLiveLogFiles(2); + PutRandomData(0, 1, 128); + // [0, (1)] [0, 1] + AssertCountLiveLogFiles(2); + PutRandomData(2, 1, 128); + // [0, (1)] [0, 1, 2] + PutRandomData(2, 1000, 128); + WaitForFlush(2); + // [0, (1)] [0, 1, (2)] [2] + AssertCountLiveLogFiles(3); + PutRandomData(2, 1000, 128); + WaitForFlush(2); + // [0, (1)] [0, 1, (2)] [(2)] [2] + AssertCountLiveLogFiles(4); + PutRandomData(3, 1, 128); + // [0, (1)] [0, 1, (2)] [(2)] [2, 3] + PutRandomData(1, 1, 128); + // [0, (1)] [0, 1, (2)] [(2)] [1, 2, 3] + AssertCountLiveLogFiles(4); + PutRandomData(1, 1000, 128); + WaitForFlush(1); + // [0, (1)] [0, (1), (2)] [(2)] [(1), 2, 3] [1] + AssertCountLiveLogFiles(5); + PutRandomData(0, 1000, 128); + WaitForFlush(0); + // [(0), (1)] [(0), (1), (2)] [(2)] [(1), 2, 3] [1, (0)] [0] + // delete obsolete logs --> + // [(1), 2, 3] [1, (0)] [0] + AssertCountLiveLogFiles(3); + PutRandomData(0, 1000, 128); + WaitForFlush(0); + // [(1), 2, 3] [1, (0)], [(0)] [0] + AssertCountLiveLogFiles(4); + PutRandomData(1, 1000, 128); + WaitForFlush(1); + // [(1), 2, 3] [(1), (0)] [(0)] [0, (1)] [1] + AssertCountLiveLogFiles(5); + PutRandomData(2, 1000, 128); + WaitForFlush(2); + // [(1), (2), 3] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2] + AssertCountLiveLogFiles(6); + PutRandomData(3, 1000, 128); + WaitForFlush(3); + // [(1), (2), (3)] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2, (3)] [3] + // delete obsolete logs --> + // [0, (1)] [1, (2)], [2, (3)] [3] + AssertCountLiveLogFiles(4); + Close(); +} +#endif // !ROCKSDB_LITE + +TEST_P(ColumnFamilyTest, CrashAfterFlush) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(env_)); + db_options_.env = fault_env.get(); + Open(); + CreateColumnFamilies({"one"}); + + WriteBatch batch; + ASSERT_OK(batch.Put(handles_[0], Slice("foo"), Slice("bar"))); + ASSERT_OK(batch.Put(handles_[1], Slice("foo"), Slice("bar"))); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + ASSERT_OK(Flush(0)); + fault_env->SetFilesystemActive(false); + + std::vector names; + for (auto name : names_) { + if (name != "") { + names.push_back(name); + } + } + Close(); + ASSERT_OK(fault_env->DropUnsyncedFileData()); + fault_env->ResetState(); + Open(names, {}); + + // Write batch should be atomic. + ASSERT_EQ(Get(0, "foo"), Get(1, "foo")); + + Close(); + db_options_.env = env_; +} + +TEST_P(ColumnFamilyTest, OpenNonexistentColumnFamily) { + ASSERT_OK(TryOpen({"default"})); + Close(); + ASSERT_TRUE(TryOpen({"default", "dne"}).IsInvalidArgument()); +} + +#ifndef ROCKSDB_LITE // WaitForFlush() is not supported +// Makes sure that obsolete log files get deleted +TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) { + // disable flushing stale column families + db_options_.max_total_wal_size = std::numeric_limits::max(); + Open(); + CreateColumnFamilies({"one", "two", "three"}); + ColumnFamilyOptions default_cf, one, two, three; + // setup options. all column families have max_write_buffer_number setup to 10 + // "default" -> 100KB memtable, start flushing immediately + // "one" -> 200KB memtable, start flushing with two immutable memtables + // "two" -> 1MB memtable, start flushing with three immutable memtables + // "three" -> 90KB memtable, start flushing with four immutable memtables + default_cf.write_buffer_size = 100000; + default_cf.arena_block_size = 4 * 4096; + default_cf.max_write_buffer_number = 10; + default_cf.min_write_buffer_number_to_merge = 1; + default_cf.max_write_buffer_size_to_maintain = 0; + one.write_buffer_size = 200000; + one.arena_block_size = 4 * 4096; + one.max_write_buffer_number = 10; + one.min_write_buffer_number_to_merge = 2; + one.max_write_buffer_size_to_maintain = + static_cast(one.write_buffer_size); + two.write_buffer_size = 1000000; + two.arena_block_size = 4 * 4096; + two.max_write_buffer_number = 10; + two.min_write_buffer_number_to_merge = 3; + two.max_write_buffer_size_to_maintain = + static_cast(two.write_buffer_size); + three.write_buffer_size = 4096 * 22; + three.arena_block_size = 4096; + three.max_write_buffer_number = 10; + three.min_write_buffer_number_to_merge = 4; + three.max_write_buffer_size_to_maintain = + static_cast(three.write_buffer_size); + + Reopen({default_cf, one, two, three}); + + int micros_wait_for_flush = 10000; + PutRandomData(0, 100, 1000); + WaitForFlush(0); + AssertNumberOfImmutableMemtables({0, 0, 0, 0}); + AssertCountLiveLogFiles(1); + PutRandomData(1, 200, 1000); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 0, 0}); + AssertCountLiveLogFiles(2); + PutRandomData(2, 1000, 1000); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 1, 0}); + AssertCountLiveLogFiles(3); + PutRandomData(2, 1000, 1000); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 2, 0}); + AssertCountLiveLogFiles(4); + PutRandomData(3, 93, 990); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 2, 1}); + AssertCountLiveLogFiles(5); + PutRandomData(3, 88, 990); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 2, 2}); + AssertCountLiveLogFiles(6); + PutRandomData(3, 88, 990); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 2, 3}); + AssertCountLiveLogFiles(7); + PutRandomData(0, 100, 1000); + WaitForFlush(0); + AssertNumberOfImmutableMemtables({0, 1, 2, 3}); + AssertCountLiveLogFiles(8); + PutRandomData(2, 100, 10000); + WaitForFlush(2); + AssertNumberOfImmutableMemtables({0, 1, 0, 3}); + AssertCountLiveLogFiles(9); + PutRandomData(3, 88, 990); + WaitForFlush(3); + AssertNumberOfImmutableMemtables({0, 1, 0, 0}); + AssertCountLiveLogFiles(10); + PutRandomData(3, 88, 990); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 0, 1}); + AssertCountLiveLogFiles(11); + PutRandomData(1, 200, 1000); + WaitForFlush(1); + AssertNumberOfImmutableMemtables({0, 0, 0, 1}); + AssertCountLiveLogFiles(5); + PutRandomData(3, 88 * 3, 990); + WaitForFlush(3); + PutRandomData(3, 88 * 4, 990); + WaitForFlush(3); + AssertNumberOfImmutableMemtables({0, 0, 0, 0}); + AssertCountLiveLogFiles(12); + PutRandomData(0, 100, 1000); + WaitForFlush(0); + AssertNumberOfImmutableMemtables({0, 0, 0, 0}); + AssertCountLiveLogFiles(12); + PutRandomData(2, 3 * 1000, 1000); + WaitForFlush(2); + AssertNumberOfImmutableMemtables({0, 0, 0, 0}); + AssertCountLiveLogFiles(12); + PutRandomData(1, 2 * 200, 1000); + WaitForFlush(1); + AssertNumberOfImmutableMemtables({0, 0, 0, 0}); + AssertCountLiveLogFiles(7); + Close(); +} +#endif // !ROCKSDB_LITE + +// The test is commented out because we want to test that snapshot is +// not created for memtables not supported it, but There isn't a memtable +// that doesn't support snapshot right now. If we have one later, we can +// re-enable the test. +// +// #ifndef ROCKSDB_LITE // Cuckoo is not supported in lite +// TEST_P(ColumnFamilyTest, MemtableNotSupportSnapshot) { +// db_options_.allow_concurrent_memtable_write = false; +// Open(); +// auto* s1 = dbfull()->GetSnapshot(); +// ASSERT_TRUE(s1 != nullptr); +// dbfull()->ReleaseSnapshot(s1); + +// // Add a column family that doesn't support snapshot +// ColumnFamilyOptions first; +// first.memtable_factory.reset(new DummyMemtableNotSupportingSnapshot()); +// CreateColumnFamilies({"first"}, {first}); +// auto* s2 = dbfull()->GetSnapshot(); +// ASSERT_TRUE(s2 == nullptr); + +// // Add a column family that supports snapshot. Snapshot stays not +// supported. ColumnFamilyOptions second; CreateColumnFamilies({"second"}, +// {second}); auto* s3 = dbfull()->GetSnapshot(); ASSERT_TRUE(s3 == nullptr); +// Close(); +// } +// #endif // !ROCKSDB_LITE + +class TestComparator : public Comparator { + int Compare(const ROCKSDB_NAMESPACE::Slice& /*a*/, + const ROCKSDB_NAMESPACE::Slice& /*b*/) const override { + return 0; + } + const char* Name() const override { return "Test"; } + void FindShortestSeparator( + std::string* /*start*/, + const ROCKSDB_NAMESPACE::Slice& /*limit*/) const override {} + void FindShortSuccessor(std::string* /*key*/) const override {} +}; + +static TestComparator third_comparator; +static TestComparator fourth_comparator; + +// Test that we can retrieve the comparator from a created CF +TEST_P(ColumnFamilyTest, GetComparator) { + Open(); + // Add a column family with no comparator specified + CreateColumnFamilies({"first"}); + const Comparator* comp = handles_[0]->GetComparator(); + ASSERT_EQ(comp, BytewiseComparator()); + + // Add three column families - one with no comparator and two + // with comparators specified + ColumnFamilyOptions second, third, fourth; + second.comparator = &third_comparator; + third.comparator = &fourth_comparator; + CreateColumnFamilies({"second", "third", "fourth"}, {second, third, fourth}); + ASSERT_EQ(handles_[1]->GetComparator(), BytewiseComparator()); + ASSERT_EQ(handles_[2]->GetComparator(), &third_comparator); + ASSERT_EQ(handles_[3]->GetComparator(), &fourth_comparator); + Close(); +} + +TEST_P(ColumnFamilyTest, DifferentMergeOperators) { + Open(); + CreateColumnFamilies({"first", "second"}); + ColumnFamilyOptions default_cf, first, second; + first.merge_operator = MergeOperators::CreateUInt64AddOperator(); + second.merge_operator = MergeOperators::CreateStringAppendOperator(); + Reopen({default_cf, first, second}); + + std::string one, two, three; + PutFixed64(&one, 1); + PutFixed64(&two, 2); + PutFixed64(&three, 3); + + ASSERT_OK(Put(0, "foo", two)); + ASSERT_OK(Put(0, "foo", one)); + ASSERT_TRUE(Merge(0, "foo", two).IsNotSupported()); + ASSERT_EQ(Get(0, "foo"), one); + + ASSERT_OK(Put(1, "foo", two)); + ASSERT_OK(Put(1, "foo", one)); + ASSERT_OK(Merge(1, "foo", two)); + ASSERT_EQ(Get(1, "foo"), three); + + ASSERT_OK(Put(2, "foo", two)); + ASSERT_OK(Put(2, "foo", one)); + ASSERT_OK(Merge(2, "foo", two)); + ASSERT_EQ(Get(2, "foo"), one + "," + two); + Close(); +} + +#ifndef ROCKSDB_LITE // WaitForFlush() is not supported +TEST_P(ColumnFamilyTest, DifferentCompactionStyles) { + Open(); + CreateColumnFamilies({"one", "two"}); + ColumnFamilyOptions default_cf, one, two; + db_options_.max_open_files = 20; // only 10 files in file cache + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.max_compaction_bytes = static_cast(1) << 60; + + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + one.compaction_style = kCompactionStyleUniversal; + + one.num_levels = 1; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 4; + one.write_buffer_size = 120000; + + two.compaction_style = kCompactionStyleLevel; + two.num_levels = 4; + two.level0_file_num_compaction_trigger = 3; + two.write_buffer_size = 100000; + + Reopen({default_cf, one, two}); + + // SETUP column family "one" -- universal style + for (int i = 0; i < one.level0_file_num_compaction_trigger - 1; ++i) { + PutRandomData(1, 10, 12000); + PutRandomData(1, 1, 10); + WaitForFlush(1); + AssertFilesPerLevel(std::to_string(i + 1), 1); + } + + // SETUP column family "two" -- level style with 4 levels + for (int i = 0; i < two.level0_file_num_compaction_trigger - 1; ++i) { + PutRandomData(2, 10, 12000); + PutRandomData(2, 1, 10); + WaitForFlush(2); + AssertFilesPerLevel(std::to_string(i + 1), 2); + } + + // TRIGGER compaction "one" + PutRandomData(1, 10, 12000); + PutRandomData(1, 1, 10); + + // TRIGGER compaction "two" + PutRandomData(2, 10, 12000); + PutRandomData(2, 1, 10); + + // WAIT for compactions + WaitForCompaction(); + + // VERIFY compaction "one" + AssertFilesPerLevel("1", 1); + + // VERIFY compaction "two" + AssertFilesPerLevel("0,1", 2); + CompactAll(2); + AssertFilesPerLevel("0,1", 2); + + Close(); +} +#endif // !ROCKSDB_LITE + +#ifndef ROCKSDB_LITE +// Sync points not supported in RocksDB Lite + +TEST_P(ColumnFamilyTest, MultipleManualCompactions) { + Open(); + CreateColumnFamilies({"one", "two"}); + ColumnFamilyOptions default_cf, one, two; + db_options_.max_open_files = 20; // only 10 files in file cache + db_options_.max_background_compactions = 3; + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + one.compaction_style = kCompactionStyleUniversal; + + one.num_levels = 1; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 4; + one.write_buffer_size = 120000; + + two.compaction_style = kCompactionStyleLevel; + two.num_levels = 4; + two.level0_file_num_compaction_trigger = 3; + two.write_buffer_size = 100000; + + Reopen({default_cf, one, two}); + + // SETUP column family "one" -- universal style + for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(std::to_string(i + 1), 1); + } + std::atomic_bool cf_1_1{true}; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"ColumnFamilyTest::MultiManual:4", "ColumnFamilyTest::MultiManual:1"}, + {"ColumnFamilyTest::MultiManual:2", "ColumnFamilyTest::MultiManual:5"}, + {"ColumnFamilyTest::MultiManual:2", "ColumnFamilyTest::MultiManual:3"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) { + if (cf_1_1.exchange(false)) { + TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:4"); + TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:3"); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + std::vector threads; + threads.emplace_back([&] { + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + }); + + // SETUP column family "two" -- level style with 4 levels + for (int i = 0; i < two.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(2, 10, 12000); + PutRandomData(2, 1, 10); + WaitForFlush(2); + AssertFilesPerLevel(std::to_string(i + 1), 2); + } + threads.emplace_back([&] { + TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:1"); + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[2], nullptr, nullptr)); + TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:2"); + }); + + TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:5"); + for (auto& t : threads) { + t.join(); + } + + // VERIFY compaction "one" + AssertFilesPerLevel("1", 1); + + // VERIFY compaction "two" + AssertFilesPerLevel("0,1", 2); + CompactAll(2); + AssertFilesPerLevel("0,1", 2); + // Compare against saved keys + std::set::iterator key_iter = keys_[1].begin(); + while (key_iter != keys_[1].end()) { + ASSERT_NE("NOT_FOUND", Get(1, *key_iter)); + key_iter++; + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + Close(); +} + +TEST_P(ColumnFamilyTest, AutomaticAndManualCompactions) { + Open(); + CreateColumnFamilies({"one", "two"}); + ColumnFamilyOptions default_cf, one, two; + db_options_.max_open_files = 20; // only 10 files in file cache + db_options_.max_background_compactions = 3; + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + ; + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + one.compaction_style = kCompactionStyleUniversal; + + one.num_levels = 1; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 4; + one.write_buffer_size = 120000; + + two.compaction_style = kCompactionStyleLevel; + two.num_levels = 4; + two.level0_file_num_compaction_trigger = 3; + two.write_buffer_size = 100000; + + Reopen({default_cf, one, two}); + // make sure all background compaction jobs can be scheduled + auto stop_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + std::atomic_bool cf_1_1{true}; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"ColumnFamilyTest::AutoManual:4", "ColumnFamilyTest::AutoManual:1"}, + {"ColumnFamilyTest::AutoManual:2", "ColumnFamilyTest::AutoManual:5"}, + {"ColumnFamilyTest::AutoManual:2", "ColumnFamilyTest::AutoManual:3"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) { + if (cf_1_1.exchange(false)) { + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:4"); + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:3"); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + // SETUP column family "one" -- universal style + for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(std::to_string(i + 1), 1); + } + + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:1"); + + // SETUP column family "two" -- level style with 4 levels + for (int i = 0; i < two.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(2, 10, 12000); + PutRandomData(2, 1, 10); + WaitForFlush(2); + AssertFilesPerLevel(std::to_string(i + 1), 2); + } + ROCKSDB_NAMESPACE::port::Thread threads([&] { + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[2], nullptr, nullptr)); + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:2"); + }); + + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:5"); + threads.join(); + + // WAIT for compactions + WaitForCompaction(); + + // VERIFY compaction "one" + AssertFilesPerLevel("1", 1); + + // VERIFY compaction "two" + AssertFilesPerLevel("0,1", 2); + CompactAll(2); + AssertFilesPerLevel("0,1", 2); + // Compare against saved keys + std::set::iterator key_iter = keys_[1].begin(); + while (key_iter != keys_[1].end()) { + ASSERT_NE("NOT_FOUND", Get(1, *key_iter)); + key_iter++; + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(ColumnFamilyTest, ManualAndAutomaticCompactions) { + Open(); + CreateColumnFamilies({"one", "two"}); + ColumnFamilyOptions default_cf, one, two; + db_options_.max_open_files = 20; // only 10 files in file cache + db_options_.max_background_compactions = 3; + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + ; + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + one.compaction_style = kCompactionStyleUniversal; + + one.num_levels = 1; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 4; + one.write_buffer_size = 120000; + + two.compaction_style = kCompactionStyleLevel; + two.num_levels = 4; + two.level0_file_num_compaction_trigger = 3; + two.write_buffer_size = 100000; + + Reopen({default_cf, one, two}); + // make sure all background compaction jobs can be scheduled + auto stop_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + // SETUP column family "one" -- universal style + for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(std::to_string(i + 1), 1); + } + std::atomic_bool cf_1_1{true}; + std::atomic_bool cf_1_2{true}; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:1"}, + {"ColumnFamilyTest::ManualAuto:5", "ColumnFamilyTest::ManualAuto:2"}, + {"ColumnFamilyTest::ManualAuto:2", "ColumnFamilyTest::ManualAuto:3"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) { + if (cf_1_1.exchange(false)) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4"); + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:3"); + } else if (cf_1_2.exchange(false)) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:2"); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ROCKSDB_NAMESPACE::port::Thread threads([&] { + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + }); + + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:1"); + + // SETUP column family "two" -- level style with 4 levels + for (int i = 0; i < two.level0_file_num_compaction_trigger; ++i) { + PutRandomData(2, 10, 12000); + PutRandomData(2, 1, 10); + WaitForFlush(2); + AssertFilesPerLevel(std::to_string(i + 1), 2); + } + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:5"); + threads.join(); + + // WAIT for compactions + WaitForCompaction(); + + // VERIFY compaction "one" + AssertFilesPerLevel("1", 1); + + // VERIFY compaction "two" + AssertFilesPerLevel("0,1", 2); + CompactAll(2); + AssertFilesPerLevel("0,1", 2); + // Compare against saved keys + std::set::iterator key_iter = keys_[1].begin(); + while (key_iter != keys_[1].end()) { + ASSERT_NE("NOT_FOUND", Get(1, *key_iter)); + key_iter++; + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(ColumnFamilyTest, SameCFManualManualCompactions) { + Open(); + CreateColumnFamilies({"one"}); + ColumnFamilyOptions default_cf, one; + db_options_.max_open_files = 20; // only 10 files in file cache + db_options_.max_background_compactions = 3; + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + ; + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + one.compaction_style = kCompactionStyleUniversal; + + one.num_levels = 1; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 4; + one.write_buffer_size = 120000; + + Reopen({default_cf, one}); + // make sure all background compaction jobs can be scheduled + auto stop_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + // SETUP column family "one" -- universal style + for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(std::to_string(i + 1), 1); + } + std::atomic_bool cf_1_1{true}; + std::atomic_bool cf_1_2{true}; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"ColumnFamilyTest::ManualManual:4", "ColumnFamilyTest::ManualManual:2"}, + {"ColumnFamilyTest::ManualManual:4", "ColumnFamilyTest::ManualManual:5"}, + {"ColumnFamilyTest::ManualManual:1", "ColumnFamilyTest::ManualManual:2"}, + {"ColumnFamilyTest::ManualManual:1", + "ColumnFamilyTest::ManualManual:3"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) { + if (cf_1_1.exchange(false)) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:4"); + TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:3"); + } else if (cf_1_2.exchange(false)) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:2"); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ROCKSDB_NAMESPACE::port::Thread threads([&] { + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = true; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + }); + + TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:5"); + + WaitForFlush(1); + + // Add more L0 files and force another manual compaction + for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel( + std::to_string(one.level0_file_num_compaction_trigger + i), 1); + } + + ROCKSDB_NAMESPACE::port::Thread threads1([&] { + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + }); + + TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:1"); + + threads.join(); + threads1.join(); + WaitForCompaction(); + // VERIFY compaction "one" + ASSERT_LE(NumTableFilesAtLevel(0, 1), 2); + + // Compare against saved keys + std::set::iterator key_iter = keys_[1].begin(); + while (key_iter != keys_[1].end()) { + ASSERT_NE("NOT_FOUND", Get(1, *key_iter)); + key_iter++; + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(ColumnFamilyTest, SameCFManualAutomaticCompactions) { + Open(); + CreateColumnFamilies({"one"}); + ColumnFamilyOptions default_cf, one; + db_options_.max_open_files = 20; // only 10 files in file cache + db_options_.max_background_compactions = 3; + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + ; + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + one.compaction_style = kCompactionStyleUniversal; + + one.num_levels = 1; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 4; + one.write_buffer_size = 120000; + + Reopen({default_cf, one}); + // make sure all background compaction jobs can be scheduled + auto stop_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + // SETUP column family "one" -- universal style + for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(std::to_string(i + 1), 1); + } + std::atomic_bool cf_1_1{true}; + std::atomic_bool cf_1_2{true}; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:2"}, + {"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:5"}, + {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:2"}, + {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:3"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) { + if (cf_1_1.exchange(false)) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4"); + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:3"); + } else if (cf_1_2.exchange(false)) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:2"); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ROCKSDB_NAMESPACE::port::Thread threads([&] { + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + }); + + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:5"); + + WaitForFlush(1); + + // Add more L0 files and force automatic compaction + for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel( + std::to_string(one.level0_file_num_compaction_trigger + i), 1); + } + + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:1"); + + threads.join(); + WaitForCompaction(); + // VERIFY compaction "one" + ASSERT_LE(NumTableFilesAtLevel(0, 1), 2); + + // Compare against saved keys + std::set::iterator key_iter = keys_[1].begin(); + while (key_iter != keys_[1].end()) { + ASSERT_NE("NOT_FOUND", Get(1, *key_iter)); + key_iter++; + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(ColumnFamilyTest, SameCFManualAutomaticCompactionsLevel) { + Open(); + CreateColumnFamilies({"one"}); + ColumnFamilyOptions default_cf, one; + db_options_.max_open_files = 20; // only 10 files in file cache + db_options_.max_background_compactions = 3; + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + ; + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + one.compaction_style = kCompactionStyleLevel; + + one.num_levels = 1; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 3; + one.write_buffer_size = 120000; + + Reopen({default_cf, one}); + // make sure all background compaction jobs can be scheduled + auto stop_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + // SETUP column family "one" -- level style + for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(std::to_string(i + 1), 1); + } + std::atomic_bool cf_1_1{true}; + std::atomic_bool cf_1_2{true}; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:2"}, + {"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:5"}, + {"ColumnFamilyTest::ManualAuto:3", "ColumnFamilyTest::ManualAuto:2"}, + {"LevelCompactionPicker::PickCompactionBySize:0", + "ColumnFamilyTest::ManualAuto:3"}, + {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:3"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) { + if (cf_1_1.exchange(false)) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4"); + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:3"); + } else if (cf_1_2.exchange(false)) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:2"); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ROCKSDB_NAMESPACE::port::Thread threads([&] { + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + }); + + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:5"); + + // Add more L0 files and force automatic compaction + for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel( + std::to_string(one.level0_file_num_compaction_trigger + i), 1); + } + + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:1"); + + threads.join(); + WaitForCompaction(); + // VERIFY compaction "one" + AssertFilesPerLevel("0,1", 1); + + // Compare against saved keys + std::set::iterator key_iter = keys_[1].begin(); + while (key_iter != keys_[1].end()) { + ASSERT_NE("NOT_FOUND", Get(1, *key_iter)); + key_iter++; + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +// In this test, we generate enough files to trigger automatic compactions. +// The automatic compaction waits in NonTrivial:AfterRun +// We generate more files and then trigger an automatic compaction +// This will wait because the automatic compaction has files it needs. +// Once the conflict is hit, the automatic compaction starts and ends +// Then the manual will run and end. +TEST_P(ColumnFamilyTest, SameCFAutomaticManualCompactions) { + Open(); + CreateColumnFamilies({"one"}); + ColumnFamilyOptions default_cf, one; + db_options_.max_open_files = 20; // only 10 files in file cache + db_options_.max_background_compactions = 3; + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + ; + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + one.compaction_style = kCompactionStyleUniversal; + + one.num_levels = 1; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 4; + one.write_buffer_size = 120000; + + Reopen({default_cf, one}); + // make sure all background compaction jobs can be scheduled + auto stop_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + std::atomic_bool cf_1_1{true}; + std::atomic_bool cf_1_2{true}; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"ColumnFamilyTest::AutoManual:4", "ColumnFamilyTest::AutoManual:2"}, + {"ColumnFamilyTest::AutoManual:4", "ColumnFamilyTest::AutoManual:5"}, + {"CompactionPicker::CompactRange:Conflict", + "ColumnFamilyTest::AutoManual:3"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) { + if (cf_1_1.exchange(false)) { + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:4"); + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:3"); + } else if (cf_1_2.exchange(false)) { + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:2"); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // SETUP column family "one" -- universal style + for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(std::to_string(i + 1), 1); + } + + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:5"); + + // Add another L0 file and force automatic compaction + for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + } + + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:1"); + + WaitForCompaction(); + // VERIFY compaction "one" + AssertFilesPerLevel("1", 1); + // Compare against saved keys + std::set::iterator key_iter = keys_[1].begin(); + while (key_iter != keys_[1].end()) { + ASSERT_NE("NOT_FOUND", Get(1, *key_iter)); + key_iter++; + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} +#endif // !ROCKSDB_LITE + +#ifndef ROCKSDB_LITE // Tailing iterator not supported +namespace { +std::string IterStatus(Iterator* iter) { + std::string result; + if (iter->Valid()) { + result = iter->key().ToString() + "->" + iter->value().ToString(); + } else { + EXPECT_OK(iter->status()); + result = "(invalid)"; + } + return result; +} +} // anonymous namespace + +TEST_P(ColumnFamilyTest, NewIteratorsTest) { + // iter == 0 -- no tailing + // iter == 2 -- tailing + for (int iter = 0; iter < 2; ++iter) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two"}); + ASSERT_OK(Put(0, "a", "b")); + ASSERT_OK(Put(1, "b", "a")); + ASSERT_OK(Put(2, "c", "m")); + ASSERT_OK(Put(2, "v", "t")); + std::vector iterators; + ReadOptions options; + options.tailing = (iter == 1); + ASSERT_OK(db_->NewIterators(options, handles_, &iterators)); + + for (auto it : iterators) { + it->SeekToFirst(); + } + ASSERT_EQ(IterStatus(iterators[0]), "a->b"); + ASSERT_EQ(IterStatus(iterators[1]), "b->a"); + ASSERT_EQ(IterStatus(iterators[2]), "c->m"); + + ASSERT_OK(Put(1, "x", "x")); + + for (auto it : iterators) { + it->Next(); + } + + ASSERT_EQ(IterStatus(iterators[0]), "(invalid)"); + if (iter == 0) { + // no tailing + ASSERT_EQ(IterStatus(iterators[1]), "(invalid)"); + } else { + // tailing + ASSERT_EQ(IterStatus(iterators[1]), "x->x"); + } + ASSERT_EQ(IterStatus(iterators[2]), "v->t"); + + for (auto it : iterators) { + delete it; + } + Destroy(); + } +} +#endif // !ROCKSDB_LITE + +#ifndef ROCKSDB_LITE // ReadOnlyDB is not supported +TEST_P(ColumnFamilyTest, ReadOnlyDBTest) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two", "three", "four"}); + ASSERT_OK(Put(0, "a", "b")); + ASSERT_OK(Put(1, "foo", "bla")); + ASSERT_OK(Put(2, "foo", "blabla")); + ASSERT_OK(Put(3, "foo", "blablabla")); + ASSERT_OK(Put(4, "foo", "blablablabla")); + + DropColumnFamilies({2}); + Close(); + // open only a subset of column families + AssertOpenReadOnly({"default", "one", "four"}); + ASSERT_EQ("NOT_FOUND", Get(0, "foo")); + ASSERT_EQ("bla", Get(1, "foo")); + ASSERT_EQ("blablablabla", Get(2, "foo")); + + // test newiterators + { + std::vector iterators; + ASSERT_OK(db_->NewIterators(ReadOptions(), handles_, &iterators)); + for (auto it : iterators) { + it->SeekToFirst(); + } + ASSERT_EQ(IterStatus(iterators[0]), "a->b"); + ASSERT_EQ(IterStatus(iterators[1]), "foo->bla"); + ASSERT_EQ(IterStatus(iterators[2]), "foo->blablablabla"); + for (auto it : iterators) { + it->Next(); + } + ASSERT_EQ(IterStatus(iterators[0]), "(invalid)"); + ASSERT_EQ(IterStatus(iterators[1]), "(invalid)"); + ASSERT_EQ(IterStatus(iterators[2]), "(invalid)"); + + for (auto it : iterators) { + delete it; + } + } + + Close(); + // can't open dropped column family + Status s = OpenReadOnly({"default", "one", "two"}); + ASSERT_TRUE(!s.ok()); + + // Can't open without specifying default column family + s = OpenReadOnly({"one", "four"}); + ASSERT_TRUE(!s.ok()); +} +#endif // !ROCKSDB_LITE + +#ifndef ROCKSDB_LITE // WaitForFlush() is not supported in lite +TEST_P(ColumnFamilyTest, DontRollEmptyLogs) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two", "three", "four"}); + + for (size_t i = 0; i < handles_.size(); ++i) { + PutRandomData(static_cast(i), 10, 100); + } + int num_writable_file_start = env_->GetNumberOfNewWritableFileCalls(); + // this will trigger the flushes + for (int i = 0; i <= 4; ++i) { + ASSERT_OK(Flush(i)); + } + + for (int i = 0; i < 4; ++i) { + WaitForFlush(i); + } + int total_new_writable_files = + env_->GetNumberOfNewWritableFileCalls() - num_writable_file_start; + ASSERT_EQ(static_cast(total_new_writable_files), handles_.size() + 1); + Close(); +} +#endif // !ROCKSDB_LITE + +#ifndef ROCKSDB_LITE // WaitForCompaction() is not supported in lite +TEST_P(ColumnFamilyTest, FlushStaleColumnFamilies) { + Open(); + CreateColumnFamilies({"one", "two"}); + ColumnFamilyOptions default_cf, one, two; + default_cf.write_buffer_size = 100000; // small write buffer size + default_cf.arena_block_size = 4096; + default_cf.disable_auto_compactions = true; + one.disable_auto_compactions = true; + two.disable_auto_compactions = true; + db_options_.max_total_wal_size = 210000; + + Reopen({default_cf, one, two}); + + PutRandomData(2, 1, 10); // 10 bytes + for (int i = 0; i < 2; ++i) { + PutRandomData(0, 100, 1000); // flush + WaitForFlush(0); + + AssertCountLiveFiles(i + 1); + } + // third flush. now, CF [two] should be detected as stale and flushed + // column family 1 should not be flushed since it's empty + PutRandomData(0, 100, 1000); // flush + WaitForFlush(0); + WaitForFlush(2); + // at least 3 files for default column families, 1 file for column family + // [two], zero files for column family [one], because it's empty + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_GE(metadata.size(), 4); + bool has_cf1_sst = false; + bool has_cf2_sst = false; + for (const auto& file : metadata) { + if (file.column_family_name == "one") { + has_cf1_sst = true; + } else if (file.column_family_name == "two") { + has_cf2_sst = true; + } + } + ASSERT_FALSE(has_cf1_sst); + ASSERT_TRUE(has_cf2_sst); + + ASSERT_OK(Flush(0)); + ASSERT_EQ(0, dbfull()->TEST_total_log_size()); + Close(); +} +#endif // !ROCKSDB_LITE + +TEST_P(ColumnFamilyTest, CreateMissingColumnFamilies) { + Status s = TryOpen({"one", "two"}); + ASSERT_TRUE(!s.ok()); + db_options_.create_missing_column_families = true; + s = TryOpen({"default", "one", "two"}); + ASSERT_TRUE(s.ok()); + Close(); +} + +TEST_P(ColumnFamilyTest, SanitizeOptions) { + DBOptions db_options; + for (int s = kCompactionStyleLevel; s <= kCompactionStyleUniversal; ++s) { + for (int l = 0; l <= 2; l++) { + for (int i = 1; i <= 3; i++) { + for (int j = 1; j <= 3; j++) { + for (int k = 1; k <= 3; k++) { + ColumnFamilyOptions original; + original.compaction_style = static_cast(s); + original.num_levels = l; + original.level0_stop_writes_trigger = i; + original.level0_slowdown_writes_trigger = j; + original.level0_file_num_compaction_trigger = k; + original.write_buffer_size = + l * 4 * 1024 * 1024 + i * 1024 * 1024 + j * 1024 + k; + + ColumnFamilyOptions result = + SanitizeOptions(ImmutableDBOptions(db_options), original); + ASSERT_TRUE(result.level0_stop_writes_trigger >= + result.level0_slowdown_writes_trigger); + ASSERT_TRUE(result.level0_slowdown_writes_trigger >= + result.level0_file_num_compaction_trigger); + ASSERT_TRUE(result.level0_file_num_compaction_trigger == + original.level0_file_num_compaction_trigger); + if (s == kCompactionStyleLevel) { + ASSERT_GE(result.num_levels, 2); + } else { + ASSERT_GE(result.num_levels, 1); + if (original.num_levels >= 1) { + ASSERT_EQ(result.num_levels, original.num_levels); + } + } + + // Make sure Sanitize options sets arena_block_size to 1/8 of + // the write_buffer_size, rounded up to a multiple of 4k. + size_t expected_arena_block_size = + l * 4 * 1024 * 1024 / 8 + i * 1024 * 1024 / 8; + if (j + k != 0) { + // not a multiple of 4k, round up 4k + expected_arena_block_size += 4 * 1024; + } + expected_arena_block_size = + std::min(size_t{1024 * 1024}, expected_arena_block_size); + ASSERT_EQ(expected_arena_block_size, result.arena_block_size); + } + } + } + } + } +} + +TEST_P(ColumnFamilyTest, ReadDroppedColumnFamily) { + // iter 0 -- drop CF, don't reopen + // iter 1 -- delete CF, reopen + for (int iter = 0; iter < 2; ++iter) { + db_options_.create_missing_column_families = true; + db_options_.max_open_files = 20; + // delete obsolete files always + db_options_.delete_obsolete_files_period_micros = 0; + Open({"default", "one", "two"}); + ColumnFamilyOptions options; + options.level0_file_num_compaction_trigger = 100; + options.level0_slowdown_writes_trigger = 200; + options.level0_stop_writes_trigger = 200; + options.write_buffer_size = 100000; // small write buffer size + Reopen({options, options, options}); + + // 1MB should create ~10 files for each CF + int kKeysNum = 10000; + PutRandomData(0, kKeysNum, 100); + PutRandomData(1, kKeysNum, 100); + PutRandomData(2, kKeysNum, 100); + + { + std::unique_ptr iterator( + db_->NewIterator(ReadOptions(), handles_[2])); + iterator->SeekToFirst(); + + if (iter == 0) { + // Drop CF two + ASSERT_OK(db_->DropColumnFamily(handles_[2])); + } else { + // delete CF two + ASSERT_OK(db_->DestroyColumnFamilyHandle(handles_[2])); + handles_[2] = nullptr; + } + // Make sure iterator created can still be used. + int count = 0; + for (; iterator->Valid(); iterator->Next()) { + ASSERT_OK(iterator->status()); + ++count; + } + ASSERT_OK(iterator->status()); + ASSERT_EQ(count, kKeysNum); + } + + // Add bunch more data to other CFs + PutRandomData(0, kKeysNum, 100); + PutRandomData(1, kKeysNum, 100); + + if (iter == 1) { + Reopen(); + } + + // Since we didn't delete CF handle, RocksDB's contract guarantees that + // we're still able to read dropped CF + for (int i = 0; i < 3; ++i) { + std::unique_ptr iterator( + db_->NewIterator(ReadOptions(), handles_[i])); + int count = 0; + for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) { + ASSERT_OK(iterator->status()); + ++count; + } + ASSERT_OK(iterator->status()); + ASSERT_EQ(count, kKeysNum * ((i == 2) ? 1 : 2)); + } + + Close(); + Destroy(); + } +} + +TEST_P(ColumnFamilyTest, LiveIteratorWithDroppedColumnFamily) { + db_options_.create_missing_column_families = true; + db_options_.max_open_files = 20; + // delete obsolete files always + db_options_.delete_obsolete_files_period_micros = 0; + Open({"default", "one", "two"}); + ColumnFamilyOptions options; + options.level0_file_num_compaction_trigger = 100; + options.level0_slowdown_writes_trigger = 200; + options.level0_stop_writes_trigger = 200; + options.write_buffer_size = 100000; // small write buffer size + Reopen({options, options, options}); + + // 1MB should create ~10 files for each CF + int kKeysNum = 10000; + PutRandomData(1, kKeysNum, 100); + { + std::unique_ptr iterator( + db_->NewIterator(ReadOptions(), handles_[1])); + iterator->SeekToFirst(); + + DropColumnFamilies({1}); + + // Make sure iterator created can still be used. + int count = 0; + for (; iterator->Valid(); iterator->Next()) { + ASSERT_OK(iterator->status()); + ++count; + } + ASSERT_OK(iterator->status()); + ASSERT_EQ(count, kKeysNum); + } + + Reopen(); + Close(); + Destroy(); +} + +TEST_P(ColumnFamilyTest, FlushAndDropRaceCondition) { + db_options_.create_missing_column_families = true; + Open({"default", "one"}); + ColumnFamilyOptions options; + options.level0_file_num_compaction_trigger = 100; + options.level0_slowdown_writes_trigger = 200; + options.level0_stop_writes_trigger = 200; + options.max_write_buffer_number = 20; + options.write_buffer_size = 100000; // small write buffer size + Reopen({options, options}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"VersionSet::LogAndApply::ColumnFamilyDrop:0", + "FlushJob::WriteLevel0Table"}, + {"VersionSet::LogAndApply::ColumnFamilyDrop:1", + "FlushJob::InstallResults"}, + {"FlushJob::InstallResults", + "VersionSet::LogAndApply::ColumnFamilyDrop:2"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + test::SleepingBackgroundTask sleeping_task; + + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::HIGH); + // Make sure the task is sleeping. Otherwise, it might start to execute + // after sleeping_task.WaitUntilDone() and cause TSAN warning. + sleeping_task.WaitUntilSleeping(); + + // 1MB should create ~10 files for each CF + int kKeysNum = 10000; + PutRandomData(1, kKeysNum, 100); + + std::vector threads; + threads.emplace_back([&] { ASSERT_OK(db_->DropColumnFamily(handles_[1])); }); + + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); + sleeping_task.Reset(); + // now we sleep again. this is just so we're certain that flush job finished + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::HIGH); + // Make sure the task is sleeping. Otherwise, it might start to execute + // after sleeping_task.WaitUntilDone() and cause TSAN warning. + sleeping_task.WaitUntilSleeping(); + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); + + { + // Since we didn't delete CF handle, RocksDB's contract guarantees that + // we're still able to read dropped CF + std::unique_ptr iterator( + db_->NewIterator(ReadOptions(), handles_[1])); + int count = 0; + for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) { + ASSERT_OK(iterator->status()); + ++count; + } + ASSERT_OK(iterator->status()); + ASSERT_EQ(count, kKeysNum); + } + for (auto& t : threads) { + t.join(); + } + + Close(); + Destroy(); +} + +#ifndef ROCKSDB_LITE +// skipped as persisting options is not supported in ROCKSDB_LITE +namespace { +std::atomic test_stage(0); +std::atomic ordered_by_writethread(false); +const int kMainThreadStartPersistingOptionsFile = 1; +const int kChildThreadFinishDroppingColumnFamily = 2; +void DropSingleColumnFamily(ColumnFamilyTest* cf_test, int cf_id, + std::vector* comparators) { + while (test_stage < kMainThreadStartPersistingOptionsFile && + !ordered_by_writethread) { + Env::Default()->SleepForMicroseconds(100); + } + cf_test->DropColumnFamilies({cf_id}); + if ((*comparators)[cf_id]) { + delete (*comparators)[cf_id]; + (*comparators)[cf_id] = nullptr; + } + test_stage = kChildThreadFinishDroppingColumnFamily; +} +} // anonymous namespace + +TEST_P(ColumnFamilyTest, CreateAndDropRace) { + const int kCfCount = 5; + std::vector cf_opts; + std::vector comparators; + for (int i = 0; i < kCfCount; ++i) { + cf_opts.emplace_back(); + comparators.push_back(new test::SimpleSuffixReverseComparator()); + cf_opts.back().comparator = comparators.back(); + } + db_options_.create_if_missing = true; + db_options_.create_missing_column_families = true; + + auto main_thread_id = std::this_thread::get_id(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PersistRocksDBOptions:start", [&](void* /*arg*/) { + auto current_thread_id = std::this_thread::get_id(); + // If it's the main thread hitting this sync-point, then it + // will be blocked until some other thread update the test_stage. + if (main_thread_id == current_thread_id) { + test_stage = kMainThreadStartPersistingOptionsFile; + while (test_stage < kChildThreadFinishDroppingColumnFamily && + !ordered_by_writethread) { + Env::Default()->SleepForMicroseconds(100); + } + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::EnterUnbatched:Wait", [&](void* /*arg*/) { + // This means a thread doing DropColumnFamily() is waiting for + // other thread to finish persisting options. + // In such case, we update the test_stage to unblock the main thread. + ordered_by_writethread = true; + }); + + // Create a database with four column families + Open({"default", "one", "two", "three"}, + {cf_opts[0], cf_opts[1], cf_opts[2], cf_opts[3]}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Start a thread that will drop the first column family + // and its comparator + ROCKSDB_NAMESPACE::port::Thread drop_cf_thread(DropSingleColumnFamily, this, + 1, &comparators); + + DropColumnFamilies({2}); + + drop_cf_thread.join(); + Close(); + Destroy(); + for (auto* comparator : comparators) { + if (comparator) { + delete comparator; + } + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} +#endif // !ROCKSDB_LITE + +TEST_P(ColumnFamilyTest, WriteStallSingleColumnFamily) { + const uint64_t kBaseRate = 800000u; + db_options_.delayed_write_rate = kBaseRate; + db_options_.max_background_compactions = 6; + + Open({"default"}); + ColumnFamilyData* cfd = + static_cast(db_->DefaultColumnFamily())->cfd(); + + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + + MutableCFOptions mutable_cf_options(column_family_options_); + + mutable_cf_options.level0_slowdown_writes_trigger = 20; + mutable_cf_options.level0_stop_writes_trigger = 10000; + mutable_cf_options.soft_pending_compaction_bytes_limit = 200; + mutable_cf_options.hard_pending_compaction_bytes_limit = 2000; + mutable_cf_options.disable_auto_compactions = false; + + vstorage->TEST_set_estimated_compaction_needed_bytes(50); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(201); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(400); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(500); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(450); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(205); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(202); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(201); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(198); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(399); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(599); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(2001); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(IsDbWriteStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(3001); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(IsDbWriteStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(390); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(100); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + + vstorage->set_l0_delay_trigger_count(100); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(101); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); + + vstorage->set_l0_delay_trigger_count(0); + vstorage->TEST_set_estimated_compaction_needed_bytes(300); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate()); + + vstorage->set_l0_delay_trigger_count(101); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25 / 1.25 / 1.25, GetDbDelayedWriteRate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(200); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate()); + + vstorage->set_l0_delay_trigger_count(0); + vstorage->TEST_set_estimated_compaction_needed_bytes(0); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + + mutable_cf_options.disable_auto_compactions = true; + dbfull()->TEST_write_controler().set_delayed_write_rate(kBaseRate); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + + vstorage->set_l0_delay_trigger_count(50); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(0, GetDbDelayedWriteRate()); + ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate()); + + vstorage->set_l0_delay_trigger_count(60); + vstorage->TEST_set_estimated_compaction_needed_bytes(300); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(0, GetDbDelayedWriteRate()); + ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate()); + + mutable_cf_options.disable_auto_compactions = false; + vstorage->set_l0_delay_trigger_count(70); + vstorage->TEST_set_estimated_compaction_needed_bytes(500); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); + + vstorage->set_l0_delay_trigger_count(71); + vstorage->TEST_set_estimated_compaction_needed_bytes(501); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); +} + +TEST_P(ColumnFamilyTest, CompactionSpeedupSingleColumnFamily) { + db_options_.max_background_compactions = 6; + Open({"default"}); + ColumnFamilyData* cfd = + static_cast(db_->DefaultColumnFamily())->cfd(); + + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + + MutableCFOptions mutable_cf_options(column_family_options_); + + // Speed up threshold = min(4 * 2, 4 + (36 - 4)/4) = 8 + mutable_cf_options.level0_file_num_compaction_trigger = 4; + mutable_cf_options.level0_slowdown_writes_trigger = 36; + mutable_cf_options.level0_stop_writes_trigger = 50; + // Speedup threshold = 200 / 4 = 50 + mutable_cf_options.soft_pending_compaction_bytes_limit = 200; + mutable_cf_options.hard_pending_compaction_bytes_limit = 2000; + + vstorage->TEST_set_estimated_compaction_needed_bytes(40); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(50); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(300); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(45); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(7); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(9); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(6); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + + // Speed up threshold = min(4 * 2, 4 + (12 - 4)/4) = 6 + mutable_cf_options.level0_file_num_compaction_trigger = 4; + mutable_cf_options.level0_slowdown_writes_trigger = 16; + mutable_cf_options.level0_stop_writes_trigger = 30; + + vstorage->set_l0_delay_trigger_count(5); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(7); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(3); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); +} + +TEST_P(ColumnFamilyTest, WriteStallTwoColumnFamilies) { + const uint64_t kBaseRate = 810000u; + db_options_.delayed_write_rate = kBaseRate; + Open(); + CreateColumnFamilies({"one"}); + ColumnFamilyData* cfd = + static_cast(db_->DefaultColumnFamily())->cfd(); + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + + ColumnFamilyData* cfd1 = + static_cast(handles_[1])->cfd(); + VersionStorageInfo* vstorage1 = cfd1->current()->storage_info(); + + MutableCFOptions mutable_cf_options(column_family_options_); + mutable_cf_options.level0_slowdown_writes_trigger = 20; + mutable_cf_options.level0_stop_writes_trigger = 10000; + mutable_cf_options.soft_pending_compaction_bytes_limit = 200; + mutable_cf_options.hard_pending_compaction_bytes_limit = 2000; + + MutableCFOptions mutable_cf_options1 = mutable_cf_options; + mutable_cf_options1.soft_pending_compaction_bytes_limit = 500; + + vstorage->TEST_set_estimated_compaction_needed_bytes(50); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(201); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(600); + RecalculateWriteStallConditions(cfd1, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(70); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(800); + RecalculateWriteStallConditions(cfd1, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(300); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(700); + RecalculateWriteStallConditions(cfd1, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(500); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(600); + RecalculateWriteStallConditions(cfd1, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); +} + +TEST_P(ColumnFamilyTest, CompactionSpeedupTwoColumnFamilies) { + db_options_.max_background_compactions = 6; + column_family_options_.soft_pending_compaction_bytes_limit = 200; + column_family_options_.hard_pending_compaction_bytes_limit = 2000; + Open(); + CreateColumnFamilies({"one"}); + ColumnFamilyData* cfd = + static_cast(db_->DefaultColumnFamily())->cfd(); + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + + ColumnFamilyData* cfd1 = + static_cast(handles_[1])->cfd(); + VersionStorageInfo* vstorage1 = cfd1->current()->storage_info(); + + MutableCFOptions mutable_cf_options(column_family_options_); + // Speed up threshold = min(4 * 2, 4 + (36 - 4)/4) = 8 + mutable_cf_options.level0_file_num_compaction_trigger = 4; + mutable_cf_options.level0_slowdown_writes_trigger = 36; + mutable_cf_options.level0_stop_writes_trigger = 30; + // Speedup threshold = 200 / 4 = 50 + mutable_cf_options.soft_pending_compaction_bytes_limit = 200; + mutable_cf_options.hard_pending_compaction_bytes_limit = 2000; + + MutableCFOptions mutable_cf_options1 = mutable_cf_options; + mutable_cf_options1.level0_slowdown_writes_trigger = 16; + + vstorage->TEST_set_estimated_compaction_needed_bytes(40); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(60); + RecalculateWriteStallConditions(cfd1, mutable_cf_options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(30); + RecalculateWriteStallConditions(cfd1, mutable_cf_options); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(70); + RecalculateWriteStallConditions(cfd1, mutable_cf_options); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(20); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(3); + RecalculateWriteStallConditions(cfd1, mutable_cf_options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(9); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage1->set_l0_delay_trigger_count(2); + RecalculateWriteStallConditions(cfd1, mutable_cf_options); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(0); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); +} + +TEST_P(ColumnFamilyTest, CreateAndDestroyOptions) { + std::unique_ptr cfo(new ColumnFamilyOptions()); + ColumnFamilyHandle* cfh; + Open(); + ASSERT_OK(db_->CreateColumnFamily(*(cfo.get()), "yoyo", &cfh)); + cfo.reset(); + ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar")); + ASSERT_OK(db_->Flush(FlushOptions(), cfh)); + ASSERT_OK(db_->DropColumnFamily(cfh)); + ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh)); +} + +TEST_P(ColumnFamilyTest, CreateDropAndDestroy) { + ColumnFamilyHandle* cfh; + Open(); + ASSERT_OK(db_->CreateColumnFamily(ColumnFamilyOptions(), "yoyo", &cfh)); + ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar")); + ASSERT_OK(db_->Flush(FlushOptions(), cfh)); + ASSERT_OK(db_->DropColumnFamily(cfh)); + ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh)); +} + +#ifndef ROCKSDB_LITE +TEST_P(ColumnFamilyTest, CreateDropAndDestroyWithoutFileDeletion) { + ColumnFamilyHandle* cfh; + Open(); + ASSERT_OK(db_->CreateColumnFamily(ColumnFamilyOptions(), "yoyo", &cfh)); + ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar")); + ASSERT_OK(db_->Flush(FlushOptions(), cfh)); + ASSERT_OK(db_->DisableFileDeletions()); + ASSERT_OK(db_->DropColumnFamily(cfh)); + ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh)); +} + +TEST_P(ColumnFamilyTest, FlushCloseWALFiles) { + SpecialEnv env(Env::Default()); + db_options_.env = &env; + db_options_.max_background_flushes = 1; + column_family_options_.memtable_factory.reset( + test::NewSpecialSkipListFactory(2)); + Open(); + CreateColumnFamilies({"one"}); + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_OK(Put(0, "fodor", "mirko")); + ASSERT_OK(Put(1, "fodor", "mirko")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::BGWorkFlush:done", "FlushCloseWALFiles:0"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Block flush jobs from running + test::SleepingBackgroundTask sleeping_task; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::HIGH); + // Make sure the task is sleeping. Otherwise, it might start to execute + // after sleeping_task.WaitUntilDone() and cause TSAN warning. + sleeping_task.WaitUntilSleeping(); + + WriteOptions wo; + wo.sync = true; + ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko")); + + ASSERT_EQ(2, env.num_open_wal_file_.load()); + + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); + TEST_SYNC_POINT("FlushCloseWALFiles:0"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_EQ(1, env.num_open_wal_file_.load()); + + Reopen(); + ASSERT_EQ("mirko", Get(0, "fodor")); + ASSERT_EQ("mirko", Get(1, "fodor")); + db_options_.env = env_; + Close(); +} +#endif // !ROCKSDB_LITE + +#ifndef ROCKSDB_LITE // WaitForFlush() is not supported +TEST_P(ColumnFamilyTest, IteratorCloseWALFile1) { + SpecialEnv env(Env::Default()); + db_options_.env = &env; + db_options_.max_background_flushes = 1; + column_family_options_.memtable_factory.reset( + test::NewSpecialSkipListFactory(2)); + Open(); + CreateColumnFamilies({"one"}); + ASSERT_OK(Put(1, "fodor", "mirko")); + // Create an iterator holding the current super version. + Iterator* it = db_->NewIterator(ReadOptions(), handles_[1]); + ASSERT_OK(it->status()); + // A flush will make `it` hold the last reference of its super version. + ASSERT_OK(Flush(1)); + + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_OK(Put(0, "fodor", "mirko")); + ASSERT_OK(Put(1, "fodor", "mirko")); + + // Flush jobs will close previous WAL files after finishing. By + // block flush jobs from running, we trigger a condition where + // the iterator destructor should close the WAL files. + test::SleepingBackgroundTask sleeping_task; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::HIGH); + // Make sure the task is sleeping. Otherwise, it might start to execute + // after sleeping_task.WaitUntilDone() and cause TSAN warning. + sleeping_task.WaitUntilSleeping(); + + WriteOptions wo; + wo.sync = true; + ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko")); + + ASSERT_EQ(2, env.num_open_wal_file_.load()); + // Deleting the iterator will clear its super version, triggering + // closing all files + delete it; + ASSERT_EQ(1, env.num_open_wal_file_.load()); + + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); + WaitForFlush(1); + + Reopen(); + ASSERT_EQ("mirko", Get(0, "fodor")); + ASSERT_EQ("mirko", Get(1, "fodor")); + db_options_.env = env_; + Close(); +} + +TEST_P(ColumnFamilyTest, IteratorCloseWALFile2) { + SpecialEnv env(Env::Default()); + // Allow both of flush and purge job to schedule. + env.SetBackgroundThreads(2, Env::HIGH); + db_options_.env = &env; + db_options_.max_background_flushes = 1; + column_family_options_.memtable_factory.reset( + test::NewSpecialSkipListFactory(2)); + Open(); + CreateColumnFamilies({"one"}); + ASSERT_OK(Put(1, "fodor", "mirko")); + // Create an iterator holding the current super version. + ReadOptions ro; + ro.background_purge_on_iterator_cleanup = true; + Iterator* it = db_->NewIterator(ro, handles_[1]); + ASSERT_OK(it->status()); + // A flush will make `it` hold the last reference of its super version. + ASSERT_OK(Flush(1)); + + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_OK(Put(0, "fodor", "mirko")); + ASSERT_OK(Put(1, "fodor", "mirko")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"ColumnFamilyTest::IteratorCloseWALFile2:0", + "DBImpl::BGWorkPurge:start"}, + {"ColumnFamilyTest::IteratorCloseWALFile2:2", + "DBImpl::BackgroundCallFlush:start"}, + {"DBImpl::BGWorkPurge:end", "ColumnFamilyTest::IteratorCloseWALFile2:1"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + WriteOptions wo; + wo.sync = true; + ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko")); + + ASSERT_EQ(2, env.num_open_wal_file_.load()); + // Deleting the iterator will clear its super version, triggering + // closing all files + delete it; + ASSERT_EQ(2, env.num_open_wal_file_.load()); + + TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:0"); + TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:1"); + ASSERT_EQ(1, env.num_open_wal_file_.load()); + TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:2"); + WaitForFlush(1); + ASSERT_EQ(1, env.num_open_wal_file_.load()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + Reopen(); + ASSERT_EQ("mirko", Get(0, "fodor")); + ASSERT_EQ("mirko", Get(1, "fodor")); + db_options_.env = env_; + Close(); +} +#endif // !ROCKSDB_LITE + +#ifndef ROCKSDB_LITE // TEST functions are not supported in lite +TEST_P(ColumnFamilyTest, ForwardIteratorCloseWALFile) { + SpecialEnv env(Env::Default()); + // Allow both of flush and purge job to schedule. + env.SetBackgroundThreads(2, Env::HIGH); + db_options_.env = &env; + db_options_.max_background_flushes = 1; + column_family_options_.memtable_factory.reset( + test::NewSpecialSkipListFactory(3)); + column_family_options_.level0_file_num_compaction_trigger = 2; + Open(); + CreateColumnFamilies({"one"}); + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_OK(Put(1, "fodar2", "mirko")); + ASSERT_OK(Flush(1)); + + // Create an iterator holding the current super version, as well as + // the SST file just flushed. + ReadOptions ro; + ro.tailing = true; + ro.background_purge_on_iterator_cleanup = true; + Iterator* it = db_->NewIterator(ro, handles_[1]); + // A flush will make `it` hold the last reference of its super version. + + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_OK(Put(1, "fodar2", "mirko")); + ASSERT_OK(Flush(1)); + + WaitForCompaction(); + + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_OK(Put(0, "fodor", "mirko")); + ASSERT_OK(Put(1, "fodor", "mirko")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"ColumnFamilyTest::IteratorCloseWALFile2:0", + "DBImpl::BGWorkPurge:start"}, + {"ColumnFamilyTest::IteratorCloseWALFile2:2", + "DBImpl::BackgroundCallFlush:start"}, + {"DBImpl::BGWorkPurge:end", "ColumnFamilyTest::IteratorCloseWALFile2:1"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + WriteOptions wo; + wo.sync = true; + ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko")); + + env.delete_count_.store(0); + ASSERT_EQ(2, env.num_open_wal_file_.load()); + // Deleting the iterator will clear its super version, triggering + // closing all files + it->Seek(""); + ASSERT_OK(it->status()); + + ASSERT_EQ(2, env.num_open_wal_file_.load()); + ASSERT_EQ(0, env.delete_count_.load()); + + TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:0"); + TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:1"); + ASSERT_EQ(1, env.num_open_wal_file_.load()); + ASSERT_EQ(1, env.delete_count_.load()); + TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:2"); + WaitForFlush(1); + ASSERT_EQ(1, env.num_open_wal_file_.load()); + ASSERT_EQ(1, env.delete_count_.load()); + + delete it; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + Reopen(); + ASSERT_EQ("mirko", Get(0, "fodor")); + ASSERT_EQ("mirko", Get(1, "fodor")); + db_options_.env = env_; + Close(); +} +#endif // !ROCKSDB_LITE + +// Disable on windows because SyncWAL requires env->IsSyncThreadSafe() +// to return true which is not so in unbuffered mode. +#ifndef OS_WIN +TEST_P(ColumnFamilyTest, LogSyncConflictFlush) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two"}); + + ASSERT_OK(Put(0, "", "")); + ASSERT_OK(Put(1, "foo", "bar")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::SyncWAL:BeforeMarkLogsSynced:1", + "ColumnFamilyTest::LogSyncConflictFlush:1"}, + {"ColumnFamilyTest::LogSyncConflictFlush:2", + "DBImpl::SyncWAL:BeforeMarkLogsSynced:2"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread thread([&] { ASSERT_OK(db_->SyncWAL()); }); + + TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:1"); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Flush(1)); + + TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:2"); + + thread.join(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + Close(); +} +#endif + +// this test is placed here, because the infrastructure for Column Family +// test is being used to ensure a roll of wal files. +// Basic idea is to test that WAL truncation is being detected and not +// ignored +TEST_P(ColumnFamilyTest, DISABLED_LogTruncationTest) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two"}); + + Build(0, 100); + + // Flush the 0th column family to force a roll of the wal log + ASSERT_OK(Flush(0)); + + // Add some more entries + Build(100, 100); + + std::vector filenames; + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); + + // collect wal files + std::vector logfs; + for (size_t i = 0; i < filenames.size(); i++) { + uint64_t number; + FileType type; + if (!(ParseFileName(filenames[i], &number, &type))) continue; + + if (type != kWalFile) continue; + + logfs.push_back(filenames[i]); + } + + std::sort(logfs.begin(), logfs.end()); + ASSERT_GE(logfs.size(), 2); + + // Take the last but one file, and truncate it + std::string fpath = dbname_ + "/" + logfs[logfs.size() - 2]; + std::vector names_save = names_; + + uint64_t fsize; + ASSERT_OK(env_->GetFileSize(fpath, &fsize)); + ASSERT_GT(fsize, 0); + + Close(); + + std::string backup_logs = dbname_ + "/backup_logs"; + std::string t_fpath = backup_logs + "/" + logfs[logfs.size() - 2]; + + ASSERT_OK(env_->CreateDirIfMissing(backup_logs)); + // Not sure how easy it is to make this data driven. + // need to read back the WAL file and truncate last 10 + // entries + CopyFile(fpath, t_fpath, fsize - 9180); + + ASSERT_OK(env_->DeleteFile(fpath)); + ASSERT_OK(env_->RenameFile(t_fpath, fpath)); + + db_options_.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + + OpenReadOnly(names_save); + + CheckMissed(); + + Close(); + + Open(names_save); + + CheckMissed(); + + Close(); + + // cleanup + ASSERT_OK(env_->DeleteDir(backup_logs)); +} + +TEST_P(ColumnFamilyTest, DefaultCfPathsTest) { + Open(); + // Leave cf_paths for one column families to be empty. + // Files should be generated according to db_paths for that + // column family. + ColumnFamilyOptions cf_opt1, cf_opt2; + cf_opt1.cf_paths.emplace_back(dbname_ + "_one_1", + std::numeric_limits::max()); + CreateColumnFamilies({"one", "two"}, {cf_opt1, cf_opt2}); + Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2}); + + // Fill Column family 1. + PutRandomData(1, 100, 100); + ASSERT_OK(Flush(1)); + + ASSERT_EQ(1, GetSstFileCount(cf_opt1.cf_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // Fill column family 2 + PutRandomData(2, 100, 100); + ASSERT_OK(Flush(2)); + + // SST from Column family 2 should be generated in + // db_paths which is dbname_ in this case. + ASSERT_EQ(1, GetSstFileCount(dbname_)); +} + +TEST_P(ColumnFamilyTest, MultipleCFPathsTest) { + Open(); + // Configure Column family specific paths. + ColumnFamilyOptions cf_opt1, cf_opt2; + cf_opt1.cf_paths.emplace_back(dbname_ + "_one_1", + std::numeric_limits::max()); + cf_opt2.cf_paths.emplace_back(dbname_ + "_two_1", + std::numeric_limits::max()); + CreateColumnFamilies({"one", "two"}, {cf_opt1, cf_opt2}); + Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2}); + + PutRandomData(1, 100, 100, true /* save */); + ASSERT_OK(Flush(1)); + + // Check that files are generated in appropriate paths. + ASSERT_EQ(1, GetSstFileCount(cf_opt1.cf_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + PutRandomData(2, 100, 100, true /* save */); + ASSERT_OK(Flush(2)); + + ASSERT_EQ(1, GetSstFileCount(cf_opt2.cf_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // Re-open and verify the keys. + Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2}); + DBImpl* dbi = static_cast_with_check(db_); + for (int cf = 1; cf != 3; ++cf) { + ReadOptions read_options; + read_options.readahead_size = 0; + auto it = dbi->NewIterator(read_options, handles_[cf]); + for (it->SeekToFirst(); it->Valid(); it->Next()) { + ASSERT_OK(it->status()); + Slice key(it->key()); + ASSERT_NE(keys_[cf].end(), keys_[cf].find(key.ToString())); + } + ASSERT_OK(it->status()); + delete it; + + for (const auto& key : keys_[cf]) { + ASSERT_NE("NOT_FOUND", Get(cf, key)); + } + } +} + +TEST(ColumnFamilyTest, ValidateBlobGCCutoff) { + DBOptions db_options; + + ColumnFamilyOptions cf_options; + cf_options.enable_blob_garbage_collection = true; + + cf_options.blob_garbage_collection_age_cutoff = -0.5; + ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options) + .IsInvalidArgument()); + + cf_options.blob_garbage_collection_age_cutoff = 0.0; + ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); + + cf_options.blob_garbage_collection_age_cutoff = 0.5; + ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); + + cf_options.blob_garbage_collection_age_cutoff = 1.0; + ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); + + cf_options.blob_garbage_collection_age_cutoff = 1.5; + ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options) + .IsInvalidArgument()); +} + +TEST(ColumnFamilyTest, ValidateBlobGCForceThreshold) { + DBOptions db_options; + + ColumnFamilyOptions cf_options; + cf_options.enable_blob_garbage_collection = true; + + cf_options.blob_garbage_collection_force_threshold = -0.5; + ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options) + .IsInvalidArgument()); + + cf_options.blob_garbage_collection_force_threshold = 0.0; + ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); + + cf_options.blob_garbage_collection_force_threshold = 0.5; + ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); + + cf_options.blob_garbage_collection_force_threshold = 1.0; + ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); + + cf_options.blob_garbage_collection_force_threshold = 1.5; + ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options) + .IsInvalidArgument()); +} + +TEST(ColumnFamilyTest, ValidateMemtableKVChecksumOption) { + DBOptions db_options; + + ColumnFamilyOptions cf_options; + ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); + + cf_options.memtable_protection_bytes_per_key = 5; + ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options) + .IsNotSupported()); + + cf_options.memtable_protection_bytes_per_key = 1; + ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); + + cf_options.memtable_protection_bytes_per_key = 16; + ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options) + .IsNotSupported()); + + cf_options.memtable_protection_bytes_per_key = 0; + ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/compact_files_test.cc b/src/rocksdb/db/compact_files_test.cc new file mode 100644 index 000000000..ef38946f7 --- /dev/null +++ b/src/rocksdb/db/compact_files_test.cc @@ -0,0 +1,502 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "util/cast_util.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class CompactFilesTest : public testing::Test { + public: + CompactFilesTest() { + env_ = Env::Default(); + db_name_ = test::PerThreadDBPath("compact_files_test"); + } + + std::string db_name_; + Env* env_; +}; + +// A class which remembers the name of each flushed file. +class FlushedFileCollector : public EventListener { + public: + FlushedFileCollector() {} + ~FlushedFileCollector() override {} + + void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { + std::lock_guard lock(mutex_); + flushed_files_.push_back(info.file_path); + } + + std::vector GetFlushedFiles() { + std::lock_guard lock(mutex_); + std::vector result; + for (auto fname : flushed_files_) { + result.push_back(fname); + } + return result; + } + void ClearFlushedFiles() { + std::lock_guard lock(mutex_); + flushed_files_.clear(); + } + + private: + std::vector flushed_files_; + std::mutex mutex_; +}; + +TEST_F(CompactFilesTest, L0ConflictsFiles) { + Options options; + // to trigger compaction more easily + const int kWriteBufferSize = 10000; + const int kLevel0Trigger = 2; + options.create_if_missing = true; + options.compaction_style = kCompactionStyleLevel; + // Small slowdown and stop trigger for experimental purpose. + options.level0_slowdown_writes_trigger = 20; + options.level0_stop_writes_trigger = 20; + options.level0_stop_writes_trigger = 20; + options.write_buffer_size = kWriteBufferSize; + options.level0_file_num_compaction_trigger = kLevel0Trigger; + options.compression = kNoCompression; + + DB* db = nullptr; + ASSERT_OK(DestroyDB(db_name_, options)); + Status s = DB::Open(options, db_name_, &db); + assert(s.ok()); + assert(db); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"CompactFilesImpl:0", "BackgroundCallCompaction:0"}, + {"BackgroundCallCompaction:1", "CompactFilesImpl:1"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // create couple files + // Background compaction starts and waits in BackgroundCallCompaction:0 + for (int i = 0; i < kLevel0Trigger * 4; ++i) { + ASSERT_OK(db->Put(WriteOptions(), std::to_string(i), "")); + ASSERT_OK(db->Put(WriteOptions(), std::to_string(100 - i), "")); + ASSERT_OK(db->Flush(FlushOptions())); + } + + ROCKSDB_NAMESPACE::ColumnFamilyMetaData meta; + db->GetColumnFamilyMetaData(&meta); + std::string file1; + for (auto& file : meta.levels[0].files) { + ASSERT_EQ(0, meta.levels[0].level); + if (file1 == "") { + file1 = file.db_path + "/" + file.name; + } else { + std::string file2 = file.db_path + "/" + file.name; + // Another thread starts a compact files and creates an L0 compaction + // The background compaction then notices that there is an L0 compaction + // already in progress and doesn't do an L0 compaction + // Once the background compaction finishes, the compact files finishes + ASSERT_OK(db->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), + {file1, file2}, 0)); + break; + } + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + delete db; +} + +TEST_F(CompactFilesTest, MultipleLevel) { + Options options; + options.create_if_missing = true; + options.level_compaction_dynamic_level_bytes = true; + options.num_levels = 6; + // Add listener + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + + DB* db = nullptr; + ASSERT_OK(DestroyDB(db_name_, options)); + Status s = DB::Open(options, db_name_, &db); + ASSERT_OK(s); + ASSERT_NE(db, nullptr); + + // create couple files in L0, L3, L4 and L5 + for (int i = 5; i > 2; --i) { + collector->ClearFlushedFiles(); + ASSERT_OK(db->Put(WriteOptions(), std::to_string(i), "")); + ASSERT_OK(db->Flush(FlushOptions())); + // Ensure background work is fully finished including listener callbacks + // before accessing listener state. + ASSERT_OK(static_cast_with_check(db)->TEST_WaitForBackgroundWork()); + auto l0_files = collector->GetFlushedFiles(); + ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, i)); + + std::string prop; + ASSERT_TRUE(db->GetProperty( + "rocksdb.num-files-at-level" + std::to_string(i), &prop)); + ASSERT_EQ("1", prop); + } + ASSERT_OK(db->Put(WriteOptions(), std::to_string(0), "")); + ASSERT_OK(db->Flush(FlushOptions())); + + ColumnFamilyMetaData meta; + db->GetColumnFamilyMetaData(&meta); + // Compact files except the file in L3 + std::vector files; + for (int i = 0; i < 6; ++i) { + if (i == 3) continue; + for (auto& file : meta.levels[i].files) { + files.push_back(file.db_path + "/" + file.name); + } + } + + SyncPoint::GetInstance()->LoadDependency({ + {"CompactionJob::Run():Start", "CompactFilesTest.MultipleLevel:0"}, + {"CompactFilesTest.MultipleLevel:1", "CompactFilesImpl:3"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::thread thread([&] { + TEST_SYNC_POINT("CompactFilesTest.MultipleLevel:0"); + ASSERT_OK(db->Put(WriteOptions(), "bar", "v2")); + ASSERT_OK(db->Put(WriteOptions(), "foo", "v2")); + ASSERT_OK(db->Flush(FlushOptions())); + TEST_SYNC_POINT("CompactFilesTest.MultipleLevel:1"); + }); + + // Compaction cannot move up the data to higher level + // here we have input file from level 5, so the output level has to be >= 5 + for (int invalid_output_level = 0; invalid_output_level < 5; + invalid_output_level++) { + s = db->CompactFiles(CompactionOptions(), files, invalid_output_level); + std::cout << s.ToString() << std::endl; + ASSERT_TRUE(s.IsInvalidArgument()); + } + + ASSERT_OK(db->CompactFiles(CompactionOptions(), files, 5)); + SyncPoint::GetInstance()->DisableProcessing(); + thread.join(); + + delete db; +} + +TEST_F(CompactFilesTest, ObsoleteFiles) { + Options options; + // to trigger compaction more easily + const int kWriteBufferSize = 65536; + options.create_if_missing = true; + // Disable RocksDB background compaction. + options.compaction_style = kCompactionStyleNone; + options.level0_slowdown_writes_trigger = (1 << 30); + options.level0_stop_writes_trigger = (1 << 30); + options.write_buffer_size = kWriteBufferSize; + options.max_write_buffer_number = 2; + options.compression = kNoCompression; + + // Add listener + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + + DB* db = nullptr; + ASSERT_OK(DestroyDB(db_name_, options)); + Status s = DB::Open(options, db_name_, &db); + ASSERT_OK(s); + ASSERT_NE(db, nullptr); + + // create couple files + for (int i = 1000; i < 2000; ++i) { + ASSERT_OK(db->Put(WriteOptions(), std::to_string(i), + std::string(kWriteBufferSize / 10, 'a' + (i % 26)))); + } + + auto l0_files = collector->GetFlushedFiles(); + ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1)); + ASSERT_OK(static_cast_with_check(db)->TEST_WaitForCompact()); + + // verify all compaction input files are deleted + for (auto fname : l0_files) { + ASSERT_EQ(Status::NotFound(), env_->FileExists(fname)); + } + delete db; +} + +TEST_F(CompactFilesTest, NotCutOutputOnLevel0) { + Options options; + options.create_if_missing = true; + // Disable RocksDB background compaction. + options.compaction_style = kCompactionStyleNone; + options.level0_slowdown_writes_trigger = 1000; + options.level0_stop_writes_trigger = 1000; + options.write_buffer_size = 65536; + options.max_write_buffer_number = 2; + options.compression = kNoCompression; + options.max_compaction_bytes = 5000; + + // Add listener + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + + DB* db = nullptr; + ASSERT_OK(DestroyDB(db_name_, options)); + Status s = DB::Open(options, db_name_, &db); + assert(s.ok()); + assert(db); + + // create couple files + for (int i = 0; i < 500; ++i) { + ASSERT_OK(db->Put(WriteOptions(), std::to_string(i), + std::string(1000, 'a' + (i % 26)))); + } + ASSERT_OK(static_cast_with_check(db)->TEST_WaitForFlushMemTable()); + auto l0_files_1 = collector->GetFlushedFiles(); + collector->ClearFlushedFiles(); + for (int i = 0; i < 500; ++i) { + ASSERT_OK(db->Put(WriteOptions(), std::to_string(i), + std::string(1000, 'a' + (i % 26)))); + } + ASSERT_OK(static_cast_with_check(db)->TEST_WaitForFlushMemTable()); + auto l0_files_2 = collector->GetFlushedFiles(); + ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_1, 0)); + ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_2, 0)); + // no assertion failure + delete db; +} + +TEST_F(CompactFilesTest, CapturingPendingFiles) { + Options options; + options.create_if_missing = true; + // Disable RocksDB background compaction. + options.compaction_style = kCompactionStyleNone; + // Always do full scans for obsolete files (needed to reproduce the issue). + options.delete_obsolete_files_period_micros = 0; + + // Add listener. + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + + DB* db = nullptr; + ASSERT_OK(DestroyDB(db_name_, options)); + Status s = DB::Open(options, db_name_, &db); + ASSERT_OK(s); + assert(db); + + // Create 5 files. + for (int i = 0; i < 5; ++i) { + ASSERT_OK(db->Put(WriteOptions(), "key" + std::to_string(i), "value")); + ASSERT_OK(db->Flush(FlushOptions())); + } + + // Ensure background work is fully finished including listener callbacks + // before accessing listener state. + ASSERT_OK(static_cast_with_check(db)->TEST_WaitForBackgroundWork()); + auto l0_files = collector->GetFlushedFiles(); + EXPECT_EQ(5, l0_files.size()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"CompactFilesImpl:2", "CompactFilesTest.CapturingPendingFiles:0"}, + {"CompactFilesTest.CapturingPendingFiles:1", "CompactFilesImpl:3"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Start compacting files. + ROCKSDB_NAMESPACE::port::Thread compaction_thread( + [&] { EXPECT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1)); }); + + // In the meantime flush another file. + TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:0"); + ASSERT_OK(db->Put(WriteOptions(), "key5", "value")); + ASSERT_OK(db->Flush(FlushOptions())); + TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:1"); + + compaction_thread.join(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + delete db; + + // Make sure we can reopen the DB. + s = DB::Open(options, db_name_, &db); + ASSERT_OK(s); + assert(db); + delete db; +} + +TEST_F(CompactFilesTest, CompactionFilterWithGetSv) { + class FilterWithGet : public CompactionFilter { + public: + bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + if (db_ == nullptr) { + return true; + } + std::string res; + db_->Get(ReadOptions(), "", &res); + return true; + } + + void SetDB(DB* db) { db_ = db; } + + const char* Name() const override { return "FilterWithGet"; } + + private: + DB* db_; + }; + + std::shared_ptr cf(new FilterWithGet()); + + Options options; + options.create_if_missing = true; + options.compaction_filter = cf.get(); + + DB* db = nullptr; + ASSERT_OK(DestroyDB(db_name_, options)); + Status s = DB::Open(options, db_name_, &db); + ASSERT_OK(s); + + cf->SetDB(db); + + // Write one L0 file + ASSERT_OK(db->Put(WriteOptions(), "K1", "V1")); + ASSERT_OK(db->Flush(FlushOptions())); + + // Compact all L0 files using CompactFiles + ROCKSDB_NAMESPACE::ColumnFamilyMetaData meta; + db->GetColumnFamilyMetaData(&meta); + for (auto& file : meta.levels[0].files) { + std::string fname = file.db_path + "/" + file.name; + ASSERT_OK( + db->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), {fname}, 0)); + } + + delete db; +} + +TEST_F(CompactFilesTest, SentinelCompressionType) { + if (!Zlib_Supported()) { + fprintf(stderr, "zlib compression not supported, skip this test\n"); + return; + } + if (!Snappy_Supported()) { + fprintf(stderr, "snappy compression not supported, skip this test\n"); + return; + } + // Check that passing `CompressionType::kDisableCompressionOption` to + // `CompactFiles` causes it to use the column family compression options. + for (auto compaction_style : {CompactionStyle::kCompactionStyleLevel, + CompactionStyle::kCompactionStyleUniversal, + CompactionStyle::kCompactionStyleNone}) { + ASSERT_OK(DestroyDB(db_name_, Options())); + Options options; + options.compaction_style = compaction_style; + // L0: Snappy, L1: ZSTD, L2: Snappy + options.compression_per_level = {CompressionType::kSnappyCompression, + CompressionType::kZlibCompression, + CompressionType::kSnappyCompression}; + options.create_if_missing = true; + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + DB* db = nullptr; + ASSERT_OK(DB::Open(options, db_name_, &db)); + + ASSERT_OK(db->Put(WriteOptions(), "key", "val")); + ASSERT_OK(db->Flush(FlushOptions())); + + // Ensure background work is fully finished including listener callbacks + // before accessing listener state. + ASSERT_OK(static_cast_with_check(db)->TEST_WaitForBackgroundWork()); + auto l0_files = collector->GetFlushedFiles(); + ASSERT_EQ(1, l0_files.size()); + + // L0->L1 compaction, so output should be ZSTD-compressed + CompactionOptions compaction_opts; + compaction_opts.compression = CompressionType::kDisableCompressionOption; + ASSERT_OK(db->CompactFiles(compaction_opts, l0_files, 1)); + + ROCKSDB_NAMESPACE::TablePropertiesCollection all_tables_props; + ASSERT_OK(db->GetPropertiesOfAllTables(&all_tables_props)); + for (const auto& name_and_table_props : all_tables_props) { + ASSERT_EQ(CompressionTypeToString(CompressionType::kZlibCompression), + name_and_table_props.second->compression_name); + } + delete db; + } +} + +TEST_F(CompactFilesTest, GetCompactionJobInfo) { + Options options; + options.create_if_missing = true; + // Disable RocksDB background compaction. + options.compaction_style = kCompactionStyleNone; + options.level0_slowdown_writes_trigger = 1000; + options.level0_stop_writes_trigger = 1000; + options.write_buffer_size = 65536; + options.max_write_buffer_number = 2; + options.compression = kNoCompression; + options.max_compaction_bytes = 5000; + + // Add listener + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + + DB* db = nullptr; + ASSERT_OK(DestroyDB(db_name_, options)); + Status s = DB::Open(options, db_name_, &db); + ASSERT_OK(s); + assert(db); + + // create couple files + for (int i = 0; i < 500; ++i) { + ASSERT_OK(db->Put(WriteOptions(), std::to_string(i), + std::string(1000, 'a' + (i % 26)))); + } + ASSERT_OK(static_cast_with_check(db)->TEST_WaitForFlushMemTable()); + auto l0_files_1 = collector->GetFlushedFiles(); + CompactionOptions co; + co.compression = CompressionType::kLZ4Compression; + CompactionJobInfo compaction_job_info{}; + ASSERT_OK( + db->CompactFiles(co, l0_files_1, 0, -1, nullptr, &compaction_job_info)); + ASSERT_EQ(compaction_job_info.base_input_level, 0); + ASSERT_EQ(compaction_job_info.cf_id, db->DefaultColumnFamily()->GetID()); + ASSERT_EQ(compaction_job_info.cf_name, db->DefaultColumnFamily()->GetName()); + ASSERT_EQ(compaction_job_info.compaction_reason, + CompactionReason::kManualCompaction); + ASSERT_EQ(compaction_job_info.compression, CompressionType::kLZ4Compression); + ASSERT_EQ(compaction_job_info.output_level, 0); + ASSERT_OK(compaction_job_info.status); + // no assertion failure + delete db; +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as DBImpl::CompactFiles is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/compaction/clipping_iterator.h b/src/rocksdb/db/compaction/clipping_iterator.h new file mode 100644 index 000000000..1ed465c2c --- /dev/null +++ b/src/rocksdb/db/compaction/clipping_iterator.h @@ -0,0 +1,276 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/comparator.h" +#include "table/internal_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +// An internal iterator that wraps another one and ensures that any keys +// returned are strictly within a range [start, end). If the underlying +// iterator has already performed the bounds checking, it relies on that result; +// otherwise, it performs the necessary key comparisons itself. Both bounds +// are optional. +class ClippingIterator : public InternalIterator { + public: + ClippingIterator(InternalIterator* iter, const Slice* start, const Slice* end, + const CompareInterface* cmp) + : iter_(iter), start_(start), end_(end), cmp_(cmp), valid_(false) { + assert(iter_); + assert(cmp_); + assert(!start_ || !end_ || cmp_->Compare(*start_, *end_) <= 0); + + UpdateAndEnforceBounds(); + } + + bool Valid() const override { return valid_; } + + void SeekToFirst() override { + if (start_) { + iter_->Seek(*start_); + } else { + iter_->SeekToFirst(); + } + + UpdateAndEnforceUpperBound(); + } + + void SeekToLast() override { + if (end_) { + iter_->SeekForPrev(*end_); + + // Upper bound is exclusive, so we need a key which is strictly smaller + if (iter_->Valid() && cmp_->Compare(iter_->key(), *end_) == 0) { + iter_->Prev(); + } + } else { + iter_->SeekToLast(); + } + + UpdateAndEnforceLowerBound(); + } + + void Seek(const Slice& target) override { + if (start_ && cmp_->Compare(target, *start_) < 0) { + iter_->Seek(*start_); + UpdateAndEnforceUpperBound(); + return; + } + + if (end_ && cmp_->Compare(target, *end_) >= 0) { + valid_ = false; + return; + } + + iter_->Seek(target); + UpdateAndEnforceUpperBound(); + } + + void SeekForPrev(const Slice& target) override { + if (start_ && cmp_->Compare(target, *start_) < 0) { + valid_ = false; + return; + } + + if (end_ && cmp_->Compare(target, *end_) >= 0) { + iter_->SeekForPrev(*end_); + + // Upper bound is exclusive, so we need a key which is strictly smaller + if (iter_->Valid() && cmp_->Compare(iter_->key(), *end_) == 0) { + iter_->Prev(); + } + + UpdateAndEnforceLowerBound(); + return; + } + + iter_->SeekForPrev(target); + UpdateAndEnforceLowerBound(); + } + + void Next() override { + assert(valid_); + iter_->Next(); + UpdateAndEnforceUpperBound(); + } + + bool NextAndGetResult(IterateResult* result) override { + assert(valid_); + assert(result); + + IterateResult res; + valid_ = iter_->NextAndGetResult(&res); + + if (!valid_) { + return false; + } + + if (end_) { + EnforceUpperBoundImpl(res.bound_check_result); + + if (!valid_) { + return false; + } + } + + res.bound_check_result = IterBoundCheck::kInbound; + *result = res; + + return true; + } + + void Prev() override { + assert(valid_); + iter_->Prev(); + UpdateAndEnforceLowerBound(); + } + + Slice key() const override { + assert(valid_); + return iter_->key(); + } + + Slice user_key() const override { + assert(valid_); + return iter_->user_key(); + } + + Slice value() const override { + assert(valid_); + return iter_->value(); + } + + Status status() const override { return iter_->status(); } + + bool PrepareValue() override { + assert(valid_); + + if (iter_->PrepareValue()) { + return true; + } + + assert(!iter_->Valid()); + valid_ = false; + return false; + } + + bool MayBeOutOfLowerBound() override { + assert(valid_); + return false; + } + + IterBoundCheck UpperBoundCheckResult() override { + assert(valid_); + return IterBoundCheck::kInbound; + } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + iter_->SetPinnedItersMgr(pinned_iters_mgr); + } + + bool IsKeyPinned() const override { + assert(valid_); + return iter_->IsKeyPinned(); + } + + bool IsValuePinned() const override { + assert(valid_); + return iter_->IsValuePinned(); + } + + Status GetProperty(std::string prop_name, std::string* prop) override { + return iter_->GetProperty(prop_name, prop); + } + + private: + void UpdateValid() { + assert(!iter_->Valid() || iter_->status().ok()); + + valid_ = iter_->Valid(); + } + + void EnforceUpperBoundImpl(IterBoundCheck bound_check_result) { + if (bound_check_result == IterBoundCheck::kInbound) { + return; + } + + if (bound_check_result == IterBoundCheck::kOutOfBound) { + valid_ = false; + return; + } + + assert(bound_check_result == IterBoundCheck::kUnknown); + + if (cmp_->Compare(key(), *end_) >= 0) { + valid_ = false; + } + } + + void EnforceUpperBound() { + if (!valid_) { + return; + } + + if (!end_) { + return; + } + + EnforceUpperBoundImpl(iter_->UpperBoundCheckResult()); + } + + void EnforceLowerBound() { + if (!valid_) { + return; + } + + if (!start_) { + return; + } + + if (!iter_->MayBeOutOfLowerBound()) { + return; + } + + if (cmp_->Compare(key(), *start_) < 0) { + valid_ = false; + } + } + + void AssertBounds() { + assert(!valid_ || !start_ || cmp_->Compare(key(), *start_) >= 0); + assert(!valid_ || !end_ || cmp_->Compare(key(), *end_) < 0); + } + + void UpdateAndEnforceBounds() { + UpdateValid(); + EnforceUpperBound(); + EnforceLowerBound(); + AssertBounds(); + } + + void UpdateAndEnforceUpperBound() { + UpdateValid(); + EnforceUpperBound(); + AssertBounds(); + } + + void UpdateAndEnforceLowerBound() { + UpdateValid(); + EnforceLowerBound(); + AssertBounds(); + } + + InternalIterator* iter_; + const Slice* start_; + const Slice* end_; + const CompareInterface* cmp_; + bool valid_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/clipping_iterator_test.cc b/src/rocksdb/db/compaction/clipping_iterator_test.cc new file mode 100644 index 000000000..b2b167048 --- /dev/null +++ b/src/rocksdb/db/compaction/clipping_iterator_test.cc @@ -0,0 +1,259 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/compaction/clipping_iterator.h" + +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "rocksdb/comparator.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/vector_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +// A vector iterator which does its own bounds checking. This is for testing the +// optimizations in the clipping iterator where we bypass the bounds checking if +// the input iterator has already performed it. +class BoundsCheckingVectorIterator : public VectorIterator { + public: + BoundsCheckingVectorIterator(const std::vector& keys, + const std::vector& values, + const Slice* start, const Slice* end, + const Comparator* cmp) + : VectorIterator(keys, values, cmp), start_(start), end_(end), cmp_(cmp) { + assert(cmp_); + } + + bool NextAndGetResult(IterateResult* result) override { + assert(Valid()); + assert(result); + + Next(); + + if (!Valid()) { + return false; + } + + result->key = key(); + result->bound_check_result = UpperBoundCheckResult(); + result->value_prepared = true; + + return true; + } + + bool MayBeOutOfLowerBound() override { + assert(Valid()); + + if (!start_) { + return false; + } + + return cmp_->Compare(key(), *start_) < 0; + } + + IterBoundCheck UpperBoundCheckResult() override { + assert(Valid()); + + if (!end_) { + return IterBoundCheck::kInbound; + } + + return cmp_->Compare(key(), *end_) >= 0 ? IterBoundCheck::kOutOfBound + : IterBoundCheck::kInbound; + } + + private: + const Slice* start_; + const Slice* end_; + const Comparator* cmp_; +}; + +class ClippingIteratorTest + : public ::testing::Test, + public ::testing::WithParamInterface> {}; + +TEST_P(ClippingIteratorTest, Clip) { + const std::vector keys{"key0", "key1", "key2", "key3", "key4", + "key5", "key6", "key7", "key8", "key9"}; + const std::vector values{ + "unused0", "value1", "value2", "value3", "unused4", + "unused5", "unused6", "unused7", "unused8", "unused9"}; + + assert(keys.size() == values.size()); + + // Note: the input always contains key1, key2, and key3; however, the clipping + // window is based on the test parameters: its left edge is a value in the + // range [0, 4], and its size is a value in the range [0, 5] + const std::vector input_keys{keys[1], keys[2], keys[3]}; + const std::vector input_values{values[1], values[2], values[3]}; + + const bool use_bounds_checking_vec_it = std::get<0>(GetParam()); + + const size_t clip_start_idx = std::get<1>(GetParam()); + const size_t clip_window_size = std::get<2>(GetParam()); + const size_t clip_end_idx = clip_start_idx + clip_window_size; + + const Slice start(keys[clip_start_idx]); + const Slice end(keys[clip_end_idx]); + + std::unique_ptr input( + use_bounds_checking_vec_it + ? new BoundsCheckingVectorIterator(input_keys, input_values, &start, + &end, BytewiseComparator()) + : new VectorIterator(input_keys, input_values, BytewiseComparator())); + + ClippingIterator clip(input.get(), &start, &end, BytewiseComparator()); + + // The range the clipping iterator should return values from. This is + // essentially the intersection of the input range [1, 4) and the clipping + // window [clip_start_idx, clip_end_idx) + const size_t data_start_idx = + std::max(clip_start_idx, static_cast(1)); + const size_t data_end_idx = std::min(clip_end_idx, static_cast(4)); + + // Range is empty; all Seeks should fail + if (data_start_idx >= data_end_idx) { + clip.SeekToFirst(); + ASSERT_FALSE(clip.Valid()); + + clip.SeekToLast(); + ASSERT_FALSE(clip.Valid()); + + for (size_t i = 0; i < keys.size(); ++i) { + clip.Seek(keys[i]); + ASSERT_FALSE(clip.Valid()); + + clip.SeekForPrev(keys[i]); + ASSERT_FALSE(clip.Valid()); + } + + return; + } + + // Range is non-empty; call SeekToFirst and iterate forward + clip.SeekToFirst(); + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[data_start_idx]); + ASSERT_EQ(clip.value(), values[data_start_idx]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + + for (size_t i = data_start_idx + 1; i < data_end_idx; ++i) { + clip.Next(); + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[i]); + ASSERT_EQ(clip.value(), values[i]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } + + clip.Next(); + ASSERT_FALSE(clip.Valid()); + + // Do it again using NextAndGetResult + clip.SeekToFirst(); + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[data_start_idx]); + ASSERT_EQ(clip.value(), values[data_start_idx]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + + for (size_t i = data_start_idx + 1; i < data_end_idx; ++i) { + IterateResult result; + ASSERT_TRUE(clip.NextAndGetResult(&result)); + ASSERT_EQ(result.key, keys[i]); + ASSERT_EQ(result.bound_check_result, IterBoundCheck::kInbound); + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[i]); + ASSERT_EQ(clip.value(), values[i]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } + + IterateResult result; + ASSERT_FALSE(clip.NextAndGetResult(&result)); + ASSERT_FALSE(clip.Valid()); + + // Call SeekToLast and iterate backward + clip.SeekToLast(); + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[data_end_idx - 1]); + ASSERT_EQ(clip.value(), values[data_end_idx - 1]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + + for (size_t i = data_end_idx - 2; i >= data_start_idx; --i) { + clip.Prev(); + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[i]); + ASSERT_EQ(clip.value(), values[i]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } + + clip.Prev(); + ASSERT_FALSE(clip.Valid()); + + // Call Seek/SeekForPrev for all keys; Seek should return the smallest key + // which is >= the target; SeekForPrev should return the largest key which is + // <= the target + for (size_t i = 0; i < keys.size(); ++i) { + clip.Seek(keys[i]); + + if (i < data_start_idx) { + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[data_start_idx]); + ASSERT_EQ(clip.value(), values[data_start_idx]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } else if (i < data_end_idx) { + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[i]); + ASSERT_EQ(clip.value(), values[i]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } else { + ASSERT_FALSE(clip.Valid()); + } + + clip.SeekForPrev(keys[i]); + + if (i < data_start_idx) { + ASSERT_FALSE(clip.Valid()); + } else if (i < data_end_idx) { + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[i]); + ASSERT_EQ(clip.value(), values[i]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } else { + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[data_end_idx - 1]); + ASSERT_EQ(clip.value(), values[data_end_idx - 1]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } + } +} + +INSTANTIATE_TEST_CASE_P( + ClippingIteratorTest, ClippingIteratorTest, + ::testing::Combine( + ::testing::Bool(), + ::testing::Range(static_cast(0), static_cast(5)), + ::testing::Range(static_cast(0), static_cast(6)))); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/compaction/compaction.cc b/src/rocksdb/db/compaction/compaction.cc new file mode 100644 index 000000000..a32b529f7 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction.cc @@ -0,0 +1,855 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction.h" + +#include +#include + +#include "db/column_family.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/sst_partitioner.h" +#include "test_util/sync_point.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +const uint64_t kRangeTombstoneSentinel = + PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion); + +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey& b) { + auto c = user_cmp->CompareWithoutTimestamp(a.user_key(), b.user_key()); + if (c != 0) { + return c; + } + auto a_footer = ExtractInternalKeyFooter(a.Encode()); + auto b_footer = ExtractInternalKeyFooter(b.Encode()); + if (a_footer == kRangeTombstoneSentinel) { + if (b_footer != kRangeTombstoneSentinel) { + return -1; + } + } else if (b_footer == kRangeTombstoneSentinel) { + return 1; + } + return 0; +} + +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a, + const InternalKey& b) { + if (a == nullptr) { + return -1; + } + return sstableKeyCompare(user_cmp, *a, b); +} + +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey* b) { + if (b == nullptr) { + return -1; + } + return sstableKeyCompare(user_cmp, a, *b); +} + +uint64_t TotalFileSize(const std::vector& files) { + uint64_t sum = 0; + for (size_t i = 0; i < files.size() && files[i]; i++) { + sum += files[i]->fd.GetFileSize(); + } + return sum; +} + +void Compaction::SetInputVersion(Version* _input_version) { + input_version_ = _input_version; + cfd_ = input_version_->cfd(); + + cfd_->Ref(); + input_version_->Ref(); + edit_.SetColumnFamily(cfd_->GetID()); +} + +void Compaction::GetBoundaryKeys( + VersionStorageInfo* vstorage, + const std::vector& inputs, Slice* smallest_user_key, + Slice* largest_user_key, int exclude_level) { + bool initialized = false; + const Comparator* ucmp = vstorage->InternalComparator()->user_comparator(); + for (size_t i = 0; i < inputs.size(); ++i) { + if (inputs[i].files.empty() || inputs[i].level == exclude_level) { + continue; + } + if (inputs[i].level == 0) { + // we need to consider all files on level 0 + for (const auto* f : inputs[i].files) { + const Slice& start_user_key = f->smallest.user_key(); + if (!initialized || + ucmp->Compare(start_user_key, *smallest_user_key) < 0) { + *smallest_user_key = start_user_key; + } + const Slice& end_user_key = f->largest.user_key(); + if (!initialized || + ucmp->Compare(end_user_key, *largest_user_key) > 0) { + *largest_user_key = end_user_key; + } + initialized = true; + } + } else { + // we only need to consider the first and last file + const Slice& start_user_key = inputs[i].files[0]->smallest.user_key(); + if (!initialized || + ucmp->Compare(start_user_key, *smallest_user_key) < 0) { + *smallest_user_key = start_user_key; + } + const Slice& end_user_key = inputs[i].files.back()->largest.user_key(); + if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) { + *largest_user_key = end_user_key; + } + initialized = true; + } + } +} + +std::vector Compaction::PopulateWithAtomicBoundaries( + VersionStorageInfo* vstorage, std::vector inputs) { + const Comparator* ucmp = vstorage->InternalComparator()->user_comparator(); + for (size_t i = 0; i < inputs.size(); i++) { + if (inputs[i].level == 0 || inputs[i].files.empty()) { + continue; + } + inputs[i].atomic_compaction_unit_boundaries.reserve(inputs[i].files.size()); + AtomicCompactionUnitBoundary cur_boundary; + size_t first_atomic_idx = 0; + auto add_unit_boundary = [&](size_t to) { + if (first_atomic_idx == to) return; + for (size_t k = first_atomic_idx; k < to; k++) { + inputs[i].atomic_compaction_unit_boundaries.push_back(cur_boundary); + } + first_atomic_idx = to; + }; + for (size_t j = 0; j < inputs[i].files.size(); j++) { + const auto* f = inputs[i].files[j]; + if (j == 0) { + // First file in a level. + cur_boundary.smallest = &f->smallest; + cur_boundary.largest = &f->largest; + } else if (sstableKeyCompare(ucmp, *cur_boundary.largest, f->smallest) == + 0) { + // SSTs overlap but the end key of the previous file was not + // artificially extended by a range tombstone. Extend the current + // boundary. + cur_boundary.largest = &f->largest; + } else { + // Atomic compaction unit has ended. + add_unit_boundary(j); + cur_boundary.smallest = &f->smallest; + cur_boundary.largest = &f->largest; + } + } + add_unit_boundary(inputs[i].files.size()); + assert(inputs[i].files.size() == + inputs[i].atomic_compaction_unit_boundaries.size()); + } + return inputs; +} + +// helper function to determine if compaction is creating files at the +// bottommost level +bool Compaction::IsBottommostLevel( + int output_level, VersionStorageInfo* vstorage, + const std::vector& inputs) { + int output_l0_idx; + if (output_level == 0) { + output_l0_idx = 0; + for (const auto* file : vstorage->LevelFiles(0)) { + if (inputs[0].files.back() == file) { + break; + } + ++output_l0_idx; + } + assert(static_cast(output_l0_idx) < vstorage->LevelFiles(0).size()); + } else { + output_l0_idx = -1; + } + Slice smallest_key, largest_key; + GetBoundaryKeys(vstorage, inputs, &smallest_key, &largest_key); + return !vstorage->RangeMightExistAfterSortedRun(smallest_key, largest_key, + output_level, output_l0_idx); +} + +// test function to validate the functionality of IsBottommostLevel() +// function -- determines if compaction with inputs and storage is bottommost +bool Compaction::TEST_IsBottommostLevel( + int output_level, VersionStorageInfo* vstorage, + const std::vector& inputs) { + return IsBottommostLevel(output_level, vstorage, inputs); +} + +bool Compaction::IsFullCompaction( + VersionStorageInfo* vstorage, + const std::vector& inputs) { + size_t num_files_in_compaction = 0; + size_t total_num_files = 0; + for (int l = 0; l < vstorage->num_levels(); l++) { + total_num_files += vstorage->NumLevelFiles(l); + } + for (size_t i = 0; i < inputs.size(); i++) { + num_files_in_compaction += inputs[i].size(); + } + return num_files_in_compaction == total_num_files; +} + +Compaction::Compaction( + VersionStorageInfo* vstorage, const ImmutableOptions& _immutable_options, + const MutableCFOptions& _mutable_cf_options, + const MutableDBOptions& _mutable_db_options, + std::vector _inputs, int _output_level, + uint64_t _target_file_size, uint64_t _max_compaction_bytes, + uint32_t _output_path_id, CompressionType _compression, + CompressionOptions _compression_opts, Temperature _output_temperature, + uint32_t _max_subcompactions, std::vector _grandparents, + bool _manual_compaction, const std::string& _trim_ts, double _score, + bool _deletion_compaction, bool l0_files_might_overlap, + CompactionReason _compaction_reason, + BlobGarbageCollectionPolicy _blob_garbage_collection_policy, + double _blob_garbage_collection_age_cutoff) + : input_vstorage_(vstorage), + start_level_(_inputs[0].level), + output_level_(_output_level), + target_output_file_size_(_target_file_size), + max_compaction_bytes_(_max_compaction_bytes), + max_subcompactions_(_max_subcompactions), + immutable_options_(_immutable_options), + mutable_cf_options_(_mutable_cf_options), + input_version_(nullptr), + number_levels_(vstorage->num_levels()), + cfd_(nullptr), + output_path_id_(_output_path_id), + output_compression_(_compression), + output_compression_opts_(_compression_opts), + output_temperature_(_output_temperature), + deletion_compaction_(_deletion_compaction), + l0_files_might_overlap_(l0_files_might_overlap), + inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))), + grandparents_(std::move(_grandparents)), + score_(_score), + bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)), + is_full_compaction_(IsFullCompaction(vstorage, inputs_)), + is_manual_compaction_(_manual_compaction), + trim_ts_(_trim_ts), + is_trivial_move_(false), + + compaction_reason_(_compaction_reason), + notify_on_compaction_completion_(false), + enable_blob_garbage_collection_( + _blob_garbage_collection_policy == BlobGarbageCollectionPolicy::kForce + ? true + : (_blob_garbage_collection_policy == + BlobGarbageCollectionPolicy::kDisable + ? false + : mutable_cf_options()->enable_blob_garbage_collection)), + blob_garbage_collection_age_cutoff_( + _blob_garbage_collection_age_cutoff < 0 || + _blob_garbage_collection_age_cutoff > 1 + ? mutable_cf_options()->blob_garbage_collection_age_cutoff + : _blob_garbage_collection_age_cutoff), + penultimate_level_(EvaluatePenultimateLevel( + vstorage, immutable_options_, start_level_, output_level_)) { + MarkFilesBeingCompacted(true); + if (is_manual_compaction_) { + compaction_reason_ = CompactionReason::kManualCompaction; + } + if (max_subcompactions_ == 0) { + max_subcompactions_ = _mutable_db_options.max_subcompactions; + } + + // for the non-bottommost levels, it tries to build files match the target + // file size, but not guaranteed. It could be 2x the size of the target size. + max_output_file_size_ = + bottommost_level_ || grandparents_.empty() || + !_immutable_options.level_compaction_dynamic_file_size + ? target_output_file_size_ + : 2 * target_output_file_size_; + +#ifndef NDEBUG + for (size_t i = 1; i < inputs_.size(); ++i) { + assert(inputs_[i].level > inputs_[i - 1].level); + } +#endif + + // setup input_levels_ + { + input_levels_.resize(num_input_levels()); + for (size_t which = 0; which < num_input_levels(); which++) { + DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files, + &arena_); + } + } + + GetBoundaryKeys(vstorage, inputs_, &smallest_user_key_, &largest_user_key_); + + // Every compaction regardless of any compaction reason may respect the + // existing compact cursor in the output level to split output files + output_split_key_ = nullptr; + if (immutable_options_.compaction_style == kCompactionStyleLevel && + immutable_options_.compaction_pri == kRoundRobin) { + const InternalKey* cursor = + &input_vstorage_->GetCompactCursors()[output_level_]; + if (cursor->size() != 0) { + const Slice& cursor_user_key = ExtractUserKey(cursor->Encode()); + auto ucmp = vstorage->InternalComparator()->user_comparator(); + // May split output files according to the cursor if it in the user-key + // range + if (ucmp->CompareWithoutTimestamp(cursor_user_key, smallest_user_key_) > + 0 && + ucmp->CompareWithoutTimestamp(cursor_user_key, largest_user_key_) <= + 0) { + output_split_key_ = cursor; + } + } + } + + PopulatePenultimateLevelOutputRange(); +} + +void Compaction::PopulatePenultimateLevelOutputRange() { + if (!SupportsPerKeyPlacement()) { + return; + } + + // exclude the last level, the range of all input levels is the safe range + // of keys that can be moved up. + int exclude_level = number_levels_ - 1; + penultimate_output_range_type_ = PenultimateOutputRangeType::kNonLastRange; + + // For universal compaction, the penultimate_output_range could be extended if + // all penultimate level files are included in the compaction (which includes + // the case that the penultimate level is empty). + if (immutable_options_.compaction_style == kCompactionStyleUniversal) { + exclude_level = kInvalidLevel; + std::set penultimate_inputs; + for (const auto& input_lvl : inputs_) { + if (input_lvl.level == penultimate_level_) { + for (const auto& file : input_lvl.files) { + penultimate_inputs.emplace(file->fd.GetNumber()); + } + } + } + auto penultimate_files = input_vstorage_->LevelFiles(penultimate_level_); + for (const auto& file : penultimate_files) { + if (penultimate_inputs.find(file->fd.GetNumber()) == + penultimate_inputs.end()) { + exclude_level = number_levels_ - 1; + penultimate_output_range_type_ = PenultimateOutputRangeType::kFullRange; + break; + } + } + } + + GetBoundaryKeys(input_vstorage_, inputs_, + &penultimate_level_smallest_user_key_, + &penultimate_level_largest_user_key_, exclude_level); + + // If there's a case that the penultimate level output range is overlapping + // with the existing files, disable the penultimate level output by setting + // the range to empty. One example is the range delete could have overlap + // boundary with the next file. (which is actually a false overlap) + // TODO: Exclude such false overlap, so it won't disable the penultimate + // output. + std::set penultimate_inputs; + for (const auto& input_lvl : inputs_) { + if (input_lvl.level == penultimate_level_) { + for (const auto& file : input_lvl.files) { + penultimate_inputs.emplace(file->fd.GetNumber()); + } + } + } + + auto penultimate_files = input_vstorage_->LevelFiles(penultimate_level_); + for (const auto& file : penultimate_files) { + if (penultimate_inputs.find(file->fd.GetNumber()) == + penultimate_inputs.end() && + OverlapPenultimateLevelOutputRange(file->smallest.user_key(), + file->largest.user_key())) { + // basically disable the penultimate range output. which should be rare + // or a false overlap caused by range del + penultimate_level_smallest_user_key_ = ""; + penultimate_level_largest_user_key_ = ""; + penultimate_output_range_type_ = PenultimateOutputRangeType::kDisabled; + } + } +} + +Compaction::~Compaction() { + if (input_version_ != nullptr) { + input_version_->Unref(); + } + if (cfd_ != nullptr) { + cfd_->UnrefAndTryDelete(); + } +} + +bool Compaction::SupportsPerKeyPlacement() const { + return penultimate_level_ != kInvalidLevel; +} + +int Compaction::GetPenultimateLevel() const { return penultimate_level_; } + +// smallest_key and largest_key include timestamps if user-defined timestamp is +// enabled. +bool Compaction::OverlapPenultimateLevelOutputRange( + const Slice& smallest_key, const Slice& largest_key) const { + if (!SupportsPerKeyPlacement()) { + return false; + } + const Comparator* ucmp = + input_vstorage_->InternalComparator()->user_comparator(); + + return ucmp->CompareWithoutTimestamp( + smallest_key, penultimate_level_largest_user_key_) <= 0 && + ucmp->CompareWithoutTimestamp( + largest_key, penultimate_level_smallest_user_key_) >= 0; +} + +// key includes timestamp if user-defined timestamp is enabled. +bool Compaction::WithinPenultimateLevelOutputRange(const Slice& key) const { + if (!SupportsPerKeyPlacement()) { + return false; + } + + if (penultimate_level_smallest_user_key_.empty() || + penultimate_level_largest_user_key_.empty()) { + return false; + } + + const Comparator* ucmp = + input_vstorage_->InternalComparator()->user_comparator(); + + return ucmp->CompareWithoutTimestamp( + key, penultimate_level_smallest_user_key_) >= 0 && + ucmp->CompareWithoutTimestamp( + key, penultimate_level_largest_user_key_) <= 0; +} + +bool Compaction::InputCompressionMatchesOutput() const { + int base_level = input_vstorage_->base_level(); + bool matches = + (GetCompressionType(input_vstorage_, mutable_cf_options_, start_level_, + base_level) == output_compression_); + if (matches) { + TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:Matches"); + return true; + } + TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:DidntMatch"); + return matches; +} + +bool Compaction::IsTrivialMove() const { + // Avoid a move if there is lots of overlapping grandparent data. + // Otherwise, the move could create a parent file that will require + // a very expensive merge later on. + // If start_level_== output_level_, the purpose is to force compaction + // filter to be applied to that level, and thus cannot be a trivial move. + + // Check if start level have files with overlapping ranges + if (start_level_ == 0 && input_vstorage_->level0_non_overlapping() == false && + l0_files_might_overlap_) { + // We cannot move files from L0 to L1 if the L0 files in the LSM-tree are + // overlapping, unless we are sure that files picked in L0 don't overlap. + return false; + } + + if (is_manual_compaction_ && + (immutable_options_.compaction_filter != nullptr || + immutable_options_.compaction_filter_factory != nullptr)) { + // This is a manual compaction and we have a compaction filter that should + // be executed, we cannot do a trivial move + return false; + } + + if (start_level_ == output_level_) { + // It doesn't make sense if compaction picker picks files just to trivial + // move to the same level. + return false; + } + + // Used in universal compaction, where trivial move can be done if the + // input files are non overlapping + if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) && + (output_level_ != 0) && + (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal)) { + return is_trivial_move_; + } + + if (!(start_level_ != output_level_ && num_input_levels() == 1 && + input(0, 0)->fd.GetPathId() == output_path_id() && + InputCompressionMatchesOutput())) { + return false; + } + + // assert inputs_.size() == 1 + + std::unique_ptr partitioner = CreateSstPartitioner(); + + for (const auto& file : inputs_.front().files) { + std::vector file_grand_parents; + if (output_level_ + 1 >= number_levels_) { + continue; + } + input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest, + &file->largest, &file_grand_parents); + const auto compaction_size = + file->fd.GetFileSize() + TotalFileSize(file_grand_parents); + if (compaction_size > max_compaction_bytes_) { + return false; + } + + if (partitioner.get() != nullptr) { + if (!partitioner->CanDoTrivialMove(file->smallest.user_key(), + file->largest.user_key())) { + return false; + } + } + } + + // PerKeyPlacement compaction should never be trivial move. + if (SupportsPerKeyPlacement()) { + return false; + } + + return true; +} + +void Compaction::AddInputDeletions(VersionEdit* out_edit) { + for (size_t which = 0; which < num_input_levels(); which++) { + for (size_t i = 0; i < inputs_[which].size(); i++) { + out_edit->DeleteFile(level(which), inputs_[which][i]->fd.GetNumber()); + } + } +} + +bool Compaction::KeyNotExistsBeyondOutputLevel( + const Slice& user_key, std::vector* level_ptrs) const { + assert(input_version_ != nullptr); + assert(level_ptrs != nullptr); + assert(level_ptrs->size() == static_cast(number_levels_)); + if (bottommost_level_) { + return true; + } else if (output_level_ != 0 && + cfd_->ioptions()->compaction_style == kCompactionStyleLevel) { + // Maybe use binary search to find right entry instead of linear search? + const Comparator* user_cmp = cfd_->user_comparator(); + for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) { + const std::vector& files = + input_vstorage_->LevelFiles(lvl); + for (; level_ptrs->at(lvl) < files.size(); level_ptrs->at(lvl)++) { + auto* f = files[level_ptrs->at(lvl)]; + if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { + // We've advanced far enough + // In the presence of user-defined timestamp, we may need to handle + // the case in which f->smallest.user_key() (including ts) has the + // same user key, but the ts part is smaller. If so, + // Compare(user_key, f->smallest.user_key()) returns -1. + // That's why we need CompareWithoutTimestamp(). + if (user_cmp->CompareWithoutTimestamp(user_key, + f->smallest.user_key()) >= 0) { + // Key falls in this file's range, so it may + // exist beyond output level + return false; + } + break; + } + } + } + return true; + } + return false; +} + +// Mark (or clear) each file that is being compacted +void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) { + for (size_t i = 0; i < num_input_levels(); i++) { + for (size_t j = 0; j < inputs_[i].size(); j++) { + assert(mark_as_compacted ? !inputs_[i][j]->being_compacted + : inputs_[i][j]->being_compacted); + inputs_[i][j]->being_compacted = mark_as_compacted; + } + } +} + +// Sample output: +// If compacting 3 L0 files, 2 L3 files and 1 L4 file, and outputting to L5, +// print: "3@0 + 2@3 + 1@4 files to L5" +const char* Compaction::InputLevelSummary( + InputLevelSummaryBuffer* scratch) const { + int len = 0; + bool is_first = true; + for (auto& input_level : inputs_) { + if (input_level.empty()) { + continue; + } + if (!is_first) { + len += + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, " + "); + len = std::min(len, static_cast(sizeof(scratch->buffer))); + } else { + is_first = false; + } + len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, + "%" ROCKSDB_PRIszt "@%d", input_level.size(), + input_level.level); + len = std::min(len, static_cast(sizeof(scratch->buffer))); + } + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, + " files to L%d", output_level()); + + return scratch->buffer; +} + +uint64_t Compaction::CalculateTotalInputSize() const { + uint64_t size = 0; + for (auto& input_level : inputs_) { + for (auto f : input_level.files) { + size += f->fd.GetFileSize(); + } + } + return size; +} + +void Compaction::ReleaseCompactionFiles(Status status) { + MarkFilesBeingCompacted(false); + cfd_->compaction_picker()->ReleaseCompactionFiles(this, status); +} + +void Compaction::ResetNextCompactionIndex() { + assert(input_version_ != nullptr); + input_vstorage_->ResetNextCompactionIndex(start_level_); +} + +namespace { +int InputSummary(const std::vector& files, char* output, + int len) { + *output = '\0'; + int write = 0; + for (size_t i = 0; i < files.size(); i++) { + int sz = len - write; + int ret; + char sztxt[16]; + AppendHumanBytes(files.at(i)->fd.GetFileSize(), sztxt, 16); + ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ", + files.at(i)->fd.GetNumber(), sztxt); + if (ret < 0 || ret >= sz) break; + write += ret; + } + // if files.size() is non-zero, overwrite the last space + return write - !!files.size(); +} +} // namespace + +void Compaction::Summary(char* output, int len) { + int write = + snprintf(output, len, "Base version %" PRIu64 " Base level %d, inputs: [", + input_version_->GetVersionNumber(), start_level_); + if (write < 0 || write >= len) { + return; + } + + for (size_t level_iter = 0; level_iter < num_input_levels(); ++level_iter) { + if (level_iter > 0) { + write += snprintf(output + write, len - write, "], ["); + if (write < 0 || write >= len) { + return; + } + } + write += + InputSummary(inputs_[level_iter].files, output + write, len - write); + if (write < 0 || write >= len) { + return; + } + } + + snprintf(output + write, len - write, "]"); +} + +uint64_t Compaction::OutputFilePreallocationSize() const { + uint64_t preallocation_size = 0; + + for (const auto& level_files : inputs_) { + for (const auto& file : level_files.files) { + preallocation_size += file->fd.GetFileSize(); + } + } + + if (max_output_file_size_ != std::numeric_limits::max() && + (immutable_options_.compaction_style == kCompactionStyleLevel || + output_level() > 0)) { + preallocation_size = std::min(max_output_file_size_, preallocation_size); + } + + // Over-estimate slightly so we don't end up just barely crossing + // the threshold + // No point to preallocate more than 1GB. + return std::min(uint64_t{1073741824}, + preallocation_size + (preallocation_size / 10)); +} + +std::unique_ptr Compaction::CreateCompactionFilter() const { + if (!cfd_->ioptions()->compaction_filter_factory) { + return nullptr; + } + + if (!cfd_->ioptions() + ->compaction_filter_factory->ShouldFilterTableFileCreation( + TableFileCreationReason::kCompaction)) { + return nullptr; + } + + CompactionFilter::Context context; + context.is_full_compaction = is_full_compaction_; + context.is_manual_compaction = is_manual_compaction_; + context.column_family_id = cfd_->GetID(); + context.reason = TableFileCreationReason::kCompaction; + return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter( + context); +} + +std::unique_ptr Compaction::CreateSstPartitioner() const { + if (!immutable_options_.sst_partitioner_factory) { + return nullptr; + } + + SstPartitioner::Context context; + context.is_full_compaction = is_full_compaction_; + context.is_manual_compaction = is_manual_compaction_; + context.output_level = output_level_; + context.smallest_user_key = smallest_user_key_; + context.largest_user_key = largest_user_key_; + return immutable_options_.sst_partitioner_factory->CreatePartitioner(context); +} + +bool Compaction::IsOutputLevelEmpty() const { + return inputs_.back().level != output_level_ || inputs_.back().empty(); +} + +bool Compaction::ShouldFormSubcompactions() const { + if (cfd_ == nullptr) { + return false; + } + + // Round-Robin pri under leveled compaction allows subcompactions by default + // and the number of subcompactions can be larger than max_subcompactions_ + if (cfd_->ioptions()->compaction_pri == kRoundRobin && + cfd_->ioptions()->compaction_style == kCompactionStyleLevel) { + return output_level_ > 0; + } + + if (max_subcompactions_ <= 1) { + return false; + } + + if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) { + return (start_level_ == 0 || is_manual_compaction_) && output_level_ > 0; + } else if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) { + return number_levels_ > 1 && output_level_ > 0; + } else { + return false; + } +} + +bool Compaction::DoesInputReferenceBlobFiles() const { + assert(input_version_); + + const VersionStorageInfo* storage_info = input_version_->storage_info(); + assert(storage_info); + + if (storage_info->GetBlobFiles().empty()) { + return false; + } + + for (size_t i = 0; i < inputs_.size(); ++i) { + for (const FileMetaData* meta : inputs_[i].files) { + assert(meta); + + if (meta->oldest_blob_file_number != kInvalidBlobFileNumber) { + return true; + } + } + } + + return false; +} + +uint64_t Compaction::MinInputFileOldestAncesterTime( + const InternalKey* start, const InternalKey* end) const { + uint64_t min_oldest_ancester_time = std::numeric_limits::max(); + const InternalKeyComparator& icmp = + column_family_data()->internal_comparator(); + for (const auto& level_files : inputs_) { + for (const auto& file : level_files.files) { + if (start != nullptr && icmp.Compare(file->largest, *start) < 0) { + continue; + } + if (end != nullptr && icmp.Compare(file->smallest, *end) > 0) { + continue; + } + uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime(); + if (oldest_ancester_time != 0) { + min_oldest_ancester_time = + std::min(min_oldest_ancester_time, oldest_ancester_time); + } + } + } + return min_oldest_ancester_time; +} + +int Compaction::EvaluatePenultimateLevel( + const VersionStorageInfo* vstorage, + const ImmutableOptions& immutable_options, const int start_level, + const int output_level) { + // TODO: currently per_key_placement feature only support level and universal + // compaction + if (immutable_options.compaction_style != kCompactionStyleLevel && + immutable_options.compaction_style != kCompactionStyleUniversal) { + return kInvalidLevel; + } + if (output_level != immutable_options.num_levels - 1) { + return kInvalidLevel; + } + + int penultimate_level = output_level - 1; + assert(penultimate_level < immutable_options.num_levels); + if (penultimate_level <= 0) { + return kInvalidLevel; + } + + // If the penultimate level is not within input level -> output level range + // check if the penultimate output level is empty, if it's empty, it could + // also be locked for the penultimate output. + // TODO: ideally, it only needs to check if there's a file within the + // compaction output key range. For simplicity, it just check if there's any + // file on the penultimate level. + if (start_level == immutable_options.num_levels - 1 && + (immutable_options.compaction_style != kCompactionStyleUniversal || + !vstorage->LevelFiles(penultimate_level).empty())) { + return kInvalidLevel; + } + + bool supports_per_key_placement = + immutable_options.preclude_last_level_data_seconds > 0; + + // it could be overridden by unittest + TEST_SYNC_POINT_CALLBACK("Compaction::SupportsPerKeyPlacement:Enabled", + &supports_per_key_placement); + if (!supports_per_key_placement) { + return kInvalidLevel; + } + + return penultimate_level; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction.h b/src/rocksdb/db/compaction/compaction.h new file mode 100644 index 000000000..21d1190ac --- /dev/null +++ b/src/rocksdb/db/compaction/compaction.h @@ -0,0 +1,559 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "db/version_set.h" +#include "memory/arena.h" +#include "options/cf_options.h" +#include "rocksdb/sst_partitioner.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { +// The file contains class Compaction, as well as some helper functions +// and data structures used by the class. + +// Utility for comparing sstable boundary keys. Returns -1 if either a or b is +// null which provides the property that a==null indicates a key that is less +// than any key and b==null indicates a key that is greater than any key. Note +// that the comparison is performed primarily on the user-key portion of the +// key. If the user-keys compare equal, an additional test is made to sort +// range tombstone sentinel keys before other keys with the same user-key. The +// result is that 2 user-keys will compare equal if they differ purely on +// their sequence number and value, but the range tombstone sentinel for that +// user-key will compare not equal. This is necessary because the range +// tombstone sentinel key is set as the largest key for an sstable even though +// that key never appears in the database. We don't want adjacent sstables to +// be considered overlapping if they are separated by the range tombstone +// sentinel. +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey& b); +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a, + const InternalKey& b); +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey* b); + +// An AtomicCompactionUnitBoundary represents a range of keys [smallest, +// largest] that exactly spans one ore more neighbouring SSTs on the same +// level. Every pair of SSTs in this range "overlap" (i.e., the largest +// user key of one file is the smallest user key of the next file). These +// boundaries are propagated down to RangeDelAggregator during compaction +// to provide safe truncation boundaries for range tombstones. +struct AtomicCompactionUnitBoundary { + const InternalKey* smallest = nullptr; + const InternalKey* largest = nullptr; +}; + +// The structure that manages compaction input files associated +// with the same physical level. +struct CompactionInputFiles { + int level; + std::vector files; + std::vector atomic_compaction_unit_boundaries; + inline bool empty() const { return files.empty(); } + inline size_t size() const { return files.size(); } + inline void clear() { files.clear(); } + inline FileMetaData* operator[](size_t i) const { return files[i]; } +}; + +class Version; +class ColumnFamilyData; +class VersionStorageInfo; +class CompactionFilter; + +// A Compaction encapsulates metadata about a compaction. +class Compaction { + public: + Compaction(VersionStorageInfo* input_version, + const ImmutableOptions& immutable_options, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + std::vector inputs, int output_level, + uint64_t target_file_size, uint64_t max_compaction_bytes, + uint32_t output_path_id, CompressionType compression, + CompressionOptions compression_opts, + Temperature output_temperature, uint32_t max_subcompactions, + std::vector grandparents, + bool manual_compaction = false, const std::string& trim_ts = "", + double score = -1, bool deletion_compaction = false, + bool l0_files_might_overlap = true, + CompactionReason compaction_reason = CompactionReason::kUnknown, + BlobGarbageCollectionPolicy blob_garbage_collection_policy = + BlobGarbageCollectionPolicy::kUseDefault, + double blob_garbage_collection_age_cutoff = -1); + + // The type of the penultimate level output range + enum class PenultimateOutputRangeType : int { + kNotSupported, // it cannot output to the penultimate level + kFullRange, // any data could be output to the penultimate level + kNonLastRange, // only the keys within non_last_level compaction inputs can + // be outputted to the penultimate level + kDisabled, // no data can be outputted to the penultimate level + }; + + // No copying allowed + Compaction(const Compaction&) = delete; + void operator=(const Compaction&) = delete; + + ~Compaction(); + + // Returns the level associated to the specified compaction input level. + // If compaction_input_level is not specified, then input_level is set to 0. + int level(size_t compaction_input_level = 0) const { + return inputs_[compaction_input_level].level; + } + + int start_level() const { return start_level_; } + + // Outputs will go to this level + int output_level() const { return output_level_; } + + // Returns the number of input levels in this compaction. + size_t num_input_levels() const { return inputs_.size(); } + + // Return the object that holds the edits to the descriptor done + // by this compaction. + VersionEdit* edit() { return &edit_; } + + // Returns the number of input files associated to the specified + // compaction input level. + // The function will return 0 if when "compaction_input_level" < 0 + // or "compaction_input_level" >= "num_input_levels()". + size_t num_input_files(size_t compaction_input_level) const { + if (compaction_input_level < inputs_.size()) { + return inputs_[compaction_input_level].size(); + } + return 0; + } + + // Returns input version of the compaction + Version* input_version() const { return input_version_; } + + // Returns the ColumnFamilyData associated with the compaction. + ColumnFamilyData* column_family_data() const { return cfd_; } + + // Returns the file meta data of the 'i'th input file at the + // specified compaction input level. + // REQUIREMENT: "compaction_input_level" must be >= 0 and + // < "input_levels()" + FileMetaData* input(size_t compaction_input_level, size_t i) const { + assert(compaction_input_level < inputs_.size()); + return inputs_[compaction_input_level][i]; + } + + const std::vector* boundaries( + size_t compaction_input_level) const { + assert(compaction_input_level < inputs_.size()); + return &inputs_[compaction_input_level].atomic_compaction_unit_boundaries; + } + + // Returns the list of file meta data of the specified compaction + // input level. + // REQUIREMENT: "compaction_input_level" must be >= 0 and + // < "input_levels()" + const std::vector* inputs( + size_t compaction_input_level) const { + assert(compaction_input_level < inputs_.size()); + return &inputs_[compaction_input_level].files; + } + + const std::vector* inputs() { return &inputs_; } + + // Returns the LevelFilesBrief of the specified compaction input level. + const LevelFilesBrief* input_levels(size_t compaction_input_level) const { + return &input_levels_[compaction_input_level]; + } + + // Maximum size of files to build during this compaction. + uint64_t max_output_file_size() const { return max_output_file_size_; } + + // Target output file size for this compaction + uint64_t target_output_file_size() const { return target_output_file_size_; } + + // What compression for output + CompressionType output_compression() const { return output_compression_; } + + // What compression options for output + const CompressionOptions& output_compression_opts() const { + return output_compression_opts_; + } + + // Whether need to write output file to second DB path. + uint32_t output_path_id() const { return output_path_id_; } + + // Is this a trivial compaction that can be implemented by just + // moving a single input file to the next level (no merging or splitting) + bool IsTrivialMove() const; + + // The split user key in the output level if this compaction is required to + // split the output files according to the existing cursor in the output + // level under round-robin compaction policy. Empty indicates no required + // splitting key + const InternalKey* GetOutputSplitKey() const { return output_split_key_; } + + // If true, then the compaction can be done by simply deleting input files. + bool deletion_compaction() const { return deletion_compaction_; } + + // Add all inputs to this compaction as delete operations to *edit. + void AddInputDeletions(VersionEdit* edit); + + // Returns true if the available information we have guarantees that + // the input "user_key" does not exist in any level beyond "output_level()". + bool KeyNotExistsBeyondOutputLevel(const Slice& user_key, + std::vector* level_ptrs) const; + + // Clear all files to indicate that they are not being compacted + // Delete this compaction from the list of running compactions. + // + // Requirement: DB mutex held + void ReleaseCompactionFiles(Status status); + + // Returns the summary of the compaction in "output" with maximum "len" + // in bytes. The caller is responsible for the memory management of + // "output". + void Summary(char* output, int len); + + // Return the score that was used to pick this compaction run. + double score() const { return score_; } + + // Is this compaction creating a file in the bottom most level? + bool bottommost_level() const { return bottommost_level_; } + + // Is the compaction compact to the last level + bool is_last_level() const { + return output_level_ == immutable_options_.num_levels - 1; + } + + // Does this compaction include all sst files? + bool is_full_compaction() const { return is_full_compaction_; } + + // Was this compaction triggered manually by the client? + bool is_manual_compaction() const { return is_manual_compaction_; } + + std::string trim_ts() const { return trim_ts_; } + + // Used when allow_trivial_move option is set in + // Universal compaction. If all the input files are + // non overlapping, then is_trivial_move_ variable + // will be set true, else false + void set_is_trivial_move(bool trivial_move) { + is_trivial_move_ = trivial_move; + } + + // Used when allow_trivial_move option is set in + // Universal compaction. Returns true, if the input files + // are non-overlapping and can be trivially moved. + bool is_trivial_move() const { return is_trivial_move_; } + + // How many total levels are there? + int number_levels() const { return number_levels_; } + + // Return the ImmutableOptions that should be used throughout the compaction + // procedure + const ImmutableOptions* immutable_options() const { + return &immutable_options_; + } + + // Return the MutableCFOptions that should be used throughout the compaction + // procedure + const MutableCFOptions* mutable_cf_options() const { + return &mutable_cf_options_; + } + + // Returns the size in bytes that the output file should be preallocated to. + // In level compaction, that is max_file_size_. In universal compaction, that + // is the sum of all input file sizes. + uint64_t OutputFilePreallocationSize() const; + + void SetInputVersion(Version* input_version); + + struct InputLevelSummaryBuffer { + char buffer[128]; + }; + + const char* InputLevelSummary(InputLevelSummaryBuffer* scratch) const; + + uint64_t CalculateTotalInputSize() const; + + // In case of compaction error, reset the nextIndex that is used + // to pick up the next file to be compacted from files_by_size_ + void ResetNextCompactionIndex(); + + // Create a CompactionFilter from compaction_filter_factory + std::unique_ptr CreateCompactionFilter() const; + + // Create a SstPartitioner from sst_partitioner_factory + std::unique_ptr CreateSstPartitioner() const; + + // Is the input level corresponding to output_level_ empty? + bool IsOutputLevelEmpty() const; + + // Should this compaction be broken up into smaller ones run in parallel? + bool ShouldFormSubcompactions() const; + + // Returns true iff at least one input file references a blob file. + // + // PRE: input version has been set. + bool DoesInputReferenceBlobFiles() const; + + // test function to validate the functionality of IsBottommostLevel() + // function -- determines if compaction with inputs and storage is bottommost + static bool TEST_IsBottommostLevel( + int output_level, VersionStorageInfo* vstorage, + const std::vector& inputs); + + TablePropertiesCollection GetOutputTableProperties() const { + return output_table_properties_; + } + + void SetOutputTableProperties(TablePropertiesCollection tp) { + output_table_properties_ = std::move(tp); + } + + Slice GetSmallestUserKey() const { return smallest_user_key_; } + + Slice GetLargestUserKey() const { return largest_user_key_; } + + Slice GetPenultimateLevelSmallestUserKey() const { + return penultimate_level_smallest_user_key_; + } + + Slice GetPenultimateLevelLargestUserKey() const { + return penultimate_level_largest_user_key_; + } + + PenultimateOutputRangeType GetPenultimateOutputRangeType() const { + return penultimate_output_range_type_; + } + + // Return true if the compaction supports per_key_placement + bool SupportsPerKeyPlacement() const; + + // Get per_key_placement penultimate output level, which is `last_level - 1` + // if per_key_placement feature is supported. Otherwise, return -1. + int GetPenultimateLevel() const; + + // Return true if the given range is overlap with penultimate level output + // range. + // Both smallest_key and largest_key include timestamps if user-defined + // timestamp is enabled. + bool OverlapPenultimateLevelOutputRange(const Slice& smallest_key, + const Slice& largest_key) const; + + // Return true if the key is within penultimate level output range for + // per_key_placement feature, which is safe to place the key to the + // penultimate level. different compaction strategy has different rules. + // If per_key_placement is not supported, always return false. + // TODO: currently it doesn't support moving data from the last level to the + // penultimate level + // key includes timestamp if user-defined timestamp is enabled. + bool WithinPenultimateLevelOutputRange(const Slice& key) const; + + CompactionReason compaction_reason() const { return compaction_reason_; } + + const std::vector& grandparents() const { + return grandparents_; + } + + uint64_t max_compaction_bytes() const { return max_compaction_bytes_; } + + Temperature output_temperature() const { return output_temperature_; } + + uint32_t max_subcompactions() const { return max_subcompactions_; } + + bool enable_blob_garbage_collection() const { + return enable_blob_garbage_collection_; + } + + double blob_garbage_collection_age_cutoff() const { + return blob_garbage_collection_age_cutoff_; + } + + // start and end are sub compact range. Null if no boundary. + // This is used to filter out some input files' ancester's time range. + uint64_t MinInputFileOldestAncesterTime(const InternalKey* start, + const InternalKey* end) const; + + // Called by DBImpl::NotifyOnCompactionCompleted to make sure number of + // compaction begin and compaction completion callbacks match. + void SetNotifyOnCompactionCompleted() { + notify_on_compaction_completion_ = true; + } + + bool ShouldNotifyOnCompactionCompleted() const { + return notify_on_compaction_completion_; + } + + static constexpr int kInvalidLevel = -1; + + // Evaluate penultimate output level. If the compaction supports + // per_key_placement feature, it returns the penultimate level number. + // Otherwise, it's set to kInvalidLevel (-1), which means + // output_to_penultimate_level is not supported. + // Note: even the penultimate level output is supported (PenultimateLevel != + // kInvalidLevel), some key range maybe unsafe to be outputted to the + // penultimate level. The safe key range is populated by + // `PopulatePenultimateLevelOutputRange()`. + // Which could potentially disable all penultimate level output. + static int EvaluatePenultimateLevel(const VersionStorageInfo* vstorage, + const ImmutableOptions& immutable_options, + const int start_level, + const int output_level); + + private: + // mark (or clear) all files that are being compacted + void MarkFilesBeingCompacted(bool mark_as_compacted); + + // get the smallest and largest key present in files to be compacted + static void GetBoundaryKeys(VersionStorageInfo* vstorage, + const std::vector& inputs, + Slice* smallest_key, Slice* largest_key, + int exclude_level = -1); + + // populate penultimate level output range, which will be used to determine if + // a key is safe to output to the penultimate level (details see + // `Compaction::WithinPenultimateLevelOutputRange()`. + void PopulatePenultimateLevelOutputRange(); + + // Get the atomic file boundaries for all files in the compaction. Necessary + // in order to avoid the scenario described in + // https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and + // plumb down appropriate key boundaries to RangeDelAggregator during + // compaction. + static std::vector PopulateWithAtomicBoundaries( + VersionStorageInfo* vstorage, std::vector inputs); + + // helper function to determine if compaction with inputs and storage is + // bottommost + static bool IsBottommostLevel( + int output_level, VersionStorageInfo* vstorage, + const std::vector& inputs); + + static bool IsFullCompaction(VersionStorageInfo* vstorage, + const std::vector& inputs); + + VersionStorageInfo* input_vstorage_; + + const int start_level_; // the lowest level to be compacted + const int output_level_; // levels to which output files are stored + uint64_t target_output_file_size_; + uint64_t max_output_file_size_; + uint64_t max_compaction_bytes_; + uint32_t max_subcompactions_; + const ImmutableOptions immutable_options_; + const MutableCFOptions mutable_cf_options_; + Version* input_version_; + VersionEdit edit_; + const int number_levels_; + ColumnFamilyData* cfd_; + Arena arena_; // Arena used to allocate space for file_levels_ + + const uint32_t output_path_id_; + CompressionType output_compression_; + CompressionOptions output_compression_opts_; + Temperature output_temperature_; + // If true, then the compaction can be done by simply deleting input files. + const bool deletion_compaction_; + // should it split the output file using the compact cursor? + const InternalKey* output_split_key_; + + // L0 files in LSM-tree might be overlapping. But the compaction picking + // logic might pick a subset of the files that aren't overlapping. if + // that is the case, set the value to false. Otherwise, set it true. + bool l0_files_might_overlap_; + + // Compaction input files organized by level. Constant after construction + const std::vector inputs_; + + // A copy of inputs_, organized more closely in memory + autovector input_levels_; + + // State used to check for number of overlapping grandparent files + // (grandparent == "output_level_ + 1") + std::vector grandparents_; + const double score_; // score that was used to pick this compaction. + + // Is this compaction creating a file in the bottom most level? + const bool bottommost_level_; + // Does this compaction include all sst files? + const bool is_full_compaction_; + + // Is this compaction requested by the client? + const bool is_manual_compaction_; + + // The data with timestamp > trim_ts_ will be removed + const std::string trim_ts_; + + // True if we can do trivial move in Universal multi level + // compaction + bool is_trivial_move_; + + // Does input compression match the output compression? + bool InputCompressionMatchesOutput() const; + + // table properties of output files + TablePropertiesCollection output_table_properties_; + + // smallest user keys in compaction + // includes timestamp if user-defined timestamp is enabled. + Slice smallest_user_key_; + + // largest user keys in compaction + // includes timestamp if user-defined timestamp is enabled. + Slice largest_user_key_; + + // Reason for compaction + CompactionReason compaction_reason_; + + // Notify on compaction completion only if listener was notified on compaction + // begin. + bool notify_on_compaction_completion_; + + // Enable/disable GC collection for blobs during compaction. + bool enable_blob_garbage_collection_; + + // Blob garbage collection age cutoff. + double blob_garbage_collection_age_cutoff_; + + // only set when per_key_placement feature is enabled, -1 (kInvalidLevel) + // means not supported. + const int penultimate_level_; + + // Key range for penultimate level output + // includes timestamp if user-defined timestamp is enabled. + // penultimate_output_range_type_ shows the range type + Slice penultimate_level_smallest_user_key_; + Slice penultimate_level_largest_user_key_; + PenultimateOutputRangeType penultimate_output_range_type_ = + PenultimateOutputRangeType::kNotSupported; +}; + +#ifndef NDEBUG +// Helper struct only for tests, which contains the data to decide if a key +// should be output to the penultimate level. +// TODO: remove this when the public feature knob is available +struct PerKeyPlacementContext { + const int level; + const Slice key; + const Slice value; + const SequenceNumber seq_num; + + bool output_to_penultimate_level; + + PerKeyPlacementContext(int _level, Slice _key, Slice _value, + SequenceNumber _seq_num) + : level(_level), key(_key), value(_value), seq_num(_seq_num) { + output_to_penultimate_level = false; + } +}; +#endif /* !NDEBUG */ + +// Return sum of sizes of all files in `files`. +extern uint64_t TotalFileSize(const std::vector& files); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_iteration_stats.h b/src/rocksdb/db/compaction/compaction_iteration_stats.h new file mode 100644 index 000000000..1b1c28b57 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_iteration_stats.h @@ -0,0 +1,49 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +struct CompactionIterationStats { + // Compaction statistics + + // Doesn't include records skipped because of + // CompactionFilter::Decision::kRemoveAndSkipUntil. + int64_t num_record_drop_user = 0; + + int64_t num_record_drop_hidden = 0; + int64_t num_record_drop_obsolete = 0; + int64_t num_record_drop_range_del = 0; + int64_t num_range_del_drop_obsolete = 0; + // Deletions obsoleted before bottom level due to file gap optimization. + int64_t num_optimized_del_drop_obsolete = 0; + uint64_t total_filter_time = 0; + + // Input statistics + // TODO(noetzli): The stats are incomplete. They are lacking everything + // consumed by MergeHelper. + uint64_t num_input_records = 0; + uint64_t num_input_deletion_records = 0; + uint64_t num_input_corrupt_records = 0; + uint64_t total_input_raw_key_bytes = 0; + uint64_t total_input_raw_value_bytes = 0; + + // Single-Delete diagnostics for exceptional situations + uint64_t num_single_del_fallthru = 0; + uint64_t num_single_del_mismatch = 0; + + // Blob related statistics + uint64_t num_blobs_read = 0; + uint64_t total_blob_bytes_read = 0; + uint64_t num_blobs_relocated = 0; + uint64_t total_blob_bytes_relocated = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_iterator.cc b/src/rocksdb/db/compaction/compaction_iterator.cc new file mode 100644 index 000000000..9f54f7813 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_iterator.cc @@ -0,0 +1,1338 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/compaction/compaction_iterator.h" + +#include +#include + +#include "db/blob/blob_fetcher.h" +#include "db/blob/blob_file_builder.h" +#include "db/blob/blob_index.h" +#include "db/blob/prefetch_buffer_collection.h" +#include "db/snapshot_checker.h" +#include "logging/logging.h" +#include "port/likely.h" +#include "rocksdb/listener.h" +#include "table/internal_iterator.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { +CompactionIterator::CompactionIterator( + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, + SequenceNumber last_sequence, std::vector* snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker, + Env* env, bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, + BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, + bool enforce_single_del_contracts, + const std::atomic& manual_compaction_canceled, + const Compaction* compaction, const CompactionFilter* compaction_filter, + const std::atomic* shutting_down, + const std::shared_ptr info_log, + const std::string* full_history_ts_low, + const SequenceNumber preserve_time_min_seqno, + const SequenceNumber preclude_last_level_min_seqno) + : CompactionIterator( + input, cmp, merge_helper, last_sequence, snapshots, + earliest_write_conflict_snapshot, job_snapshot, snapshot_checker, env, + report_detailed_time, expect_valid_internal_key, range_del_agg, + blob_file_builder, allow_data_in_errors, enforce_single_del_contracts, + manual_compaction_canceled, + std::unique_ptr( + compaction ? new RealCompaction(compaction) : nullptr), + compaction_filter, shutting_down, info_log, full_history_ts_low, + preserve_time_min_seqno, preclude_last_level_min_seqno) {} + +CompactionIterator::CompactionIterator( + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, + SequenceNumber /*last_sequence*/, std::vector* snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker, + Env* env, bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, + BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, + bool enforce_single_del_contracts, + const std::atomic& manual_compaction_canceled, + std::unique_ptr compaction, + const CompactionFilter* compaction_filter, + const std::atomic* shutting_down, + const std::shared_ptr info_log, + const std::string* full_history_ts_low, + const SequenceNumber preserve_time_min_seqno, + const SequenceNumber preclude_last_level_min_seqno) + : input_(input, cmp, + !compaction || compaction->DoesInputReferenceBlobFiles()), + cmp_(cmp), + merge_helper_(merge_helper), + snapshots_(snapshots), + earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), + job_snapshot_(job_snapshot), + snapshot_checker_(snapshot_checker), + env_(env), + clock_(env_->GetSystemClock().get()), + report_detailed_time_(report_detailed_time), + expect_valid_internal_key_(expect_valid_internal_key), + range_del_agg_(range_del_agg), + blob_file_builder_(blob_file_builder), + compaction_(std::move(compaction)), + compaction_filter_(compaction_filter), + shutting_down_(shutting_down), + manual_compaction_canceled_(manual_compaction_canceled), + bottommost_level_(!compaction_ ? false + : compaction_->bottommost_level() && + !compaction_->allow_ingest_behind()), + // snapshots_ cannot be nullptr, but we will assert later in the body of + // the constructor. + visible_at_tip_(snapshots_ ? snapshots_->empty() : false), + earliest_snapshot_(!snapshots_ || snapshots_->empty() + ? kMaxSequenceNumber + : snapshots_->at(0)), + info_log_(info_log), + allow_data_in_errors_(allow_data_in_errors), + enforce_single_del_contracts_(enforce_single_del_contracts), + timestamp_size_(cmp_ ? cmp_->timestamp_size() : 0), + full_history_ts_low_(full_history_ts_low), + current_user_key_sequence_(0), + current_user_key_snapshot_(0), + merge_out_iter_(merge_helper_), + blob_garbage_collection_cutoff_file_number_( + ComputeBlobGarbageCollectionCutoffFileNumber(compaction_.get())), + blob_fetcher_(CreateBlobFetcherIfNeeded(compaction_.get())), + prefetch_buffers_( + CreatePrefetchBufferCollectionIfNeeded(compaction_.get())), + current_key_committed_(false), + cmp_with_history_ts_low_(0), + level_(compaction_ == nullptr ? 0 : compaction_->level()), + preserve_time_min_seqno_(preserve_time_min_seqno), + preclude_last_level_min_seqno_(preclude_last_level_min_seqno) { + assert(snapshots_ != nullptr); + assert(preserve_time_min_seqno_ <= preclude_last_level_min_seqno_); + + if (compaction_ != nullptr) { + level_ptrs_ = std::vector(compaction_->number_levels(), 0); + } +#ifndef NDEBUG + // findEarliestVisibleSnapshot assumes this ordering. + for (size_t i = 1; i < snapshots_->size(); ++i) { + assert(snapshots_->at(i - 1) < snapshots_->at(i)); + } + assert(timestamp_size_ == 0 || !full_history_ts_low_ || + timestamp_size_ == full_history_ts_low_->size()); +#endif + input_.SetPinnedItersMgr(&pinned_iters_mgr_); + TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get()); +} + +CompactionIterator::~CompactionIterator() { + // input_ Iterator lifetime is longer than pinned_iters_mgr_ lifetime + input_.SetPinnedItersMgr(nullptr); +} + +void CompactionIterator::ResetRecordCounts() { + iter_stats_.num_record_drop_user = 0; + iter_stats_.num_record_drop_hidden = 0; + iter_stats_.num_record_drop_obsolete = 0; + iter_stats_.num_record_drop_range_del = 0; + iter_stats_.num_range_del_drop_obsolete = 0; + iter_stats_.num_optimized_del_drop_obsolete = 0; +} + +void CompactionIterator::SeekToFirst() { + NextFromInput(); + PrepareOutput(); +} + +void CompactionIterator::Next() { + // If there is a merge output, return it before continuing to process the + // input. + if (merge_out_iter_.Valid()) { + merge_out_iter_.Next(); + + // Check if we returned all records of the merge output. + if (merge_out_iter_.Valid()) { + key_ = merge_out_iter_.key(); + value_ = merge_out_iter_.value(); + Status s = ParseInternalKey(key_, &ikey_, allow_data_in_errors_); + // MergeUntil stops when it encounters a corrupt key and does not + // include them in the result, so we expect the keys here to be valid. + if (!s.ok()) { + ROCKS_LOG_FATAL( + info_log_, "Invalid ikey %s in compaction. %s", + allow_data_in_errors_ ? key_.ToString(true).c_str() : "hidden", + s.getState()); + assert(false); + } + + // Keep current_key_ in sync. + if (0 == timestamp_size_) { + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + } else { + Slice ts = ikey_.GetTimestamp(timestamp_size_); + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type, &ts); + } + key_ = current_key_.GetInternalKey(); + ikey_.user_key = current_key_.GetUserKey(); + validity_info_.SetValid(ValidContext::kMerge1); + } else { + // We consumed all pinned merge operands, release pinned iterators + pinned_iters_mgr_.ReleasePinnedData(); + // MergeHelper moves the iterator to the first record after the merged + // records, so even though we reached the end of the merge output, we do + // not want to advance the iterator. + NextFromInput(); + } + } else { + // Only advance the input iterator if there is no merge output and the + // iterator is not already at the next record. + if (!at_next_) { + AdvanceInputIter(); + } + NextFromInput(); + } + + if (Valid()) { + // Record that we've outputted a record for the current key. + has_outputted_key_ = true; + } + + PrepareOutput(); +} + +bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip, + Slice* skip_until) { + // TODO: support compaction filter for wide-column entities + if (!compaction_filter_ || + (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex)) { + return true; + } + bool error = false; + // If the user has specified a compaction filter and the sequence + // number is greater than any external snapshot, then invoke the + // filter. If the return value of the compaction filter is true, + // replace the entry with a deletion marker. + CompactionFilter::Decision filter = CompactionFilter::Decision::kUndetermined; + compaction_filter_value_.clear(); + compaction_filter_skip_until_.Clear(); + CompactionFilter::ValueType value_type = + ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue + : CompactionFilter::ValueType::kBlobIndex; + // Hack: pass internal key to BlobIndexCompactionFilter since it needs + // to get sequence number. + assert(compaction_filter_); + Slice& filter_key = + (ikey_.type == kTypeValue || + !compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) + ? ikey_.user_key + : key_; + { + StopWatchNano timer(clock_, report_detailed_time_); + if (kTypeBlobIndex == ikey_.type) { + filter = compaction_filter_->FilterBlobByKey( + level_, filter_key, &compaction_filter_value_, + compaction_filter_skip_until_.rep()); + if (CompactionFilter::Decision::kUndetermined == filter && + !compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) { + if (compaction_ == nullptr) { + status_ = + Status::Corruption("Unexpected blob index outside of compaction"); + validity_info_.Invalidate(); + return false; + } + + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::InvokeFilterIfNeeded::TamperWithBlobIndex", + &value_); + + // For integrated BlobDB impl, CompactionIterator reads blob value. + // For Stacked BlobDB impl, the corresponding CompactionFilter's + // FilterV2 method should read the blob value. + BlobIndex blob_index; + Status s = blob_index.DecodeFrom(value_); + if (!s.ok()) { + status_ = s; + validity_info_.Invalidate(); + return false; + } + + FilePrefetchBuffer* prefetch_buffer = + prefetch_buffers_ ? prefetch_buffers_->GetOrCreatePrefetchBuffer( + blob_index.file_number()) + : nullptr; + + uint64_t bytes_read = 0; + + assert(blob_fetcher_); + + s = blob_fetcher_->FetchBlob(ikey_.user_key, blob_index, + prefetch_buffer, &blob_value_, + &bytes_read); + if (!s.ok()) { + status_ = s; + validity_info_.Invalidate(); + return false; + } + + ++iter_stats_.num_blobs_read; + iter_stats_.total_blob_bytes_read += bytes_read; + + value_type = CompactionFilter::ValueType::kValue; + } + } + if (CompactionFilter::Decision::kUndetermined == filter) { + filter = compaction_filter_->FilterV2( + level_, filter_key, value_type, + blob_value_.empty() ? value_ : blob_value_, &compaction_filter_value_, + compaction_filter_skip_until_.rep()); + } + iter_stats_.total_filter_time += + env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0; + } + + if (CompactionFilter::Decision::kUndetermined == filter) { + // Should not reach here, since FilterV2 should never return kUndetermined. + status_ = + Status::NotSupported("FilterV2() should never return kUndetermined"); + validity_info_.Invalidate(); + return false; + } + + if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil && + cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <= + 0) { + // Can't skip to a key smaller than the current one. + // Keep the key as per FilterV2 documentation. + filter = CompactionFilter::Decision::kKeep; + } + + if (filter == CompactionFilter::Decision::kRemove) { + // convert the current key to a delete; key_ is pointing into + // current_key_ at this point, so updating current_key_ updates key() + ikey_.type = kTypeDeletion; + current_key_.UpdateInternalKey(ikey_.sequence, kTypeDeletion); + // no value associated with delete + value_.clear(); + iter_stats_.num_record_drop_user++; + } else if (filter == CompactionFilter::Decision::kPurge) { + // convert the current key to a single delete; key_ is pointing into + // current_key_ at this point, so updating current_key_ updates key() + ikey_.type = kTypeSingleDeletion; + current_key_.UpdateInternalKey(ikey_.sequence, kTypeSingleDeletion); + // no value associated with single delete + value_.clear(); + iter_stats_.num_record_drop_user++; + } else if (filter == CompactionFilter::Decision::kChangeValue) { + if (ikey_.type == kTypeBlobIndex) { + // value transfer from blob file to inlined data + ikey_.type = kTypeValue; + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + } + value_ = compaction_filter_value_; + } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) { + *need_skip = true; + compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber, + kValueTypeForSeek); + *skip_until = compaction_filter_skip_until_.Encode(); + } else if (filter == CompactionFilter::Decision::kChangeBlobIndex) { + // Only the StackableDB-based BlobDB impl's compaction filter should return + // kChangeBlobIndex. Decision about rewriting blob and changing blob index + // in the integrated BlobDB impl is made in subsequent call to + // PrepareOutput() and its callees. + if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) { + status_ = Status::NotSupported( + "Only stacked BlobDB's internal compaction filter can return " + "kChangeBlobIndex."); + validity_info_.Invalidate(); + return false; + } + if (ikey_.type == kTypeValue) { + // value transfer from inlined data to blob file + ikey_.type = kTypeBlobIndex; + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + } + value_ = compaction_filter_value_; + } else if (filter == CompactionFilter::Decision::kIOError) { + if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) { + status_ = Status::NotSupported( + "CompactionFilter for integrated BlobDB should not return kIOError"); + validity_info_.Invalidate(); + return false; + } + status_ = Status::IOError("Failed to access blob during compaction filter"); + error = true; + } + return !error; +} + +void CompactionIterator::NextFromInput() { + at_next_ = false; + validity_info_.Invalidate(); + + while (!Valid() && input_.Valid() && !IsPausingManualCompaction() && + !IsShuttingDown()) { + key_ = input_.key(); + value_ = input_.value(); + blob_value_.Reset(); + iter_stats_.num_input_records++; + + Status pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_); + if (!pik_status.ok()) { + iter_stats_.num_input_corrupt_records++; + + // If `expect_valid_internal_key_` is false, return the corrupted key + // and let the caller decide what to do with it. + if (expect_valid_internal_key_) { + status_ = pik_status; + return; + } + key_ = current_key_.SetInternalKey(key_); + has_current_user_key_ = false; + current_user_key_sequence_ = kMaxSequenceNumber; + current_user_key_snapshot_ = 0; + validity_info_.SetValid(ValidContext::kParseKeyError); + break; + } + TEST_SYNC_POINT_CALLBACK("CompactionIterator:ProcessKV", &ikey_); + + // Update input statistics + if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion || + ikey_.type == kTypeDeletionWithTimestamp) { + iter_stats_.num_input_deletion_records++; + } + iter_stats_.total_input_raw_key_bytes += key_.size(); + iter_stats_.total_input_raw_value_bytes += value_.size(); + + // If need_skip is true, we should seek the input iterator + // to internal key skip_until and continue from there. + bool need_skip = false; + // Points either into compaction_filter_skip_until_ or into + // merge_helper_->compaction_filter_skip_until_. + Slice skip_until; + + bool user_key_equal_without_ts = false; + int cmp_ts = 0; + if (has_current_user_key_) { + user_key_equal_without_ts = + cmp_->EqualWithoutTimestamp(ikey_.user_key, current_user_key_); + // if timestamp_size_ > 0, then curr_ts_ has been initialized by a + // previous key. + cmp_ts = timestamp_size_ ? cmp_->CompareTimestamp( + ExtractTimestampFromUserKey( + ikey_.user_key, timestamp_size_), + curr_ts_) + : 0; + } + + // Check whether the user key changed. After this if statement current_key_ + // is a copy of the current input key (maybe converted to a delete by the + // compaction filter). ikey_.user_key is pointing to the copy. + if (!has_current_user_key_ || !user_key_equal_without_ts || cmp_ts != 0) { + // First occurrence of this user key + // Copy key for output + key_ = current_key_.SetInternalKey(key_, &ikey_); + + int prev_cmp_with_ts_low = + !full_history_ts_low_ ? 0 + : curr_ts_.empty() + ? 0 + : cmp_->CompareTimestamp(curr_ts_, *full_history_ts_low_); + + // If timestamp_size_ > 0, then copy from ikey_ to curr_ts_ for the use + // in next iteration to compare with the timestamp of next key. + UpdateTimestampAndCompareWithFullHistoryLow(); + + // If + // (1) !has_current_user_key_, OR + // (2) timestamp is disabled, OR + // (3) all history will be preserved, OR + // (4) user key (excluding timestamp) is different from previous key, OR + // (5) timestamp is NO older than *full_history_ts_low_, OR + // (6) timestamp is the largest one older than full_history_ts_low_, + // then current_user_key_ must be treated as a different user key. + // This means, if a user key (excluding ts) is the same as the previous + // user key, and its ts is older than *full_history_ts_low_, then we + // consider this key for GC, e.g. it may be dropped if certain conditions + // match. + if (!has_current_user_key_ || !timestamp_size_ || !full_history_ts_low_ || + !user_key_equal_without_ts || cmp_with_history_ts_low_ >= 0 || + prev_cmp_with_ts_low >= 0) { + // Initialize for future comparison for rule (A) and etc. + current_user_key_sequence_ = kMaxSequenceNumber; + current_user_key_snapshot_ = 0; + has_current_user_key_ = true; + } + current_user_key_ = ikey_.user_key; + + has_outputted_key_ = false; + + last_key_seq_zeroed_ = false; + + current_key_committed_ = KeyCommitted(ikey_.sequence); + + // Apply the compaction filter to the first committed version of the user + // key. + if (current_key_committed_ && + !InvokeFilterIfNeeded(&need_skip, &skip_until)) { + break; + } + } else { + // Update the current key to reflect the new sequence number/type without + // copying the user key. + // TODO(rven): Compaction filter does not process keys in this path + // Need to have the compaction filter process multiple versions + // if we have versions on both sides of a snapshot + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + key_ = current_key_.GetInternalKey(); + ikey_.user_key = current_key_.GetUserKey(); + + // Note that newer version of a key is ordered before older versions. If a + // newer version of a key is committed, so as the older version. No need + // to query snapshot_checker_ in that case. + if (UNLIKELY(!current_key_committed_)) { + assert(snapshot_checker_ != nullptr); + current_key_committed_ = KeyCommitted(ikey_.sequence); + // Apply the compaction filter to the first committed version of the + // user key. + if (current_key_committed_ && + !InvokeFilterIfNeeded(&need_skip, &skip_until)) { + break; + } + } + } + + if (UNLIKELY(!current_key_committed_)) { + assert(snapshot_checker_ != nullptr); + validity_info_.SetValid(ValidContext::kCurrentKeyUncommitted); + break; + } + + // If there are no snapshots, then this kv affect visibility at tip. + // Otherwise, search though all existing snapshots to find the earliest + // snapshot that is affected by this kv. + SequenceNumber last_sequence = current_user_key_sequence_; + current_user_key_sequence_ = ikey_.sequence; + SequenceNumber last_snapshot = current_user_key_snapshot_; + SequenceNumber prev_snapshot = 0; // 0 means no previous snapshot + current_user_key_snapshot_ = + visible_at_tip_ + ? earliest_snapshot_ + : findEarliestVisibleSnapshot(ikey_.sequence, &prev_snapshot); + + if (need_skip) { + // This case is handled below. + } else if (clear_and_output_next_key_) { + // In the previous iteration we encountered a single delete that we could + // not compact out. We will keep this Put, but can drop it's data. + // (See Optimization 3, below.) + if (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex && + ikey_.type != kTypeWideColumnEntity) { + ROCKS_LOG_FATAL(info_log_, "Unexpected key %s for compaction output", + ikey_.DebugString(allow_data_in_errors_, true).c_str()); + assert(false); + } + if (current_user_key_snapshot_ < last_snapshot) { + ROCKS_LOG_FATAL(info_log_, + "key %s, current_user_key_snapshot_ (%" PRIu64 + ") < last_snapshot (%" PRIu64 ")", + ikey_.DebugString(allow_data_in_errors_, true).c_str(), + current_user_key_snapshot_, last_snapshot); + assert(false); + } + + if (ikey_.type == kTypeBlobIndex || ikey_.type == kTypeWideColumnEntity) { + ikey_.type = kTypeValue; + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + } + + value_.clear(); + validity_info_.SetValid(ValidContext::kKeepSDAndClearPut); + clear_and_output_next_key_ = false; + } else if (ikey_.type == kTypeSingleDeletion) { + // We can compact out a SingleDelete if: + // 1) We encounter the corresponding PUT -OR- we know that this key + // doesn't appear past this output level + // =AND= + // 2) We've already returned a record in this snapshot -OR- + // there are no earlier earliest_write_conflict_snapshot. + // + // A note about 2) above: + // we try to determine whether there is any earlier write conflict + // checking snapshot by calling DefinitelyInSnapshot() with seq and + // earliest_write_conflict_snapshot as arguments. For write-prepared + // and write-unprepared transactions, if earliest_write_conflict_snapshot + // is evicted from WritePreparedTxnDB::commit_cache, then + // DefinitelyInSnapshot(seq, earliest_write_conflict_snapshot) returns + // false, even if the seq is actually visible within + // earliest_write_conflict_snapshot. Consequently, CompactionIterator + // may try to zero out its sequence number, thus hitting assertion error + // in debug mode or cause incorrect DBIter return result. + // We observe that earliest_write_conflict_snapshot >= earliest_snapshot, + // and the seq zeroing logic depends on + // DefinitelyInSnapshot(seq, earliest_snapshot). Therefore, if we cannot + // determine whether seq is **definitely** in + // earliest_write_conflict_snapshot, then we can additionally check if + // seq is definitely in earliest_snapshot. If the latter holds, then the + // former holds too. + // + // Rule 1 is needed for SingleDelete correctness. Rule 2 is needed to + // allow Transactions to do write-conflict checking (if we compacted away + // all keys, then we wouldn't know that a write happened in this + // snapshot). If there is no earlier snapshot, then we know that there + // are no active transactions that need to know about any writes. + // + // Optimization 3: + // If we encounter a SingleDelete followed by a PUT and Rule 2 is NOT + // true, then we must output a SingleDelete. In this case, we will decide + // to also output the PUT. While we are compacting less by outputting the + // PUT now, hopefully this will lead to better compaction in the future + // when Rule 2 is later true (Ie, We are hoping we can later compact out + // both the SingleDelete and the Put, while we couldn't if we only + // outputted the SingleDelete now). + // In this case, we can save space by removing the PUT's value as it will + // never be read. + // + // Deletes and Merges are not supported on the same key that has a + // SingleDelete as it is not possible to correctly do any partial + // compaction of such a combination of operations. The result of mixing + // those operations for a given key is documented as being undefined. So + // we can choose how to handle such a combinations of operations. We will + // try to compact out as much as we can in these cases. + // We will report counts on these anomalous cases. + // + // Note: If timestamp is enabled, then record will be eligible for + // deletion, only if, along with above conditions (Rule 1 and Rule 2) + // full_history_ts_low_ is specified and timestamp for that key is less + // than *full_history_ts_low_. If it's not eligible for deletion, then we + // will output the SingleDelete. For Optimization 3 also, if + // full_history_ts_low_ is specified and timestamp for the key is less + // than *full_history_ts_low_ then only optimization will be applied. + + // The easiest way to process a SingleDelete during iteration is to peek + // ahead at the next key. + const bool is_timestamp_eligible_for_gc = + (timestamp_size_ == 0 || + (full_history_ts_low_ && cmp_with_history_ts_low_ < 0)); + + ParsedInternalKey next_ikey; + AdvanceInputIter(); + + // Check whether the next key exists, is not corrupt, and is the same key + // as the single delete. + if (input_.Valid() && + ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_) + .ok() && + cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) { +#ifndef NDEBUG + const Compaction* c = + compaction_ ? compaction_->real_compaction() : nullptr; +#endif + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::NextFromInput:SingleDelete:1", + const_cast(c)); + if (last_key_seq_zeroed_) { + ++iter_stats_.num_record_drop_hidden; + ++iter_stats_.num_record_drop_obsolete; + assert(bottommost_level_); + AdvanceInputIter(); + } else if (prev_snapshot == 0 || + DefinitelyNotInSnapshot(next_ikey.sequence, prev_snapshot)) { + // Check whether the next key belongs to the same snapshot as the + // SingleDelete. + + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::NextFromInput:SingleDelete:2", nullptr); + if (next_ikey.type == kTypeSingleDeletion) { + // We encountered two SingleDeletes for same key in a row. This + // could be due to unexpected user input. If write-(un)prepared + // transaction is used, this could also be due to releasing an old + // snapshot between a Put and its matching SingleDelete. + // Skip the first SingleDelete and let the next iteration decide + // how to handle the second SingleDelete. + + // First SingleDelete has been skipped since we already called + // input_.Next(). + ++iter_stats_.num_record_drop_obsolete; + ++iter_stats_.num_single_del_mismatch; + } else if (next_ikey.type == kTypeDeletion) { + std::ostringstream oss; + oss << "Found SD and type: " << static_cast(next_ikey.type) + << " on the same key, violating the contract " + "of SingleDelete. Check your application to make sure the " + "application does not mix SingleDelete and Delete for " + "the same key. If you are using " + "write-prepared/write-unprepared transactions, and use " + "SingleDelete to delete certain keys, then make sure " + "TransactionDBOptions::rollback_deletion_type_callback is " + "configured properly. Mixing SD and DEL can lead to " + "undefined behaviors"; + ++iter_stats_.num_record_drop_obsolete; + ++iter_stats_.num_single_del_mismatch; + if (enforce_single_del_contracts_) { + ROCKS_LOG_ERROR(info_log_, "%s", oss.str().c_str()); + validity_info_.Invalidate(); + status_ = Status::Corruption(oss.str()); + return; + } + ROCKS_LOG_WARN(info_log_, "%s", oss.str().c_str()); + } else if (!is_timestamp_eligible_for_gc) { + // We cannot drop the SingleDelete as timestamp is enabled, and + // timestamp of this key is greater than or equal to + // *full_history_ts_low_. We will output the SingleDelete. + validity_info_.SetValid(ValidContext::kKeepTsHistory); + } else if (has_outputted_key_ || + DefinitelyInSnapshot(ikey_.sequence, + earliest_write_conflict_snapshot_) || + (earliest_snapshot_ < earliest_write_conflict_snapshot_ && + DefinitelyInSnapshot(ikey_.sequence, + earliest_snapshot_))) { + // Found a matching value, we can drop the single delete and the + // value. It is safe to drop both records since we've already + // outputted a key in this snapshot, or there is no earlier + // snapshot (Rule 2 above). + + // Note: it doesn't matter whether the second key is a Put or if it + // is an unexpected Merge or Delete. We will compact it out + // either way. We will maintain counts of how many mismatches + // happened + if (next_ikey.type != kTypeValue && + next_ikey.type != kTypeBlobIndex && + next_ikey.type != kTypeWideColumnEntity) { + ++iter_stats_.num_single_del_mismatch; + } + + ++iter_stats_.num_record_drop_hidden; + ++iter_stats_.num_record_drop_obsolete; + // Already called input_.Next() once. Call it a second time to + // skip past the second key. + AdvanceInputIter(); + } else { + // Found a matching value, but we cannot drop both keys since + // there is an earlier snapshot and we need to leave behind a record + // to know that a write happened in this snapshot (Rule 2 above). + // Clear the value and output the SingleDelete. (The value will be + // outputted on the next iteration.) + + // Setting valid_ to true will output the current SingleDelete + validity_info_.SetValid(ValidContext::kKeepSDForConflictCheck); + + // Set up the Put to be outputted in the next iteration. + // (Optimization 3). + clear_and_output_next_key_ = true; + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::NextFromInput:KeepSDForWW", + /*arg=*/nullptr); + } + } else { + // We hit the next snapshot without hitting a put, so the iterator + // returns the single delete. + validity_info_.SetValid(ValidContext::kKeepSDForSnapshot); + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::NextFromInput:SingleDelete:3", + const_cast(c)); + } + } else { + // We are at the end of the input, could not parse the next key, or hit + // a different key. The iterator returns the single delete if the key + // possibly exists beyond the current output level. We set + // has_current_user_key to false so that if the iterator is at the next + // key, we do not compare it again against the previous key at the next + // iteration. If the next key is corrupt, we return before the + // comparison, so the value of has_current_user_key does not matter. + has_current_user_key_ = false; + if (compaction_ != nullptr && + DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) && + compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key, + &level_ptrs_) && + is_timestamp_eligible_for_gc) { + // Key doesn't exist outside of this range. + // Can compact out this SingleDelete. + ++iter_stats_.num_record_drop_obsolete; + ++iter_stats_.num_single_del_fallthru; + if (!bottommost_level_) { + ++iter_stats_.num_optimized_del_drop_obsolete; + } + } else if (last_key_seq_zeroed_) { + // Skip. + ++iter_stats_.num_record_drop_hidden; + ++iter_stats_.num_record_drop_obsolete; + assert(bottommost_level_); + } else { + // Output SingleDelete + validity_info_.SetValid(ValidContext::kKeepSD); + } + } + + if (Valid()) { + at_next_ = true; + } + } else if (last_snapshot == current_user_key_snapshot_ || + (last_snapshot > 0 && + last_snapshot < current_user_key_snapshot_)) { + // If the earliest snapshot is which this key is visible in + // is the same as the visibility of a previous instance of the + // same key, then this kv is not visible in any snapshot. + // Hidden by an newer entry for same user key + // + // Note: Dropping this key will not affect TransactionDB write-conflict + // checking since there has already been a record returned for this key + // in this snapshot. + if (last_sequence < current_user_key_sequence_) { + ROCKS_LOG_FATAL(info_log_, + "key %s, last_sequence (%" PRIu64 + ") < current_user_key_sequence_ (%" PRIu64 ")", + ikey_.DebugString(allow_data_in_errors_, true).c_str(), + last_sequence, current_user_key_sequence_); + assert(false); + } + + ++iter_stats_.num_record_drop_hidden; // rule (A) + AdvanceInputIter(); + } else if (compaction_ != nullptr && + (ikey_.type == kTypeDeletion || + (ikey_.type == kTypeDeletionWithTimestamp && + cmp_with_history_ts_low_ < 0)) && + DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) && + compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key, + &level_ptrs_)) { + // TODO(noetzli): This is the only place where we use compaction_ + // (besides the constructor). We should probably get rid of this + // dependency and find a way to do similar filtering during flushes. + // + // For this user key: + // (1) there is no data in higher levels + // (2) data in lower levels will have larger sequence numbers + // (3) data in layers that are being compacted here and have + // smaller sequence numbers will be dropped in the next + // few iterations of this loop (by rule (A) above). + // Therefore this deletion marker is obsolete and can be dropped. + // + // Note: Dropping this Delete will not affect TransactionDB + // write-conflict checking since it is earlier than any snapshot. + // + // It seems that we can also drop deletion later than earliest snapshot + // given that: + // (1) The deletion is earlier than earliest_write_conflict_snapshot, and + // (2) No value exist earlier than the deletion. + // + // Note also that a deletion marker of type kTypeDeletionWithTimestamp + // will be treated as a different user key unless the timestamp is older + // than *full_history_ts_low_. + ++iter_stats_.num_record_drop_obsolete; + if (!bottommost_level_) { + ++iter_stats_.num_optimized_del_drop_obsolete; + } + AdvanceInputIter(); + } else if ((ikey_.type == kTypeDeletion || + (ikey_.type == kTypeDeletionWithTimestamp && + cmp_with_history_ts_low_ < 0)) && + bottommost_level_) { + // Handle the case where we have a delete key at the bottom most level + // We can skip outputting the key iff there are no subsequent puts for + // this key + assert(!compaction_ || compaction_->KeyNotExistsBeyondOutputLevel( + ikey_.user_key, &level_ptrs_)); + ParsedInternalKey next_ikey; + AdvanceInputIter(); +#ifndef NDEBUG + const Compaction* c = + compaction_ ? compaction_->real_compaction() : nullptr; +#endif + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::NextFromInput:BottommostDelete:1", + const_cast(c)); + // Skip over all versions of this key that happen to occur in the same + // snapshot range as the delete. + // + // Note that a deletion marker of type kTypeDeletionWithTimestamp will be + // considered to have a different user key unless the timestamp is older + // than *full_history_ts_low_. + while (!IsPausingManualCompaction() && !IsShuttingDown() && + input_.Valid() && + (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_) + .ok()) && + cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key) && + (prev_snapshot == 0 || + DefinitelyNotInSnapshot(next_ikey.sequence, prev_snapshot))) { + AdvanceInputIter(); + } + // If you find you still need to output a row with this key, we need to + // output the delete too + if (input_.Valid() && + (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_) + .ok()) && + cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) { + validity_info_.SetValid(ValidContext::kKeepDel); + at_next_ = true; + } + } else if (ikey_.type == kTypeMerge) { + if (!merge_helper_->HasOperator()) { + status_ = Status::InvalidArgument( + "merge_operator is not properly initialized."); + return; + } + + pinned_iters_mgr_.StartPinning(); + + // We know the merge type entry is not hidden, otherwise we would + // have hit (A) + // We encapsulate the merge related state machine in a different + // object to minimize change to the existing flow. + Status s = merge_helper_->MergeUntil( + &input_, range_del_agg_, prev_snapshot, bottommost_level_, + allow_data_in_errors_, blob_fetcher_.get(), full_history_ts_low_, + prefetch_buffers_.get(), &iter_stats_); + merge_out_iter_.SeekToFirst(); + + if (!s.ok() && !s.IsMergeInProgress()) { + status_ = s; + return; + } else if (merge_out_iter_.Valid()) { + // NOTE: key, value, and ikey_ refer to old entries. + // These will be correctly set below. + key_ = merge_out_iter_.key(); + value_ = merge_out_iter_.value(); + pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_); + // MergeUntil stops when it encounters a corrupt key and does not + // include them in the result, so we expect the keys here to valid. + if (!pik_status.ok()) { + ROCKS_LOG_FATAL( + info_log_, "Invalid key %s in compaction. %s", + allow_data_in_errors_ ? key_.ToString(true).c_str() : "hidden", + pik_status.getState()); + assert(false); + } + // Keep current_key_ in sync. + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + key_ = current_key_.GetInternalKey(); + ikey_.user_key = current_key_.GetUserKey(); + validity_info_.SetValid(ValidContext::kMerge2); + } else { + // all merge operands were filtered out. reset the user key, since the + // batch consumed by the merge operator should not shadow any keys + // coming after the merges + has_current_user_key_ = false; + pinned_iters_mgr_.ReleasePinnedData(); + + if (merge_helper_->FilteredUntil(&skip_until)) { + need_skip = true; + } + } + } else { + // 1. new user key -OR- + // 2. different snapshot stripe + // If user-defined timestamp is enabled, we consider keys for GC if they + // are below history_ts_low_. CompactionRangeDelAggregator::ShouldDelete() + // only considers range deletions that are at or below history_ts_low_ and + // trim_ts_. We drop keys here that are below history_ts_low_ and are + // covered by a range tombstone that is at or below history_ts_low_ and + // trim_ts. + bool should_delete = false; + if (!timestamp_size_ || cmp_with_history_ts_low_ < 0) { + should_delete = range_del_agg_->ShouldDelete( + key_, RangeDelPositioningMode::kForwardTraversal); + } + if (should_delete) { + ++iter_stats_.num_record_drop_hidden; + ++iter_stats_.num_record_drop_range_del; + AdvanceInputIter(); + } else { + validity_info_.SetValid(ValidContext::kNewUserKey); + } + } + + if (need_skip) { + SkipUntil(skip_until); + } + } + + if (!Valid() && IsShuttingDown()) { + status_ = Status::ShutdownInProgress(); + } + + if (IsPausingManualCompaction()) { + status_ = Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + + // Propagate corruption status from memtable itereator + if (!input_.Valid() && input_.status().IsCorruption()) { + status_ = input_.status(); + } +} + +bool CompactionIterator::ExtractLargeValueIfNeededImpl() { + if (!blob_file_builder_) { + return false; + } + + blob_index_.clear(); + const Status s = blob_file_builder_->Add(user_key(), value_, &blob_index_); + + if (!s.ok()) { + status_ = s; + validity_info_.Invalidate(); + + return false; + } + + if (blob_index_.empty()) { + return false; + } + + value_ = blob_index_; + + return true; +} + +void CompactionIterator::ExtractLargeValueIfNeeded() { + assert(ikey_.type == kTypeValue); + + if (!ExtractLargeValueIfNeededImpl()) { + return; + } + + ikey_.type = kTypeBlobIndex; + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); +} + +void CompactionIterator::GarbageCollectBlobIfNeeded() { + assert(ikey_.type == kTypeBlobIndex); + + if (!compaction_) { + return; + } + + // GC for integrated BlobDB + if (compaction_->enable_blob_garbage_collection()) { + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::GarbageCollectBlobIfNeeded::TamperWithBlobIndex", + &value_); + + BlobIndex blob_index; + + { + const Status s = blob_index.DecodeFrom(value_); + + if (!s.ok()) { + status_ = s; + validity_info_.Invalidate(); + + return; + } + } + + if (blob_index.file_number() >= + blob_garbage_collection_cutoff_file_number_) { + return; + } + + FilePrefetchBuffer* prefetch_buffer = + prefetch_buffers_ ? prefetch_buffers_->GetOrCreatePrefetchBuffer( + blob_index.file_number()) + : nullptr; + + uint64_t bytes_read = 0; + + { + assert(blob_fetcher_); + + const Status s = blob_fetcher_->FetchBlob( + user_key(), blob_index, prefetch_buffer, &blob_value_, &bytes_read); + + if (!s.ok()) { + status_ = s; + validity_info_.Invalidate(); + + return; + } + } + + ++iter_stats_.num_blobs_read; + iter_stats_.total_blob_bytes_read += bytes_read; + + ++iter_stats_.num_blobs_relocated; + iter_stats_.total_blob_bytes_relocated += blob_index.size(); + + value_ = blob_value_; + + if (ExtractLargeValueIfNeededImpl()) { + return; + } + + ikey_.type = kTypeValue; + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + + return; + } + + // GC for stacked BlobDB + if (compaction_filter_ && + compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) { + const auto blob_decision = compaction_filter_->PrepareBlobOutput( + user_key(), value_, &compaction_filter_value_); + + if (blob_decision == CompactionFilter::BlobDecision::kCorruption) { + status_ = + Status::Corruption("Corrupted blob reference encountered during GC"); + validity_info_.Invalidate(); + + return; + } + + if (blob_decision == CompactionFilter::BlobDecision::kIOError) { + status_ = Status::IOError("Could not relocate blob during GC"); + validity_info_.Invalidate(); + + return; + } + + if (blob_decision == CompactionFilter::BlobDecision::kChangeValue) { + value_ = compaction_filter_value_; + + return; + } + } +} + +void CompactionIterator::DecideOutputLevel() { + assert(compaction_->SupportsPerKeyPlacement()); +#ifndef NDEBUG + // Could be overridden by unittest + PerKeyPlacementContext context(level_, ikey_.user_key, value_, + ikey_.sequence); + TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context", + &context); + output_to_penultimate_level_ = context.output_to_penultimate_level; +#else + output_to_penultimate_level_ = false; +#endif // NDEBUG + + // if the key is newer than the cutoff sequence or within the earliest + // snapshot, it should output to the penultimate level. + if (ikey_.sequence > preclude_last_level_min_seqno_ || + ikey_.sequence > earliest_snapshot_) { + output_to_penultimate_level_ = true; + } + + if (output_to_penultimate_level_) { + // If it's decided to output to the penultimate level, but unsafe to do so, + // still output to the last level. For example, moving the data from a lower + // level to a higher level outside of the higher-level input key range is + // considered unsafe, because the key may conflict with higher-level SSTs + // not from this compaction. + // TODO: add statistic for declined output_to_penultimate_level + bool safe_to_penultimate_level = + compaction_->WithinPenultimateLevelOutputRange(ikey_.user_key); + if (!safe_to_penultimate_level) { + output_to_penultimate_level_ = false; + // It could happen when disable/enable `last_level_temperature` while + // holding a snapshot. When `last_level_temperature` is not set + // (==kUnknown), the data newer than any snapshot is pushed to the last + // level, but when the per_key_placement feature is enabled on the fly, + // the data later than the snapshot has to be moved to the penultimate + // level, which may or may not be safe. So the user needs to make sure all + // snapshot is released before enabling `last_level_temperature` feature + // We will migrate the feature to `last_level_temperature` and maybe make + // it not dynamically changeable. + if (ikey_.sequence > earliest_snapshot_) { + status_ = Status::Corruption( + "Unsafe to store Seq later than snapshot in the last level if " + "per_key_placement is enabled"); + } + } + } +} + +void CompactionIterator::PrepareOutput() { + if (Valid()) { + if (ikey_.type == kTypeValue) { + ExtractLargeValueIfNeeded(); + } else if (ikey_.type == kTypeBlobIndex) { + GarbageCollectBlobIfNeeded(); + } + + if (compaction_ != nullptr && compaction_->SupportsPerKeyPlacement()) { + DecideOutputLevel(); + } + + // Zeroing out the sequence number leads to better compression. + // If this is the bottommost level (no files in lower levels) + // and the earliest snapshot is larger than this seqno + // and the userkey differs from the last userkey in compaction + // then we can squash the seqno to zero. + // + // This is safe for TransactionDB write-conflict checking since transactions + // only care about sequence number larger than any active snapshots. + // + // Can we do the same for levels above bottom level as long as + // KeyNotExistsBeyondOutputLevel() return true? + if (Valid() && compaction_ != nullptr && + !compaction_->allow_ingest_behind() && bottommost_level_ && + DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) && + ikey_.type != kTypeMerge && current_key_committed_ && + !output_to_penultimate_level_ && + ikey_.sequence < preserve_time_min_seqno_) { + if (ikey_.type == kTypeDeletion || + (ikey_.type == kTypeSingleDeletion && timestamp_size_ == 0)) { + ROCKS_LOG_FATAL( + info_log_, + "Unexpected key %s for seq-zero optimization. " + "earliest_snapshot %" PRIu64 + ", earliest_write_conflict_snapshot %" PRIu64 + " job_snapshot %" PRIu64 + ". timestamp_size: %d full_history_ts_low_ %s. validity %x", + ikey_.DebugString(allow_data_in_errors_, true).c_str(), + earliest_snapshot_, earliest_write_conflict_snapshot_, + job_snapshot_, static_cast(timestamp_size_), + full_history_ts_low_ != nullptr + ? Slice(*full_history_ts_low_).ToString(true).c_str() + : "null", + validity_info_.rep); + assert(false); + } + ikey_.sequence = 0; + last_key_seq_zeroed_ = true; + TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput:ZeroingSeq", + &ikey_); + if (!timestamp_size_) { + current_key_.UpdateInternalKey(0, ikey_.type); + } else if (full_history_ts_low_ && cmp_with_history_ts_low_ < 0) { + // We can also zero out timestamp for better compression. + // For the same user key (excluding timestamp), the timestamp-based + // history can be collapsed to save some space if the timestamp is + // older than *full_history_ts_low_. + const std::string kTsMin(timestamp_size_, static_cast(0)); + const Slice ts_slice = kTsMin; + ikey_.SetTimestamp(ts_slice); + current_key_.UpdateInternalKey(0, ikey_.type, &ts_slice); + } + } + } +} + +inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot( + SequenceNumber in, SequenceNumber* prev_snapshot) { + assert(snapshots_->size()); + if (snapshots_->size() == 0) { + ROCKS_LOG_FATAL(info_log_, + "No snapshot left in findEarliestVisibleSnapshot"); + } + auto snapshots_iter = + std::lower_bound(snapshots_->begin(), snapshots_->end(), in); + assert(prev_snapshot != nullptr); + if (snapshots_iter == snapshots_->begin()) { + *prev_snapshot = 0; + } else { + *prev_snapshot = *std::prev(snapshots_iter); + if (*prev_snapshot >= in) { + ROCKS_LOG_FATAL(info_log_, + "*prev_snapshot (%" PRIu64 ") >= in (%" PRIu64 + ") in findEarliestVisibleSnapshot", + *prev_snapshot, in); + assert(false); + } + } + if (snapshot_checker_ == nullptr) { + return snapshots_iter != snapshots_->end() ? *snapshots_iter + : kMaxSequenceNumber; + } + bool has_released_snapshot = !released_snapshots_.empty(); + for (; snapshots_iter != snapshots_->end(); ++snapshots_iter) { + auto cur = *snapshots_iter; + if (in > cur) { + ROCKS_LOG_FATAL(info_log_, + "in (%" PRIu64 ") > cur (%" PRIu64 + ") in findEarliestVisibleSnapshot", + in, cur); + assert(false); + } + // Skip if cur is in released_snapshots. + if (has_released_snapshot && released_snapshots_.count(cur) > 0) { + continue; + } + auto res = snapshot_checker_->CheckInSnapshot(in, cur); + if (res == SnapshotCheckerResult::kInSnapshot) { + return cur; + } else if (res == SnapshotCheckerResult::kSnapshotReleased) { + released_snapshots_.insert(cur); + } + *prev_snapshot = cur; + } + return kMaxSequenceNumber; +} + +uint64_t CompactionIterator::ComputeBlobGarbageCollectionCutoffFileNumber( + const CompactionProxy* compaction) { + if (!compaction) { + return 0; + } + + if (!compaction->enable_blob_garbage_collection()) { + return 0; + } + + const Version* const version = compaction->input_version(); + assert(version); + + const VersionStorageInfo* const storage_info = version->storage_info(); + assert(storage_info); + + const auto& blob_files = storage_info->GetBlobFiles(); + + const size_t cutoff_index = static_cast( + compaction->blob_garbage_collection_age_cutoff() * blob_files.size()); + + if (cutoff_index >= blob_files.size()) { + return std::numeric_limits::max(); + } + + const auto& meta = blob_files[cutoff_index]; + assert(meta); + + return meta->GetBlobFileNumber(); +} + +std::unique_ptr CompactionIterator::CreateBlobFetcherIfNeeded( + const CompactionProxy* compaction) { + if (!compaction) { + return nullptr; + } + + const Version* const version = compaction->input_version(); + if (!version) { + return nullptr; + } + + ReadOptions read_options; + read_options.fill_cache = false; + + return std::unique_ptr(new BlobFetcher(version, read_options)); +} + +std::unique_ptr +CompactionIterator::CreatePrefetchBufferCollectionIfNeeded( + const CompactionProxy* compaction) { + if (!compaction) { + return nullptr; + } + + if (!compaction->input_version()) { + return nullptr; + } + + if (compaction->allow_mmap_reads()) { + return nullptr; + } + + const uint64_t readahead_size = compaction->blob_compaction_readahead_size(); + if (!readahead_size) { + return nullptr; + } + + return std::unique_ptr( + new PrefetchBufferCollection(readahead_size)); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_iterator.h b/src/rocksdb/db/compaction/compaction_iterator.h new file mode 100644 index 000000000..c215d2bbb --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_iterator.h @@ -0,0 +1,513 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_iteration_stats.h" +#include "db/merge_helper.h" +#include "db/pinned_iterators_manager.h" +#include "db/range_del_aggregator.h" +#include "db/snapshot_checker.h" +#include "options/cf_options.h" +#include "rocksdb/compaction_filter.h" + +namespace ROCKSDB_NAMESPACE { + +class BlobFileBuilder; +class BlobFetcher; +class PrefetchBufferCollection; + +// A wrapper of internal iterator whose purpose is to count how +// many entries there are in the iterator. +class SequenceIterWrapper : public InternalIterator { + public: + SequenceIterWrapper(InternalIterator* iter, const Comparator* cmp, + bool need_count_entries) + : icmp_(cmp), + inner_iter_(iter), + need_count_entries_(need_count_entries) {} + bool Valid() const override { return inner_iter_->Valid(); } + Status status() const override { return inner_iter_->status(); } + void Next() override { + num_itered_++; + inner_iter_->Next(); + } + void Seek(const Slice& target) override { + if (!need_count_entries_) { + inner_iter_->Seek(target); + } else { + // For flush cases, we need to count total number of entries, so we + // do Next() rather than Seek(). + while (inner_iter_->Valid() && + icmp_.Compare(inner_iter_->key(), target) < 0) { + Next(); + } + } + } + Slice key() const override { return inner_iter_->key(); } + Slice value() const override { return inner_iter_->value(); } + + // Unused InternalIterator methods + void SeekToFirst() override { assert(false); } + void Prev() override { assert(false); } + void SeekForPrev(const Slice& /* target */) override { assert(false); } + void SeekToLast() override { assert(false); } + + uint64_t num_itered() const { return num_itered_; } + + private: + InternalKeyComparator icmp_; + InternalIterator* inner_iter_; // not owned + uint64_t num_itered_ = 0; + bool need_count_entries_; +}; + +class CompactionIterator { + public: + // A wrapper around Compaction. Has a much smaller interface, only what + // CompactionIterator uses. Tests can override it. + class CompactionProxy { + public: + virtual ~CompactionProxy() = default; + + virtual int level() const = 0; + + virtual bool KeyNotExistsBeyondOutputLevel( + const Slice& user_key, std::vector* level_ptrs) const = 0; + + virtual bool bottommost_level() const = 0; + + virtual int number_levels() const = 0; + + // Result includes timestamp if user-defined timestamp is enabled. + virtual Slice GetLargestUserKey() const = 0; + + virtual bool allow_ingest_behind() const = 0; + + virtual bool allow_mmap_reads() const = 0; + + virtual bool enable_blob_garbage_collection() const = 0; + + virtual double blob_garbage_collection_age_cutoff() const = 0; + + virtual uint64_t blob_compaction_readahead_size() const = 0; + + virtual const Version* input_version() const = 0; + + virtual bool DoesInputReferenceBlobFiles() const = 0; + + virtual const Compaction* real_compaction() const = 0; + + virtual bool SupportsPerKeyPlacement() const = 0; + + // `key` includes timestamp if user-defined timestamp is enabled. + virtual bool WithinPenultimateLevelOutputRange(const Slice& key) const = 0; + }; + + class RealCompaction : public CompactionProxy { + public: + explicit RealCompaction(const Compaction* compaction) + : compaction_(compaction) { + assert(compaction_); + assert(compaction_->immutable_options()); + assert(compaction_->mutable_cf_options()); + } + + int level() const override { return compaction_->level(); } + + bool KeyNotExistsBeyondOutputLevel( + const Slice& user_key, std::vector* level_ptrs) const override { + return compaction_->KeyNotExistsBeyondOutputLevel(user_key, level_ptrs); + } + + bool bottommost_level() const override { + return compaction_->bottommost_level(); + } + + int number_levels() const override { return compaction_->number_levels(); } + + // Result includes timestamp if user-defined timestamp is enabled. + Slice GetLargestUserKey() const override { + return compaction_->GetLargestUserKey(); + } + + bool allow_ingest_behind() const override { + return compaction_->immutable_options()->allow_ingest_behind; + } + + bool allow_mmap_reads() const override { + return compaction_->immutable_options()->allow_mmap_reads; + } + + bool enable_blob_garbage_collection() const override { + return compaction_->enable_blob_garbage_collection(); + } + + double blob_garbage_collection_age_cutoff() const override { + return compaction_->blob_garbage_collection_age_cutoff(); + } + + uint64_t blob_compaction_readahead_size() const override { + return compaction_->mutable_cf_options()->blob_compaction_readahead_size; + } + + const Version* input_version() const override { + return compaction_->input_version(); + } + + bool DoesInputReferenceBlobFiles() const override { + return compaction_->DoesInputReferenceBlobFiles(); + } + + const Compaction* real_compaction() const override { return compaction_; } + + bool SupportsPerKeyPlacement() const override { + return compaction_->SupportsPerKeyPlacement(); + } + + // Check if key is within penultimate level output range, to see if it's + // safe to output to the penultimate level for per_key_placement feature. + // `key` includes timestamp if user-defined timestamp is enabled. + bool WithinPenultimateLevelOutputRange(const Slice& key) const override { + return compaction_->WithinPenultimateLevelOutputRange(key); + } + + private: + const Compaction* compaction_; + }; + + CompactionIterator( + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, + SequenceNumber last_sequence, std::vector* snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker, + Env* env, bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, + BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, + bool enforce_single_del_contracts, + const std::atomic& manual_compaction_canceled, + const Compaction* compaction = nullptr, + const CompactionFilter* compaction_filter = nullptr, + const std::atomic* shutting_down = nullptr, + const std::shared_ptr info_log = nullptr, + const std::string* full_history_ts_low = nullptr, + const SequenceNumber preserve_time_min_seqno = kMaxSequenceNumber, + const SequenceNumber preclude_last_level_min_seqno = kMaxSequenceNumber); + + // Constructor with custom CompactionProxy, used for tests. + CompactionIterator( + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, + SequenceNumber last_sequence, std::vector* snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker, + Env* env, bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, + BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, + bool enforce_single_del_contracts, + const std::atomic& manual_compaction_canceled, + std::unique_ptr compaction, + const CompactionFilter* compaction_filter = nullptr, + const std::atomic* shutting_down = nullptr, + const std::shared_ptr info_log = nullptr, + const std::string* full_history_ts_low = nullptr, + const SequenceNumber preserve_time_min_seqno = kMaxSequenceNumber, + const SequenceNumber preclude_last_level_min_seqno = kMaxSequenceNumber); + + ~CompactionIterator(); + + void ResetRecordCounts(); + + // Seek to the beginning of the compaction iterator output. + // + // REQUIRED: Call only once. + void SeekToFirst(); + + // Produces the next record in the compaction. + // + // REQUIRED: SeekToFirst() has been called. + void Next(); + + // Getters + const Slice& key() const { return key_; } + const Slice& value() const { return value_; } + const Status& status() const { return status_; } + const ParsedInternalKey& ikey() const { return ikey_; } + inline bool Valid() const { return validity_info_.IsValid(); } + const Slice& user_key() const { return current_user_key_; } + const CompactionIterationStats& iter_stats() const { return iter_stats_; } + uint64_t num_input_entry_scanned() const { return input_.num_itered(); } + // If the current key should be placed on penultimate level, only valid if + // per_key_placement is supported + bool output_to_penultimate_level() const { + return output_to_penultimate_level_; + } + Status InputStatus() const { return input_.status(); } + + private: + // Processes the input stream to find the next output + void NextFromInput(); + + // Do final preparations before presenting the output to the callee. + void PrepareOutput(); + + // Decide the current key should be output to the last level or penultimate + // level, only call for compaction supports per key placement + void DecideOutputLevel(); + + // Passes the output value to the blob file builder (if any), and replaces it + // with the corresponding blob reference if it has been actually written to a + // blob file (i.e. if it passed the value size check). Returns true if the + // value got extracted to a blob file, false otherwise. + bool ExtractLargeValueIfNeededImpl(); + + // Extracts large values as described above, and updates the internal key's + // type to kTypeBlobIndex if the value got extracted. Should only be called + // for regular values (kTypeValue). + void ExtractLargeValueIfNeeded(); + + // Relocates valid blobs residing in the oldest blob files if garbage + // collection is enabled. Relocated blobs are written to new blob files or + // inlined in the LSM tree depending on the current settings (i.e. + // enable_blob_files and min_blob_size). Should only be called for blob + // references (kTypeBlobIndex). + // + // Note: the stacked BlobDB implementation's compaction filter based GC + // algorithm is also called from here. + void GarbageCollectBlobIfNeeded(); + + // Invoke compaction filter if needed. + // Return true on success, false on failures (e.g.: kIOError). + bool InvokeFilterIfNeeded(bool* need_skip, Slice* skip_until); + + // Given a sequence number, return the sequence number of the + // earliest snapshot that this sequence number is visible in. + // The snapshots themselves are arranged in ascending order of + // sequence numbers. + // Employ a sequential search because the total number of + // snapshots are typically small. + inline SequenceNumber findEarliestVisibleSnapshot( + SequenceNumber in, SequenceNumber* prev_snapshot); + + inline bool KeyCommitted(SequenceNumber sequence) { + return snapshot_checker_ == nullptr || + snapshot_checker_->CheckInSnapshot(sequence, job_snapshot_) == + SnapshotCheckerResult::kInSnapshot; + } + + bool DefinitelyInSnapshot(SequenceNumber seq, SequenceNumber snapshot); + + bool DefinitelyNotInSnapshot(SequenceNumber seq, SequenceNumber snapshot); + + // Extract user-defined timestamp from user key if possible and compare it + // with *full_history_ts_low_ if applicable. + inline void UpdateTimestampAndCompareWithFullHistoryLow() { + if (!timestamp_size_) { + return; + } + Slice ts = ExtractTimestampFromUserKey(ikey_.user_key, timestamp_size_); + curr_ts_.assign(ts.data(), ts.size()); + if (full_history_ts_low_) { + cmp_with_history_ts_low_ = + cmp_->CompareTimestamp(ts, *full_history_ts_low_); + } + } + + static uint64_t ComputeBlobGarbageCollectionCutoffFileNumber( + const CompactionProxy* compaction); + static std::unique_ptr CreateBlobFetcherIfNeeded( + const CompactionProxy* compaction); + static std::unique_ptr + CreatePrefetchBufferCollectionIfNeeded(const CompactionProxy* compaction); + + SequenceIterWrapper input_; + const Comparator* cmp_; + MergeHelper* merge_helper_; + const std::vector* snapshots_; + // List of snapshots released during compaction. + // findEarliestVisibleSnapshot() find them out from return of + // snapshot_checker, and make sure they will not be returned as + // earliest visible snapshot of an older value. + // See WritePreparedTransactionTest::ReleaseSnapshotDuringCompaction3. + std::unordered_set released_snapshots_; + const SequenceNumber earliest_write_conflict_snapshot_; + const SequenceNumber job_snapshot_; + const SnapshotChecker* const snapshot_checker_; + Env* env_; + SystemClock* clock_; + const bool report_detailed_time_; + const bool expect_valid_internal_key_; + CompactionRangeDelAggregator* range_del_agg_; + BlobFileBuilder* blob_file_builder_; + std::unique_ptr compaction_; + const CompactionFilter* compaction_filter_; + const std::atomic* shutting_down_; + const std::atomic& manual_compaction_canceled_; + const bool bottommost_level_; + const bool visible_at_tip_; + const SequenceNumber earliest_snapshot_; + + std::shared_ptr info_log_; + + const bool allow_data_in_errors_; + + const bool enforce_single_del_contracts_; + + // Comes from comparator. + const size_t timestamp_size_; + + // Lower bound timestamp to retain full history in terms of user-defined + // timestamp. If a key's timestamp is older than full_history_ts_low_, then + // the key *may* be eligible for garbage collection (GC). The skipping logic + // is in `NextFromInput()` and `PrepareOutput()`. + // If nullptr, NO GC will be performed and all history will be preserved. + const std::string* const full_history_ts_low_; + + // State + // + enum ValidContext : uint8_t { + kMerge1 = 0, + kMerge2 = 1, + kParseKeyError = 2, + kCurrentKeyUncommitted = 3, + kKeepSDAndClearPut = 4, + kKeepTsHistory = 5, + kKeepSDForConflictCheck = 6, + kKeepSDForSnapshot = 7, + kKeepSD = 8, + kKeepDel = 9, + kNewUserKey = 10, + }; + + struct ValidityInfo { + inline bool IsValid() const { return rep & 1; } + ValidContext GetContext() const { + return static_cast(rep >> 1); + } + inline void SetValid(uint8_t ctx) { rep = (ctx << 1) | 1; } + inline void Invalidate() { rep = 0; } + + uint8_t rep{0}; + } validity_info_; + + // Points to a copy of the current compaction iterator output (current_key_) + // if valid. + Slice key_; + // Points to the value in the underlying iterator that corresponds to the + // current output. + Slice value_; + // The status is OK unless compaction iterator encounters a merge operand + // while not having a merge operator defined. + Status status_; + // Stores the user key, sequence number and type of the current compaction + // iterator output (or current key in the underlying iterator during + // NextFromInput()). + ParsedInternalKey ikey_; + // Stores whether ikey_.user_key is valid. If set to false, the user key is + // not compared against the current key in the underlying iterator. + bool has_current_user_key_ = false; + // If false, the iterator holds a copy of the current compaction iterator + // output (or current key in the underlying iterator during NextFromInput()). + bool at_next_ = false; + + IterKey current_key_; + Slice current_user_key_; + std::string curr_ts_; + SequenceNumber current_user_key_sequence_; + SequenceNumber current_user_key_snapshot_; + + // True if the iterator has already returned a record for the current key. + bool has_outputted_key_ = false; + + // truncated the value of the next key and output it without applying any + // compaction rules. This is used for outputting a put after a single delete. + bool clear_and_output_next_key_ = false; + + MergeOutputIterator merge_out_iter_; + // PinnedIteratorsManager used to pin input_ Iterator blocks while reading + // merge operands and then releasing them after consuming them. + PinnedIteratorsManager pinned_iters_mgr_; + + uint64_t blob_garbage_collection_cutoff_file_number_; + + std::unique_ptr blob_fetcher_; + std::unique_ptr prefetch_buffers_; + + std::string blob_index_; + PinnableSlice blob_value_; + std::string compaction_filter_value_; + InternalKey compaction_filter_skip_until_; + // "level_ptrs" holds indices that remember which file of an associated + // level we were last checking during the last call to compaction-> + // KeyNotExistsBeyondOutputLevel(). This allows future calls to the function + // to pick off where it left off since each subcompaction's key range is + // increasing so a later call to the function must be looking for a key that + // is in or beyond the last file checked during the previous call + std::vector level_ptrs_; + CompactionIterationStats iter_stats_; + + // Used to avoid purging uncommitted values. The application can specify + // uncommitted values by providing a SnapshotChecker object. + bool current_key_committed_; + + // Saved result of ucmp->CompareTimestamp(current_ts_, *full_history_ts_low_) + int cmp_with_history_ts_low_; + + const int level_; + + // True if the previous internal key (same user key)'s sequence number has + // just been zeroed out during bottommost compaction. + bool last_key_seq_zeroed_{false}; + + // True if the current key should be output to the penultimate level if + // possible, compaction logic makes the final decision on which level to + // output to. + bool output_to_penultimate_level_{false}; + + // min seqno for preserving the time information. + const SequenceNumber preserve_time_min_seqno_ = kMaxSequenceNumber; + + // min seqno to preclude the data from the last level, if the key seqno larger + // than this, it will be output to penultimate level + const SequenceNumber preclude_last_level_min_seqno_ = kMaxSequenceNumber; + + void AdvanceInputIter() { input_.Next(); } + + void SkipUntil(const Slice& skip_until) { input_.Seek(skip_until); } + + bool IsShuttingDown() { + // This is a best-effort facility, so memory_order_relaxed is sufficient. + return shutting_down_ && shutting_down_->load(std::memory_order_relaxed); + } + + bool IsPausingManualCompaction() { + // This is a best-effort facility, so memory_order_relaxed is sufficient. + return manual_compaction_canceled_.load(std::memory_order_relaxed); + } +}; + +inline bool CompactionIterator::DefinitelyInSnapshot(SequenceNumber seq, + SequenceNumber snapshot) { + return ((seq) <= (snapshot) && + (snapshot_checker_ == nullptr || + LIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == + SnapshotCheckerResult::kInSnapshot))); +} + +inline bool CompactionIterator::DefinitelyNotInSnapshot( + SequenceNumber seq, SequenceNumber snapshot) { + return ((seq) > (snapshot) || + (snapshot_checker_ != nullptr && + UNLIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == + SnapshotCheckerResult::kNotInSnapshot))); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_iterator_test.cc b/src/rocksdb/db/compaction/compaction_iterator_test.cc new file mode 100644 index 000000000..81362d792 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_iterator_test.cc @@ -0,0 +1,1618 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/compaction/compaction_iterator.h" + +#include +#include + +#include "db/dbformat.h" +#include "port/port.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" +#include "util/vector_iterator.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +// Expects no merging attempts. +class NoMergingMergeOp : public MergeOperator { + public: + bool FullMergeV2(const MergeOperationInput& /*merge_in*/, + MergeOperationOutput* /*merge_out*/) const override { + ADD_FAILURE(); + return false; + } + bool PartialMergeMulti(const Slice& /*key*/, + const std::deque& /*operand_list*/, + std::string* /*new_value*/, + Logger* /*logger*/) const override { + ADD_FAILURE(); + return false; + } + const char* Name() const override { + return "CompactionIteratorTest NoMergingMergeOp"; + } +}; + +// Compaction filter that gets stuck when it sees a particular key, +// then gets unstuck when told to. +// Always returns Decision::kRemove. +class StallingFilter : public CompactionFilter { + public: + Decision FilterV2(int /*level*/, const Slice& key, ValueType /*type*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + int k = std::atoi(key.ToString().c_str()); + last_seen.store(k); + while (k >= stall_at.load()) { + std::this_thread::yield(); + } + return Decision::kRemove; + } + + const char* Name() const override { + return "CompactionIteratorTest StallingFilter"; + } + + // Wait until the filter sees a key >= k and stalls at that key. + // If `exact`, asserts that the seen key is equal to k. + void WaitForStall(int k, bool exact = true) { + stall_at.store(k); + while (last_seen.load() < k) { + std::this_thread::yield(); + } + if (exact) { + EXPECT_EQ(k, last_seen.load()); + } + } + + // Filter will stall on key >= stall_at. Advance stall_at to unstall. + mutable std::atomic stall_at{0}; + // Last key the filter was called with. + mutable std::atomic last_seen{0}; +}; + +// Compaction filter that filter out all keys. +class FilterAllKeysCompactionFilter : public CompactionFilter { + public: + Decision FilterV2(int /*level*/, const Slice& /*key*/, ValueType /*type*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + return Decision::kRemove; + } + + const char* Name() const override { return "AllKeysCompactionFilter"; } +}; + +class LoggingForwardVectorIterator : public VectorIterator { + public: + struct Action { + enum class Type { + SEEK_TO_FIRST, + SEEK, + NEXT, + }; + + Type type; + std::string arg; + + explicit Action(Type _type, std::string _arg = "") + : type(_type), arg(_arg) {} + + bool operator==(const Action& rhs) const { + return std::tie(type, arg) == std::tie(rhs.type, rhs.arg); + } + }; + + LoggingForwardVectorIterator(const std::vector& keys, + const std::vector& values) + : VectorIterator(keys, values) { + current_ = keys_.size(); + } + + void SeekToFirst() override { + log.emplace_back(Action::Type::SEEK_TO_FIRST); + VectorIterator::SeekToFirst(); + } + void SeekToLast() override { assert(false); } + + void Seek(const Slice& target) override { + log.emplace_back(Action::Type::SEEK, target.ToString()); + VectorIterator::Seek(target); + } + + void SeekForPrev(const Slice& /*target*/) override { assert(false); } + + void Next() override { + assert(Valid()); + log.emplace_back(Action::Type::NEXT); + VectorIterator::Next(); + } + void Prev() override { assert(false); } + + Slice key() const override { + assert(Valid()); + return VectorIterator::key(); + } + Slice value() const override { + assert(Valid()); + return VectorIterator::value(); + } + + std::vector log; +}; + +class FakeCompaction : public CompactionIterator::CompactionProxy { + public: + int level() const override { return 0; } + + bool KeyNotExistsBeyondOutputLevel( + const Slice& /*user_key*/, + std::vector* /*level_ptrs*/) const override { + return is_bottommost_level || key_not_exists_beyond_output_level; + } + + bool bottommost_level() const override { return is_bottommost_level; } + + int number_levels() const override { return 1; } + + Slice GetLargestUserKey() const override { + return "\xff\xff\xff\xff\xff\xff\xff\xff\xff"; + } + + bool allow_ingest_behind() const override { return is_allow_ingest_behind; } + + bool allow_mmap_reads() const override { return false; } + + bool enable_blob_garbage_collection() const override { return false; } + + double blob_garbage_collection_age_cutoff() const override { return 0.0; } + + uint64_t blob_compaction_readahead_size() const override { return 0; } + + const Version* input_version() const override { return nullptr; } + + bool DoesInputReferenceBlobFiles() const override { return false; } + + const Compaction* real_compaction() const override { return nullptr; } + + bool SupportsPerKeyPlacement() const override { + return supports_per_key_placement; + } + + bool WithinPenultimateLevelOutputRange(const Slice& key) const override { + return (!key.starts_with("unsafe_pb")); + } + + bool key_not_exists_beyond_output_level = false; + + bool is_bottommost_level = false; + + bool is_allow_ingest_behind = false; + + bool supports_per_key_placement = false; +}; + +// A simplified snapshot checker which assumes each snapshot has a global +// last visible sequence. +class TestSnapshotChecker : public SnapshotChecker { + public: + explicit TestSnapshotChecker( + SequenceNumber last_committed_sequence, + const std::unordered_map& snapshots = + {{}}) + : last_committed_sequence_(last_committed_sequence), + snapshots_(snapshots) {} + + SnapshotCheckerResult CheckInSnapshot( + SequenceNumber seq, SequenceNumber snapshot_seq) const override { + if (snapshot_seq == kMaxSequenceNumber) { + return seq <= last_committed_sequence_ + ? SnapshotCheckerResult::kInSnapshot + : SnapshotCheckerResult::kNotInSnapshot; + } + assert(snapshots_.count(snapshot_seq) > 0); + return seq <= snapshots_.at(snapshot_seq) + ? SnapshotCheckerResult::kInSnapshot + : SnapshotCheckerResult::kNotInSnapshot; + } + + private: + SequenceNumber last_committed_sequence_; + // A map of valid snapshot to last visible sequence to the snapshot. + std::unordered_map snapshots_; +}; + +// Test param: +// bool: whether to pass snapshot_checker to compaction iterator. +class CompactionIteratorTest : public testing::TestWithParam { + public: + CompactionIteratorTest() + : cmp_(BytewiseComparator()), icmp_(cmp_), snapshots_({}) {} + + explicit CompactionIteratorTest(const Comparator* ucmp) + : cmp_(ucmp), icmp_(cmp_), snapshots_({}) {} + + void InitIterators( + const std::vector& ks, const std::vector& vs, + const std::vector& range_del_ks, + const std::vector& range_del_vs, + SequenceNumber last_sequence, + SequenceNumber last_committed_sequence = kMaxSequenceNumber, + MergeOperator* merge_op = nullptr, CompactionFilter* filter = nullptr, + bool bottommost_level = false, + SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber, + bool key_not_exists_beyond_output_level = false, + const std::string* full_history_ts_low = nullptr) { + std::unique_ptr unfragmented_range_del_iter( + new VectorIterator(range_del_ks, range_del_vs, &icmp_)); + auto tombstone_list = std::make_shared( + std::move(unfragmented_range_del_iter), icmp_); + std::unique_ptr range_del_iter( + new FragmentedRangeTombstoneIterator(tombstone_list, icmp_, + kMaxSequenceNumber)); + range_del_agg_.reset(new CompactionRangeDelAggregator(&icmp_, snapshots_)); + range_del_agg_->AddTombstones(std::move(range_del_iter)); + + std::unique_ptr compaction; + if (filter || bottommost_level || key_not_exists_beyond_output_level) { + compaction_proxy_ = new FakeCompaction(); + compaction_proxy_->is_bottommost_level = bottommost_level; + compaction_proxy_->is_allow_ingest_behind = AllowIngestBehind(); + compaction_proxy_->key_not_exists_beyond_output_level = + key_not_exists_beyond_output_level; + compaction_proxy_->supports_per_key_placement = SupportsPerKeyPlacement(); + compaction.reset(compaction_proxy_); + } + bool use_snapshot_checker = UseSnapshotChecker() || GetParam(); + if (use_snapshot_checker || last_committed_sequence < kMaxSequenceNumber) { + snapshot_checker_.reset( + new TestSnapshotChecker(last_committed_sequence, snapshot_map_)); + } + merge_helper_.reset( + new MergeHelper(Env::Default(), cmp_, merge_op, filter, nullptr, false, + 0 /*latest_snapshot*/, snapshot_checker_.get(), + 0 /*level*/, nullptr /*statistics*/, &shutting_down_)); + + if (c_iter_) { + // Since iter_ is still used in ~CompactionIterator(), we call + // ~CompactionIterator() first. + c_iter_.reset(); + } + iter_.reset(new LoggingForwardVectorIterator(ks, vs)); + iter_->SeekToFirst(); + c_iter_.reset(new CompactionIterator( + iter_.get(), cmp_, merge_helper_.get(), last_sequence, &snapshots_, + earliest_write_conflict_snapshot, kMaxSequenceNumber, + snapshot_checker_.get(), Env::Default(), + false /* report_detailed_time */, false, range_del_agg_.get(), + nullptr /* blob_file_builder */, true /*allow_data_in_errors*/, + true /*enforce_single_del_contracts*/, + /*manual_compaction_canceled=*/kManualCompactionCanceledFalse_, + std::move(compaction), filter, &shutting_down_, /*info_log=*/nullptr, + full_history_ts_low)); + } + + void AddSnapshot(SequenceNumber snapshot, + SequenceNumber last_visible_seq = kMaxSequenceNumber) { + snapshots_.push_back(snapshot); + snapshot_map_[snapshot] = last_visible_seq; + } + + virtual bool UseSnapshotChecker() const { return false; } + + virtual bool AllowIngestBehind() const { return false; } + + virtual bool SupportsPerKeyPlacement() const { return false; } + + void RunTest( + const std::vector& input_keys, + const std::vector& input_values, + const std::vector& expected_keys, + const std::vector& expected_values, + SequenceNumber last_committed_seq = kMaxSequenceNumber, + MergeOperator* merge_operator = nullptr, + CompactionFilter* compaction_filter = nullptr, + bool bottommost_level = false, + SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber, + bool key_not_exists_beyond_output_level = false, + const std::string* full_history_ts_low = nullptr) { + InitIterators(input_keys, input_values, {}, {}, kMaxSequenceNumber, + last_committed_seq, merge_operator, compaction_filter, + bottommost_level, earliest_write_conflict_snapshot, + key_not_exists_beyond_output_level, full_history_ts_low); + c_iter_->SeekToFirst(); + for (size_t i = 0; i < expected_keys.size(); i++) { + std::string info = "i = " + std::to_string(i); + ASSERT_TRUE(c_iter_->Valid()) << info; + ASSERT_OK(c_iter_->status()) << info; + ASSERT_EQ(expected_keys[i], c_iter_->key().ToString()) << info; + ASSERT_EQ(expected_values[i], c_iter_->value().ToString()) << info; + c_iter_->Next(); + } + ASSERT_OK(c_iter_->status()); + ASSERT_FALSE(c_iter_->Valid()); + } + + void ClearSnapshots() { + snapshots_.clear(); + snapshot_map_.clear(); + } + + const Comparator* cmp_; + const InternalKeyComparator icmp_; + std::vector snapshots_; + // A map of valid snapshot to last visible sequence to the snapshot. + std::unordered_map snapshot_map_; + std::unique_ptr merge_helper_; + std::unique_ptr iter_; + std::unique_ptr c_iter_; + std::unique_ptr range_del_agg_; + std::unique_ptr snapshot_checker_; + std::atomic shutting_down_{false}; + const std::atomic kManualCompactionCanceledFalse_{false}; + FakeCompaction* compaction_proxy_; +}; + +// It is possible that the output of the compaction iterator is empty even if +// the input is not. +TEST_P(CompactionIteratorTest, EmptyResult) { + InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion), + test::KeyStr("a", 3, kTypeValue)}, + {"", "val"}, {}, {}, 5); + c_iter_->SeekToFirst(); + ASSERT_OK(c_iter_->status()); + ASSERT_FALSE(c_iter_->Valid()); +} + +// If there is a corruption after a single deletion, the corrupted key should +// be preserved. +TEST_P(CompactionIteratorTest, CorruptionAfterSingleDeletion) { + InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion), + test::KeyStr("a", 3, kTypeValue, true), + test::KeyStr("b", 10, kTypeValue)}, + {"", "val", "val2"}, {}, {}, 10); + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("a", 5, kTypeSingleDeletion), + c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("a", 3, kTypeValue, true), c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("b", 10, kTypeValue), c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_OK(c_iter_->status()); + ASSERT_FALSE(c_iter_->Valid()); +} + +TEST_P(CompactionIteratorTest, SimpleRangeDeletion) { + InitIterators({test::KeyStr("morning", 5, kTypeValue), + test::KeyStr("morning", 2, kTypeValue), + test::KeyStr("night", 3, kTypeValue)}, + {"zao", "zao", "wan"}, + {test::KeyStr("ma", 4, kTypeRangeDeletion)}, {"mz"}, 5); + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("morning", 5, kTypeValue), c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("night", 3, kTypeValue), c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_OK(c_iter_->status()); + ASSERT_FALSE(c_iter_->Valid()); +} + +TEST_P(CompactionIteratorTest, RangeDeletionWithSnapshots) { + AddSnapshot(10); + std::vector ks1; + ks1.push_back(test::KeyStr("ma", 28, kTypeRangeDeletion)); + std::vector vs1{"mz"}; + std::vector ks2{test::KeyStr("morning", 15, kTypeValue), + test::KeyStr("morning", 5, kTypeValue), + test::KeyStr("night", 40, kTypeValue), + test::KeyStr("night", 20, kTypeValue)}; + std::vector vs2{"zao 15", "zao 5", "wan 40", "wan 20"}; + InitIterators(ks2, vs2, ks1, vs1, 40); + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("morning", 5, kTypeValue), c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("night", 40, kTypeValue), c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_OK(c_iter_->status()); + ASSERT_FALSE(c_iter_->Valid()); +} + +TEST_P(CompactionIteratorTest, CompactionFilterSkipUntil) { + class Filter : public CompactionFilter { + Decision FilterV2(int /*level*/, const Slice& key, ValueType t, + const Slice& existing_value, std::string* /*new_value*/, + std::string* skip_until) const override { + std::string k = key.ToString(); + std::string v = existing_value.ToString(); + // See InitIterators() call below for the sequence of keys and their + // filtering decisions. Here we closely assert that compaction filter is + // called with the expected keys and only them, and with the right values. + if (k == "a") { + EXPECT_EQ(ValueType::kValue, t); + EXPECT_EQ("av50", v); + return Decision::kKeep; + } + if (k == "b") { + EXPECT_EQ(ValueType::kValue, t); + EXPECT_EQ("bv60", v); + *skip_until = "d+"; + return Decision::kRemoveAndSkipUntil; + } + if (k == "e") { + EXPECT_EQ(ValueType::kMergeOperand, t); + EXPECT_EQ("em71", v); + return Decision::kKeep; + } + if (k == "f") { + if (v == "fm65") { + EXPECT_EQ(ValueType::kMergeOperand, t); + *skip_until = "f"; + } else { + EXPECT_EQ("fm30", v); + EXPECT_EQ(ValueType::kMergeOperand, t); + *skip_until = "g+"; + } + return Decision::kRemoveAndSkipUntil; + } + if (k == "h") { + EXPECT_EQ(ValueType::kValue, t); + EXPECT_EQ("hv91", v); + return Decision::kKeep; + } + if (k == "i") { + EXPECT_EQ(ValueType::kMergeOperand, t); + EXPECT_EQ("im95", v); + *skip_until = "z"; + return Decision::kRemoveAndSkipUntil; + } + ADD_FAILURE(); + return Decision::kKeep; + } + + const char* Name() const override { + return "CompactionIteratorTest.CompactionFilterSkipUntil::Filter"; + } + }; + + NoMergingMergeOp merge_op; + Filter filter; + InitIterators( + {test::KeyStr("a", 50, kTypeValue), // keep + test::KeyStr("a", 45, kTypeMerge), + test::KeyStr("b", 60, kTypeValue), // skip to "d+" + test::KeyStr("b", 40, kTypeValue), test::KeyStr("c", 35, kTypeValue), + test::KeyStr("d", 70, kTypeMerge), + test::KeyStr("e", 71, kTypeMerge), // keep + test::KeyStr("f", 65, kTypeMerge), // skip to "f", aka keep + test::KeyStr("f", 30, kTypeMerge), // skip to "g+" + test::KeyStr("f", 25, kTypeValue), test::KeyStr("g", 90, kTypeValue), + test::KeyStr("h", 91, kTypeValue), // keep + test::KeyStr("i", 95, kTypeMerge), // skip to "z" + test::KeyStr("j", 99, kTypeValue)}, + {"av50", "am45", "bv60", "bv40", "cv35", "dm70", "em71", "fm65", "fm30", + "fv25", "gv90", "hv91", "im95", "jv99"}, + {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, &merge_op, &filter); + + // Compaction should output just "a", "e" and "h" keys. + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("a", 50, kTypeValue), c_iter_->key().ToString()); + ASSERT_EQ("av50", c_iter_->value().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("e", 71, kTypeMerge), c_iter_->key().ToString()); + ASSERT_EQ("em71", c_iter_->value().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("h", 91, kTypeValue), c_iter_->key().ToString()); + ASSERT_EQ("hv91", c_iter_->value().ToString()); + c_iter_->Next(); + ASSERT_OK(c_iter_->status()); + ASSERT_FALSE(c_iter_->Valid()); + + // Check that the compaction iterator did the correct sequence of calls on + // the underlying iterator. + using A = LoggingForwardVectorIterator::Action; + using T = A::Type; + std::vector expected_actions = { + A(T::SEEK_TO_FIRST), + A(T::NEXT), + A(T::NEXT), + A(T::SEEK, test::KeyStr("d+", kMaxSequenceNumber, kValueTypeForSeek)), + A(T::NEXT), + A(T::NEXT), + A(T::SEEK, test::KeyStr("g+", kMaxSequenceNumber, kValueTypeForSeek)), + A(T::NEXT), + A(T::SEEK, test::KeyStr("z", kMaxSequenceNumber, kValueTypeForSeek))}; + ASSERT_EQ(expected_actions, iter_->log); +} + +TEST_P(CompactionIteratorTest, ShuttingDownInFilter) { + NoMergingMergeOp merge_op; + StallingFilter filter; + InitIterators( + {test::KeyStr("1", 1, kTypeValue), test::KeyStr("2", 2, kTypeValue), + test::KeyStr("3", 3, kTypeValue), test::KeyStr("4", 4, kTypeValue)}, + {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, + &merge_op, &filter); + // Don't leave tombstones (kTypeDeletion) for filtered keys. + compaction_proxy_->key_not_exists_beyond_output_level = true; + + std::atomic seek_done{false}; + ROCKSDB_NAMESPACE::port::Thread compaction_thread([&] { + c_iter_->SeekToFirst(); + EXPECT_FALSE(c_iter_->Valid()); + EXPECT_TRUE(c_iter_->status().IsShutdownInProgress()); + seek_done.store(true); + }); + + // Let key 1 through. + filter.WaitForStall(1); + + // Shutdown during compaction filter call for key 2. + filter.WaitForStall(2); + shutting_down_.store(true); + EXPECT_FALSE(seek_done.load()); + + // Unstall filter and wait for SeekToFirst() to return. + filter.stall_at.store(3); + compaction_thread.join(); + assert(seek_done.load()); + + // Check that filter was never called again. + EXPECT_EQ(2, filter.last_seen.load()); +} + +// Same as ShuttingDownInFilter, but shutdown happens during filter call for +// a merge operand, not for a value. +TEST_P(CompactionIteratorTest, ShuttingDownInMerge) { + NoMergingMergeOp merge_op; + StallingFilter filter; + InitIterators( + {test::KeyStr("1", 1, kTypeValue), test::KeyStr("2", 2, kTypeMerge), + test::KeyStr("3", 3, kTypeMerge), test::KeyStr("4", 4, kTypeValue)}, + {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, + &merge_op, &filter); + compaction_proxy_->key_not_exists_beyond_output_level = true; + + std::atomic seek_done{false}; + ROCKSDB_NAMESPACE::port::Thread compaction_thread([&] { + c_iter_->SeekToFirst(); + ASSERT_FALSE(c_iter_->Valid()); + ASSERT_TRUE(c_iter_->status().IsShutdownInProgress()); + seek_done.store(true); + }); + + // Let key 1 through. + filter.WaitForStall(1); + + // Shutdown during compaction filter call for key 2. + filter.WaitForStall(2); + shutting_down_.store(true); + EXPECT_FALSE(seek_done.load()); + + // Unstall filter and wait for SeekToFirst() to return. + filter.stall_at.store(3); + compaction_thread.join(); + assert(seek_done.load()); + + // Check that filter was never called again. + EXPECT_EQ(2, filter.last_seen.load()); +} + +TEST_P(CompactionIteratorTest, SingleMergeOperand) { + class Filter : public CompactionFilter { + Decision FilterV2(int /*level*/, const Slice& key, ValueType t, + const Slice& existing_value, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + std::string k = key.ToString(); + std::string v = existing_value.ToString(); + + // See InitIterators() call below for the sequence of keys and their + // filtering decisions. Here we closely assert that compaction filter is + // called with the expected keys and only them, and with the right values. + if (k == "a") { + EXPECT_EQ(ValueType::kMergeOperand, t); + EXPECT_EQ("av1", v); + return Decision::kKeep; + } else if (k == "b") { + EXPECT_EQ(ValueType::kMergeOperand, t); + return Decision::kKeep; + } else if (k == "c") { + return Decision::kKeep; + } + + ADD_FAILURE(); + return Decision::kKeep; + } + + const char* Name() const override { + return "CompactionIteratorTest.SingleMergeOperand::Filter"; + } + }; + + class SingleMergeOp : public MergeOperator { + public: + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override { + // See InitIterators() call below for why "c" is the only key for which + // FullMergeV2 should be called. + EXPECT_EQ("c", merge_in.key.ToString()); + + std::string temp_value; + if (merge_in.existing_value != nullptr) { + temp_value = merge_in.existing_value->ToString(); + } + + for (auto& operand : merge_in.operand_list) { + temp_value.append(operand.ToString()); + } + merge_out->new_value = temp_value; + + return true; + } + + bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, + Logger* /*logger*/) const override { + std::string string_key = key.ToString(); + EXPECT_TRUE(string_key == "a" || string_key == "b"); + + if (string_key == "a") { + EXPECT_EQ(1, operand_list.size()); + } else if (string_key == "b") { + EXPECT_EQ(2, operand_list.size()); + } + + std::string temp_value; + for (auto& operand : operand_list) { + temp_value.append(operand.ToString()); + } + swap(temp_value, *new_value); + + return true; + } + + const char* Name() const override { + return "CompactionIteratorTest SingleMergeOp"; + } + + bool AllowSingleOperand() const override { return true; } + }; + + SingleMergeOp merge_op; + Filter filter; + InitIterators( + // a should invoke PartialMergeMulti with a single merge operand. + {test::KeyStr("a", 50, kTypeMerge), + // b should invoke PartialMergeMulti with two operands. + test::KeyStr("b", 70, kTypeMerge), test::KeyStr("b", 60, kTypeMerge), + // c should invoke FullMerge due to kTypeValue at the beginning. + test::KeyStr("c", 90, kTypeMerge), test::KeyStr("c", 80, kTypeValue)}, + {"av1", "bv2", "bv1", "cv2", "cv1"}, {}, {}, kMaxSequenceNumber, + kMaxSequenceNumber, &merge_op, &filter); + + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), c_iter_->key().ToString()); + ASSERT_EQ("av1", c_iter_->value().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ("bv1bv2", c_iter_->value().ToString()); + c_iter_->Next(); + ASSERT_OK(c_iter_->status()); + ASSERT_EQ("cv1cv2", c_iter_->value().ToString()); +} + +// In bottommost level, values earlier than earliest snapshot can be output +// with sequence = 0. +TEST_P(CompactionIteratorTest, ZeroOutSequenceAtBottomLevel) { + AddSnapshot(1); + RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue)}, + {"v1", "v2"}, + {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue)}, + {"v1", "v2"}, kMaxSequenceNumber /*last_committed_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); +} + +// In bottommost level, deletions earlier than earliest snapshot can be removed +// permanently. +TEST_P(CompactionIteratorTest, RemoveDeletionAtBottomLevel) { + AddSnapshot(1); + RunTest( + {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 3, kTypeDeletion), + test::KeyStr("b", 1, kTypeValue)}, + {"", "", ""}, + {test::KeyStr("b", 3, kTypeDeletion), test::KeyStr("b", 0, kTypeValue)}, + {"", ""}, kMaxSequenceNumber /*last_committed_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); +} + +// In bottommost level, single deletions earlier than earliest snapshot can be +// removed permanently. +TEST_P(CompactionIteratorTest, RemoveSingleDeletionAtBottomLevel) { + AddSnapshot(1); + RunTest({test::KeyStr("a", 1, kTypeSingleDeletion), + test::KeyStr("b", 2, kTypeSingleDeletion)}, + {"", ""}, {test::KeyStr("b", 2, kTypeSingleDeletion)}, {""}, + kMaxSequenceNumber /*last_committed_seq*/, nullptr /*merge_operator*/, + nullptr /*compaction_filter*/, true /*bottommost_level*/); +} + +TEST_P(CompactionIteratorTest, ConvertToPutAtBottom) { + std::shared_ptr merge_op = + MergeOperators::CreateStringAppendOperator(); + RunTest({test::KeyStr("a", 4, kTypeMerge), test::KeyStr("a", 3, kTypeMerge), + test::KeyStr("a", 2, kTypeMerge), test::KeyStr("b", 1, kTypeValue)}, + {"a4", "a3", "a2", "b1"}, + {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 0, kTypeValue)}, + {"a2,a3,a4", "b1"}, kMaxSequenceNumber /*last_committed_seq*/, + merge_op.get(), nullptr /*compaction_filter*/, + true /*bottomost_level*/); +} + +INSTANTIATE_TEST_CASE_P(CompactionIteratorTestInstance, CompactionIteratorTest, + testing::Values(true, false)); + +class PerKeyPlacementCompIteratorTest : public CompactionIteratorTest { + public: + bool SupportsPerKeyPlacement() const override { return true; } +}; + +TEST_P(PerKeyPlacementCompIteratorTest, SplitLastLevelData) { + std::atomic_uint64_t latest_cold_seq = 0; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast(arg); + context->output_to_penultimate_level = + context->seq_num > latest_cold_seq; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + latest_cold_seq = 5; + + InitIterators( + {test::KeyStr("a", 7, kTypeValue), test::KeyStr("b", 6, kTypeValue), + test::KeyStr("c", 5, kTypeValue)}, + {"vala", "valb", "valc"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, + nullptr, nullptr, true); + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + + // the first 2 keys are hot, which should has + // `output_to_penultimate_level()==true` and seq num not zeroed out + ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString()); + ASSERT_TRUE(c_iter_->output_to_penultimate_level()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("b", 6, kTypeValue), c_iter_->key().ToString()); + ASSERT_TRUE(c_iter_->output_to_penultimate_level()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + // `a` is cold data, which should be output to bottommost + ASSERT_EQ(test::KeyStr("c", 0, kTypeValue), c_iter_->key().ToString()); + ASSERT_FALSE(c_iter_->output_to_penultimate_level()); + c_iter_->Next(); + ASSERT_OK(c_iter_->status()); + ASSERT_FALSE(c_iter_->Valid()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(PerKeyPlacementCompIteratorTest, SnapshotData) { + AddSnapshot(5); + + InitIterators( + {test::KeyStr("a", 7, kTypeValue), test::KeyStr("b", 6, kTypeDeletion), + test::KeyStr("b", 5, kTypeValue)}, + {"vala", "", "valb"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, + nullptr, nullptr, true); + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + + // The first key and the tombstone are within snapshot, which should output + // to the penultimate level (and seq num cannot be zeroed out). + ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString()); + ASSERT_TRUE(c_iter_->output_to_penultimate_level()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("b", 6, kTypeDeletion), c_iter_->key().ToString()); + ASSERT_TRUE(c_iter_->output_to_penultimate_level()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + // `a` is not protected by the snapshot, the sequence number is zero out and + // should output bottommost + ASSERT_EQ(test::KeyStr("b", 0, kTypeValue), c_iter_->key().ToString()); + ASSERT_FALSE(c_iter_->output_to_penultimate_level()); + c_iter_->Next(); + ASSERT_OK(c_iter_->status()); + ASSERT_FALSE(c_iter_->Valid()); +} + +TEST_P(PerKeyPlacementCompIteratorTest, ConflictWithSnapshot) { + std::atomic_uint64_t latest_cold_seq = 0; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast(arg); + context->output_to_penultimate_level = + context->seq_num > latest_cold_seq; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + latest_cold_seq = 6; + + AddSnapshot(5); + + InitIterators({test::KeyStr("a", 7, kTypeValue), + test::KeyStr("unsafe_pb", 6, kTypeValue), + test::KeyStr("c", 5, kTypeValue)}, + {"vala", "valb", "valc"}, {}, {}, kMaxSequenceNumber, + kMaxSequenceNumber, nullptr, nullptr, true); + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + + ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString()); + ASSERT_TRUE(c_iter_->output_to_penultimate_level()); + // the 2nd key is unsafe to output_to_penultimate_level, but it's within + // snapshot so for per_key_placement feature it has to be outputted to the + // penultimate level. which is a corruption. We should never see + // such case as the data with seq num (within snapshot) should always come + // from higher compaction input level, which makes it safe to + // output_to_penultimate_level. + c_iter_->Next(); + ASSERT_TRUE(c_iter_->status().IsCorruption()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompIteratorTest, + PerKeyPlacementCompIteratorTest, + testing::Values(true, false)); + +// Tests how CompactionIterator work together with SnapshotChecker. +class CompactionIteratorWithSnapshotCheckerTest + : public CompactionIteratorTest { + public: + bool UseSnapshotChecker() const override { return true; } +}; + +// Uncommitted keys (keys with seq > last_committed_seq) should be output as-is +// while committed version of these keys should get compacted as usual. + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + PreserveUncommittedKeys_Value) { + RunTest( + {test::KeyStr("foo", 3, kTypeValue), test::KeyStr("foo", 2, kTypeValue), + test::KeyStr("foo", 1, kTypeValue)}, + {"v3", "v2", "v1"}, + {test::KeyStr("foo", 3, kTypeValue), test::KeyStr("foo", 2, kTypeValue)}, + {"v3", "v2"}, 2 /*last_committed_seq*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + PreserveUncommittedKeys_Deletion) { + RunTest({test::KeyStr("foo", 2, kTypeDeletion), + test::KeyStr("foo", 1, kTypeValue)}, + {"", "v1"}, + {test::KeyStr("foo", 2, kTypeDeletion), + test::KeyStr("foo", 1, kTypeValue)}, + {"", "v1"}, 1 /*last_committed_seq*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + PreserveUncommittedKeys_Merge) { + auto merge_op = MergeOperators::CreateStringAppendOperator(); + RunTest( + {test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge), + test::KeyStr("foo", 1, kTypeValue)}, + {"v3", "v2", "v1"}, + {test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeValue)}, + {"v3", "v1,v2"}, 2 /*last_committed_seq*/, merge_op.get()); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + PreserveUncommittedKeys_SingleDelete) { + RunTest({test::KeyStr("foo", 2, kTypeSingleDeletion), + test::KeyStr("foo", 1, kTypeValue)}, + {"", "v1"}, + {test::KeyStr("foo", 2, kTypeSingleDeletion), + test::KeyStr("foo", 1, kTypeValue)}, + {"", "v1"}, 1 /*last_committed_seq*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + PreserveUncommittedKeys_BlobIndex) { + RunTest({test::KeyStr("foo", 3, kTypeBlobIndex), + test::KeyStr("foo", 2, kTypeBlobIndex), + test::KeyStr("foo", 1, kTypeBlobIndex)}, + {"v3", "v2", "v1"}, + {test::KeyStr("foo", 3, kTypeBlobIndex), + test::KeyStr("foo", 2, kTypeBlobIndex)}, + {"v3", "v2"}, 2 /*last_committed_seq*/); +} + +// Test compaction iterator dedup keys visible to the same snapshot. + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Value) { + AddSnapshot(2, 1); + RunTest( + {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeValue), + test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)}, + {"v4", "v3", "v2", "v1"}, + {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeValue), + test::KeyStr("foo", 1, kTypeValue)}, + {"v4", "v3", "v1"}, 3 /*last_committed_seq*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Deletion) { + AddSnapshot(2, 1); + RunTest( + {test::KeyStr("foo", 4, kTypeValue), + test::KeyStr("foo", 3, kTypeDeletion), + test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)}, + {"v4", "", "v2", "v1"}, + {test::KeyStr("foo", 4, kTypeValue), + test::KeyStr("foo", 3, kTypeDeletion), + test::KeyStr("foo", 1, kTypeValue)}, + {"v4", "", "v1"}, 3 /*last_committed_seq*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Merge) { + AddSnapshot(2, 1); + AddSnapshot(4, 3); + auto merge_op = MergeOperators::CreateStringAppendOperator(); + RunTest( + {test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge), + test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge), + test::KeyStr("foo", 1, kTypeValue)}, + {"v5", "v4", "v3", "v2", "v1"}, + {test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge), + test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 1, kTypeValue)}, + {"v5", "v4", "v2,v3", "v1"}, 4 /*last_committed_seq*/, merge_op.get()); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + DedupSameSnapshot_SingleDeletion) { + AddSnapshot(2, 1); + RunTest( + {test::KeyStr("foo", 4, kTypeValue), + test::KeyStr("foo", 3, kTypeSingleDeletion), + test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)}, + {"v4", "", "v2", "v1"}, + {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 1, kTypeValue)}, + {"v4", "v1"}, 3 /*last_committed_seq*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_BlobIndex) { + AddSnapshot(2, 1); + RunTest({test::KeyStr("foo", 4, kTypeBlobIndex), + test::KeyStr("foo", 3, kTypeBlobIndex), + test::KeyStr("foo", 2, kTypeBlobIndex), + test::KeyStr("foo", 1, kTypeBlobIndex)}, + {"v4", "v3", "v2", "v1"}, + {test::KeyStr("foo", 4, kTypeBlobIndex), + test::KeyStr("foo", 3, kTypeBlobIndex), + test::KeyStr("foo", 1, kTypeBlobIndex)}, + {"v4", "v3", "v1"}, 3 /*last_committed_seq*/); +} + +// At bottom level, sequence numbers can be zero out, and deletions can be +// removed, but only when they are visible to earliest snapshot. + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + NotZeroOutSequenceIfNotVisibleToEarliestSnapshot) { + AddSnapshot(2, 1); + RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue), + test::KeyStr("c", 3, kTypeValue)}, + {"v1", "v2", "v3"}, + {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue), + test::KeyStr("c", 3, kTypeValue)}, + {"v1", "v2", "v3"}, kMaxSequenceNumber /*last_committed_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + NotRemoveDeletionIfNotVisibleToEarliestSnapshot) { + AddSnapshot(2, 1); + RunTest( + {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 2, kTypeDeletion), + test::KeyStr("c", 3, kTypeDeletion)}, + {"", "", ""}, {}, {"", ""}, kMaxSequenceNumber /*last_committed_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + NotRemoveDeletionIfValuePresentToEarlierSnapshot) { + AddSnapshot(2, 1); + RunTest({test::KeyStr("a", 4, kTypeDeletion), + test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 3, kTypeValue)}, + {"", "", ""}, + {test::KeyStr("a", 4, kTypeDeletion), + test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 3, kTypeValue)}, + {"", "", ""}, kMaxSequenceNumber /*last_committed_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + NotRemoveSingleDeletionIfNotVisibleToEarliestSnapshot) { + AddSnapshot(2, 1); + RunTest({test::KeyStr("a", 1, kTypeSingleDeletion), + test::KeyStr("b", 2, kTypeSingleDeletion), + test::KeyStr("c", 3, kTypeSingleDeletion)}, + {"", "", ""}, + {test::KeyStr("b", 2, kTypeSingleDeletion), + test::KeyStr("c", 3, kTypeSingleDeletion)}, + {"", ""}, kMaxSequenceNumber /*last_committed_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); +} + +// Single delete should not cancel out values that not visible to the +// same set of snapshots +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + SingleDeleteAcrossSnapshotBoundary) { + AddSnapshot(2, 1); + RunTest({test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeValue)}, + {"", "v1"}, + {test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeValue)}, + {"", "v1"}, 2 /*last_committed_seq*/); +} + +// Single delete should be kept in case it is not visible to the +// earliest write conflict snapshot. If a single delete is kept for this reason, +// corresponding value can be trimmed to save space. +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + KeepSingleDeletionForWriteConflictChecking) { + AddSnapshot(2, 0); + RunTest({test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeValue)}, + {"", "v1"}, + {test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeValue)}, + {"", ""}, 2 /*last_committed_seq*/, nullptr /*merge_operator*/, + nullptr /*compaction_filter*/, false /*bottommost_level*/, + 2 /*earliest_write_conflict_snapshot*/); +} + +// Same as above but with a blob index. In addition to the value getting +// trimmed, the type of the KV is changed to kTypeValue. +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + KeepSingleDeletionForWriteConflictChecking_BlobIndex) { + AddSnapshot(2, 0); + RunTest({test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeBlobIndex)}, + {"", "fake_blob_index"}, + {test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeValue)}, + {"", ""}, 2 /*last_committed_seq*/, nullptr /*merge_operator*/, + nullptr /*compaction_filter*/, false /*bottommost_level*/, + 2 /*earliest_write_conflict_snapshot*/); +} + +// Same as above but with a wide-column entity. In addition to the value getting +// trimmed, the type of the KV is changed to kTypeValue. +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + KeepSingleDeletionForWriteConflictChecking_WideColumnEntity) { + AddSnapshot(2, 0); + RunTest({test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeWideColumnEntity)}, + {"", "fake_entity"}, + {test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeValue)}, + {"", ""}, 2 /* last_committed_seq */, nullptr /* merge_operator */, + nullptr /* compaction_filter */, false /* bottommost_level */, + 2 /* earliest_write_conflict_snapshot */); +} + +// Compaction filter should keep uncommitted key as-is, and +// * Convert the latest value to deletion, and/or +// * if latest value is a merge, apply filter to all subsequent merges. + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Value) { + std::unique_ptr compaction_filter( + new FilterAllKeysCompactionFilter()); + RunTest( + {test::KeyStr("a", 2, kTypeValue), test::KeyStr("a", 1, kTypeValue), + test::KeyStr("b", 3, kTypeValue), test::KeyStr("c", 1, kTypeValue)}, + {"v2", "v1", "v3", "v4"}, + {test::KeyStr("a", 2, kTypeValue), test::KeyStr("a", 1, kTypeDeletion), + test::KeyStr("b", 3, kTypeValue), test::KeyStr("c", 1, kTypeDeletion)}, + {"v2", "", "v3", ""}, 1 /*last_committed_seq*/, + nullptr /*merge_operator*/, compaction_filter.get()); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Deletion) { + std::unique_ptr compaction_filter( + new FilterAllKeysCompactionFilter()); + RunTest( + {test::KeyStr("a", 2, kTypeDeletion), test::KeyStr("a", 1, kTypeValue)}, + {"", "v1"}, + {test::KeyStr("a", 2, kTypeDeletion), + test::KeyStr("a", 1, kTypeDeletion)}, + {"", ""}, 1 /*last_committed_seq*/, nullptr /*merge_operator*/, + compaction_filter.get()); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + CompactionFilter_PartialMerge) { + std::shared_ptr merge_op = + MergeOperators::CreateStringAppendOperator(); + std::unique_ptr compaction_filter( + new FilterAllKeysCompactionFilter()); + RunTest({test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge), + test::KeyStr("a", 1, kTypeMerge)}, + {"v3", "v2", "v1"}, {test::KeyStr("a", 3, kTypeMerge)}, {"v3"}, + 2 /*last_committed_seq*/, merge_op.get(), compaction_filter.get()); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_FullMerge) { + std::shared_ptr merge_op = + MergeOperators::CreateStringAppendOperator(); + std::unique_ptr compaction_filter( + new FilterAllKeysCompactionFilter()); + RunTest( + {test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge), + test::KeyStr("a", 1, kTypeValue)}, + {"v3", "v2", "v1"}, + {test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 1, kTypeDeletion)}, + {"v3", ""}, 2 /*last_committed_seq*/, merge_op.get(), + compaction_filter.get()); +} + +// Tests how CompactionIterator work together with AllowIngestBehind. +class CompactionIteratorWithAllowIngestBehindTest + : public CompactionIteratorTest { + public: + bool AllowIngestBehind() const override { return true; } +}; + +// When allow_ingest_behind is set, compaction iterator is not targeting +// the bottommost level since there is no guarantee there won't be further +// data ingested under the compaction output in future. +TEST_P(CompactionIteratorWithAllowIngestBehindTest, NoConvertToPutAtBottom) { + std::shared_ptr merge_op = + MergeOperators::CreateStringAppendOperator(); + RunTest({test::KeyStr("a", 4, kTypeMerge), test::KeyStr("a", 3, kTypeMerge), + test::KeyStr("a", 2, kTypeMerge), test::KeyStr("b", 1, kTypeValue)}, + {"a4", "a3", "a2", "b1"}, + {test::KeyStr("a", 4, kTypeMerge), test::KeyStr("b", 1, kTypeValue)}, + {"a2,a3,a4", "b1"}, kMaxSequenceNumber /*last_committed_seq*/, + merge_op.get(), nullptr /*compaction_filter*/, + true /*bottomost_level*/); +} + +TEST_P(CompactionIteratorWithAllowIngestBehindTest, + MergeToPutIfEncounteredPutAtBottom) { + std::shared_ptr merge_op = + MergeOperators::CreateStringAppendOperator(); + RunTest({test::KeyStr("a", 4, kTypeMerge), test::KeyStr("a", 3, kTypeMerge), + test::KeyStr("a", 2, kTypeValue), test::KeyStr("b", 1, kTypeValue)}, + {"a4", "a3", "a2", "b1"}, + {test::KeyStr("a", 4, kTypeValue), test::KeyStr("b", 1, kTypeValue)}, + {"a2,a3,a4", "b1"}, kMaxSequenceNumber /*last_committed_seq*/, + merge_op.get(), nullptr /*compaction_filter*/, + true /*bottomost_level*/); +} + +INSTANTIATE_TEST_CASE_P(CompactionIteratorWithAllowIngestBehindTestInstance, + CompactionIteratorWithAllowIngestBehindTest, + testing::Values(true, false)); + +class CompactionIteratorTsGcTest : public CompactionIteratorTest { + public: + CompactionIteratorTsGcTest() + : CompactionIteratorTest(test::BytewiseComparatorWithU64TsWrapper()) {} +}; + +TEST_P(CompactionIteratorTsGcTest, NoKeyEligibleForGC) { + constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}}; + const std::vector input_keys = { + test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4, kTypeValue), + test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3, + kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)}; + const std::vector input_values = {"a3", "", "b2"}; + std::string full_history_ts_low; + // All keys' timestamps are newer than or equal to 102, thus none of them + // will be eligible for GC. + PutFixed64(&full_history_ts_low, 102); + const std::vector& expected_keys = input_keys; + const std::vector& expected_values = input_values; + const std::vector> params = { + {false, false}, {false, true}, {true, true}}; + for (const std::pair& param : params) { + const bool bottommost_level = param.first; + const bool key_not_exists_beyond_output_level = param.second; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + bottommost_level, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + key_not_exists_beyond_output_level, &full_history_ts_low); + } +} + +TEST_P(CompactionIteratorTsGcTest, NoMergeEligibleForGc) { + constexpr char user_key[] = "a"; + const std::vector input_keys = { + test::KeyStr(10002, user_key, 102, kTypeMerge), + test::KeyStr(10001, user_key, 101, kTypeMerge), + test::KeyStr(10000, user_key, 100, kTypeValue)}; + const std::vector input_values = {"2", "1", "a0"}; + std::shared_ptr merge_op = + MergeOperators::CreateStringAppendTESTOperator(); + const std::vector& expected_keys = input_keys; + const std::vector& expected_values = input_values; + const std::vector> params = { + {false, false}, {false, true}, {true, true}}; + for (const auto& param : params) { + const bool bottommost_level = param.first; + const bool key_not_exists_beyond_output_level = param.second; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, merge_op.get(), + /*compaction_filter=*/nullptr, bottommost_level, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + key_not_exists_beyond_output_level, + /*full_history_ts_low=*/nullptr); + } +} + +TEST_P(CompactionIteratorTsGcTest, AllKeysOlderThanThreshold) { + constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}}; + const std::vector input_keys = { + test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4, + kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/101, user_key[0], /*seq=*/2, kTypeValue), + test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)}; + const std::vector input_values = {"", "a2", "a1", "b5"}; + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, std::numeric_limits::max()); + { + // With a snapshot at seq 3, both the deletion marker and the key at 3 must + // be preserved. + AddSnapshot(3); + const std::vector expected_keys = { + input_keys[0], input_keys[1], input_keys[3]}; + const std::vector expected_values = {"", "a2", "b5"}; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low); + ClearSnapshots(); + } + { + // No snapshot, the deletion marker should be preserved because the user + // key may appear beyond output level. + const std::vector expected_keys = {input_keys[0], + input_keys[3]}; + const std::vector expected_values = {"", "b5"}; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low); + } + { + // No snapshot, the deletion marker can be dropped because the user key + // does not appear in higher levels. + const std::vector expected_keys = {input_keys[3]}; + const std::vector expected_values = {"b5"}; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low); + } +} + +TEST_P(CompactionIteratorTsGcTest, SomeMergesOlderThanThreshold) { + constexpr char user_key[][2] = {"a", "f"}; + const std::vector input_keys = { + test::KeyStr(/*ts=*/25000, user_key[0], /*seq=*/2500, kTypeMerge), + test::KeyStr(/*ts=*/19000, user_key[0], /*seq=*/2300, kTypeMerge), + test::KeyStr(/*ts=*/18000, user_key[0], /*seq=*/1800, kTypeMerge), + test::KeyStr(/*ts=*/16000, user_key[0], /*seq=*/1600, kTypeValue), + test::KeyStr(/*ts=*/19000, user_key[1], /*seq=*/2000, kTypeMerge), + test::KeyStr(/*ts=*/17000, user_key[1], /*seq=*/1700, kTypeMerge), + test::KeyStr(/*ts=*/15000, user_key[1], /*seq=*/1600, + kTypeDeletionWithTimestamp)}; + const std::vector input_values = {"25", "19", "18", "16", + "19", "17", ""}; + std::shared_ptr merge_op = + MergeOperators::CreateStringAppendTESTOperator(); + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, 20000); + + const std::vector> params = { + {false, false}, {false, true}, {true, true}}; + + { + AddSnapshot(1600); + AddSnapshot(1900); + const std::vector expected_keys = { + test::KeyStr(/*ts=*/25000, user_key[0], /*seq=*/2500, kTypeMerge), + test::KeyStr(/*ts=*/19000, user_key[0], /*seq=*/2300, kTypeMerge), + test::KeyStr(/*ts=*/18000, user_key[0], /*seq=*/1800, kTypeMerge), + test::KeyStr(/*ts=*/16000, user_key[0], /*seq=*/1600, kTypeValue), + test::KeyStr(/*ts=*/19000, user_key[1], /*seq=*/2000, kTypeMerge), + test::KeyStr(/*ts=*/17000, user_key[1], /*seq=*/1700, kTypeMerge), + test::KeyStr(/*ts=*/15000, user_key[1], /*seq=*/1600, + kTypeDeletionWithTimestamp)}; + const std::vector expected_values = {"25", "19", "18", "16", + "19", "17", ""}; + for (const auto& param : params) { + const bool bottommost_level = param.first; + const bool key_not_exists_beyond_output_level = param.second; + auto expected_keys_copy = expected_keys; + auto expected_values_copy = expected_values; + if (bottommost_level || key_not_exists_beyond_output_level) { + // the kTypeDeletionWithTimestamp will be dropped + expected_keys_copy.pop_back(); + expected_values_copy.pop_back(); + if (bottommost_level) { + // seq zero + expected_keys_copy[3] = + test::KeyStr(/*ts=*/0, user_key[0], /*seq=*/0, kTypeValue); + } + } + RunTest(input_keys, input_values, expected_keys_copy, + expected_values_copy, + /*last_committed_seq=*/kMaxSequenceNumber, merge_op.get(), + /*compaction_filter=*/nullptr, bottommost_level, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + key_not_exists_beyond_output_level, &full_history_ts_low); + } + ClearSnapshots(); + } + + // No snapshots + { + const std::vector expected_keys = { + test::KeyStr(/*ts=*/25000, user_key[0], /*seq=*/2500, kTypeValue), + test::KeyStr(/*ts=*/19000, user_key[1], /*seq=*/2000, kTypeValue)}; + const std::vector expected_values = {"16,18,19,25", "17,19"}; + for (const auto& param : params) { + const bool bottommost_level = param.first; + const bool key_not_exists_beyond_output_level = param.second; + auto expected_keys_copy = expected_keys; + auto expected_values_copy = expected_values; + if (bottommost_level) { + expected_keys_copy[1] = + test::KeyStr(/*ts=*/0, user_key[1], /*seq=*/0, kTypeValue); + } + RunTest(input_keys, input_values, expected_keys_copy, + expected_values_copy, + /*last_committed_seq=*/kMaxSequenceNumber, merge_op.get(), + /*compaction_filter=*/nullptr, bottommost_level, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + key_not_exists_beyond_output_level, &full_history_ts_low); + } + } +} + +TEST_P(CompactionIteratorTsGcTest, NewHidesOldSameSnapshot) { + constexpr char user_key[] = "a"; + const std::vector input_keys = { + test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeValue), + test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)}; + const std::vector input_values = {"", "a2", "a1", "a0"}; + { + std::string full_history_ts_low; + // Keys whose timestamps larger than or equal to 102 will be preserved. + PutFixed64(&full_history_ts_low, 102); + const std::vector expected_keys = { + input_keys[0], input_keys[1], input_keys[2]}; + const std::vector expected_values = {"", input_values[1], + input_values[2]}; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low); + } +} + +TEST_P(CompactionIteratorTsGcTest, DropTombstones) { + constexpr char user_key[] = "a"; + const std::vector input_keys = { + test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)}; + const std::vector input_values = {"", "a2", "", "a0"}; + const std::vector expected_keys = {input_keys[0], input_keys[1]}; + const std::vector expected_values = {"", "a2"}; + + // Take a snapshot at seq 2. + AddSnapshot(2); + + { + // Non-bottommost level, but key does not exist beyond output level. + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, 102); + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_sequence=*/kMaxSequenceNumber, + /*merge_op=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low); + } + { + // Bottommost level + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, 102); + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/true, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low); + } +} + +TEST_P(CompactionIteratorTsGcTest, RewriteTs) { + constexpr char user_key[] = "a"; + const std::vector input_keys = { + test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)}; + const std::vector input_values = {"", "a2", "", "a0"}; + const std::vector expected_keys = { + input_keys[0], input_keys[1], input_keys[2], + test::KeyStr(/*ts=*/0, user_key, /*seq=*/0, kTypeValue)}; + const std::vector expected_values = {"", "a2", "", "a0"}; + + AddSnapshot(1); + AddSnapshot(2); + + { + // Bottommost level and need to rewrite both ts and seq. + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, 102); + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/true, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low); + } +} + +TEST_P(CompactionIteratorTsGcTest, SingleDeleteNoKeyEligibleForGC) { + constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}}; + const std::vector input_keys = { + test::KeyStr(/*ts=*/104, user_key[0], /*seq=*/4, kTypeSingleDeletion), + test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/102, user_key[1], /*seq=*/2, kTypeValue)}; + const std::vector input_values = {"", "a3", "b2"}; + std::string full_history_ts_low; + // All keys' timestamps are newer than or equal to 102, thus none of them + // will be eligible for GC. + PutFixed64(&full_history_ts_low, 102); + const std::vector& expected_keys = input_keys; + const std::vector& expected_values = input_values; + const std::vector> params = { + {false, false}, {false, true}, {true, true}}; + for (const std::pair& param : params) { + const bool bottommost_level = param.first; + const bool key_not_exists_beyond_output_level = param.second; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + bottommost_level, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + key_not_exists_beyond_output_level, &full_history_ts_low); + } +} + +TEST_P(CompactionIteratorTsGcTest, SingleDeleteDropTombstones) { + constexpr char user_key[] = "a"; + const std::vector input_keys = { + test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeSingleDeletion), + test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeSingleDeletion), + test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)}; + const std::vector input_values = {"", "a2", "", "a0"}; + const std::vector expected_keys = {input_keys[0], input_keys[1]}; + const std::vector expected_values = {"", "a2"}; + + // Take a snapshot at seq 2. + AddSnapshot(2); + { + const std::vector> params = { + {false, false}, {false, true}, {true, true}}; + for (const std::pair& param : params) { + const bool bottommost_level = param.first; + const bool key_not_exists_beyond_output_level = param.second; + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, 102); + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + bottommost_level, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + key_not_exists_beyond_output_level, &full_history_ts_low); + } + } +} + +TEST_P(CompactionIteratorTsGcTest, SingleDeleteAllKeysOlderThanThreshold) { + constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}}; + const std::vector input_keys = { + test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4, kTypeSingleDeletion), + test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)}; + const std::vector input_values = {"", "a2", "b5"}; + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, std::numeric_limits::max()); + { + // With a snapshot at seq 3, both the deletion marker and the key at 3 must + // be preserved. + AddSnapshot(3); + const std::vector expected_keys = { + input_keys[0], input_keys[1], input_keys[2]}; + const std::vector expected_values = {"", "a2", "b5"}; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low); + ClearSnapshots(); + } + { + // No snapshot. + const std::vector expected_keys = {input_keys[2]}; + const std::vector expected_values = {"b5"}; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low); + } +} + +INSTANTIATE_TEST_CASE_P(CompactionIteratorTsGcTestInstance, + CompactionIteratorTsGcTest, + testing::Values(true, false)); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/compaction/compaction_job.cc b/src/rocksdb/db/compaction/compaction_job.cc new file mode 100644 index 000000000..1da1bcda8 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_job.cc @@ -0,0 +1,2060 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_job.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "db/blob/blob_counting_iterator.h" +#include "db/blob/blob_file_addition.h" +#include "db/blob/blob_file_builder.h" +#include "db/builder.h" +#include "db/compaction/clipping_iterator.h" +#include "db/compaction/compaction_state.h" +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "db/history_trimming_iterator.h" +#include "db/log_writer.h" +#include "db/merge_helper.h" +#include "db/range_del_aggregator.h" +#include "db/version_edit.h" +#include "db/version_set.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "file/sst_file_manager_impl.h" +#include "file/writable_file_writer.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/thread_status_util.h" +#include "options/configurable_helper.h" +#include "options/options_helper.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "rocksdb/utilities/options_type.h" +#include "table/merging_iterator.h" +#include "table/table_builder.h" +#include "table/unique_id_impl.h" +#include "test_util/sync_point.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +const char* GetCompactionReasonString(CompactionReason compaction_reason) { + switch (compaction_reason) { + case CompactionReason::kUnknown: + return "Unknown"; + case CompactionReason::kLevelL0FilesNum: + return "LevelL0FilesNum"; + case CompactionReason::kLevelMaxLevelSize: + return "LevelMaxLevelSize"; + case CompactionReason::kUniversalSizeAmplification: + return "UniversalSizeAmplification"; + case CompactionReason::kUniversalSizeRatio: + return "UniversalSizeRatio"; + case CompactionReason::kUniversalSortedRunNum: + return "UniversalSortedRunNum"; + case CompactionReason::kFIFOMaxSize: + return "FIFOMaxSize"; + case CompactionReason::kFIFOReduceNumFiles: + return "FIFOReduceNumFiles"; + case CompactionReason::kFIFOTtl: + return "FIFOTtl"; + case CompactionReason::kManualCompaction: + return "ManualCompaction"; + case CompactionReason::kFilesMarkedForCompaction: + return "FilesMarkedForCompaction"; + case CompactionReason::kBottommostFiles: + return "BottommostFiles"; + case CompactionReason::kTtl: + return "Ttl"; + case CompactionReason::kFlush: + return "Flush"; + case CompactionReason::kExternalSstIngestion: + return "ExternalSstIngestion"; + case CompactionReason::kPeriodicCompaction: + return "PeriodicCompaction"; + case CompactionReason::kChangeTemperature: + return "ChangeTemperature"; + case CompactionReason::kForcedBlobGC: + return "ForcedBlobGC"; + case CompactionReason::kRoundRobinTtl: + return "RoundRobinTtl"; + case CompactionReason::kNumOfReasons: + // fall through + default: + assert(false); + return "Invalid"; + } +} + +const char* GetCompactionPenultimateOutputRangeTypeString( + Compaction::PenultimateOutputRangeType range_type) { + switch (range_type) { + case Compaction::PenultimateOutputRangeType::kNotSupported: + return "NotSupported"; + case Compaction::PenultimateOutputRangeType::kFullRange: + return "FullRange"; + case Compaction::PenultimateOutputRangeType::kNonLastRange: + return "NonLastRange"; + case Compaction::PenultimateOutputRangeType::kDisabled: + return "Disabled"; + default: + assert(false); + return "Invalid"; + } +} + +CompactionJob::CompactionJob( + int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, + const MutableDBOptions& mutable_db_options, const FileOptions& file_options, + VersionSet* versions, const std::atomic* shutting_down, + LogBuffer* log_buffer, FSDirectory* db_directory, + FSDirectory* output_directory, FSDirectory* blob_output_directory, + Statistics* stats, InstrumentedMutex* db_mutex, + ErrorHandler* db_error_handler, + std::vector existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, JobContext* job_context, + std::shared_ptr table_cache, EventLogger* event_logger, + bool paranoid_file_checks, bool measure_io_stats, const std::string& dbname, + CompactionJobStats* compaction_job_stats, Env::Priority thread_pri, + const std::shared_ptr& io_tracer, + const std::atomic& manual_compaction_canceled, + const std::string& db_id, const std::string& db_session_id, + std::string full_history_ts_low, std::string trim_ts, + BlobFileCompletionCallback* blob_callback, int* bg_compaction_scheduled, + int* bg_bottom_compaction_scheduled) + : compact_(new CompactionState(compaction)), + compaction_stats_(compaction->compaction_reason(), 1), + db_options_(db_options), + mutable_db_options_copy_(mutable_db_options), + log_buffer_(log_buffer), + output_directory_(output_directory), + stats_(stats), + bottommost_level_(false), + write_hint_(Env::WLTH_NOT_SET), + compaction_job_stats_(compaction_job_stats), + job_id_(job_id), + dbname_(dbname), + db_id_(db_id), + db_session_id_(db_session_id), + file_options_(file_options), + env_(db_options.env), + io_tracer_(io_tracer), + fs_(db_options.fs, io_tracer), + file_options_for_read_( + fs_->OptimizeForCompactionTableRead(file_options, db_options_)), + versions_(versions), + shutting_down_(shutting_down), + manual_compaction_canceled_(manual_compaction_canceled), + db_directory_(db_directory), + blob_output_directory_(blob_output_directory), + db_mutex_(db_mutex), + db_error_handler_(db_error_handler), + existing_snapshots_(std::move(existing_snapshots)), + earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), + snapshot_checker_(snapshot_checker), + job_context_(job_context), + table_cache_(std::move(table_cache)), + event_logger_(event_logger), + paranoid_file_checks_(paranoid_file_checks), + measure_io_stats_(measure_io_stats), + thread_pri_(thread_pri), + full_history_ts_low_(std::move(full_history_ts_low)), + trim_ts_(std::move(trim_ts)), + blob_callback_(blob_callback), + extra_num_subcompaction_threads_reserved_(0), + bg_compaction_scheduled_(bg_compaction_scheduled), + bg_bottom_compaction_scheduled_(bg_bottom_compaction_scheduled) { + assert(compaction_job_stats_ != nullptr); + assert(log_buffer_ != nullptr); + + const auto* cfd = compact_->compaction->column_family_data(); + ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env, + db_options_.enable_thread_tracking); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); + ReportStartedCompaction(compaction); +} + +CompactionJob::~CompactionJob() { + assert(compact_ == nullptr); + ThreadStatusUtil::ResetThreadStatus(); +} + +void CompactionJob::ReportStartedCompaction(Compaction* compaction) { + const auto* cfd = compact_->compaction->column_family_data(); + ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env, + db_options_.enable_thread_tracking); + + ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID, + job_id_); + + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL, + (static_cast(compact_->compaction->start_level()) << 32) + + compact_->compaction->output_level()); + + // In the current design, a CompactionJob is always created + // for non-trivial compaction. + assert(compaction->IsTrivialMove() == false || + compaction->is_manual_compaction() == true); + + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_PROP_FLAGS, + compaction->is_manual_compaction() + + (compaction->deletion_compaction() << 1)); + + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES, + compaction->CalculateTotalInputSize()); + + IOSTATS_RESET(bytes_written); + IOSTATS_RESET(bytes_read); + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_BYTES_WRITTEN, 0); + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_BYTES_READ, 0); + + // Set the thread operation after operation properties + // to ensure GetThreadList() can always show them all together. + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); + + compaction_job_stats_->is_manual_compaction = + compaction->is_manual_compaction(); + compaction_job_stats_->is_full_compaction = compaction->is_full_compaction(); +} + +void CompactionJob::Prepare() { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_PREPARE); + + // Generate file_levels_ for compaction before making Iterator + auto* c = compact_->compaction; + ColumnFamilyData* cfd = c->column_family_data(); + assert(cfd != nullptr); + assert(cfd->current()->storage_info()->NumLevelFiles( + compact_->compaction->level()) > 0); + + write_hint_ = cfd->CalculateSSTWriteHint(c->output_level()); + bottommost_level_ = c->bottommost_level(); + + if (c->ShouldFormSubcompactions()) { + StopWatch sw(db_options_.clock, stats_, SUBCOMPACTION_SETUP_TIME); + GenSubcompactionBoundaries(); + } + if (boundaries_.size() > 1) { + for (size_t i = 0; i <= boundaries_.size(); i++) { + compact_->sub_compact_states.emplace_back( + c, (i != 0) ? std::optional(boundaries_[i - 1]) : std::nullopt, + (i != boundaries_.size()) ? std::optional(boundaries_[i]) + : std::nullopt, + static_cast(i)); + // assert to validate that boundaries don't have same user keys (without + // timestamp part). + assert(i == 0 || i == boundaries_.size() || + cfd->user_comparator()->CompareWithoutTimestamp( + boundaries_[i - 1], boundaries_[i]) < 0); + } + RecordInHistogram(stats_, NUM_SUBCOMPACTIONS_SCHEDULED, + compact_->sub_compact_states.size()); + } else { + compact_->sub_compact_states.emplace_back(c, std::nullopt, std::nullopt, + /*sub_job_id*/ 0); + } + + // collect all seqno->time information from the input files which will be used + // to encode seqno->time to the output files. + uint64_t preserve_time_duration = + std::max(c->immutable_options()->preserve_internal_time_seconds, + c->immutable_options()->preclude_last_level_data_seconds); + + if (preserve_time_duration > 0) { + // setup seqno_time_mapping_ + seqno_time_mapping_.SetMaxTimeDuration(preserve_time_duration); + for (const auto& each_level : *c->inputs()) { + for (const auto& fmd : each_level.files) { + std::shared_ptr tp; + Status s = cfd->current()->GetTableProperties(&tp, fmd, nullptr); + if (s.ok()) { + seqno_time_mapping_.Add(tp->seqno_to_time_mapping) + .PermitUncheckedError(); + seqno_time_mapping_.Add(fmd->fd.smallest_seqno, + fmd->oldest_ancester_time); + } + } + } + + auto status = seqno_time_mapping_.Sort(); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Invalid sequence number to time mapping: Status: %s", + status.ToString().c_str()); + } + int64_t _current_time = 0; + status = db_options_.clock->GetCurrentTime(&_current_time); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to get current time in compaction: Status: %s", + status.ToString().c_str()); + // preserve all time information + preserve_time_min_seqno_ = 0; + preclude_last_level_min_seqno_ = 0; + } else { + seqno_time_mapping_.TruncateOldEntries(_current_time); + uint64_t preserve_time = + static_cast(_current_time) > preserve_time_duration + ? _current_time - preserve_time_duration + : 0; + preserve_time_min_seqno_ = + seqno_time_mapping_.GetOldestSequenceNum(preserve_time); + if (c->immutable_options()->preclude_last_level_data_seconds > 0) { + uint64_t preclude_last_level_time = + static_cast(_current_time) > + c->immutable_options()->preclude_last_level_data_seconds + ? _current_time - + c->immutable_options()->preclude_last_level_data_seconds + : 0; + preclude_last_level_min_seqno_ = + seqno_time_mapping_.GetOldestSequenceNum(preclude_last_level_time); + } + } + } +} + +uint64_t CompactionJob::GetSubcompactionsLimit() { + return extra_num_subcompaction_threads_reserved_ + + std::max( + std::uint64_t(1), + static_cast(compact_->compaction->max_subcompactions())); +} + +void CompactionJob::AcquireSubcompactionResources( + int num_extra_required_subcompactions) { + TEST_SYNC_POINT("CompactionJob::AcquireSubcompactionResources:0"); + TEST_SYNC_POINT("CompactionJob::AcquireSubcompactionResources:1"); + int max_db_compactions = + DBImpl::GetBGJobLimits( + mutable_db_options_copy_.max_background_flushes, + mutable_db_options_copy_.max_background_compactions, + mutable_db_options_copy_.max_background_jobs, + versions_->GetColumnFamilySet() + ->write_controller() + ->NeedSpeedupCompaction()) + .max_compactions; + InstrumentedMutexLock l(db_mutex_); + // Apply min function first since We need to compute the extra subcompaction + // against compaction limits. And then try to reserve threads for extra + // subcompactions. The actual number of reserved threads could be less than + // the desired number. + int available_bg_compactions_against_db_limit = + std::max(max_db_compactions - *bg_compaction_scheduled_ - + *bg_bottom_compaction_scheduled_, + 0); + // Reservation only supports backgrdoun threads of which the priority is + // between BOTTOM and HIGH. Need to degrade the priority to HIGH if the + // origin thread_pri_ is higher than that. Similar to ReleaseThreads(). + extra_num_subcompaction_threads_reserved_ = + env_->ReserveThreads(std::min(num_extra_required_subcompactions, + available_bg_compactions_against_db_limit), + std::min(thread_pri_, Env::Priority::HIGH)); + + // Update bg_compaction_scheduled_ or bg_bottom_compaction_scheduled_ + // depending on if this compaction has the bottommost priority + if (thread_pri_ == Env::Priority::BOTTOM) { + *bg_bottom_compaction_scheduled_ += + extra_num_subcompaction_threads_reserved_; + } else { + *bg_compaction_scheduled_ += extra_num_subcompaction_threads_reserved_; + } +} + +void CompactionJob::ShrinkSubcompactionResources(uint64_t num_extra_resources) { + // Do nothing when we have zero resources to shrink + if (num_extra_resources == 0) return; + db_mutex_->Lock(); + // We cannot release threads more than what we reserved before + int extra_num_subcompaction_threads_released = env_->ReleaseThreads( + (int)num_extra_resources, std::min(thread_pri_, Env::Priority::HIGH)); + // Update the number of reserved threads and the number of background + // scheduled compactions for this compaction job + extra_num_subcompaction_threads_reserved_ -= + extra_num_subcompaction_threads_released; + // TODO (zichen): design a test case with new subcompaction partitioning + // when the number of actual partitions is less than the number of planned + // partitions + assert(extra_num_subcompaction_threads_released == (int)num_extra_resources); + // Update bg_compaction_scheduled_ or bg_bottom_compaction_scheduled_ + // depending on if this compaction has the bottommost priority + if (thread_pri_ == Env::Priority::BOTTOM) { + *bg_bottom_compaction_scheduled_ -= + extra_num_subcompaction_threads_released; + } else { + *bg_compaction_scheduled_ -= extra_num_subcompaction_threads_released; + } + db_mutex_->Unlock(); + TEST_SYNC_POINT("CompactionJob::ShrinkSubcompactionResources:0"); +} + +void CompactionJob::ReleaseSubcompactionResources() { + if (extra_num_subcompaction_threads_reserved_ == 0) { + return; + } + { + InstrumentedMutexLock l(db_mutex_); + // The number of reserved threads becomes larger than 0 only if the + // compaction prioity is round robin and there is no sufficient + // sub-compactions available + + // The scheduled compaction must be no less than 1 + extra number + // subcompactions using acquired resources since this compaction job has not + // finished yet + assert(*bg_bottom_compaction_scheduled_ >= + 1 + extra_num_subcompaction_threads_reserved_ || + *bg_compaction_scheduled_ >= + 1 + extra_num_subcompaction_threads_reserved_); + } + ShrinkSubcompactionResources(extra_num_subcompaction_threads_reserved_); +} + +struct RangeWithSize { + Range range; + uint64_t size; + + RangeWithSize(const Slice& a, const Slice& b, uint64_t s = 0) + : range(a, b), size(s) {} +}; + +void CompactionJob::GenSubcompactionBoundaries() { + // The goal is to find some boundary keys so that we can evenly partition + // the compaction input data into max_subcompactions ranges. + // For every input file, we ask TableReader to estimate 128 anchor points + // that evenly partition the input file into 128 ranges and the range + // sizes. This can be calculated by scanning index blocks of the file. + // Once we have the anchor points for all the input files, we merge them + // together and try to find keys dividing ranges evenly. + // For example, if we have two input files, and each returns following + // ranges: + // File1: (a1, 1000), (b1, 1200), (c1, 1100) + // File2: (a2, 1100), (b2, 1000), (c2, 1000) + // We total sort the keys to following: + // (a1, 1000), (a2, 1100), (b1, 1200), (b2, 1000), (c1, 1100), (c2, 1000) + // We calculate the total size by adding up all ranges' size, which is 6400. + // If we would like to partition into 2 subcompactions, the target of the + // range size is 3200. Based on the size, we take "b1" as the partition key + // since the first three ranges would hit 3200. + // + // Note that the ranges are actually overlapping. For example, in the example + // above, the range ending with "b1" is overlapping with the range ending with + // "b2". So the size 1000+1100+1200 is an underestimation of data size up to + // "b1". In extreme cases where we only compact N L0 files, a range can + // overlap with N-1 other ranges. Since we requested a relatively large number + // (128) of ranges from each input files, even N range overlapping would + // cause relatively small inaccuracy. + + auto* c = compact_->compaction; + if (c->max_subcompactions() <= 1 && + !(c->immutable_options()->compaction_pri == kRoundRobin && + c->immutable_options()->compaction_style == kCompactionStyleLevel)) { + return; + } + auto* cfd = c->column_family_data(); + const Comparator* cfd_comparator = cfd->user_comparator(); + const InternalKeyComparator& icomp = cfd->internal_comparator(); + + auto* v = compact_->compaction->input_version(); + int base_level = v->storage_info()->base_level(); + InstrumentedMutexUnlock unlock_guard(db_mutex_); + + uint64_t total_size = 0; + std::vector all_anchors; + int start_lvl = c->start_level(); + int out_lvl = c->output_level(); + + for (size_t lvl_idx = 0; lvl_idx < c->num_input_levels(); lvl_idx++) { + int lvl = c->level(lvl_idx); + if (lvl >= start_lvl && lvl <= out_lvl) { + const LevelFilesBrief* flevel = c->input_levels(lvl_idx); + size_t num_files = flevel->num_files; + + if (num_files == 0) { + continue; + } + + for (size_t i = 0; i < num_files; i++) { + FileMetaData* f = flevel->files[i].file_metadata; + std::vector my_anchors; + Status s = cfd->table_cache()->ApproximateKeyAnchors( + ReadOptions(), icomp, *f, my_anchors); + if (!s.ok() || my_anchors.empty()) { + my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize()); + } + for (auto& ac : my_anchors) { + // Can be optimize to avoid this loop. + total_size += ac.range_size; + } + + all_anchors.insert(all_anchors.end(), my_anchors.begin(), + my_anchors.end()); + } + } + } + // Here we total sort all the anchor points across all files and go through + // them in the sorted order to find partitioning boundaries. + // Not the most efficient implementation. A much more efficient algorithm + // probably exists. But they are more complex. If performance turns out to + // be a problem, we can optimize. + std::sort( + all_anchors.begin(), all_anchors.end(), + [cfd_comparator](TableReader::Anchor& a, TableReader::Anchor& b) -> bool { + return cfd_comparator->CompareWithoutTimestamp(a.user_key, b.user_key) < + 0; + }); + + // Remove duplicated entries from boundaries. + all_anchors.erase( + std::unique(all_anchors.begin(), all_anchors.end(), + [cfd_comparator](TableReader::Anchor& a, + TableReader::Anchor& b) -> bool { + return cfd_comparator->CompareWithoutTimestamp( + a.user_key, b.user_key) == 0; + }), + all_anchors.end()); + + // Get the number of planned subcompactions, may update reserve threads + // and update extra_num_subcompaction_threads_reserved_ for round-robin + uint64_t num_planned_subcompactions; + if (c->immutable_options()->compaction_pri == kRoundRobin && + c->immutable_options()->compaction_style == kCompactionStyleLevel) { + // For round-robin compaction prioity, we need to employ more + // subcompactions (may exceed the max_subcompaction limit). The extra + // subcompactions will be executed using reserved threads and taken into + // account bg_compaction_scheduled or bg_bottom_compaction_scheduled. + + // Initialized by the number of input files + num_planned_subcompactions = static_cast(c->num_input_files(0)); + uint64_t max_subcompactions_limit = GetSubcompactionsLimit(); + if (max_subcompactions_limit < num_planned_subcompactions) { + // Assert two pointers are not empty so that we can use extra + // subcompactions against db compaction limits + assert(bg_bottom_compaction_scheduled_ != nullptr); + assert(bg_compaction_scheduled_ != nullptr); + // Reserve resources when max_subcompaction is not sufficient + AcquireSubcompactionResources( + (int)(num_planned_subcompactions - max_subcompactions_limit)); + // Subcompactions limit changes after acquiring additional resources. + // Need to call GetSubcompactionsLimit() again to update the number + // of planned subcompactions + num_planned_subcompactions = + std::min(num_planned_subcompactions, GetSubcompactionsLimit()); + } else { + num_planned_subcompactions = max_subcompactions_limit; + } + } else { + num_planned_subcompactions = GetSubcompactionsLimit(); + } + + TEST_SYNC_POINT_CALLBACK("CompactionJob::GenSubcompactionBoundaries:0", + &num_planned_subcompactions); + if (num_planned_subcompactions == 1) return; + + // Group the ranges into subcompactions + uint64_t target_range_size = std::max( + total_size / num_planned_subcompactions, + MaxFileSizeForLevel( + *(c->mutable_cf_options()), out_lvl, + c->immutable_options()->compaction_style, base_level, + c->immutable_options()->level_compaction_dynamic_level_bytes)); + + if (target_range_size >= total_size) { + return; + } + + uint64_t next_threshold = target_range_size; + uint64_t cumulative_size = 0; + uint64_t num_actual_subcompactions = 1U; + for (TableReader::Anchor& anchor : all_anchors) { + cumulative_size += anchor.range_size; + if (cumulative_size > next_threshold) { + next_threshold += target_range_size; + num_actual_subcompactions++; + boundaries_.push_back(anchor.user_key); + } + if (num_actual_subcompactions == num_planned_subcompactions) { + break; + } + } + TEST_SYNC_POINT_CALLBACK("CompactionJob::GenSubcompactionBoundaries:1", + &num_actual_subcompactions); + // Shrink extra subcompactions resources when extra resrouces are acquired + ShrinkSubcompactionResources( + std::min((int)(num_planned_subcompactions - num_actual_subcompactions), + extra_num_subcompaction_threads_reserved_)); +} + +Status CompactionJob::Run() { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_RUN); + TEST_SYNC_POINT("CompactionJob::Run():Start"); + log_buffer_->FlushBufferToLog(); + LogCompaction(); + + const size_t num_threads = compact_->sub_compact_states.size(); + assert(num_threads > 0); + const uint64_t start_micros = db_options_.clock->NowMicros(); + + // Launch a thread for each of subcompactions 1...num_threads-1 + std::vector thread_pool; + thread_pool.reserve(num_threads - 1); + for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) { + thread_pool.emplace_back(&CompactionJob::ProcessKeyValueCompaction, this, + &compact_->sub_compact_states[i]); + } + + // Always schedule the first subcompaction (whether or not there are also + // others) in the current thread to be efficient with resources + ProcessKeyValueCompaction(&compact_->sub_compact_states[0]); + + // Wait for all other threads (if there are any) to finish execution + for (auto& thread : thread_pool) { + thread.join(); + } + + compaction_stats_.SetMicros(db_options_.clock->NowMicros() - start_micros); + + for (auto& state : compact_->sub_compact_states) { + compaction_stats_.AddCpuMicros(state.compaction_job_stats.cpu_micros); + state.RemoveLastEmptyOutput(); + } + + RecordTimeToHistogram(stats_, COMPACTION_TIME, + compaction_stats_.stats.micros); + RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME, + compaction_stats_.stats.cpu_micros); + + TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify"); + + // Check if any thread encountered an error during execution + Status status; + IOStatus io_s; + bool wrote_new_blob_files = false; + + for (const auto& state : compact_->sub_compact_states) { + if (!state.status.ok()) { + status = state.status; + io_s = state.io_status; + break; + } + + if (state.Current().HasBlobFileAdditions()) { + wrote_new_blob_files = true; + } + } + + if (io_status_.ok()) { + io_status_ = io_s; + } + if (status.ok()) { + constexpr IODebugContext* dbg = nullptr; + + if (output_directory_) { + io_s = output_directory_->FsyncWithDirOptions( + IOOptions(), dbg, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + } + + if (io_s.ok() && wrote_new_blob_files && blob_output_directory_ && + blob_output_directory_ != output_directory_) { + io_s = blob_output_directory_->FsyncWithDirOptions( + IOOptions(), dbg, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + } + } + if (io_status_.ok()) { + io_status_ = io_s; + } + if (status.ok()) { + status = io_s; + } + if (status.ok()) { + thread_pool.clear(); + std::vector files_output; + for (const auto& state : compact_->sub_compact_states) { + for (const auto& output : state.GetOutputs()) { + files_output.emplace_back(&output); + } + } + ColumnFamilyData* cfd = compact_->compaction->column_family_data(); + auto& prefix_extractor = + compact_->compaction->mutable_cf_options()->prefix_extractor; + std::atomic next_file_idx(0); + auto verify_table = [&](Status& output_status) { + while (true) { + size_t file_idx = next_file_idx.fetch_add(1); + if (file_idx >= files_output.size()) { + break; + } + // Verify that the table is usable + // We set for_compaction to false and don't + // OptimizeForCompactionTableRead here because this is a special case + // after we finish the table building No matter whether + // use_direct_io_for_flush_and_compaction is true, we will regard this + // verification as user reads since the goal is to cache it here for + // further user reads + ReadOptions read_options; + InternalIterator* iter = cfd->table_cache()->NewIterator( + read_options, file_options_, cfd->internal_comparator(), + files_output[file_idx]->meta, /*range_del_agg=*/nullptr, + prefix_extractor, + /*table_reader_ptr=*/nullptr, + cfd->internal_stats()->GetFileReadHist( + compact_->compaction->output_level()), + TableReaderCaller::kCompactionRefill, /*arena=*/nullptr, + /*skip_filters=*/false, compact_->compaction->output_level(), + MaxFileSizeForL0MetaPin( + *compact_->compaction->mutable_cf_options()), + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr, + /*allow_unprepared_value=*/false); + auto s = iter->status(); + + if (s.ok() && paranoid_file_checks_) { + OutputValidator validator(cfd->internal_comparator(), + /*_enable_order_check=*/true, + /*_enable_hash=*/true); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + s = validator.Add(iter->key(), iter->value()); + if (!s.ok()) { + break; + } + } + if (s.ok()) { + s = iter->status(); + } + if (s.ok() && + !validator.CompareValidator(files_output[file_idx]->validator)) { + s = Status::Corruption("Paranoid checksums do not match"); + } + } + + delete iter; + + if (!s.ok()) { + output_status = s; + break; + } + } + }; + for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) { + thread_pool.emplace_back( + verify_table, std::ref(compact_->sub_compact_states[i].status)); + } + verify_table(compact_->sub_compact_states[0].status); + for (auto& thread : thread_pool) { + thread.join(); + } + + for (const auto& state : compact_->sub_compact_states) { + if (!state.status.ok()) { + status = state.status; + break; + } + } + } + + ReleaseSubcompactionResources(); + TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:0"); + TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:1"); + + TablePropertiesCollection tp; + for (const auto& state : compact_->sub_compact_states) { + for (const auto& output : state.GetOutputs()) { + auto fn = + TableFileName(state.compaction->immutable_options()->cf_paths, + output.meta.fd.GetNumber(), output.meta.fd.GetPathId()); + tp[fn] = output.table_properties; + } + } + compact_->compaction->SetOutputTableProperties(std::move(tp)); + + // Finish up all book-keeping to unify the subcompaction results + compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_); + UpdateCompactionStats(); + + RecordCompactionIOStats(); + LogFlush(db_options_.info_log); + TEST_SYNC_POINT("CompactionJob::Run():End"); + + compact_->status = status; + return status; +} + +Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { + assert(compact_); + + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_INSTALL); + db_mutex_->AssertHeld(); + Status status = compact_->status; + + ColumnFamilyData* cfd = compact_->compaction->column_family_data(); + assert(cfd); + + int output_level = compact_->compaction->output_level(); + cfd->internal_stats()->AddCompactionStats(output_level, thread_pri_, + compaction_stats_); + + if (status.ok()) { + status = InstallCompactionResults(mutable_cf_options); + } + if (!versions_->io_status().ok()) { + io_status_ = versions_->io_status(); + } + + VersionStorageInfo::LevelSummaryStorage tmp; + auto vstorage = cfd->current()->storage_info(); + const auto& stats = compaction_stats_.stats; + + double read_write_amp = 0.0; + double write_amp = 0.0; + double bytes_read_per_sec = 0; + double bytes_written_per_sec = 0; + + const uint64_t bytes_read_non_output_and_blob = + stats.bytes_read_non_output_levels + stats.bytes_read_blob; + const uint64_t bytes_read_all = + stats.bytes_read_output_level + bytes_read_non_output_and_blob; + const uint64_t bytes_written_all = + stats.bytes_written + stats.bytes_written_blob; + + if (bytes_read_non_output_and_blob > 0) { + read_write_amp = (bytes_written_all + bytes_read_all) / + static_cast(bytes_read_non_output_and_blob); + write_amp = + bytes_written_all / static_cast(bytes_read_non_output_and_blob); + } + if (stats.micros > 0) { + bytes_read_per_sec = bytes_read_all / static_cast(stats.micros); + bytes_written_per_sec = + bytes_written_all / static_cast(stats.micros); + } + + const std::string& column_family_name = cfd->GetName(); + + constexpr double kMB = 1048576.0; + + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, " + "files in(%d, %d) out(%d +%d blob) " + "MB in(%.1f, %.1f +%.1f blob) out(%.1f +%.1f blob), " + "read-write-amplify(%.1f) write-amplify(%.1f) %s, records in: %" PRIu64 + ", records dropped: %" PRIu64 " output_compression: %s\n", + column_family_name.c_str(), vstorage->LevelSummary(&tmp), + bytes_read_per_sec, bytes_written_per_sec, + compact_->compaction->output_level(), + stats.num_input_files_in_non_output_levels, + stats.num_input_files_in_output_level, stats.num_output_files, + stats.num_output_files_blob, stats.bytes_read_non_output_levels / kMB, + stats.bytes_read_output_level / kMB, stats.bytes_read_blob / kMB, + stats.bytes_written / kMB, stats.bytes_written_blob / kMB, read_write_amp, + write_amp, status.ToString().c_str(), stats.num_input_records, + stats.num_dropped_records, + CompressionTypeToString(compact_->compaction->output_compression()) + .c_str()); + + const auto& blob_files = vstorage->GetBlobFiles(); + if (!blob_files.empty()) { + assert(blob_files.front()); + assert(blob_files.back()); + + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Blob file summary: head=%" PRIu64 ", tail=%" PRIu64 "\n", + column_family_name.c_str(), blob_files.front()->GetBlobFileNumber(), + blob_files.back()->GetBlobFileNumber()); + } + + if (compaction_stats_.has_penultimate_level_output) { + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] has Penultimate Level output: %" PRIu64 + ", level %d, number of files: %" PRIu64 ", number of records: %" PRIu64, + column_family_name.c_str(), + compaction_stats_.penultimate_level_stats.bytes_written, + compact_->compaction->GetPenultimateLevel(), + compaction_stats_.penultimate_level_stats.num_output_files, + compaction_stats_.penultimate_level_stats.num_output_records); + } + + UpdateCompactionJobStats(stats); + + auto stream = event_logger_->LogToBuffer(log_buffer_, 8192); + stream << "job" << job_id_ << "event" + << "compaction_finished" + << "compaction_time_micros" << stats.micros + << "compaction_time_cpu_micros" << stats.cpu_micros << "output_level" + << compact_->compaction->output_level() << "num_output_files" + << stats.num_output_files << "total_output_size" + << stats.bytes_written; + + if (stats.num_output_files_blob > 0) { + stream << "num_blob_output_files" << stats.num_output_files_blob + << "total_blob_output_size" << stats.bytes_written_blob; + } + + stream << "num_input_records" << stats.num_input_records + << "num_output_records" << stats.num_output_records + << "num_subcompactions" << compact_->sub_compact_states.size() + << "output_compression" + << CompressionTypeToString(compact_->compaction->output_compression()); + + stream << "num_single_delete_mismatches" + << compaction_job_stats_->num_single_del_mismatch; + stream << "num_single_delete_fallthrough" + << compaction_job_stats_->num_single_del_fallthru; + + if (measure_io_stats_) { + stream << "file_write_nanos" << compaction_job_stats_->file_write_nanos; + stream << "file_range_sync_nanos" + << compaction_job_stats_->file_range_sync_nanos; + stream << "file_fsync_nanos" << compaction_job_stats_->file_fsync_nanos; + stream << "file_prepare_write_nanos" + << compaction_job_stats_->file_prepare_write_nanos; + } + + stream << "lsm_state"; + stream.StartArray(); + for (int level = 0; level < vstorage->num_levels(); ++level) { + stream << vstorage->NumLevelFiles(level); + } + stream.EndArray(); + + if (!blob_files.empty()) { + assert(blob_files.front()); + stream << "blob_file_head" << blob_files.front()->GetBlobFileNumber(); + + assert(blob_files.back()); + stream << "blob_file_tail" << blob_files.back()->GetBlobFileNumber(); + } + + if (compaction_stats_.has_penultimate_level_output) { + InternalStats::CompactionStats& pl_stats = + compaction_stats_.penultimate_level_stats; + stream << "penultimate_level_num_output_files" << pl_stats.num_output_files; + stream << "penultimate_level_bytes_written" << pl_stats.bytes_written; + stream << "penultimate_level_num_output_records" + << pl_stats.num_output_records; + stream << "penultimate_level_num_output_files_blob" + << pl_stats.num_output_files_blob; + stream << "penultimate_level_bytes_written_blob" + << pl_stats.bytes_written_blob; + } + + CleanupCompaction(); + return status; +} + +void CompactionJob::NotifyOnSubcompactionBegin( + SubcompactionState* sub_compact) { +#ifndef ROCKSDB_LITE + Compaction* c = compact_->compaction; + + if (db_options_.listeners.empty()) { + return; + } + if (shutting_down_->load(std::memory_order_acquire)) { + return; + } + if (c->is_manual_compaction() && + manual_compaction_canceled_.load(std::memory_order_acquire)) { + return; + } + + sub_compact->notify_on_subcompaction_completion = true; + + SubcompactionJobInfo info{}; + sub_compact->BuildSubcompactionJobInfo(info); + info.job_id = static_cast(job_id_); + info.thread_id = env_->GetThreadID(); + + for (const auto& listener : db_options_.listeners) { + listener->OnSubcompactionBegin(info); + } + info.status.PermitUncheckedError(); + +#else + (void)sub_compact; +#endif // ROCKSDB_LITE +} + +void CompactionJob::NotifyOnSubcompactionCompleted( + SubcompactionState* sub_compact) { +#ifndef ROCKSDB_LITE + + if (db_options_.listeners.empty()) { + return; + } + if (shutting_down_->load(std::memory_order_acquire)) { + return; + } + + if (sub_compact->notify_on_subcompaction_completion == false) { + return; + } + + SubcompactionJobInfo info{}; + sub_compact->BuildSubcompactionJobInfo(info); + info.job_id = static_cast(job_id_); + info.thread_id = env_->GetThreadID(); + + for (const auto& listener : db_options_.listeners) { + listener->OnSubcompactionCompleted(info); + } +#else + (void)sub_compact; +#endif // ROCKSDB_LITE +} + +void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { + assert(sub_compact); + assert(sub_compact->compaction); + +#ifndef ROCKSDB_LITE + if (db_options_.compaction_service) { + CompactionServiceJobStatus comp_status = + ProcessKeyValueCompactionWithCompactionService(sub_compact); + if (comp_status == CompactionServiceJobStatus::kSuccess || + comp_status == CompactionServiceJobStatus::kFailure) { + return; + } + // fallback to local compaction + assert(comp_status == CompactionServiceJobStatus::kUseLocal); + } +#endif // !ROCKSDB_LITE + + uint64_t prev_cpu_micros = db_options_.clock->CPUMicros(); + + ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); + + // Create compaction filter and fail the compaction if + // IgnoreSnapshots() = false because it is not supported anymore + const CompactionFilter* compaction_filter = + cfd->ioptions()->compaction_filter; + std::unique_ptr compaction_filter_from_factory = nullptr; + if (compaction_filter == nullptr) { + compaction_filter_from_factory = + sub_compact->compaction->CreateCompactionFilter(); + compaction_filter = compaction_filter_from_factory.get(); + } + if (compaction_filter != nullptr && !compaction_filter->IgnoreSnapshots()) { + sub_compact->status = Status::NotSupported( + "CompactionFilter::IgnoreSnapshots() = false is not supported " + "anymore."); + return; + } + + NotifyOnSubcompactionBegin(sub_compact); + + auto range_del_agg = std::make_unique( + &cfd->internal_comparator(), existing_snapshots_, &full_history_ts_low_, + &trim_ts_); + + // TODO: since we already use C++17, should use + // std::optional instead. + const std::optional start = sub_compact->start; + const std::optional end = sub_compact->end; + + std::optional start_without_ts; + std::optional end_without_ts; + + ReadOptions read_options; + read_options.verify_checksums = true; + read_options.fill_cache = false; + read_options.rate_limiter_priority = GetRateLimiterPriority(); + // Compaction iterators shouldn't be confined to a single prefix. + // Compactions use Seek() for + // (a) concurrent compactions, + // (b) CompactionFilter::Decision::kRemoveAndSkipUntil. + read_options.total_order_seek = true; + + // Remove the timestamps from boundaries because boundaries created in + // GenSubcompactionBoundaries doesn't strip away the timestamp. + size_t ts_sz = cfd->user_comparator()->timestamp_size(); + if (start.has_value()) { + read_options.iterate_lower_bound = &start.value(); + if (ts_sz > 0) { + start_without_ts = StripTimestampFromUserKey(start.value(), ts_sz); + read_options.iterate_lower_bound = &start_without_ts.value(); + } + } + if (end.has_value()) { + read_options.iterate_upper_bound = &end.value(); + if (ts_sz > 0) { + end_without_ts = StripTimestampFromUserKey(end.value(), ts_sz); + read_options.iterate_upper_bound = &end_without_ts.value(); + } + } + + // Although the v2 aggregator is what the level iterator(s) know about, + // the AddTombstones calls will be propagated down to the v1 aggregator. + std::unique_ptr raw_input(versions_->MakeInputIterator( + read_options, sub_compact->compaction, range_del_agg.get(), + file_options_for_read_, start, end)); + InternalIterator* input = raw_input.get(); + + IterKey start_ikey; + IterKey end_ikey; + Slice start_slice; + Slice end_slice; + + static constexpr char kMaxTs[] = + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"; + Slice ts_slice; + std::string max_ts; + if (ts_sz > 0) { + if (ts_sz <= strlen(kMaxTs)) { + ts_slice = Slice(kMaxTs, ts_sz); + } else { + max_ts = std::string(ts_sz, '\xff'); + ts_slice = Slice(max_ts); + } + } + + if (start.has_value()) { + start_ikey.SetInternalKey(start.value(), kMaxSequenceNumber, + kValueTypeForSeek); + if (ts_sz > 0) { + start_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek, + &ts_slice); + } + start_slice = start_ikey.GetInternalKey(); + } + if (end.has_value()) { + end_ikey.SetInternalKey(end.value(), kMaxSequenceNumber, kValueTypeForSeek); + if (ts_sz > 0) { + end_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek, + &ts_slice); + } + end_slice = end_ikey.GetInternalKey(); + } + + std::unique_ptr clip; + if (start.has_value() || end.has_value()) { + clip = std::make_unique( + raw_input.get(), start.has_value() ? &start_slice : nullptr, + end.has_value() ? &end_slice : nullptr, &cfd->internal_comparator()); + input = clip.get(); + } + + std::unique_ptr blob_counter; + + if (sub_compact->compaction->DoesInputReferenceBlobFiles()) { + BlobGarbageMeter* meter = sub_compact->Current().CreateBlobGarbageMeter(); + blob_counter = std::make_unique(input, meter); + input = blob_counter.get(); + } + + std::unique_ptr trim_history_iter; + if (ts_sz > 0 && !trim_ts_.empty()) { + trim_history_iter = std::make_unique( + input, cfd->user_comparator(), trim_ts_); + input = trim_history_iter.get(); + } + + input->SeekToFirst(); + + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_PROCESS_KV); + + // I/O measurement variables + PerfLevel prev_perf_level = PerfLevel::kEnableTime; + const uint64_t kRecordStatsEvery = 1000; + uint64_t prev_write_nanos = 0; + uint64_t prev_fsync_nanos = 0; + uint64_t prev_range_sync_nanos = 0; + uint64_t prev_prepare_write_nanos = 0; + uint64_t prev_cpu_write_nanos = 0; + uint64_t prev_cpu_read_nanos = 0; + if (measure_io_stats_) { + prev_perf_level = GetPerfLevel(); + SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); + prev_write_nanos = IOSTATS(write_nanos); + prev_fsync_nanos = IOSTATS(fsync_nanos); + prev_range_sync_nanos = IOSTATS(range_sync_nanos); + prev_prepare_write_nanos = IOSTATS(prepare_write_nanos); + prev_cpu_write_nanos = IOSTATS(cpu_write_nanos); + prev_cpu_read_nanos = IOSTATS(cpu_read_nanos); + } + + MergeHelper merge( + env_, cfd->user_comparator(), cfd->ioptions()->merge_operator.get(), + compaction_filter, db_options_.info_log.get(), + false /* internal key corruption is expected */, + existing_snapshots_.empty() ? 0 : existing_snapshots_.back(), + snapshot_checker_, compact_->compaction->level(), db_options_.stats); + + const MutableCFOptions* mutable_cf_options = + sub_compact->compaction->mutable_cf_options(); + assert(mutable_cf_options); + + std::vector blob_file_paths; + + // TODO: BlobDB to support output_to_penultimate_level compaction, which needs + // 2 builders, so may need to move to `CompactionOutputs` + std::unique_ptr blob_file_builder( + (mutable_cf_options->enable_blob_files && + sub_compact->compaction->output_level() >= + mutable_cf_options->blob_file_starting_level) + ? new BlobFileBuilder( + versions_, fs_.get(), + sub_compact->compaction->immutable_options(), + mutable_cf_options, &file_options_, db_id_, db_session_id_, + job_id_, cfd->GetID(), cfd->GetName(), Env::IOPriority::IO_LOW, + write_hint_, io_tracer_, blob_callback_, + BlobFileCreationReason::kCompaction, &blob_file_paths, + sub_compact->Current().GetBlobFileAdditionsPtr()) + : nullptr); + + TEST_SYNC_POINT("CompactionJob::Run():Inprogress"); + TEST_SYNC_POINT_CALLBACK( + "CompactionJob::Run():PausingManualCompaction:1", + reinterpret_cast( + const_cast*>(&manual_compaction_canceled_))); + + const std::string* const full_history_ts_low = + full_history_ts_low_.empty() ? nullptr : &full_history_ts_low_; + const SequenceNumber job_snapshot_seq = + job_context_ ? job_context_->GetJobSnapshotSequence() + : kMaxSequenceNumber; + + auto c_iter = std::make_unique( + input, cfd->user_comparator(), &merge, versions_->LastSequence(), + &existing_snapshots_, earliest_write_conflict_snapshot_, job_snapshot_seq, + snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), + /*expect_valid_internal_key=*/true, range_del_agg.get(), + blob_file_builder.get(), db_options_.allow_data_in_errors, + db_options_.enforce_single_del_contracts, manual_compaction_canceled_, + sub_compact->compaction, compaction_filter, shutting_down_, + db_options_.info_log, full_history_ts_low, preserve_time_min_seqno_, + preclude_last_level_min_seqno_); + c_iter->SeekToFirst(); + + // Assign range delete aggregator to the target output level, which makes sure + // it only output to single level + sub_compact->AssignRangeDelAggregator(std::move(range_del_agg)); + + const auto& c_iter_stats = c_iter->iter_stats(); + + // define the open and close functions for the compaction files, which will be + // used open/close output files when needed. + const CompactionFileOpenFunc open_file_func = + [this, sub_compact](CompactionOutputs& outputs) { + return this->OpenCompactionOutputFile(sub_compact, outputs); + }; + const CompactionFileCloseFunc close_file_func = + [this, sub_compact](CompactionOutputs& outputs, const Status& status, + const Slice& next_table_min_key) { + return this->FinishCompactionOutputFile(status, sub_compact, outputs, + next_table_min_key); + }; + + Status status; + TEST_SYNC_POINT_CALLBACK( + "CompactionJob::ProcessKeyValueCompaction()::Processing", + reinterpret_cast( + const_cast(sub_compact->compaction))); + while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) { + // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid() + // returns true. + + assert(!end.has_value() || cfd->user_comparator()->Compare( + c_iter->user_key(), end.value()) < 0); + + if (c_iter_stats.num_input_records % kRecordStatsEvery == + kRecordStatsEvery - 1) { + RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats); + c_iter->ResetRecordCounts(); + RecordCompactionIOStats(); + } + + // Add current compaction_iterator key to target compaction output, if the + // output file needs to be close or open, it will call the `open_file_func` + // and `close_file_func`. + // TODO: it would be better to have the compaction file open/close moved + // into `CompactionOutputs` which has the output file information. + status = sub_compact->AddToOutput(*c_iter, open_file_func, close_file_func); + if (!status.ok()) { + break; + } + + TEST_SYNC_POINT_CALLBACK( + "CompactionJob::Run():PausingManualCompaction:2", + reinterpret_cast( + const_cast*>(&manual_compaction_canceled_))); + c_iter->Next(); + if (c_iter->status().IsManualCompactionPaused()) { + break; + } + } + + sub_compact->compaction_job_stats.num_blobs_read = + c_iter_stats.num_blobs_read; + sub_compact->compaction_job_stats.total_blob_bytes_read = + c_iter_stats.total_blob_bytes_read; + sub_compact->compaction_job_stats.num_input_deletion_records = + c_iter_stats.num_input_deletion_records; + sub_compact->compaction_job_stats.num_corrupt_keys = + c_iter_stats.num_input_corrupt_records; + sub_compact->compaction_job_stats.num_single_del_fallthru = + c_iter_stats.num_single_del_fallthru; + sub_compact->compaction_job_stats.num_single_del_mismatch = + c_iter_stats.num_single_del_mismatch; + sub_compact->compaction_job_stats.total_input_raw_key_bytes += + c_iter_stats.total_input_raw_key_bytes; + sub_compact->compaction_job_stats.total_input_raw_value_bytes += + c_iter_stats.total_input_raw_value_bytes; + + RecordTick(stats_, FILTER_OPERATION_TOTAL_TIME, + c_iter_stats.total_filter_time); + + if (c_iter_stats.num_blobs_relocated > 0) { + RecordTick(stats_, BLOB_DB_GC_NUM_KEYS_RELOCATED, + c_iter_stats.num_blobs_relocated); + } + if (c_iter_stats.total_blob_bytes_relocated > 0) { + RecordTick(stats_, BLOB_DB_GC_BYTES_RELOCATED, + c_iter_stats.total_blob_bytes_relocated); + } + + RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats); + RecordCompactionIOStats(); + + if (status.ok() && cfd->IsDropped()) { + status = + Status::ColumnFamilyDropped("Column family dropped during compaction"); + } + if ((status.ok() || status.IsColumnFamilyDropped()) && + shutting_down_->load(std::memory_order_relaxed)) { + status = Status::ShutdownInProgress("Database shutdown"); + } + if ((status.ok() || status.IsColumnFamilyDropped()) && + (manual_compaction_canceled_.load(std::memory_order_relaxed))) { + status = Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + if (status.ok()) { + status = input->status(); + } + if (status.ok()) { + status = c_iter->status(); + } + + // Call FinishCompactionOutputFile() even if status is not ok: it needs to + // close the output files. Open file function is also passed, in case there's + // only range-dels, no file was opened, to save the range-dels, it need to + // create a new output file. + status = sub_compact->CloseCompactionFiles(status, open_file_func, + close_file_func); + + if (blob_file_builder) { + if (status.ok()) { + status = blob_file_builder->Finish(); + } else { + blob_file_builder->Abandon(status); + } + blob_file_builder.reset(); + sub_compact->Current().UpdateBlobStats(); + } + + sub_compact->compaction_job_stats.cpu_micros = + db_options_.clock->CPUMicros() - prev_cpu_micros; + + if (measure_io_stats_) { + sub_compact->compaction_job_stats.file_write_nanos += + IOSTATS(write_nanos) - prev_write_nanos; + sub_compact->compaction_job_stats.file_fsync_nanos += + IOSTATS(fsync_nanos) - prev_fsync_nanos; + sub_compact->compaction_job_stats.file_range_sync_nanos += + IOSTATS(range_sync_nanos) - prev_range_sync_nanos; + sub_compact->compaction_job_stats.file_prepare_write_nanos += + IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos; + sub_compact->compaction_job_stats.cpu_micros -= + (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos + + IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos) / + 1000; + if (prev_perf_level != PerfLevel::kEnableTimeAndCPUTimeExceptForMutex) { + SetPerfLevel(prev_perf_level); + } + } +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + if (!status.ok()) { + if (c_iter) { + c_iter->status().PermitUncheckedError(); + } + if (input) { + input->status().PermitUncheckedError(); + } + } +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + + blob_counter.reset(); + clip.reset(); + raw_input.reset(); + sub_compact->status = status; + NotifyOnSubcompactionCompleted(sub_compact); +} + +uint64_t CompactionJob::GetCompactionId(SubcompactionState* sub_compact) const { + return (uint64_t)job_id_ << 32 | sub_compact->sub_job_id; +} + +void CompactionJob::RecordDroppedKeys( + const CompactionIterationStats& c_iter_stats, + CompactionJobStats* compaction_job_stats) { + if (c_iter_stats.num_record_drop_user > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_USER, + c_iter_stats.num_record_drop_user); + } + if (c_iter_stats.num_record_drop_hidden > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY, + c_iter_stats.num_record_drop_hidden); + if (compaction_job_stats) { + compaction_job_stats->num_records_replaced += + c_iter_stats.num_record_drop_hidden; + } + } + if (c_iter_stats.num_record_drop_obsolete > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE, + c_iter_stats.num_record_drop_obsolete); + if (compaction_job_stats) { + compaction_job_stats->num_expired_deletion_records += + c_iter_stats.num_record_drop_obsolete; + } + } + if (c_iter_stats.num_record_drop_range_del > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_RANGE_DEL, + c_iter_stats.num_record_drop_range_del); + } + if (c_iter_stats.num_range_del_drop_obsolete > 0) { + RecordTick(stats_, COMPACTION_RANGE_DEL_DROP_OBSOLETE, + c_iter_stats.num_range_del_drop_obsolete); + } + if (c_iter_stats.num_optimized_del_drop_obsolete > 0) { + RecordTick(stats_, COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE, + c_iter_stats.num_optimized_del_drop_obsolete); + } +} + +Status CompactionJob::FinishCompactionOutputFile( + const Status& input_status, SubcompactionState* sub_compact, + CompactionOutputs& outputs, const Slice& next_table_min_key) { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_SYNC_FILE); + assert(sub_compact != nullptr); + assert(outputs.HasBuilder()); + + FileMetaData* meta = outputs.GetMetaData(); + uint64_t output_number = meta->fd.GetNumber(); + assert(output_number != 0); + + ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); + std::string file_checksum = kUnknownFileChecksum; + std::string file_checksum_func_name = kUnknownFileChecksumFuncName; + + // Check for iterator errors + Status s = input_status; + + // Add range tombstones + auto earliest_snapshot = kMaxSequenceNumber; + if (existing_snapshots_.size() > 0) { + earliest_snapshot = existing_snapshots_[0]; + } + if (s.ok()) { + CompactionIterationStats range_del_out_stats; + // if the compaction supports per_key_placement, only output range dels to + // the penultimate level. + // Note: Use `bottommost_level_ = true` for both bottommost and + // output_to_penultimate_level compaction here, as it's only used to decide + // if range dels could be dropped. + if (outputs.HasRangeDel()) { + s = outputs.AddRangeDels( + sub_compact->start.has_value() ? &(sub_compact->start.value()) + : nullptr, + sub_compact->end.has_value() ? &(sub_compact->end.value()) : nullptr, + range_del_out_stats, bottommost_level_, cfd->internal_comparator(), + earliest_snapshot, next_table_min_key, full_history_ts_low_); + } + RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats); + TEST_SYNC_POINT("CompactionJob::FinishCompactionOutputFile1"); + } + + const uint64_t current_entries = outputs.NumEntries(); + + s = outputs.Finish(s, seqno_time_mapping_); + + if (s.ok()) { + // With accurate smallest and largest key, we can get a slightly more + // accurate oldest ancester time. + // This makes oldest ancester time in manifest more accurate than in + // table properties. Not sure how to resolve it. + if (meta->smallest.size() > 0 && meta->largest.size() > 0) { + uint64_t refined_oldest_ancester_time; + Slice new_smallest = meta->smallest.user_key(); + Slice new_largest = meta->largest.user_key(); + if (!new_largest.empty() && !new_smallest.empty()) { + refined_oldest_ancester_time = + sub_compact->compaction->MinInputFileOldestAncesterTime( + &(meta->smallest), &(meta->largest)); + if (refined_oldest_ancester_time != + std::numeric_limits::max()) { + meta->oldest_ancester_time = refined_oldest_ancester_time; + } + } + } + } + + // Finish and check for file errors + IOStatus io_s = outputs.WriterSyncClose(s, db_options_.clock, stats_, + db_options_.use_fsync); + + if (s.ok() && io_s.ok()) { + file_checksum = meta->file_checksum; + file_checksum_func_name = meta->file_checksum_func_name; + } + + if (s.ok()) { + s = io_s; + } + if (sub_compact->io_status.ok()) { + sub_compact->io_status = io_s; + // Since this error is really a copy of the + // "normal" status, it does not also need to be checked + sub_compact->io_status.PermitUncheckedError(); + } + + TableProperties tp; + if (s.ok()) { + tp = outputs.GetTableProperties(); + } + + if (s.ok() && current_entries == 0 && tp.num_range_deletions == 0) { + // If there is nothing to output, no necessary to generate a sst file. + // This happens when the output level is bottom level, at the same time + // the sub_compact output nothing. + std::string fname = + TableFileName(sub_compact->compaction->immutable_options()->cf_paths, + meta->fd.GetNumber(), meta->fd.GetPathId()); + + // TODO(AR) it is not clear if there are any larger implications if + // DeleteFile fails here + Status ds = env_->DeleteFile(fname); + if (!ds.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "[%s] [JOB %d] Unable to remove SST file for table #%" PRIu64 + " at bottom level%s", + cfd->GetName().c_str(), job_id_, output_number, + meta->marked_for_compaction ? " (need compaction)" : ""); + } + + // Also need to remove the file from outputs, or it will be added to the + // VersionEdit. + outputs.RemoveLastOutput(); + meta = nullptr; + } + + if (s.ok() && (current_entries > 0 || tp.num_range_deletions > 0)) { + // Output to event logger and fire events. + outputs.UpdateTableProperties(); + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Generated table #%" PRIu64 ": %" PRIu64 + " keys, %" PRIu64 " bytes%s, temperature: %s", + cfd->GetName().c_str(), job_id_, output_number, + current_entries, meta->fd.file_size, + meta->marked_for_compaction ? " (need compaction)" : "", + temperature_to_string[meta->temperature].c_str()); + } + std::string fname; + FileDescriptor output_fd; + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber; + Status status_for_listener = s; + if (meta != nullptr) { + fname = GetTableFileName(meta->fd.GetNumber()); + output_fd = meta->fd; + oldest_blob_file_number = meta->oldest_blob_file_number; + } else { + fname = "(nil)"; + if (s.ok()) { + status_for_listener = Status::Aborted("Empty SST file not kept"); + } + } + EventHelpers::LogAndNotifyTableFileCreationFinished( + event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, + job_id_, output_fd, oldest_blob_file_number, tp, + TableFileCreationReason::kCompaction, status_for_listener, file_checksum, + file_checksum_func_name); + +#ifndef ROCKSDB_LITE + // Report new file to SstFileManagerImpl + auto sfm = + static_cast(db_options_.sst_file_manager.get()); + if (sfm && meta != nullptr && meta->fd.GetPathId() == 0) { + Status add_s = sfm->OnAddFile(fname); + if (!add_s.ok() && s.ok()) { + s = add_s; + } + if (sfm->IsMaxAllowedSpaceReached()) { + // TODO(ajkr): should we return OK() if max space was reached by the final + // compaction output file (similarly to how flush works when full)? + s = Status::SpaceLimit("Max allowed space was reached"); + TEST_SYNC_POINT( + "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached"); + InstrumentedMutexLock l(db_mutex_); + db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction); + } + } +#endif + + outputs.ResetBuilder(); + return s; +} + +Status CompactionJob::InstallCompactionResults( + const MutableCFOptions& mutable_cf_options) { + assert(compact_); + + db_mutex_->AssertHeld(); + + auto* compaction = compact_->compaction; + assert(compaction); + + { + Compaction::InputLevelSummaryBuffer inputs_summary; + if (compaction_stats_.has_penultimate_level_output) { + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] [JOB %d] Compacted %s => output_to_penultimate_level: %" PRIu64 + " bytes + last: %" PRIu64 " bytes. Total: %" PRIu64 " bytes", + compaction->column_family_data()->GetName().c_str(), job_id_, + compaction->InputLevelSummary(&inputs_summary), + compaction_stats_.penultimate_level_stats.bytes_written, + compaction_stats_.stats.bytes_written, + compaction_stats_.TotalBytesWritten()); + } else { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes", + compaction->column_family_data()->GetName().c_str(), + job_id_, compaction->InputLevelSummary(&inputs_summary), + compaction_stats_.TotalBytesWritten()); + } + } + + VersionEdit* const edit = compaction->edit(); + assert(edit); + + // Add compaction inputs + compaction->AddInputDeletions(edit); + + std::unordered_map blob_total_garbage; + + for (const auto& sub_compact : compact_->sub_compact_states) { + sub_compact.AddOutputsEdit(edit); + + for (const auto& blob : sub_compact.Current().GetBlobFileAdditions()) { + edit->AddBlobFile(blob); + } + + if (sub_compact.Current().GetBlobGarbageMeter()) { + const auto& flows = sub_compact.Current().GetBlobGarbageMeter()->flows(); + + for (const auto& pair : flows) { + const uint64_t blob_file_number = pair.first; + const BlobGarbageMeter::BlobInOutFlow& flow = pair.second; + + assert(flow.IsValid()); + if (flow.HasGarbage()) { + blob_total_garbage[blob_file_number].Add(flow.GetGarbageCount(), + flow.GetGarbageBytes()); + } + } + } + } + + for (const auto& pair : blob_total_garbage) { + const uint64_t blob_file_number = pair.first; + const BlobGarbageMeter::BlobStats& stats = pair.second; + + edit->AddBlobFileGarbage(blob_file_number, stats.GetCount(), + stats.GetBytes()); + } + + if ((compaction->compaction_reason() == + CompactionReason::kLevelMaxLevelSize || + compaction->compaction_reason() == CompactionReason::kRoundRobinTtl) && + compaction->immutable_options()->compaction_pri == kRoundRobin) { + int start_level = compaction->start_level(); + if (start_level > 0) { + auto vstorage = compaction->input_version()->storage_info(); + edit->AddCompactCursor(start_level, + vstorage->GetNextCompactCursor( + start_level, compaction->num_input_files(0))); + } + } + + return versions_->LogAndApply(compaction->column_family_data(), + mutable_cf_options, edit, db_mutex_, + db_directory_); +} + +void CompactionJob::RecordCompactionIOStats() { + RecordTick(stats_, COMPACT_READ_BYTES, IOSTATS(bytes_read)); + RecordTick(stats_, COMPACT_WRITE_BYTES, IOSTATS(bytes_written)); + CompactionReason compaction_reason = + compact_->compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kFilesMarkedForCompaction) { + RecordTick(stats_, COMPACT_READ_BYTES_MARKED, IOSTATS(bytes_read)); + RecordTick(stats_, COMPACT_WRITE_BYTES_MARKED, IOSTATS(bytes_written)); + } else if (compaction_reason == CompactionReason::kPeriodicCompaction) { + RecordTick(stats_, COMPACT_READ_BYTES_PERIODIC, IOSTATS(bytes_read)); + RecordTick(stats_, COMPACT_WRITE_BYTES_PERIODIC, IOSTATS(bytes_written)); + } else if (compaction_reason == CompactionReason::kTtl) { + RecordTick(stats_, COMPACT_READ_BYTES_TTL, IOSTATS(bytes_read)); + RecordTick(stats_, COMPACT_WRITE_BYTES_TTL, IOSTATS(bytes_written)); + } + ThreadStatusUtil::IncreaseThreadOperationProperty( + ThreadStatus::COMPACTION_BYTES_READ, IOSTATS(bytes_read)); + IOSTATS_RESET(bytes_read); + ThreadStatusUtil::IncreaseThreadOperationProperty( + ThreadStatus::COMPACTION_BYTES_WRITTEN, IOSTATS(bytes_written)); + IOSTATS_RESET(bytes_written); +} + +Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact, + CompactionOutputs& outputs) { + assert(sub_compact != nullptr); + + // no need to lock because VersionSet::next_file_number_ is atomic + uint64_t file_number = versions_->NewFileNumber(); + std::string fname = GetTableFileName(file_number); + // Fire events. + ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); +#ifndef ROCKSDB_LITE + EventHelpers::NotifyTableFileCreationStarted( + cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, job_id_, + TableFileCreationReason::kCompaction); +#endif // !ROCKSDB_LITE + // Make the output file + std::unique_ptr writable_file; +#ifndef NDEBUG + bool syncpoint_arg = file_options_.use_direct_writes; + TEST_SYNC_POINT_CALLBACK("CompactionJob::OpenCompactionOutputFile", + &syncpoint_arg); +#endif + + // Pass temperature of the last level files to FileSystem. + FileOptions fo_copy = file_options_; + Temperature temperature = sub_compact->compaction->output_temperature(); + // only set for the last level compaction and also it's not output to + // penultimate level (when preclude_last_level feature is enabled) + if (temperature == Temperature::kUnknown && + sub_compact->compaction->is_last_level() && + !sub_compact->IsCurrentPenultimateLevel()) { + temperature = + sub_compact->compaction->mutable_cf_options()->last_level_temperature; + } + fo_copy.temperature = temperature; + + Status s; + IOStatus io_s = NewWritableFile(fs_.get(), fname, &writable_file, fo_copy); + s = io_s; + if (sub_compact->io_status.ok()) { + sub_compact->io_status = io_s; + // Since this error is really a copy of the io_s that is checked below as s, + // it does not also need to be checked. + sub_compact->io_status.PermitUncheckedError(); + } + if (!s.ok()) { + ROCKS_LOG_ERROR( + db_options_.info_log, + "[%s] [JOB %d] OpenCompactionOutputFiles for table #%" PRIu64 + " fails at NewWritableFile with status %s", + sub_compact->compaction->column_family_data()->GetName().c_str(), + job_id_, file_number, s.ToString().c_str()); + LogFlush(db_options_.info_log); + EventHelpers::LogAndNotifyTableFileCreationFinished( + event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), + fname, job_id_, FileDescriptor(), kInvalidBlobFileNumber, + TableProperties(), TableFileCreationReason::kCompaction, s, + kUnknownFileChecksum, kUnknownFileChecksumFuncName); + return s; + } + + // Try to figure out the output file's oldest ancester time. + int64_t temp_current_time = 0; + auto get_time_status = db_options_.clock->GetCurrentTime(&temp_current_time); + // Safe to proceed even if GetCurrentTime fails. So, log and proceed. + if (!get_time_status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to get current time. Status: %s", + get_time_status.ToString().c_str()); + } + uint64_t current_time = static_cast(temp_current_time); + InternalKey tmp_start, tmp_end; + if (sub_compact->start.has_value()) { + tmp_start.SetMinPossibleForUserKey(sub_compact->start.value()); + } + if (sub_compact->end.has_value()) { + tmp_end.SetMinPossibleForUserKey(sub_compact->end.value()); + } + uint64_t oldest_ancester_time = + sub_compact->compaction->MinInputFileOldestAncesterTime( + sub_compact->start.has_value() ? &tmp_start : nullptr, + sub_compact->end.has_value() ? &tmp_end : nullptr); + if (oldest_ancester_time == std::numeric_limits::max()) { + oldest_ancester_time = current_time; + } + + // Initialize a SubcompactionState::Output and add it to sub_compact->outputs + { + FileMetaData meta; + meta.fd = FileDescriptor(file_number, + sub_compact->compaction->output_path_id(), 0); + meta.oldest_ancester_time = oldest_ancester_time; + meta.file_creation_time = current_time; + meta.temperature = temperature; + assert(!db_id_.empty()); + assert(!db_session_id_.empty()); + s = GetSstInternalUniqueId(db_id_, db_session_id_, meta.fd.GetNumber(), + &meta.unique_id); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "[%s] [JOB %d] file #%" PRIu64 + " failed to generate unique id: %s.", + cfd->GetName().c_str(), job_id_, meta.fd.GetNumber(), + s.ToString().c_str()); + return s; + } + + outputs.AddOutput(std::move(meta), cfd->internal_comparator(), + sub_compact->compaction->mutable_cf_options() + ->check_flush_compaction_key_order, + paranoid_file_checks_); + } + + writable_file->SetIOPriority(GetRateLimiterPriority()); + writable_file->SetWriteLifeTimeHint(write_hint_); + FileTypeSet tmp_set = db_options_.checksum_handoff_file_types; + writable_file->SetPreallocationBlockSize(static_cast( + sub_compact->compaction->OutputFilePreallocationSize())); + const auto& listeners = + sub_compact->compaction->immutable_options()->listeners; + outputs.AssignFileWriter(new WritableFileWriter( + std::move(writable_file), fname, fo_copy, db_options_.clock, io_tracer_, + db_options_.stats, listeners, db_options_.file_checksum_gen_factory.get(), + tmp_set.Contains(FileType::kTableFile), false)); + + TableBuilderOptions tboptions( + *cfd->ioptions(), *(sub_compact->compaction->mutable_cf_options()), + cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(), + sub_compact->compaction->output_compression(), + sub_compact->compaction->output_compression_opts(), cfd->GetID(), + cfd->GetName(), sub_compact->compaction->output_level(), + bottommost_level_, TableFileCreationReason::kCompaction, + 0 /* oldest_key_time */, current_time, db_id_, db_session_id_, + sub_compact->compaction->max_output_file_size(), file_number); + + outputs.NewBuilder(tboptions); + + LogFlush(db_options_.info_log); + return s; +} + +void CompactionJob::CleanupCompaction() { + for (SubcompactionState& sub_compact : compact_->sub_compact_states) { + sub_compact.Cleanup(table_cache_.get()); + } + delete compact_; + compact_ = nullptr; +} + +#ifndef ROCKSDB_LITE +namespace { +void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) { + assert(prefix_length > 0); + size_t length = src.size() > prefix_length ? prefix_length : src.size(); + dst->assign(src.data(), length); +} +} // namespace + +#endif // !ROCKSDB_LITE + +void CompactionJob::UpdateCompactionStats() { + assert(compact_); + + Compaction* compaction = compact_->compaction; + compaction_stats_.stats.num_input_files_in_non_output_levels = 0; + compaction_stats_.stats.num_input_files_in_output_level = 0; + for (int input_level = 0; + input_level < static_cast(compaction->num_input_levels()); + ++input_level) { + if (compaction->level(input_level) != compaction->output_level()) { + UpdateCompactionInputStatsHelper( + &compaction_stats_.stats.num_input_files_in_non_output_levels, + &compaction_stats_.stats.bytes_read_non_output_levels, input_level); + } else { + UpdateCompactionInputStatsHelper( + &compaction_stats_.stats.num_input_files_in_output_level, + &compaction_stats_.stats.bytes_read_output_level, input_level); + } + } + + assert(compaction_job_stats_); + compaction_stats_.stats.bytes_read_blob = + compaction_job_stats_->total_blob_bytes_read; + + compaction_stats_.stats.num_dropped_records = + compaction_stats_.DroppedRecords(); +} + +void CompactionJob::UpdateCompactionInputStatsHelper(int* num_files, + uint64_t* bytes_read, + int input_level) { + const Compaction* compaction = compact_->compaction; + auto num_input_files = compaction->num_input_files(input_level); + *num_files += static_cast(num_input_files); + + for (size_t i = 0; i < num_input_files; ++i) { + const auto* file_meta = compaction->input(input_level, i); + *bytes_read += file_meta->fd.GetFileSize(); + compaction_stats_.stats.num_input_records += + static_cast(file_meta->num_entries); + } +} + +void CompactionJob::UpdateCompactionJobStats( + const InternalStats::CompactionStats& stats) const { +#ifndef ROCKSDB_LITE + compaction_job_stats_->elapsed_micros = stats.micros; + + // input information + compaction_job_stats_->total_input_bytes = + stats.bytes_read_non_output_levels + stats.bytes_read_output_level; + compaction_job_stats_->num_input_records = stats.num_input_records; + compaction_job_stats_->num_input_files = + stats.num_input_files_in_non_output_levels + + stats.num_input_files_in_output_level; + compaction_job_stats_->num_input_files_at_output_level = + stats.num_input_files_in_output_level; + + // output information + compaction_job_stats_->total_output_bytes = stats.bytes_written; + compaction_job_stats_->total_output_bytes_blob = stats.bytes_written_blob; + compaction_job_stats_->num_output_records = stats.num_output_records; + compaction_job_stats_->num_output_files = stats.num_output_files; + compaction_job_stats_->num_output_files_blob = stats.num_output_files_blob; + + if (stats.num_output_files > 0) { + CopyPrefix(compact_->SmallestUserKey(), + CompactionJobStats::kMaxPrefixLength, + &compaction_job_stats_->smallest_output_key_prefix); + CopyPrefix(compact_->LargestUserKey(), CompactionJobStats::kMaxPrefixLength, + &compaction_job_stats_->largest_output_key_prefix); + } +#else + (void)stats; +#endif // !ROCKSDB_LITE +} + +void CompactionJob::LogCompaction() { + Compaction* compaction = compact_->compaction; + ColumnFamilyData* cfd = compaction->column_family_data(); + + // Let's check if anything will get logged. Don't prepare all the info if + // we're not logging + if (db_options_.info_log_level <= InfoLogLevel::INFO_LEVEL) { + Compaction::InputLevelSummaryBuffer inputs_summary; + ROCKS_LOG_INFO( + db_options_.info_log, "[%s] [JOB %d] Compacting %s, score %.2f", + cfd->GetName().c_str(), job_id_, + compaction->InputLevelSummary(&inputs_summary), compaction->score()); + char scratch[2345]; + compaction->Summary(scratch, sizeof(scratch)); + ROCKS_LOG_INFO(db_options_.info_log, "[%s]: Compaction start summary: %s\n", + cfd->GetName().c_str(), scratch); + // build event logger report + auto stream = event_logger_->Log(); + stream << "job" << job_id_ << "event" + << "compaction_started" + << "compaction_reason" + << GetCompactionReasonString(compaction->compaction_reason()); + for (size_t i = 0; i < compaction->num_input_levels(); ++i) { + stream << ("files_L" + std::to_string(compaction->level(i))); + stream.StartArray(); + for (auto f : *compaction->inputs(i)) { + stream << f->fd.GetNumber(); + } + stream.EndArray(); + } + stream << "score" << compaction->score() << "input_data_size" + << compaction->CalculateTotalInputSize() << "oldest_snapshot_seqno" + << (existing_snapshots_.empty() + ? int64_t{-1} // Use -1 for "none" + : static_cast(existing_snapshots_[0])); + if (compaction->SupportsPerKeyPlacement()) { + stream << "preclude_last_level_min_seqno" + << preclude_last_level_min_seqno_; + stream << "penultimate_output_level" << compaction->GetPenultimateLevel(); + stream << "penultimate_output_range" + << GetCompactionPenultimateOutputRangeTypeString( + compaction->GetPenultimateOutputRangeType()); + + if (compaction->GetPenultimateOutputRangeType() == + Compaction::PenultimateOutputRangeType::kDisabled) { + ROCKS_LOG_WARN( + db_options_.info_log, + "[%s] [JOB %d] Penultimate level output is disabled, likely " + "because of the range conflict in the penultimate level", + cfd->GetName().c_str(), job_id_); + } + } + } +} + +std::string CompactionJob::GetTableFileName(uint64_t file_number) { + return TableFileName(compact_->compaction->immutable_options()->cf_paths, + file_number, compact_->compaction->output_path_id()); +} + +Env::IOPriority CompactionJob::GetRateLimiterPriority() { + if (versions_ && versions_->GetColumnFamilySet() && + versions_->GetColumnFamilySet()->write_controller()) { + WriteController* write_controller = + versions_->GetColumnFamilySet()->write_controller(); + if (write_controller->NeedsDelay() || write_controller->IsStopped()) { + return Env::IO_USER; + } + } + + return Env::IO_LOW; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_job.h b/src/rocksdb/db/compaction/compaction_job.h new file mode 100644 index 000000000..bfbce1011 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_job.h @@ -0,0 +1,500 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/blob/blob_file_completion_callback.h" +#include "db/column_family.h" +#include "db/compaction/compaction_iterator.h" +#include "db/compaction/compaction_outputs.h" +#include "db/flush_scheduler.h" +#include "db/internal_stats.h" +#include "db/job_context.h" +#include "db/log_writer.h" +#include "db/memtable_list.h" +#include "db/range_del_aggregator.h" +#include "db/seqno_to_time_mapping.h" +#include "db/version_edit.h" +#include "db/write_controller.h" +#include "db/write_thread.h" +#include "logging/event_logger.h" +#include "options/cf_options.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/compaction_job_stats.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/transaction_log.h" +#include "table/scoped_arena_iterator.h" +#include "util/autovector.h" +#include "util/stop_watch.h" +#include "util/thread_local.h" + +namespace ROCKSDB_NAMESPACE { + +class Arena; +class CompactionState; +class ErrorHandler; +class MemTable; +class SnapshotChecker; +class SystemClock; +class TableCache; +class Version; +class VersionEdit; +class VersionSet; + +class SubcompactionState; + +// CompactionJob is responsible for executing the compaction. Each (manual or +// automated) compaction corresponds to a CompactionJob object, and usually +// goes through the stages of `Prepare()`->`Run()`->`Install()`. CompactionJob +// will divide the compaction into subcompactions and execute them in parallel +// if needed. +// +// CompactionJob has 2 main stats: +// 1. CompactionJobStats compaction_job_stats_ +// CompactionJobStats is a public data structure which is part of Compaction +// event listener that rocksdb share the job stats with the user. +// Internally it's an aggregation of all the compaction_job_stats from each +// `SubcompactionState`: +// +------------------------+ +// | SubcompactionState | +// | | +// +--------->| compaction_job_stats | +// | | | +// | +------------------------+ +// +------------------------+ | +// | CompactionJob | | +------------------------+ +// | | | | SubcompactionState | +// | compaction_job_stats +-----+ | | +// | | +--------->| compaction_job_stats | +// | | | | | +// +------------------------+ | +------------------------+ +// | +// | +------------------------+ +// | | SubcompactionState | +// | | | +// +--------->+ compaction_job_stats | +// | | | +// | +------------------------+ +// | +// | +------------------------+ +// | | ... | +// +--------->+ | +// +------------------------+ +// +// 2. CompactionStatsFull compaction_stats_ +// `CompactionStatsFull` is an internal stats about the compaction, which +// is eventually sent to `ColumnFamilyData::internal_stats_` and used for +// logging and public metrics. +// Internally, it's an aggregation of stats_ from each `SubcompactionState`. +// It has 2 parts, normal stats about the main compaction information and +// the penultimate level output stats. +// `SubcompactionState` maintains the CompactionOutputs for normal output and +// the penultimate level output if exists, the per_level stats is +// stored with the outputs. +// +---------------------------+ +// | SubcompactionState | +// | | +// | +----------------------+ | +// | | CompactionOutputs | | +// | | (normal output) | | +// +---->| stats_ | | +// | | +----------------------+ | +// | | | +// | | +----------------------+ | +// +--------------------------------+ | | | CompactionOutputs | | +// | CompactionJob | | | | (penultimate_level) | | +// | | +--------->| stats_ | | +// | compaction_stats_ | | | | +----------------------+ | +// | +-------------------------+ | | | | | +// | |stats (normal) |------|----+ +---------------------------+ +// | +-------------------------+ | | | +// | | | | +// | +-------------------------+ | | | +---------------------------+ +// | |penultimate_level_stats +------+ | | SubcompactionState | +// | +-------------------------+ | | | | | +// | | | | | +----------------------+ | +// | | | | | | CompactionOutputs | | +// +--------------------------------+ | | | | (normal output) | | +// | +---->| stats_ | | +// | | +----------------------+ | +// | | | +// | | +----------------------+ | +// | | | CompactionOutputs | | +// | | | (penultimate_level) | | +// +--------->| stats_ | | +// | +----------------------+ | +// | | +// +---------------------------+ + +class CompactionJob { + public: + CompactionJob( + int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, + const MutableDBOptions& mutable_db_options, + const FileOptions& file_options, VersionSet* versions, + const std::atomic* shutting_down, LogBuffer* log_buffer, + FSDirectory* db_directory, FSDirectory* output_directory, + FSDirectory* blob_output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, + std::vector existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, JobContext* job_context, + std::shared_ptr table_cache, EventLogger* event_logger, + bool paranoid_file_checks, bool measure_io_stats, + const std::string& dbname, CompactionJobStats* compaction_job_stats, + Env::Priority thread_pri, const std::shared_ptr& io_tracer, + const std::atomic& manual_compaction_canceled, + const std::string& db_id = "", const std::string& db_session_id = "", + std::string full_history_ts_low = "", std::string trim_ts = "", + BlobFileCompletionCallback* blob_callback = nullptr, + int* bg_compaction_scheduled = nullptr, + int* bg_bottom_compaction_scheduled = nullptr); + + virtual ~CompactionJob(); + + // no copy/move + CompactionJob(CompactionJob&& job) = delete; + CompactionJob(const CompactionJob& job) = delete; + CompactionJob& operator=(const CompactionJob& job) = delete; + + // REQUIRED: mutex held + // Prepare for the compaction by setting up boundaries for each subcompaction + void Prepare(); + // REQUIRED mutex not held + // Launch threads for each subcompaction and wait for them to finish. After + // that, verify table is usable and finally do bookkeeping to unify + // subcompaction results + Status Run(); + + // REQUIRED: mutex held + // Add compaction input/output to the current version + Status Install(const MutableCFOptions& mutable_cf_options); + + // Return the IO status + IOStatus io_status() const { return io_status_; } + + protected: + void UpdateCompactionStats(); + void LogCompaction(); + virtual void RecordCompactionIOStats(); + void CleanupCompaction(); + + // Call compaction filter. Then iterate through input and compact the + // kv-pairs + void ProcessKeyValueCompaction(SubcompactionState* sub_compact); + + CompactionState* compact_; + InternalStats::CompactionStatsFull compaction_stats_; + const ImmutableDBOptions& db_options_; + const MutableDBOptions mutable_db_options_copy_; + LogBuffer* log_buffer_; + FSDirectory* output_directory_; + Statistics* stats_; + // Is this compaction creating a file in the bottom most level? + bool bottommost_level_; + + Env::WriteLifeTimeHint write_hint_; + + IOStatus io_status_; + + CompactionJobStats* compaction_job_stats_; + + private: + friend class CompactionJobTestBase; + + // Generates a histogram representing potential divisions of key ranges from + // the input. It adds the starting and/or ending keys of certain input files + // to the working set and then finds the approximate size of data in between + // each consecutive pair of slices. Then it divides these ranges into + // consecutive groups such that each group has a similar size. + void GenSubcompactionBoundaries(); + + // Get the number of planned subcompactions based on max_subcompactions and + // extra reserved resources + uint64_t GetSubcompactionsLimit(); + + // Additional reserved threads are reserved and the number is stored in + // extra_num_subcompaction_threads_reserved__. For now, this happens only if + // the compaction priority is round-robin and max_subcompactions is not + // sufficient (extra resources may be needed) + void AcquireSubcompactionResources(int num_extra_required_subcompactions); + + // Additional threads may be reserved during IncreaseSubcompactionResources() + // if num_actual_subcompactions is less than num_planned_subcompactions. + // Additional threads will be released and the bg_compaction_scheduled_ or + // bg_bottom_compaction_scheduled_ will be updated if they are used. + // DB Mutex lock is required. + void ShrinkSubcompactionResources(uint64_t num_extra_resources); + + // Release all reserved threads and update the compaction limits. + void ReleaseSubcompactionResources(); + + CompactionServiceJobStatus ProcessKeyValueCompactionWithCompactionService( + SubcompactionState* sub_compact); + + // update the thread status for starting a compaction. + void ReportStartedCompaction(Compaction* compaction); + + Status FinishCompactionOutputFile(const Status& input_status, + SubcompactionState* sub_compact, + CompactionOutputs& outputs, + const Slice& next_table_min_key); + Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options); + Status OpenCompactionOutputFile(SubcompactionState* sub_compact, + CompactionOutputs& outputs); + void UpdateCompactionJobStats( + const InternalStats::CompactionStats& stats) const; + void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats, + CompactionJobStats* compaction_job_stats = nullptr); + + void UpdateCompactionInputStatsHelper(int* num_files, uint64_t* bytes_read, + int input_level); + + void NotifyOnSubcompactionBegin(SubcompactionState* sub_compact); + + void NotifyOnSubcompactionCompleted(SubcompactionState* sub_compact); + + uint32_t job_id_; + + // DBImpl state + const std::string& dbname_; + const std::string db_id_; + const std::string db_session_id_; + const FileOptions file_options_; + + Env* env_; + std::shared_ptr io_tracer_; + FileSystemPtr fs_; + // env_option optimized for compaction table reads + FileOptions file_options_for_read_; + VersionSet* versions_; + const std::atomic* shutting_down_; + const std::atomic& manual_compaction_canceled_; + FSDirectory* db_directory_; + FSDirectory* blob_output_directory_; + InstrumentedMutex* db_mutex_; + ErrorHandler* db_error_handler_; + // If there were two snapshots with seq numbers s1 and + // s2 and s1 < s2, and if we find two instances of a key k1 then lies + // entirely within s1 and s2, then the earlier version of k1 can be safely + // deleted because that version is not visible in any snapshot. + std::vector existing_snapshots_; + + // This is the earliest snapshot that could be used for write-conflict + // checking by a transaction. For any user-key newer than this snapshot, we + // should make sure not to remove evidence that a write occurred. + SequenceNumber earliest_write_conflict_snapshot_; + + const SnapshotChecker* const snapshot_checker_; + + JobContext* job_context_; + + std::shared_ptr table_cache_; + + EventLogger* event_logger_; + + bool paranoid_file_checks_; + bool measure_io_stats_; + // Stores the Slices that designate the boundaries for each subcompaction + std::vector boundaries_; + Env::Priority thread_pri_; + std::string full_history_ts_low_; + std::string trim_ts_; + BlobFileCompletionCallback* blob_callback_; + + uint64_t GetCompactionId(SubcompactionState* sub_compact) const; + // Stores the number of reserved threads in shared env_ for the number of + // extra subcompaction in kRoundRobin compaction priority + int extra_num_subcompaction_threads_reserved_; + + // Stores the pointer to bg_compaction_scheduled_, + // bg_bottom_compaction_scheduled_ in DBImpl. Mutex is required when accessing + // or updating it. + int* bg_compaction_scheduled_; + int* bg_bottom_compaction_scheduled_; + + // Stores the sequence number to time mapping gathered from all input files + // it also collects the smallest_seqno -> oldest_ancester_time from the SST. + SeqnoToTimeMapping seqno_time_mapping_; + + // Minimal sequence number for preserving the time information. The time info + // older than this sequence number won't be preserved after the compaction and + // if it's bottommost compaction, the seq num will be zeroed out. + SequenceNumber preserve_time_min_seqno_ = kMaxSequenceNumber; + + // Minimal sequence number to preclude the data from the last level. If the + // key has bigger (newer) sequence number than this, it will be precluded from + // the last level (output to penultimate level). + SequenceNumber preclude_last_level_min_seqno_ = kMaxSequenceNumber; + + // Get table file name in where it's outputting to, which should also be in + // `output_directory_`. + virtual std::string GetTableFileName(uint64_t file_number); + // The rate limiter priority (io_priority) is determined dynamically here. + // The Compaction Read and Write priorities are the same for different + // scenarios, such as write stalled. + Env::IOPriority GetRateLimiterPriority(); +}; + +// CompactionServiceInput is used the pass compaction information between two +// db instances. It contains the information needed to do a compaction. It +// doesn't contain the LSM tree information, which is passed though MANIFEST +// file. +struct CompactionServiceInput { + ColumnFamilyDescriptor column_family; + + DBOptions db_options; + + std::vector snapshots; + + // SST files for compaction, it should already be expended to include all the + // files needed for this compaction, for both input level files and output + // level files. + std::vector input_files; + int output_level; + + // db_id is used to generate unique id of sst on the remote compactor + std::string db_id; + + // information for subcompaction + bool has_begin = false; + std::string begin; + bool has_end = false; + std::string end; + + // serialization interface to read and write the object + static Status Read(const std::string& data_str, CompactionServiceInput* obj); + Status Write(std::string* output); + + // Initialize a dummy ColumnFamilyDescriptor + CompactionServiceInput() : column_family("", ColumnFamilyOptions()) {} + +#ifndef NDEBUG + bool TEST_Equals(CompactionServiceInput* other); + bool TEST_Equals(CompactionServiceInput* other, std::string* mismatch); +#endif // NDEBUG +}; + +// CompactionServiceOutputFile is the metadata for the output SST file +struct CompactionServiceOutputFile { + std::string file_name; + SequenceNumber smallest_seqno; + SequenceNumber largest_seqno; + std::string smallest_internal_key; + std::string largest_internal_key; + uint64_t oldest_ancester_time; + uint64_t file_creation_time; + uint64_t paranoid_hash; + bool marked_for_compaction; + UniqueId64x2 unique_id; + + CompactionServiceOutputFile() = default; + CompactionServiceOutputFile( + const std::string& name, SequenceNumber smallest, SequenceNumber largest, + std::string _smallest_internal_key, std::string _largest_internal_key, + uint64_t _oldest_ancester_time, uint64_t _file_creation_time, + uint64_t _paranoid_hash, bool _marked_for_compaction, + UniqueId64x2 _unique_id) + : file_name(name), + smallest_seqno(smallest), + largest_seqno(largest), + smallest_internal_key(std::move(_smallest_internal_key)), + largest_internal_key(std::move(_largest_internal_key)), + oldest_ancester_time(_oldest_ancester_time), + file_creation_time(_file_creation_time), + paranoid_hash(_paranoid_hash), + marked_for_compaction(_marked_for_compaction), + unique_id(std::move(_unique_id)) {} +}; + +// CompactionServiceResult contains the compaction result from a different db +// instance, with these information, the primary db instance with write +// permission is able to install the result to the DB. +struct CompactionServiceResult { + Status status; + std::vector output_files; + int output_level; + + // location of the output files + std::string output_path; + + // some statistics about the compaction + uint64_t num_output_records = 0; + uint64_t total_bytes = 0; + uint64_t bytes_read = 0; + uint64_t bytes_written = 0; + CompactionJobStats stats; + + // serialization interface to read and write the object + static Status Read(const std::string& data_str, CompactionServiceResult* obj); + Status Write(std::string* output); + +#ifndef NDEBUG + bool TEST_Equals(CompactionServiceResult* other); + bool TEST_Equals(CompactionServiceResult* other, std::string* mismatch); +#endif // NDEBUG +}; + +// CompactionServiceCompactionJob is an read-only compaction job, it takes +// input information from `compaction_service_input` and put result information +// in `compaction_service_result`, the SST files are generated to `output_path`. +class CompactionServiceCompactionJob : private CompactionJob { + public: + CompactionServiceCompactionJob( + int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, + const MutableDBOptions& mutable_db_options, + const FileOptions& file_options, VersionSet* versions, + const std::atomic* shutting_down, LogBuffer* log_buffer, + FSDirectory* output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, + std::vector existing_snapshots, + std::shared_ptr table_cache, EventLogger* event_logger, + const std::string& dbname, const std::shared_ptr& io_tracer, + const std::atomic& manual_compaction_canceled, + const std::string& db_id, const std::string& db_session_id, + std::string output_path, + const CompactionServiceInput& compaction_service_input, + CompactionServiceResult* compaction_service_result); + + // Run the compaction in current thread and return the result + Status Run(); + + void CleanupCompaction(); + + IOStatus io_status() const { return CompactionJob::io_status(); } + + protected: + void RecordCompactionIOStats() override; + + private: + // Get table file name in output_path + std::string GetTableFileName(uint64_t file_number) override; + // Specific the compaction output path, otherwise it uses default DB path + const std::string output_path_; + + // Compaction job input + const CompactionServiceInput& compaction_input_; + + // Compaction job result + CompactionServiceResult* compaction_result_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_job_stats_test.cc b/src/rocksdb/db/compaction/compaction_job_stats_test.cc new file mode 100644 index 000000000..930270778 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_job_stats_test.cc @@ -0,0 +1,975 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "db/job_context.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "env/mock_env.h" +#include "file/filename.h" +#include "monitoring/statistics.h" +#include "monitoring/thread_status_util.h" +#include "port/stack_trace.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/experimental.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/thread_status.h" +#include "rocksdb/utilities/checkpoint.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/mock_table.h" +#include "table/plain/plain_table_factory.h" +#include "table/scoped_arena_iterator.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/cast_util.h" +#include "util/compression.h" +#include "util/hash.h" +#include "util/mutexlock.h" +#include "util/rate_limiter.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +#if !defined(IOS_CROSS_COMPILE) +#ifndef ROCKSDB_LITE +namespace ROCKSDB_NAMESPACE { + +static std::string RandomString(Random* rnd, int len, double ratio) { + std::string r; + test::CompressibleString(rnd, ratio, len, &r); + return r; +} + +std::string Key(uint64_t key, int length) { + const int kBufSize = 1000; + char buf[kBufSize]; + if (length > kBufSize) { + length = kBufSize; + } + snprintf(buf, kBufSize, "%0*" PRIu64, length, key); + return std::string(buf); +} + +class CompactionJobStatsTest : public testing::Test, + public testing::WithParamInterface { + public: + std::string dbname_; + std::string alternative_wal_dir_; + Env* env_; + DB* db_; + std::vector handles_; + uint32_t max_subcompactions_; + + Options last_options_; + + CompactionJobStatsTest() : env_(Env::Default()) { + env_->SetBackgroundThreads(1, Env::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); + dbname_ = test::PerThreadDBPath("compaction_job_stats_test"); + alternative_wal_dir_ = dbname_ + "/wal"; + Options options; + options.create_if_missing = true; + max_subcompactions_ = GetParam(); + options.max_subcompactions = max_subcompactions_; + auto delete_options = options; + delete_options.wal_dir = alternative_wal_dir_; + EXPECT_OK(DestroyDB(dbname_, delete_options)); + // Destroy it for not alternative WAL dir is used. + EXPECT_OK(DestroyDB(dbname_, options)); + db_ = nullptr; + Reopen(options); + } + + ~CompactionJobStatsTest() override { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + Close(); + Options options; + options.db_paths.emplace_back(dbname_, 0); + options.db_paths.emplace_back(dbname_ + "_2", 0); + options.db_paths.emplace_back(dbname_ + "_3", 0); + options.db_paths.emplace_back(dbname_ + "_4", 0); + EXPECT_OK(DestroyDB(dbname_, options)); + } + + // Required if inheriting from testing::WithParamInterface<> + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + DBImpl* dbfull() { return static_cast_with_check(db_); } + + void CreateColumnFamilies(const std::vector& cfs, + const Options& options) { + ColumnFamilyOptions cf_opts(options); + size_t cfi = handles_.size(); + handles_.resize(cfi + cfs.size()); + for (auto cf : cfs) { + ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++])); + } + } + + void CreateAndReopenWithCF(const std::vector& cfs, + const Options& options) { + CreateColumnFamilies(cfs, options); + std::vector cfs_plus_default = cfs; + cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName); + ReopenWithColumnFamilies(cfs_plus_default, options); + } + + void ReopenWithColumnFamilies(const std::vector& cfs, + const std::vector& options) { + ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); + } + + void ReopenWithColumnFamilies(const std::vector& cfs, + const Options& options) { + ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); + } + + Status TryReopenWithColumnFamilies(const std::vector& cfs, + const std::vector& options) { + Close(); + EXPECT_EQ(cfs.size(), options.size()); + std::vector column_families; + for (size_t i = 0; i < cfs.size(); ++i) { + column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i])); + } + DBOptions db_opts = DBOptions(options[0]); + return DB::Open(db_opts, dbname_, column_families, &handles_, &db_); + } + + Status TryReopenWithColumnFamilies(const std::vector& cfs, + const Options& options) { + Close(); + std::vector v_opts(cfs.size(), options); + return TryReopenWithColumnFamilies(cfs, v_opts); + } + + void Reopen(const Options& options) { ASSERT_OK(TryReopen(options)); } + + void Close() { + for (auto h : handles_) { + delete h; + } + handles_.clear(); + delete db_; + db_ = nullptr; + } + + void DestroyAndReopen(const Options& options) { + // Destroy using last options + Destroy(last_options_); + ASSERT_OK(TryReopen(options)); + } + + void Destroy(const Options& options) { + Close(); + ASSERT_OK(DestroyDB(dbname_, options)); + } + + Status ReadOnlyReopen(const Options& options) { + return DB::OpenForReadOnly(options, dbname_, &db_); + } + + Status TryReopen(const Options& options) { + Close(); + last_options_ = options; + return DB::Open(options, dbname_, &db_); + } + + Status Flush(int cf = 0) { + if (cf == 0) { + return db_->Flush(FlushOptions()); + } else { + return db_->Flush(FlushOptions(), handles_[cf]); + } + } + + Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()) { + return db_->Put(wo, k, v); + } + + Status Put(int cf, const Slice& k, const Slice& v, + WriteOptions wo = WriteOptions()) { + return db_->Put(wo, handles_[cf], k, v); + } + + Status Delete(const std::string& k) { return db_->Delete(WriteOptions(), k); } + + Status Delete(int cf, const std::string& k) { + return db_->Delete(WriteOptions(), handles_[cf], k); + } + + std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { + ReadOptions options; + options.verify_checksums = true; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + std::string Get(int cf, const std::string& k, + const Snapshot* snapshot = nullptr) { + ReadOptions options; + options.verify_checksums = true; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, handles_[cf], k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + int NumTableFilesAtLevel(int level, int cf = 0) { + std::string property; + if (cf == 0) { + // default cfd + EXPECT_TRUE(db_->GetProperty( + "rocksdb.num-files-at-level" + std::to_string(level), &property)); + } else { + EXPECT_TRUE(db_->GetProperty( + handles_[cf], "rocksdb.num-files-at-level" + std::to_string(level), + &property)); + } + return atoi(property.c_str()); + } + + // Return spread of files per level + std::string FilesPerLevel(int cf = 0) { + int num_levels = + (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]); + std::string result; + size_t last_non_zero_offset = 0; + for (int level = 0; level < num_levels; level++) { + int f = NumTableFilesAtLevel(level, cf); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; + } + + Status Size(uint64_t* size, const Slice& start, const Slice& limit, + int cf = 0) { + Range r(start, limit); + if (cf == 0) { + return db_->GetApproximateSizes(&r, 1, size); + } else { + return db_->GetApproximateSizes(handles_[1], &r, 1, size); + } + } + + void Compact(int cf, const Slice& start, const Slice& limit, + uint32_t target_path_id) { + CompactRangeOptions compact_options; + compact_options.target_path_id = target_path_id; + ASSERT_OK(db_->CompactRange(compact_options, handles_[cf], &start, &limit)); + } + + void Compact(int cf, const Slice& start, const Slice& limit) { + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit)); + } + + void Compact(const Slice& start, const Slice& limit) { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &limit)); + } + + void TEST_Compact(int level, int cf, const Slice& start, const Slice& limit) { + ASSERT_OK(dbfull()->TEST_CompactRange(level, &start, &limit, handles_[cf], + true /* disallow trivial move */)); + } + + // Do n memtable compactions, each of which produces an sstable + // covering the range [small,large]. + void MakeTables(int n, const std::string& small, const std::string& large, + int cf = 0) { + for (int i = 0; i < n; i++) { + ASSERT_OK(Put(cf, small, "begin")); + ASSERT_OK(Put(cf, large, "end")); + ASSERT_OK(Flush(cf)); + } + } + + static void SetDeletionCompactionStats(CompactionJobStats* stats, + uint64_t input_deletions, + uint64_t expired_deletions, + uint64_t records_replaced) { + stats->num_input_deletion_records = input_deletions; + stats->num_expired_deletion_records = expired_deletions; + stats->num_records_replaced = records_replaced; + } + + void MakeTableWithKeyValues(Random* rnd, uint64_t smallest, uint64_t largest, + int key_size, int value_size, uint64_t interval, + double ratio, int cf = 0) { + for (auto key = smallest; key < largest; key += interval) { + ASSERT_OK(Put(cf, Slice(Key(key, key_size)), + Slice(RandomString(rnd, value_size, ratio)))); + } + ASSERT_OK(Flush(cf)); + } + + // This function behaves with the implicit understanding that two + // rounds of keys are inserted into the database, as per the behavior + // of the DeletionStatsTest. + void SelectivelyDeleteKeys(uint64_t smallest, uint64_t largest, + uint64_t interval, int deletion_interval, + int key_size, uint64_t cutoff_key_num, + CompactionJobStats* stats, int cf = 0) { + // interval needs to be >= 2 so that deletion entries can be inserted + // that are intended to not result in an actual key deletion by using + // an offset of 1 from another existing key + ASSERT_GE(interval, 2); + + uint64_t ctr = 1; + uint32_t deletions_made = 0; + uint32_t num_deleted = 0; + uint32_t num_expired = 0; + for (auto key = smallest; key <= largest; key += interval, ctr++) { + if (ctr % deletion_interval == 0) { + ASSERT_OK(Delete(cf, Key(key, key_size))); + deletions_made++; + num_deleted++; + + if (key > cutoff_key_num) { + num_expired++; + } + } + } + + // Insert some deletions for keys that don't exist that + // are both in and out of the key range + ASSERT_OK(Delete(cf, Key(smallest + 1, key_size))); + deletions_made++; + + ASSERT_OK(Delete(cf, Key(smallest - 1, key_size))); + deletions_made++; + num_expired++; + + ASSERT_OK(Delete(cf, Key(smallest - 9, key_size))); + deletions_made++; + num_expired++; + + ASSERT_OK(Flush(cf)); + SetDeletionCompactionStats(stats, deletions_made, num_expired, num_deleted); + } +}; + +// An EventListener which helps verify the compaction results in +// test CompactionJobStatsTest. +class CompactionJobStatsChecker : public EventListener { + public: + CompactionJobStatsChecker() + : compression_enabled_(false), verify_next_comp_io_stats_(false) {} + + size_t NumberOfUnverifiedStats() { return expected_stats_.size(); } + + void set_verify_next_comp_io_stats(bool v) { verify_next_comp_io_stats_ = v; } + + // Once a compaction completed, this function will verify the returned + // CompactionJobInfo with the oldest CompactionJobInfo added earlier + // in "expected_stats_" which has not yet being used for verification. + void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override { + if (verify_next_comp_io_stats_) { + ASSERT_GT(ci.stats.file_write_nanos, 0); + ASSERT_GT(ci.stats.file_range_sync_nanos, 0); + ASSERT_GT(ci.stats.file_fsync_nanos, 0); + ASSERT_GT(ci.stats.file_prepare_write_nanos, 0); + verify_next_comp_io_stats_ = false; + } + + std::lock_guard lock(mutex_); + if (expected_stats_.size()) { + Verify(ci.stats, expected_stats_.front()); + expected_stats_.pop(); + } + } + + // A helper function which verifies whether two CompactionJobStats + // match. The verification of all compaction stats are done by + // ASSERT_EQ except for the total input / output bytes, which we + // use ASSERT_GE and ASSERT_LE with a reasonable bias --- + // 10% in uncompressed case and 20% when compression is used. + virtual void Verify(const CompactionJobStats& current_stats, + const CompactionJobStats& stats) { + // time + ASSERT_GT(current_stats.elapsed_micros, 0U); + + ASSERT_EQ(current_stats.num_input_records, stats.num_input_records); + ASSERT_EQ(current_stats.num_input_files, stats.num_input_files); + ASSERT_EQ(current_stats.num_input_files_at_output_level, + stats.num_input_files_at_output_level); + + ASSERT_EQ(current_stats.num_output_records, stats.num_output_records); + ASSERT_EQ(current_stats.num_output_files, stats.num_output_files); + + ASSERT_EQ(current_stats.is_full_compaction, stats.is_full_compaction); + ASSERT_EQ(current_stats.is_manual_compaction, stats.is_manual_compaction); + + // file size + double kFileSizeBias = compression_enabled_ ? 0.20 : 0.10; + ASSERT_GE(current_stats.total_input_bytes * (1.00 + kFileSizeBias), + stats.total_input_bytes); + ASSERT_LE(current_stats.total_input_bytes, + stats.total_input_bytes * (1.00 + kFileSizeBias)); + ASSERT_GE(current_stats.total_output_bytes * (1.00 + kFileSizeBias), + stats.total_output_bytes); + ASSERT_LE(current_stats.total_output_bytes, + stats.total_output_bytes * (1.00 + kFileSizeBias)); + ASSERT_EQ(current_stats.total_input_raw_key_bytes, + stats.total_input_raw_key_bytes); + ASSERT_EQ(current_stats.total_input_raw_value_bytes, + stats.total_input_raw_value_bytes); + + ASSERT_EQ(current_stats.num_records_replaced, stats.num_records_replaced); + + ASSERT_EQ(current_stats.num_corrupt_keys, stats.num_corrupt_keys); + + ASSERT_EQ(std::string(current_stats.smallest_output_key_prefix), + std::string(stats.smallest_output_key_prefix)); + ASSERT_EQ(std::string(current_stats.largest_output_key_prefix), + std::string(stats.largest_output_key_prefix)); + } + + // Add an expected compaction stats, which will be used to + // verify the CompactionJobStats returned by the OnCompactionCompleted() + // callback. + void AddExpectedStats(const CompactionJobStats& stats) { + std::lock_guard lock(mutex_); + expected_stats_.push(stats); + } + + void EnableCompression(bool flag) { compression_enabled_ = flag; } + + bool verify_next_comp_io_stats() const { return verify_next_comp_io_stats_; } + + private: + std::mutex mutex_; + std::queue expected_stats_; + bool compression_enabled_; + bool verify_next_comp_io_stats_; +}; + +// An EventListener which helps verify the compaction statistics in +// the test DeletionStatsTest. +class CompactionJobDeletionStatsChecker : public CompactionJobStatsChecker { + public: + // Verifies whether two CompactionJobStats match. + void Verify(const CompactionJobStats& current_stats, + const CompactionJobStats& stats) override { + ASSERT_EQ(current_stats.num_input_deletion_records, + stats.num_input_deletion_records); + ASSERT_EQ(current_stats.num_expired_deletion_records, + stats.num_expired_deletion_records); + ASSERT_EQ(current_stats.num_records_replaced, stats.num_records_replaced); + + ASSERT_EQ(current_stats.num_corrupt_keys, stats.num_corrupt_keys); + } +}; + +namespace { + +uint64_t EstimatedFileSize(uint64_t num_records, size_t key_size, + size_t value_size, double compression_ratio = 1.0, + size_t block_size = 4096, + int bloom_bits_per_key = 10) { + const size_t kPerKeyOverhead = 8; + const size_t kFooterSize = 512; + + uint64_t data_size = static_cast( + num_records * + (key_size + value_size * compression_ratio + kPerKeyOverhead)); + + return data_size + kFooterSize + + num_records * bloom_bits_per_key / 8 // filter block + + data_size * (key_size + 8) / block_size; // index block +} + +namespace { + +void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) { + assert(prefix_length > 0); + size_t length = src.size() > prefix_length ? prefix_length : src.size(); + dst->assign(src.data(), length); +} + +} // namespace + +CompactionJobStats NewManualCompactionJobStats( + const std::string& smallest_key, const std::string& largest_key, + size_t num_input_files, size_t num_input_files_at_output_level, + uint64_t num_input_records, size_t key_size, size_t value_size, + size_t num_output_files, uint64_t num_output_records, + double compression_ratio, uint64_t num_records_replaced, + bool is_full = false, bool is_manual = true) { + CompactionJobStats stats; + stats.Reset(); + + stats.num_input_records = num_input_records; + stats.num_input_files = num_input_files; + stats.num_input_files_at_output_level = num_input_files_at_output_level; + + stats.num_output_records = num_output_records; + stats.num_output_files = num_output_files; + + stats.total_input_bytes = + EstimatedFileSize(num_input_records / num_input_files, key_size, + value_size, compression_ratio) * + num_input_files; + stats.total_output_bytes = + EstimatedFileSize(num_output_records / num_output_files, key_size, + value_size, compression_ratio) * + num_output_files; + stats.total_input_raw_key_bytes = num_input_records * (key_size + 8); + stats.total_input_raw_value_bytes = num_input_records * value_size; + + stats.is_full_compaction = is_full; + stats.is_manual_compaction = is_manual; + + stats.num_records_replaced = num_records_replaced; + + CopyPrefix(smallest_key, CompactionJobStats::kMaxPrefixLength, + &stats.smallest_output_key_prefix); + CopyPrefix(largest_key, CompactionJobStats::kMaxPrefixLength, + &stats.largest_output_key_prefix); + + return stats; +} + +CompressionType GetAnyCompression() { + if (Snappy_Supported()) { + return kSnappyCompression; + } else if (Zlib_Supported()) { + return kZlibCompression; + } else if (BZip2_Supported()) { + return kBZip2Compression; + } else if (LZ4_Supported()) { + return kLZ4Compression; + } else if (XPRESS_Supported()) { + return kXpressCompression; + } + + return kNoCompression; +} + +} // namespace + +TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) { + Random rnd(301); + const int kBufSize = 100; + char buf[kBufSize]; + uint64_t key_base = 100000000l; + // Note: key_base must be multiple of num_keys_per_L0_file + int num_keys_per_L0_file = 100; + const int kTestScale = 8; + const int kKeySize = 10; + const int kValueSize = 1000; + const double kCompressionRatio = 0.5; + double compression_ratio = 1.0; + uint64_t key_interval = key_base / num_keys_per_L0_file; + + // Whenever a compaction completes, this listener will try to + // verify whether the returned CompactionJobStats matches + // what we expect. The expected CompactionJobStats is added + // via AddExpectedStats(). + auto* stats_checker = new CompactionJobStatsChecker(); + Options options; + options.listeners.emplace_back(stats_checker); + options.create_if_missing = true; + // just enough setting to hold off auto-compaction. + options.level0_file_num_compaction_trigger = kTestScale + 1; + options.num_levels = 3; + options.compression = kNoCompression; + options.max_subcompactions = max_subcompactions_; + options.bytes_per_sync = 512 * 1024; + + options.report_bg_io_stats = true; + for (int test = 0; test < 2; ++test) { + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // 1st Phase: generate "num_L0_files" L0 files. + int num_L0_files = 0; + for (uint64_t start_key = key_base; start_key <= key_base * kTestScale; + start_key += key_base) { + MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, + kKeySize, kValueSize, key_interval, + compression_ratio, 1); + snprintf(buf, kBufSize, "%d", ++num_L0_files); + ASSERT_EQ(std::string(buf), FilesPerLevel(1)); + } + ASSERT_EQ(std::to_string(num_L0_files), FilesPerLevel(1)); + + // 2nd Phase: perform L0 -> L1 compaction. + int L0_compaction_count = 6; + int count = 1; + std::string smallest_key; + std::string largest_key; + for (uint64_t start_key = key_base; + start_key <= key_base * L0_compaction_count; + start_key += key_base, count++) { + smallest_key = Key(start_key, 10); + largest_key = Key(start_key + key_base - key_interval, 10); + stats_checker->AddExpectedStats(NewManualCompactionJobStats( + smallest_key, largest_key, 1, 0, num_keys_per_L0_file, kKeySize, + kValueSize, 1, num_keys_per_L0_file, compression_ratio, 0)); + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U); + TEST_Compact(0, 1, smallest_key, largest_key); + snprintf(buf, kBufSize, "%d,%d", num_L0_files - count, count); + ASSERT_EQ(std::string(buf), FilesPerLevel(1)); + } + + // compact two files into one in the last L0 -> L1 compaction + int num_remaining_L0 = num_L0_files - L0_compaction_count; + smallest_key = Key(key_base * (L0_compaction_count + 1), 10); + largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10); + stats_checker->AddExpectedStats(NewManualCompactionJobStats( + smallest_key, largest_key, num_remaining_L0, 0, + num_keys_per_L0_file * num_remaining_L0, kKeySize, kValueSize, 1, + num_keys_per_L0_file * num_remaining_L0, compression_ratio, 0)); + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U); + TEST_Compact(0, 1, smallest_key, largest_key); + + int num_L1_files = num_L0_files - num_remaining_L0 + 1; + num_L0_files = 0; + snprintf(buf, kBufSize, "%d,%d", num_L0_files, num_L1_files); + ASSERT_EQ(std::string(buf), FilesPerLevel(1)); + + // 3rd Phase: generate sparse L0 files (wider key-range, same num of keys) + int sparseness = 2; + for (uint64_t start_key = key_base; start_key <= key_base * kTestScale; + start_key += key_base * sparseness) { + MakeTableWithKeyValues( + &rnd, start_key, start_key + key_base * sparseness - 1, kKeySize, + kValueSize, key_base * sparseness / num_keys_per_L0_file, + compression_ratio, 1); + snprintf(buf, kBufSize, "%d,%d", ++num_L0_files, num_L1_files); + ASSERT_EQ(std::string(buf), FilesPerLevel(1)); + } + + // 4th Phase: perform L0 -> L1 compaction again, expect higher write amp + // When subcompactions are enabled, the number of output files increases + // by 1 because multiple threads are consuming the input and generating + // output files without coordinating to see if the output could fit into + // a smaller number of files like it does when it runs sequentially + int num_output_files = options.max_subcompactions > 1 ? 2 : 1; + for (uint64_t start_key = key_base; num_L0_files > 1; + start_key += key_base * sparseness) { + smallest_key = Key(start_key, 10); + largest_key = Key(start_key + key_base * sparseness - key_interval, 10); + stats_checker->AddExpectedStats(NewManualCompactionJobStats( + smallest_key, largest_key, 3, 2, num_keys_per_L0_file * 3, kKeySize, + kValueSize, num_output_files, + num_keys_per_L0_file * 2, // 1/3 of the data will be updated. + compression_ratio, num_keys_per_L0_file)); + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U); + Compact(1, smallest_key, largest_key); + if (options.max_subcompactions == 1) { + --num_L1_files; + } + snprintf(buf, kBufSize, "%d,%d", --num_L0_files, num_L1_files); + ASSERT_EQ(std::string(buf), FilesPerLevel(1)); + } + + // 5th Phase: Do a full compaction, which involves in two sub-compactions. + // Here we expect to have 1 L0 files and 4 L1 files + // In the first sub-compaction, we expect L0 compaction. + smallest_key = Key(key_base, 10); + largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10); + stats_checker->AddExpectedStats(NewManualCompactionJobStats( + Key(key_base * (kTestScale + 1 - sparseness), 10), largest_key, 2, 1, + num_keys_per_L0_file * 3, kKeySize, kValueSize, 1, + num_keys_per_L0_file * 2, compression_ratio, num_keys_per_L0_file)); + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U); + Compact(1, smallest_key, largest_key); + + num_L1_files = options.max_subcompactions > 1 ? 7 : 4; + char L1_buf[4]; + snprintf(L1_buf, sizeof(L1_buf), "0,%d", num_L1_files); + std::string L1_files(L1_buf); + ASSERT_EQ(L1_files, FilesPerLevel(1)); + options.compression = GetAnyCompression(); + if (options.compression == kNoCompression) { + break; + } + stats_checker->EnableCompression(true); + compression_ratio = kCompressionRatio; + + for (int i = 0; i < 5; i++) { + ASSERT_OK(Put(1, Slice(Key(key_base + i, 10)), + Slice(RandomString(&rnd, 512 * 1024, 1)))); + } + + ASSERT_OK(Flush(1)); + ASSERT_OK(static_cast_with_check(db_)->TEST_WaitForCompact()); + + stats_checker->set_verify_next_comp_io_stats(true); + std::atomic first_prepare_write(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::Append:BeforePrepareWrite", [&](void* /*arg*/) { + if (first_prepare_write.load()) { + options.env->SleepForMicroseconds(3); + first_prepare_write.store(false); + } + }); + + std::atomic first_flush(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::Flush:BeforeAppend", [&](void* /*arg*/) { + if (first_flush.load()) { + options.env->SleepForMicroseconds(3); + first_flush.store(false); + } + }); + + std::atomic first_sync(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::SyncInternal:0", [&](void* /*arg*/) { + if (first_sync.load()) { + options.env->SleepForMicroseconds(3); + first_sync.store(false); + } + }); + + std::atomic first_range_sync(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) { + if (first_range_sync.load()) { + options.env->SleepForMicroseconds(3); + first_range_sync.store(false); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Compact(1, smallest_key, largest_key); + + ASSERT_TRUE(!stats_checker->verify_next_comp_io_stats()); + ASSERT_TRUE(!first_prepare_write.load()); + ASSERT_TRUE(!first_flush.load()); + ASSERT_TRUE(!first_sync.load()); + ASSERT_TRUE(!first_range_sync.load()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U); +} + +TEST_P(CompactionJobStatsTest, DeletionStatsTest) { + Random rnd(301); + uint64_t key_base = 100000l; + // Note: key_base must be multiple of num_keys_per_L0_file + int num_keys_per_L0_file = 20; + const int kTestScale = 8; // make sure this is even + const int kKeySize = 10; + const int kValueSize = 100; + double compression_ratio = 1.0; + uint64_t key_interval = key_base / num_keys_per_L0_file; + uint64_t largest_key_num = key_base * (kTestScale + 1) - key_interval; + uint64_t cutoff_key_num = key_base * (kTestScale / 2 + 1) - key_interval; + const std::string smallest_key = Key(key_base - 10, kKeySize); + const std::string largest_key = Key(largest_key_num + 10, kKeySize); + + // Whenever a compaction completes, this listener will try to + // verify whether the returned CompactionJobStats matches + // what we expect. + auto* stats_checker = new CompactionJobDeletionStatsChecker(); + Options options; + options.listeners.emplace_back(stats_checker); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = kTestScale + 1; + options.num_levels = 3; + options.compression = kNoCompression; + options.max_bytes_for_level_multiplier = 2; + options.max_subcompactions = max_subcompactions_; + + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Stage 1: Generate several L0 files and then send them to L2 by + // using CompactRangeOptions and CompactRange(). These files will + // have a strict subset of the keys from the full key-range + for (uint64_t start_key = key_base; start_key <= key_base * kTestScale / 2; + start_key += key_base) { + MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, kKeySize, + kValueSize, key_interval, compression_ratio, 1); + } + + CompactRangeOptions cr_options; + cr_options.change_level = true; + cr_options.target_level = 2; + ASSERT_OK(db_->CompactRange(cr_options, handles_[1], nullptr, nullptr)); + ASSERT_GT(NumTableFilesAtLevel(2, 1), 0); + + // Stage 2: Generate files including keys from the entire key range + for (uint64_t start_key = key_base; start_key <= key_base * kTestScale; + start_key += key_base) { + MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, kKeySize, + kValueSize, key_interval, compression_ratio, 1); + } + + // Send these L0 files to L1 + TEST_Compact(0, 1, smallest_key, largest_key); + ASSERT_GT(NumTableFilesAtLevel(1, 1), 0); + + // Add a new record and flush so now there is a L0 file + // with a value too (not just deletions from the next step) + ASSERT_OK(Put(1, Key(key_base - 6, kKeySize), "test")); + ASSERT_OK(Flush(1)); + + // Stage 3: Generate L0 files with some deletions so now + // there are files with the same key range in L0, L1, and L2 + int deletion_interval = 3; + CompactionJobStats first_compaction_stats; + SelectivelyDeleteKeys(key_base, largest_key_num, key_interval, + deletion_interval, kKeySize, cutoff_key_num, + &first_compaction_stats, 1); + + stats_checker->AddExpectedStats(first_compaction_stats); + + // Stage 4: Trigger compaction and verify the stats + TEST_Compact(0, 1, smallest_key, largest_key); +} + +namespace { +int GetUniversalCompactionInputUnits(uint32_t num_flushes) { + uint32_t compaction_input_units; + for (compaction_input_units = 1; num_flushes >= compaction_input_units; + compaction_input_units *= 2) { + if ((num_flushes & compaction_input_units) != 0) { + return compaction_input_units > 1 ? compaction_input_units : 0; + } + } + return 0; +} +} // namespace + +TEST_P(CompactionJobStatsTest, UniversalCompactionTest) { + Random rnd(301); + uint64_t key_base = 100000000l; + // Note: key_base must be multiple of num_keys_per_L0_file + int num_keys_per_table = 100; + const uint32_t kTestScale = 6; + const int kKeySize = 10; + const int kValueSize = 900; + double compression_ratio = 1.0; + uint64_t key_interval = key_base / num_keys_per_table; + + auto* stats_checker = new CompactionJobStatsChecker(); + Options options; + options.listeners.emplace_back(stats_checker); + options.create_if_missing = true; + options.num_levels = 3; + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 2; + options.target_file_size_base = num_keys_per_table * 1000; + options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.size_ratio = 1; + options.compaction_options_universal.max_size_amplification_percent = 1000; + options.max_subcompactions = max_subcompactions_; + + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Generates the expected CompactionJobStats for each compaction + for (uint32_t num_flushes = 2; num_flushes <= kTestScale; num_flushes++) { + // Here we treat one newly flushed file as an unit. + // + // For example, if a newly flushed file is 100k, and a compaction has + // 4 input units, then this compaction inputs 400k. + uint32_t num_input_units = GetUniversalCompactionInputUnits(num_flushes); + if (num_input_units == 0) { + continue; + } + // A full compaction only happens when the number of flushes equals to + // the number of compaction input runs. + bool is_full = num_flushes == num_input_units; + // The following statement determines the expected smallest key + // based on whether it is a full compaction. + uint64_t smallest_key = is_full ? key_base : key_base * (num_flushes - 1); + + stats_checker->AddExpectedStats(NewManualCompactionJobStats( + Key(smallest_key, 10), + Key(smallest_key + key_base * num_input_units - key_interval, 10), + num_input_units, num_input_units > 2 ? num_input_units / 2 : 0, + num_keys_per_table * num_input_units, kKeySize, kValueSize, + num_input_units, num_keys_per_table * num_input_units, 1.0, 0, is_full, + false)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 3U); + + for (uint64_t start_key = key_base; start_key <= key_base * kTestScale; + start_key += key_base) { + MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, kKeySize, + kValueSize, key_interval, compression_ratio, 1); + ASSERT_OK(static_cast_with_check(db_)->TEST_WaitForCompact()); + } + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U); +} + +INSTANTIATE_TEST_CASE_P(CompactionJobStatsTest, CompactionJobStatsTest, + ::testing::Values(1, 4)); +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED, not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE + +#else + +int main(int /*argc*/, char** /*argv*/) { return 0; } +#endif // !defined(IOS_CROSS_COMPILE) diff --git a/src/rocksdb/db/compaction/compaction_job_test.cc b/src/rocksdb/db/compaction/compaction_job_test.cc new file mode 100644 index 000000000..c87871100 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_job_test.cc @@ -0,0 +1,2451 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "db/compaction/compaction_job.h" + +#include +#include +#include +#include +#include +#include + +#include "db/blob/blob_index.h" +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/error_handler.h" +#include "db/version_set.h" +#include "file/random_access_file_reader.h" +#include "file/writable_file_writer.h" +#include "options/options_helper.h" +#include "rocksdb/cache.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/file_system.h" +#include "rocksdb/options.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/mock_table.h" +#include "table/unique_id_impl.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +void VerifyInitializationOfCompactionJobStats( + const CompactionJobStats& compaction_job_stats) { +#if !defined(IOS_CROSS_COMPILE) + ASSERT_EQ(compaction_job_stats.elapsed_micros, 0U); + + ASSERT_EQ(compaction_job_stats.num_input_records, 0U); + ASSERT_EQ(compaction_job_stats.num_input_files, 0U); + ASSERT_EQ(compaction_job_stats.num_input_files_at_output_level, 0U); + + ASSERT_EQ(compaction_job_stats.num_output_records, 0U); + ASSERT_EQ(compaction_job_stats.num_output_files, 0U); + + ASSERT_EQ(compaction_job_stats.is_manual_compaction, true); + + ASSERT_EQ(compaction_job_stats.total_input_bytes, 0U); + ASSERT_EQ(compaction_job_stats.total_output_bytes, 0U); + + ASSERT_EQ(compaction_job_stats.total_input_raw_key_bytes, 0U); + ASSERT_EQ(compaction_job_stats.total_input_raw_value_bytes, 0U); + + ASSERT_EQ(compaction_job_stats.smallest_output_key_prefix[0], 0); + ASSERT_EQ(compaction_job_stats.largest_output_key_prefix[0], 0); + + ASSERT_EQ(compaction_job_stats.num_records_replaced, 0U); + + ASSERT_EQ(compaction_job_stats.num_input_deletion_records, 0U); + ASSERT_EQ(compaction_job_stats.num_expired_deletion_records, 0U); + + ASSERT_EQ(compaction_job_stats.num_corrupt_keys, 0U); +#endif // !defined(IOS_CROSS_COMPILE) +} + +// Mock FSWritableFile for testing io priority. +// Only override the essential functions for testing compaction io priority. +class MockTestWritableFile : public FSWritableFileOwnerWrapper { + public: + MockTestWritableFile(std::unique_ptr&& file, + Env::IOPriority io_priority) + : FSWritableFileOwnerWrapper(std::move(file)), + write_io_priority_(io_priority) {} + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override { + EXPECT_EQ(options.rate_limiter_priority, write_io_priority_); + return target()->Append(data, options, dbg); + } + IOStatus Append(const Slice& data, const IOOptions& options, + const DataVerificationInfo& verification_info, + IODebugContext* dbg) override { + EXPECT_EQ(options.rate_limiter_priority, write_io_priority_); + return target()->Append(data, options, verification_info, dbg); + } + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { + EXPECT_EQ(options.rate_limiter_priority, write_io_priority_); + return target()->Close(options, dbg); + } + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override { + EXPECT_EQ(options.rate_limiter_priority, write_io_priority_); + return target()->Flush(options, dbg); + } + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override { + EXPECT_EQ(options.rate_limiter_priority, write_io_priority_); + return target()->Sync(options, dbg); + } + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override { + EXPECT_EQ(options.rate_limiter_priority, write_io_priority_); + return target()->Fsync(options, dbg); + } + uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override { + EXPECT_EQ(options.rate_limiter_priority, write_io_priority_); + return target()->GetFileSize(options, dbg); + } + IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options, + IODebugContext* dbg) override { + EXPECT_EQ(options.rate_limiter_priority, write_io_priority_); + return target()->RangeSync(offset, nbytes, options, dbg); + } + + void PrepareWrite(size_t offset, size_t len, const IOOptions& options, + IODebugContext* dbg) override { + EXPECT_EQ(options.rate_limiter_priority, write_io_priority_); + target()->PrepareWrite(offset, len, options, dbg); + } + + IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options, + IODebugContext* dbg) override { + EXPECT_EQ(options.rate_limiter_priority, write_io_priority_); + return target()->Allocate(offset, len, options, dbg); + } + + private: + Env::IOPriority write_io_priority_; +}; + +// Mock FSRandomAccessFile for testing io priority. +// Only override the essential functions for testing compaction io priority. +class MockTestRandomAccessFile : public FSRandomAccessFileOwnerWrapper { + public: + MockTestRandomAccessFile(std::unique_ptr&& file, + Env::IOPriority io_priority) + : FSRandomAccessFileOwnerWrapper(std::move(file)), + read_io_priority_(io_priority) {} + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { + EXPECT_EQ(options.rate_limiter_priority, read_io_priority_); + return target()->Read(offset, n, options, result, scratch, dbg); + } + IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options, + IODebugContext* dbg) override { + EXPECT_EQ(options.rate_limiter_priority, read_io_priority_); + return target()->Prefetch(offset, n, options, dbg); + } + + private: + Env::IOPriority read_io_priority_; +}; + +// Mock FileSystem for testing io priority. +class MockTestFileSystem : public FileSystemWrapper { + public: + explicit MockTestFileSystem(const std::shared_ptr& base, + Env::IOPriority read_io_priority, + Env::IOPriority write_io_priority) + : FileSystemWrapper(base), + read_io_priority_(read_io_priority), + write_io_priority_(write_io_priority) {} + + static const char* kClassName() { return "MockTestFileSystem"; } + const char* Name() const override { return kClassName(); } + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + IOStatus s = target()->NewRandomAccessFile(fname, file_opts, result, dbg); + EXPECT_OK(s); + result->reset( + new MockTestRandomAccessFile(std::move(*result), read_io_priority_)); + return s; + } + IOStatus NewWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + IOStatus s = target()->NewWritableFile(fname, file_opts, result, dbg); + EXPECT_OK(s); + result->reset( + new MockTestWritableFile(std::move(*result), write_io_priority_)); + return s; + } + + private: + Env::IOPriority read_io_priority_; + Env::IOPriority write_io_priority_; +}; + +enum TableTypeForTest : uint8_t { kMockTable = 0, kBlockBasedTable = 1 }; + +} // namespace + +class CompactionJobTestBase : public testing::Test { + protected: + CompactionJobTestBase(std::string dbname, const Comparator* ucmp, + std::function encode_u64_ts, + bool test_io_priority, TableTypeForTest table_type) + : dbname_(std::move(dbname)), + ucmp_(ucmp), + db_options_(), + mutable_cf_options_(cf_options_), + mutable_db_options_(), + table_cache_(NewLRUCache(50000, 16)), + write_buffer_manager_(db_options_.db_write_buffer_size), + versions_(new VersionSet( + dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, + /*io_tracer=*/nullptr, /*db_id*/ "", /*db_session_id*/ "")), + shutting_down_(false), + mock_table_factory_(new mock::MockTableFactory()), + error_handler_(nullptr, db_options_, &mutex_), + encode_u64_ts_(std::move(encode_u64_ts)), + test_io_priority_(test_io_priority), + table_type_(table_type) { + Env* base_env = Env::Default(); + EXPECT_OK( + test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_)); + env_ = base_env; + fs_ = env_->GetFileSystem(); + // set default for the tests + mutable_cf_options_.target_file_size_base = 1024 * 1024; + mutable_cf_options_.max_compaction_bytes = 10 * 1024 * 1024; + } + + void SetUp() override { + EXPECT_OK(env_->CreateDirIfMissing(dbname_)); + db_options_.env = env_; + db_options_.fs = fs_; + db_options_.db_paths.emplace_back(dbname_, + std::numeric_limits::max()); + cf_options_.comparator = ucmp_; + if (table_type_ == TableTypeForTest::kBlockBasedTable) { + BlockBasedTableOptions table_options; + cf_options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + } else if (table_type_ == TableTypeForTest::kMockTable) { + cf_options_.table_factory = mock_table_factory_; + } else { + assert(false); + } + } + + std::string GenerateFileName(uint64_t file_number) { + FileMetaData meta; + std::vector db_paths; + db_paths.emplace_back(dbname_, std::numeric_limits::max()); + meta.fd = FileDescriptor(file_number, 0, 0); + return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId()); + } + + std::string KeyStr(const std::string& user_key, const SequenceNumber seq_num, + const ValueType t, uint64_t ts = 0) { + std::string user_key_with_ts = user_key + encode_u64_ts_(ts); + return InternalKey(user_key_with_ts, seq_num, t).Encode().ToString(); + } + + static std::string BlobStr(uint64_t blob_file_number, uint64_t offset, + uint64_t size) { + std::string blob_index; + BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size, + kNoCompression); + return blob_index; + } + + static std::string BlobStrTTL(uint64_t blob_file_number, uint64_t offset, + uint64_t size, uint64_t expiration) { + std::string blob_index; + BlobIndex::EncodeBlobTTL(&blob_index, expiration, blob_file_number, offset, + size, kNoCompression); + return blob_index; + } + + static std::string BlobStrInlinedTTL(const Slice& value, + uint64_t expiration) { + std::string blob_index; + BlobIndex::EncodeInlinedTTL(&blob_index, expiration, value); + return blob_index; + } + + // Creates a table with the specificied key value pairs. + void CreateTable(const std::string& table_name, + const mock::KVVector& contents, uint64_t& file_size) { + std::unique_ptr file_writer; + Status s = WritableFileWriter::Create(fs_, table_name, FileOptions(), + &file_writer, nullptr); + ASSERT_OK(s); + std::unique_ptr table_builder( + cf_options_.table_factory->NewTableBuilder( + TableBuilderOptions(*cfd_->ioptions(), mutable_cf_options_, + cfd_->internal_comparator(), + cfd_->int_tbl_prop_collector_factories(), + CompressionType::kNoCompression, + CompressionOptions(), 0 /* column_family_id */, + kDefaultColumnFamilyName, -1 /* level */), + file_writer.get())); + // Build table. + for (auto kv : contents) { + std::string key; + std::string value; + std::tie(key, value) = kv; + table_builder->Add(key, value); + } + ASSERT_OK(table_builder->Finish()); + file_size = table_builder->FileSize(); + } + + void AddMockFile(const mock::KVVector& contents, int level = 0) { + assert(contents.size() > 0); + + bool first_key = true; + std::string smallest, largest; + InternalKey smallest_key, largest_key; + SequenceNumber smallest_seqno = kMaxSequenceNumber; + SequenceNumber largest_seqno = 0; + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber; + for (auto kv : contents) { + ParsedInternalKey key; + std::string skey; + std::string value; + std::tie(skey, value) = kv; + const Status pik_status = + ParseInternalKey(skey, &key, true /* log_err_key */); + + smallest_seqno = std::min(smallest_seqno, key.sequence); + largest_seqno = std::max(largest_seqno, key.sequence); + + if (first_key || + cfd_->user_comparator()->Compare(key.user_key, smallest) < 0) { + smallest.assign(key.user_key.data(), key.user_key.size()); + smallest_key.DecodeFrom(skey); + } + if (first_key || + cfd_->user_comparator()->Compare(key.user_key, largest) > 0) { + largest.assign(key.user_key.data(), key.user_key.size()); + largest_key.DecodeFrom(skey); + } + + first_key = false; + + if (pik_status.ok() && key.type == kTypeBlobIndex) { + BlobIndex blob_index; + const Status s = blob_index.DecodeFrom(value); + if (!s.ok()) { + continue; + } + + if (blob_index.IsInlined() || blob_index.HasTTL() || + blob_index.file_number() == kInvalidBlobFileNumber) { + continue; + } + + if (oldest_blob_file_number == kInvalidBlobFileNumber || + oldest_blob_file_number > blob_index.file_number()) { + oldest_blob_file_number = blob_index.file_number(); + } + } + } + + uint64_t file_number = versions_->NewFileNumber(); + + uint64_t file_size = 0; + if (table_type_ == TableTypeForTest::kBlockBasedTable) { + CreateTable(GenerateFileName(file_number), contents, file_size); + } else if (table_type_ == TableTypeForTest::kMockTable) { + file_size = 10; + EXPECT_OK(mock_table_factory_->CreateMockTable( + env_, GenerateFileName(file_number), std::move(contents))); + } else { + assert(false); + } + + VersionEdit edit; + edit.AddFile(level, file_number, 0, file_size, smallest_key, largest_key, + smallest_seqno, largest_seqno, false, Temperature::kUnknown, + oldest_blob_file_number, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); + + mutex_.Lock(); + EXPECT_OK( + versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), + mutable_cf_options_, &edit, &mutex_, nullptr)); + mutex_.Unlock(); + } + + void VerifyTables(int output_level, + const std::vector& expected_results, + std::vector expected_oldest_blob_file_numbers) { + if (expected_results.empty()) { + ASSERT_EQ(compaction_job_stats_.num_output_files, 0U); + return; + } + int expected_output_file_num = 0; + for (const auto& e : expected_results) { + if (!e.empty()) { + ++expected_output_file_num; + } + } + ASSERT_EQ(expected_output_file_num, compaction_job_stats_.num_output_files); + if (expected_output_file_num == 0) { + return; + } + + if (expected_oldest_blob_file_numbers.empty()) { + expected_oldest_blob_file_numbers.resize(expected_output_file_num, + kInvalidBlobFileNumber); + } + + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + if (table_type_ == TableTypeForTest::kMockTable) { + ASSERT_EQ(compaction_job_stats_.num_output_files, + expected_results.size()); + mock_table_factory_->AssertLatestFiles(expected_results); + } else { + assert(table_type_ == TableTypeForTest::kBlockBasedTable); + } + + auto output_files = + cfd->current()->storage_info()->LevelFiles(output_level); + ASSERT_EQ(expected_output_file_num, output_files.size()); + + if (table_type_ == TableTypeForTest::kMockTable) { + assert(output_files.size() == + static_cast(expected_output_file_num)); + const FileMetaData* const output_file = output_files[0]; + ASSERT_EQ(output_file->oldest_blob_file_number, + expected_oldest_blob_file_numbers[0]); + return; + } + + for (size_t i = 0; i < expected_results.size(); ++i) { + const FileMetaData* const output_file = output_files[i]; + std::string file_name = GenerateFileName(output_file->fd.GetNumber()); + const auto& fs = env_->GetFileSystem(); + std::unique_ptr freader; + IOStatus ios = RandomAccessFileReader::Create( + fs, file_name, FileOptions(), &freader, nullptr); + ASSERT_OK(ios); + std::unique_ptr table_reader; + uint64_t file_size = output_file->fd.GetFileSize(); + ReadOptions read_opts; + Status s = cf_options_.table_factory->NewTableReader( + read_opts, + TableReaderOptions(*cfd->ioptions(), nullptr, FileOptions(), + cfd_->internal_comparator()), + std::move(freader), file_size, &table_reader, false); + ASSERT_OK(s); + assert(table_reader); + std::unique_ptr iiter( + table_reader->NewIterator(read_opts, nullptr, nullptr, true, + TableReaderCaller::kUncategorized)); + assert(iiter); + + mock::KVVector from_db; + for (iiter->SeekToFirst(); iiter->Valid(); iiter->Next()) { + const Slice key = iiter->key(); + const Slice value = iiter->value(); + from_db.emplace_back( + make_pair(key.ToString(false), value.ToString(false))); + } + ASSERT_EQ(expected_results[i], from_db); + } + } + + void SetLastSequence(const SequenceNumber sequence_number) { + versions_->SetLastAllocatedSequence(sequence_number + 1); + versions_->SetLastPublishedSequence(sequence_number + 1); + versions_->SetLastSequence(sequence_number + 1); + } + + // returns expected result after compaction + mock::KVVector CreateTwoFiles(bool gen_corrupted_keys) { + stl_wrappers::KVMap expected_results; + constexpr int kKeysPerFile = 10000; + constexpr int kCorruptKeysPerFile = 200; + constexpr int kMatchingKeys = kKeysPerFile / 2; + SequenceNumber sequence_number = 0; + + auto corrupt_id = [&](int id) { + return gen_corrupted_keys && id > 0 && id <= kCorruptKeysPerFile; + }; + + for (int i = 0; i < 2; ++i) { + auto contents = mock::MakeMockFile(); + for (int k = 0; k < kKeysPerFile; ++k) { + auto key = std::to_string(i * kMatchingKeys + k); + auto value = std::to_string(i * kKeysPerFile + k); + InternalKey internal_key(key, ++sequence_number, kTypeValue); + + // This is how the key will look like once it's written in bottommost + // file + InternalKey bottommost_internal_key(key, 0, kTypeValue); + + if (corrupt_id(k)) { + test::CorruptKeyType(&internal_key); + test::CorruptKeyType(&bottommost_internal_key); + } + contents.push_back({internal_key.Encode().ToString(), value}); + if (i == 1 || k < kMatchingKeys || corrupt_id(k - kMatchingKeys)) { + expected_results.insert( + {bottommost_internal_key.Encode().ToString(), value}); + } + } + mock::SortKVVector(&contents, ucmp_); + + AddMockFile(contents); + } + + SetLastSequence(sequence_number); + + mock::KVVector expected_results_kvvector; + for (auto& kv : expected_results) { + expected_results_kvvector.push_back({kv.first, kv.second}); + } + + return expected_results_kvvector; + } + + void NewDB() { + EXPECT_OK(DestroyDB(dbname_, Options())); + EXPECT_OK(env_->CreateDirIfMissing(dbname_)); + + std::shared_ptr info_log; + DBOptions db_opts = BuildDBOptions(db_options_, mutable_db_options_); + Status s = CreateLoggerFromOptions(dbname_, db_opts, &info_log); + ASSERT_OK(s); + db_options_.info_log = info_log; + + versions_.reset( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id*/ "", /*db_session_id*/ "")); + compaction_job_stats_.Reset(); + ASSERT_OK(SetIdentityFile(env_, dbname_)); + + VersionEdit new_db; + new_db.SetLogNumber(0); + new_db.SetNextFile(2); + new_db.SetLastSequence(0); + + const std::string manifest = DescriptorFileName(dbname_, 1); + std::unique_ptr file_writer; + const auto& fs = env_->GetFileSystem(); + s = WritableFileWriter::Create(fs, manifest, + fs->OptimizeForManifestWrite(env_options_), + &file_writer, nullptr); + + ASSERT_OK(s); + { + log::Writer log(std::move(file_writer), 0, false); + std::string record; + new_db.EncodeTo(&record); + s = log.AddRecord(record); + } + ASSERT_OK(s); + // Make "CURRENT" file that points to the new manifest file. + s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + + ASSERT_OK(s); + + cf_options_.merge_operator = merge_op_; + cf_options_.compaction_filter = compaction_filter_.get(); + std::vector column_families; + column_families.emplace_back(kDefaultColumnFamilyName, cf_options_); + + ASSERT_OK(versions_->Recover(column_families, false)); + cfd_ = versions_->GetColumnFamilySet()->GetDefault(); + } + + // input_files[i] on input_levels[i] + void RunLastLevelCompaction( + const std::vector>& input_files, + const std::vector input_levels, + std::function&& verify_func, + const std::vector& snapshots = {}) { + const int kLastLevel = cf_options_.num_levels - 1; + verify_per_key_placement_ = std::move(verify_func); + mock::KVVector empty_map; + RunCompaction(input_files, input_levels, {empty_map}, snapshots, + kMaxSequenceNumber, kLastLevel, false); + } + + // input_files[i] on input_levels[i] + void RunCompaction( + const std::vector>& input_files, + const std::vector& input_levels, + const std::vector& expected_results, + const std::vector& snapshots = {}, + SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber, + int output_level = 1, bool verify = true, + std::vector expected_oldest_blob_file_numbers = {}, + bool check_get_priority = false, + Env::IOPriority read_io_priority = Env::IO_TOTAL, + Env::IOPriority write_io_priority = Env::IO_TOTAL, + int max_subcompactions = 0) { + // For compaction, set fs as MockTestFileSystem to check the io_priority. + if (test_io_priority_) { + db_options_.fs.reset( + new MockTestFileSystem(fs_, read_io_priority, write_io_priority)); + } + + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + + size_t num_input_files = 0; + std::vector compaction_input_files; + for (size_t i = 0; i < input_files.size(); ++i) { + auto level_files = input_files[i]; + CompactionInputFiles compaction_level; + compaction_level.level = input_levels[i]; + compaction_level.files.insert(compaction_level.files.end(), + level_files.begin(), level_files.end()); + compaction_input_files.push_back(compaction_level); + num_input_files += level_files.size(); + } + + std::vector grandparents; + // it should actually be the next non-empty level + const int kGrandparentsLevel = output_level + 1; + if (kGrandparentsLevel < cf_options_.num_levels) { + grandparents = + cfd_->current()->storage_info()->LevelFiles(kGrandparentsLevel); + } + + Compaction compaction( + cfd->current()->storage_info(), *cfd->ioptions(), + *cfd->GetLatestMutableCFOptions(), mutable_db_options_, + compaction_input_files, output_level, + mutable_cf_options_.target_file_size_base, + mutable_cf_options_.max_compaction_bytes, 0, kNoCompression, + cfd->GetLatestMutableCFOptions()->compression_opts, + Temperature::kUnknown, max_subcompactions, grandparents, true); + compaction.SetInputVersion(cfd->current()); + + assert(db_options_.info_log); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); + mutex_.Lock(); + EventLogger event_logger(db_options_.info_log.get()); + // TODO(yiwu) add a mock snapshot checker and add test for it. + SnapshotChecker* snapshot_checker = nullptr; + ASSERT_TRUE(full_history_ts_low_.empty() || + ucmp_->timestamp_size() == full_history_ts_low_.size()); + const std::atomic kManualCompactionCanceledFalse{false}; + CompactionJob compaction_job( + 0, &compaction, db_options_, mutable_db_options_, env_options_, + versions_.get(), &shutting_down_, &log_buffer, nullptr, nullptr, + nullptr, nullptr, &mutex_, &error_handler_, snapshots, + earliest_write_conflict_snapshot, snapshot_checker, nullptr, + table_cache_, &event_logger, false, false, dbname_, + &compaction_job_stats_, Env::Priority::USER, nullptr /* IOTracer */, + /*manual_compaction_canceled=*/kManualCompactionCanceledFalse, + env_->GenerateUniqueId(), DBImpl::GenerateDbSessionId(nullptr), + full_history_ts_low_); + VerifyInitializationOfCompactionJobStats(compaction_job_stats_); + + compaction_job.Prepare(); + mutex_.Unlock(); + Status s = compaction_job.Run(); + ASSERT_OK(s); + ASSERT_OK(compaction_job.io_status()); + mutex_.Lock(); + ASSERT_OK(compaction_job.Install(*cfd->GetLatestMutableCFOptions())); + ASSERT_OK(compaction_job.io_status()); + mutex_.Unlock(); + log_buffer.FlushBufferToLog(); + + if (verify) { + ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U); + ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files); + + VerifyTables(output_level, expected_results, + expected_oldest_blob_file_numbers); + } + + if (check_get_priority) { + CheckGetRateLimiterPriority(compaction_job); + } + + if (verify_per_key_placement_) { + // Verify per_key_placement compaction + assert(compaction.SupportsPerKeyPlacement()); + verify_per_key_placement_(compaction); + } + } + + void CheckGetRateLimiterPriority(CompactionJob& compaction_job) { + // When the state from WriteController is normal. + ASSERT_EQ(compaction_job.GetRateLimiterPriority(), Env::IO_LOW); + + WriteController* write_controller = + compaction_job.versions_->GetColumnFamilySet()->write_controller(); + + { + // When the state from WriteController is Delayed. + std::unique_ptr delay_token = + write_controller->GetDelayToken(1000000); + ASSERT_EQ(compaction_job.GetRateLimiterPriority(), Env::IO_USER); + } + + { + // When the state from WriteController is Stopped. + std::unique_ptr stop_token = + write_controller->GetStopToken(); + ASSERT_EQ(compaction_job.GetRateLimiterPriority(), Env::IO_USER); + } + } + + std::shared_ptr env_guard_; + Env* env_; + std::shared_ptr fs_; + std::string dbname_; + const Comparator* const ucmp_; + EnvOptions env_options_; + ImmutableDBOptions db_options_; + ColumnFamilyOptions cf_options_; + MutableCFOptions mutable_cf_options_; + MutableDBOptions mutable_db_options_; + std::shared_ptr table_cache_; + WriteController write_controller_; + WriteBufferManager write_buffer_manager_; + std::unique_ptr versions_; + InstrumentedMutex mutex_; + std::atomic shutting_down_; + std::shared_ptr mock_table_factory_; + CompactionJobStats compaction_job_stats_; + ColumnFamilyData* cfd_; + std::unique_ptr compaction_filter_; + std::shared_ptr merge_op_; + ErrorHandler error_handler_; + std::string full_history_ts_low_; + const std::function encode_u64_ts_; + const bool test_io_priority_; + std::function verify_per_key_placement_; + const TableTypeForTest table_type_ = kMockTable; +}; + +// TODO(icanadi) Make it simpler once we mock out VersionSet +class CompactionJobTest : public CompactionJobTestBase { + public: + CompactionJobTest() + : CompactionJobTestBase( + test::PerThreadDBPath("compaction_job_test"), BytewiseComparator(), + [](uint64_t /*ts*/) { return ""; }, /*test_io_priority=*/false, + TableTypeForTest::kMockTable) {} +}; + +TEST_F(CompactionJobTest, Simple) { + NewDB(); + + auto expected_results = CreateTwoFiles(false); + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + constexpr int input_level = 0; + auto files = cfd->current()->storage_info()->LevelFiles(input_level); + ASSERT_EQ(2U, files.size()); + RunCompaction({files}, {input_level}, {expected_results}); +} + +TEST_F(CompactionJobTest, DISABLED_SimpleCorrupted) { + NewDB(); + + auto expected_results = CreateTwoFiles(true); + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + constexpr int input_level = 0; + auto files = cfd->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}); + ASSERT_EQ(compaction_job_stats_.num_corrupt_keys, 400U); +} + +TEST_F(CompactionJobTest, SimpleDeletion) { + NewDB(); + + auto file1 = mock::MakeMockFile({{KeyStr("c", 4U, kTypeDeletion), ""}, + {KeyStr("c", 3U, kTypeValue), "val"}}); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("b", 2U, kTypeValue), "val"}, + {KeyStr("b", 1U, kTypeValue), "val"}}); + AddMockFile(file2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("b", 0U, kTypeValue), "val"}}); + + SetLastSequence(4U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}); +} + +TEST_F(CompactionJobTest, OutputNothing) { + NewDB(); + + auto file1 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"}}); + + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("a", 2U, kTypeDeletion), ""}}); + + AddMockFile(file2); + + auto expected_results = mock::MakeMockFile(); + + SetLastSequence(4U); + + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}); +} + +TEST_F(CompactionJobTest, SimpleOverwrite) { + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("a", 3U, kTypeValue), "val2"}, + {KeyStr("b", 4U, kTypeValue), "val3"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"}, + {KeyStr("b", 2U, kTypeValue), "val"}}); + AddMockFile(file2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "val2"}, + {KeyStr("b", 0U, kTypeValue), "val3"}}); + + SetLastSequence(4U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}); +} + +TEST_F(CompactionJobTest, SimpleNonLastLevel) { + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("a", 5U, kTypeValue), "val2"}, + {KeyStr("b", 6U, kTypeValue), "val3"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("a", 3U, kTypeValue), "val"}, + {KeyStr("b", 4U, kTypeValue), "val"}}); + AddMockFile(file2, 1); + + auto file3 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"}, + {KeyStr("b", 2U, kTypeValue), "val"}}); + AddMockFile(file3, 2); + + // Because level 1 is not the last level, the sequence numbers of a and b + // cannot be set to 0 + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"}, + {KeyStr("b", 6U, kTypeValue), "val3"}}); + + SetLastSequence(6U); + const std::vector input_levels = {0, 1}; + auto lvl0_files = + cfd_->current()->storage_info()->LevelFiles(input_levels[0]); + auto lvl1_files = + cfd_->current()->storage_info()->LevelFiles(input_levels[1]); + RunCompaction({lvl0_files, lvl1_files}, input_levels, {expected_results}); +} + +TEST_F(CompactionJobTest, SimpleMerge) { + merge_op_ = MergeOperators::CreateStringAppendOperator(); + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("a", 5U, kTypeMerge), "5"}, + {KeyStr("a", 4U, kTypeMerge), "4"}, + {KeyStr("a", 3U, kTypeValue), "3"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile( + {{KeyStr("b", 2U, kTypeMerge), "2"}, {KeyStr("b", 1U, kTypeValue), "1"}}); + AddMockFile(file2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"}, + {KeyStr("b", 0U, kTypeValue), "1,2"}}); + + SetLastSequence(5U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}); +} + +TEST_F(CompactionJobTest, NonAssocMerge) { + merge_op_ = MergeOperators::CreateStringAppendTESTOperator(); + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("a", 5U, kTypeMerge), "5"}, + {KeyStr("a", 4U, kTypeMerge), "4"}, + {KeyStr("a", 3U, kTypeMerge), "3"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile( + {{KeyStr("b", 2U, kTypeMerge), "2"}, {KeyStr("b", 1U, kTypeMerge), "1"}}); + AddMockFile(file2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"}, + {KeyStr("b", 0U, kTypeValue), "1,2"}}); + + SetLastSequence(5U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}); +} + +// Filters merge operands with value 10. +TEST_F(CompactionJobTest, MergeOperandFilter) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + compaction_filter_.reset(new test::FilterNumber(10U)); + NewDB(); + + auto file1 = mock::MakeMockFile( + {{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)}, + {KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)}, // Filtered + {KeyStr("a", 3U, kTypeMerge), test::EncodeInt(3U)}}); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({ + {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(2U)}, + {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)} // Filtered + }); + AddMockFile(file2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), test::EncodeInt(8U)}, + {KeyStr("b", 0U, kTypeValue), test::EncodeInt(2U)}}); + + SetLastSequence(5U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}); +} + +TEST_F(CompactionJobTest, FilterSomeMergeOperands) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + compaction_filter_.reset(new test::FilterNumber(10U)); + NewDB(); + + auto file1 = mock::MakeMockFile( + {{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)}, + {KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)}, // Filtered + {KeyStr("a", 3U, kTypeValue), test::EncodeInt(5U)}, + {KeyStr("d", 8U, kTypeMerge), test::EncodeInt(10U)}}); + AddMockFile(file1); + + auto file2 = + mock::MakeMockFile({{KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("c", 2U, kTypeMerge), test::EncodeInt(3U)}, + {KeyStr("c", 1U, kTypeValue), test::EncodeInt(7U)}, + {KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)}}); + AddMockFile(file2); + + auto file3 = + mock::MakeMockFile({{KeyStr("a", 1U, kTypeMerge), test::EncodeInt(3U)}}); + AddMockFile(file3, 2); + + auto expected_results = mock::MakeMockFile({ + {KeyStr("a", 5U, kTypeValue), test::EncodeInt(10U)}, + {KeyStr("c", 2U, kTypeValue), test::EncodeInt(10U)}, + {KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)} + // b does not appear because the operands are filtered + }); + + SetLastSequence(5U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}); +} + +// Test where all operands/merge results are filtered out. +TEST_F(CompactionJobTest, FilterAllMergeOperands) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + compaction_filter_.reset(new test::FilterNumber(10U)); + NewDB(); + + auto file1 = + mock::MakeMockFile({{KeyStr("a", 11U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("a", 10U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("a", 9U, kTypeMerge), test::EncodeInt(10U)}}); + AddMockFile(file1); + + auto file2 = + mock::MakeMockFile({{KeyStr("b", 8U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 7U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 6U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 5U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 4U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 3U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("c", 2U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("c", 1U, kTypeMerge), test::EncodeInt(10U)}}); + AddMockFile(file2); + + auto file3 = + mock::MakeMockFile({{KeyStr("a", 2U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)}}); + AddMockFile(file3, 2); + + SetLastSequence(11U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + + mock::KVVector empty_map; + RunCompaction({files}, {input_level}, {empty_map}); +} + +TEST_F(CompactionJobTest, SimpleSingleDelete) { + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("a", 5U, kTypeDeletion), ""}, + {KeyStr("b", 6U, kTypeSingleDeletion), ""}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("a", 3U, kTypeValue), "val"}, + {KeyStr("b", 4U, kTypeValue), "val"}}); + AddMockFile(file2); + + auto file3 = mock::MakeMockFile({ + {KeyStr("a", 1U, kTypeValue), "val"}, + }); + AddMockFile(file3, 2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 5U, kTypeDeletion), ""}}); + + SetLastSequence(6U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}); +} + +TEST_F(CompactionJobTest, SingleDeleteSnapshots) { + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("A", 12U, kTypeSingleDeletion), ""}, + {KeyStr("a", 12U, kTypeSingleDeletion), ""}, + {KeyStr("b", 21U, kTypeSingleDeletion), ""}, + {KeyStr("c", 22U, kTypeSingleDeletion), ""}, + {KeyStr("d", 9U, kTypeSingleDeletion), ""}, + {KeyStr("f", 21U, kTypeSingleDeletion), ""}, + {KeyStr("j", 11U, kTypeSingleDeletion), ""}, + {KeyStr("j", 9U, kTypeSingleDeletion), ""}, + {KeyStr("k", 12U, kTypeSingleDeletion), ""}, + {KeyStr("k", 11U, kTypeSingleDeletion), ""}, + {KeyStr("l", 3U, kTypeSingleDeletion), ""}, + {KeyStr("l", 2U, kTypeSingleDeletion), ""}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({ + {KeyStr("0", 2U, kTypeSingleDeletion), ""}, + {KeyStr("a", 11U, kTypeValue), "val1"}, + {KeyStr("b", 11U, kTypeValue), "val2"}, + {KeyStr("c", 21U, kTypeValue), "val3"}, + {KeyStr("d", 8U, kTypeValue), "val4"}, + {KeyStr("e", 2U, kTypeSingleDeletion), ""}, + {KeyStr("f", 1U, kTypeValue), "val1"}, + {KeyStr("g", 11U, kTypeSingleDeletion), ""}, + {KeyStr("h", 2U, kTypeSingleDeletion), ""}, + {KeyStr("m", 12U, kTypeValue), "val1"}, + {KeyStr("m", 11U, kTypeSingleDeletion), ""}, + {KeyStr("m", 8U, kTypeValue), "val2"}, + }); + AddMockFile(file2); + + auto file3 = mock::MakeMockFile({ + {KeyStr("A", 1U, kTypeValue), "val"}, + {KeyStr("e", 1U, kTypeValue), "val"}, + }); + AddMockFile(file3, 2); + + auto expected_results = mock::MakeMockFile({ + {KeyStr("A", 12U, kTypeSingleDeletion), ""}, + {KeyStr("a", 12U, kTypeSingleDeletion), ""}, + {KeyStr("a", 11U, kTypeValue), ""}, + {KeyStr("b", 21U, kTypeSingleDeletion), ""}, + {KeyStr("b", 11U, kTypeValue), "val2"}, + {KeyStr("c", 22U, kTypeSingleDeletion), ""}, + {KeyStr("c", 21U, kTypeValue), ""}, + {KeyStr("e", 2U, kTypeSingleDeletion), ""}, + {KeyStr("f", 21U, kTypeSingleDeletion), ""}, + {KeyStr("f", 1U, kTypeValue), "val1"}, + {KeyStr("g", 11U, kTypeSingleDeletion), ""}, + {KeyStr("j", 11U, kTypeSingleDeletion), ""}, + {KeyStr("k", 11U, kTypeSingleDeletion), ""}, + {KeyStr("m", 12U, kTypeValue), "val1"}, + {KeyStr("m", 11U, kTypeSingleDeletion), ""}, + {KeyStr("m", 8U, kTypeValue), "val2"}, + }); + + SetLastSequence(22U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}, {10U, 20U}, 10U); +} + +TEST_F(CompactionJobTest, EarliestWriteConflictSnapshot) { + NewDB(); + + // Test multiple snapshots where the earliest snapshot is not a + // write-conflic-snapshot. + + auto file1 = mock::MakeMockFile({ + {KeyStr("A", 24U, kTypeSingleDeletion), ""}, + {KeyStr("A", 23U, kTypeValue), "val"}, + {KeyStr("B", 24U, kTypeSingleDeletion), ""}, + {KeyStr("B", 23U, kTypeValue), "val"}, + {KeyStr("D", 24U, kTypeSingleDeletion), ""}, + {KeyStr("G", 32U, kTypeSingleDeletion), ""}, + {KeyStr("G", 31U, kTypeValue), "val"}, + {KeyStr("G", 24U, kTypeSingleDeletion), ""}, + {KeyStr("G", 23U, kTypeValue), "val2"}, + {KeyStr("H", 31U, kTypeValue), "val"}, + {KeyStr("H", 24U, kTypeSingleDeletion), ""}, + {KeyStr("H", 23U, kTypeValue), "val"}, + {KeyStr("I", 35U, kTypeSingleDeletion), ""}, + {KeyStr("I", 34U, kTypeValue), "val2"}, + {KeyStr("I", 33U, kTypeSingleDeletion), ""}, + {KeyStr("I", 32U, kTypeValue), "val3"}, + {KeyStr("I", 31U, kTypeSingleDeletion), ""}, + {KeyStr("J", 34U, kTypeValue), "val"}, + {KeyStr("J", 33U, kTypeSingleDeletion), ""}, + {KeyStr("J", 25U, kTypeValue), "val2"}, + {KeyStr("J", 24U, kTypeSingleDeletion), ""}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({ + {KeyStr("A", 14U, kTypeSingleDeletion), ""}, + {KeyStr("A", 13U, kTypeValue), "val2"}, + {KeyStr("C", 14U, kTypeSingleDeletion), ""}, + {KeyStr("C", 13U, kTypeValue), "val"}, + {KeyStr("E", 12U, kTypeSingleDeletion), ""}, + {KeyStr("F", 4U, kTypeSingleDeletion), ""}, + {KeyStr("F", 3U, kTypeValue), "val"}, + {KeyStr("G", 14U, kTypeSingleDeletion), ""}, + {KeyStr("G", 13U, kTypeValue), "val3"}, + {KeyStr("H", 14U, kTypeSingleDeletion), ""}, + {KeyStr("H", 13U, kTypeValue), "val2"}, + {KeyStr("I", 13U, kTypeValue), "val4"}, + {KeyStr("I", 12U, kTypeSingleDeletion), ""}, + {KeyStr("I", 11U, kTypeValue), "val5"}, + {KeyStr("J", 15U, kTypeValue), "val3"}, + {KeyStr("J", 14U, kTypeSingleDeletion), ""}, + }); + AddMockFile(file2); + + auto expected_results = mock::MakeMockFile({ + {KeyStr("A", 24U, kTypeSingleDeletion), ""}, + {KeyStr("A", 23U, kTypeValue), ""}, + {KeyStr("B", 24U, kTypeSingleDeletion), ""}, + {KeyStr("B", 23U, kTypeValue), ""}, + {KeyStr("D", 24U, kTypeSingleDeletion), ""}, + {KeyStr("E", 12U, kTypeSingleDeletion), ""}, + {KeyStr("G", 32U, kTypeSingleDeletion), ""}, + {KeyStr("G", 31U, kTypeValue), ""}, + {KeyStr("H", 31U, kTypeValue), "val"}, + {KeyStr("I", 35U, kTypeSingleDeletion), ""}, + {KeyStr("I", 34U, kTypeValue), ""}, + {KeyStr("I", 31U, kTypeSingleDeletion), ""}, + {KeyStr("I", 13U, kTypeValue), "val4"}, + {KeyStr("J", 34U, kTypeValue), "val"}, + {KeyStr("J", 33U, kTypeSingleDeletion), ""}, + {KeyStr("J", 25U, kTypeValue), "val2"}, + {KeyStr("J", 24U, kTypeSingleDeletion), ""}, + {KeyStr("J", 15U, kTypeValue), "val3"}, + {KeyStr("J", 14U, kTypeSingleDeletion), ""}, + }); + + SetLastSequence(24U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}, {10U, 20U, 30U}, + 20U); +} + +TEST_F(CompactionJobTest, SingleDeleteZeroSeq) { + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("A", 10U, kTypeSingleDeletion), ""}, + {KeyStr("dummy", 5U, kTypeValue), "val2"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({ + {KeyStr("A", 0U, kTypeValue), "val"}, + }); + AddMockFile(file2); + + auto expected_results = mock::MakeMockFile({ + {KeyStr("dummy", 0U, kTypeValue), "val2"}, + }); + + SetLastSequence(22U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}, {}); +} + +TEST_F(CompactionJobTest, MultiSingleDelete) { + // Tests three scenarios involving multiple single delete/put pairs: + // + // A: Put Snapshot SDel Put SDel -> Put Snapshot SDel + // B: Snapshot Put SDel Put SDel Snapshot -> Snapshot SDel Snapshot + // C: SDel Put SDel Snapshot Put -> Snapshot Put + // D: (Put) SDel Snapshot Put SDel -> (Put) SDel Snapshot SDel + // E: Put SDel Snapshot Put SDel -> Snapshot SDel + // F: Put SDel Put Sdel Snapshot -> removed + // G: Snapshot SDel Put SDel Put -> Snapshot Put SDel + // H: (Put) Put SDel Put Sdel Snapshot -> Removed + // I: (Put) Snapshot Put SDel Put SDel -> SDel + // J: Put Put SDel Put SDel SDel Snapshot Put Put SDel SDel Put + // -> Snapshot Put + // K: SDel SDel Put SDel Put Put Snapshot SDel Put SDel SDel Put SDel + // -> Snapshot Put Snapshot SDel + // L: SDel Put SDel Put SDel Snapshot SDel Put SDel SDel Put SDel + // -> Snapshot SDel Put SDel + // M: (Put) SDel Put SDel Put SDel Snapshot Put SDel SDel Put SDel SDel + // -> SDel Snapshot Put SDel + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("A", 14U, kTypeSingleDeletion), ""}, + {KeyStr("A", 13U, kTypeValue), "val5"}, + {KeyStr("A", 12U, kTypeSingleDeletion), ""}, + {KeyStr("B", 14U, kTypeSingleDeletion), ""}, + {KeyStr("B", 13U, kTypeValue), "val2"}, + {KeyStr("C", 14U, kTypeValue), "val3"}, + {KeyStr("D", 12U, kTypeSingleDeletion), ""}, + {KeyStr("D", 11U, kTypeValue), "val4"}, + {KeyStr("G", 15U, kTypeValue), "val"}, + {KeyStr("G", 14U, kTypeSingleDeletion), ""}, + {KeyStr("G", 13U, kTypeValue), "val"}, + {KeyStr("I", 14U, kTypeSingleDeletion), ""}, + {KeyStr("I", 13U, kTypeValue), "val"}, + {KeyStr("J", 15U, kTypeValue), "val"}, + {KeyStr("J", 14U, kTypeSingleDeletion), ""}, + {KeyStr("J", 13U, kTypeSingleDeletion), ""}, + {KeyStr("J", 12U, kTypeValue), "val"}, + {KeyStr("J", 11U, kTypeValue), "val"}, + {KeyStr("K", 16U, kTypeSingleDeletion), ""}, + {KeyStr("K", 15U, kTypeValue), "val1"}, + {KeyStr("K", 14U, kTypeSingleDeletion), ""}, + {KeyStr("K", 13U, kTypeSingleDeletion), ""}, + {KeyStr("K", 12U, kTypeValue), "val2"}, + {KeyStr("K", 11U, kTypeSingleDeletion), ""}, + {KeyStr("L", 16U, kTypeSingleDeletion), ""}, + {KeyStr("L", 15U, kTypeValue), "val"}, + {KeyStr("L", 14U, kTypeSingleDeletion), ""}, + {KeyStr("L", 13U, kTypeSingleDeletion), ""}, + {KeyStr("L", 12U, kTypeValue), "val"}, + {KeyStr("L", 11U, kTypeSingleDeletion), ""}, + {KeyStr("M", 16U, kTypeSingleDeletion), ""}, + {KeyStr("M", 15U, kTypeSingleDeletion), ""}, + {KeyStr("M", 14U, kTypeValue), "val"}, + {KeyStr("M", 13U, kTypeSingleDeletion), ""}, + {KeyStr("M", 12U, kTypeSingleDeletion), ""}, + {KeyStr("M", 11U, kTypeValue), "val"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({ + {KeyStr("A", 10U, kTypeValue), "val"}, + {KeyStr("B", 12U, kTypeSingleDeletion), ""}, + {KeyStr("B", 11U, kTypeValue), "val2"}, + {KeyStr("C", 10U, kTypeSingleDeletion), ""}, + {KeyStr("C", 9U, kTypeValue), "val6"}, + {KeyStr("C", 8U, kTypeSingleDeletion), ""}, + {KeyStr("D", 10U, kTypeSingleDeletion), ""}, + {KeyStr("E", 12U, kTypeSingleDeletion), ""}, + {KeyStr("E", 11U, kTypeValue), "val"}, + {KeyStr("E", 5U, kTypeSingleDeletion), ""}, + {KeyStr("E", 4U, kTypeValue), "val"}, + {KeyStr("F", 6U, kTypeSingleDeletion), ""}, + {KeyStr("F", 5U, kTypeValue), "val"}, + {KeyStr("F", 4U, kTypeSingleDeletion), ""}, + {KeyStr("F", 3U, kTypeValue), "val"}, + {KeyStr("G", 12U, kTypeSingleDeletion), ""}, + {KeyStr("H", 6U, kTypeSingleDeletion), ""}, + {KeyStr("H", 5U, kTypeValue), "val"}, + {KeyStr("H", 4U, kTypeSingleDeletion), ""}, + {KeyStr("H", 3U, kTypeValue), "val"}, + {KeyStr("I", 12U, kTypeSingleDeletion), ""}, + {KeyStr("I", 11U, kTypeValue), "val"}, + {KeyStr("J", 6U, kTypeSingleDeletion), ""}, + {KeyStr("J", 5U, kTypeSingleDeletion), ""}, + {KeyStr("J", 4U, kTypeValue), "val"}, + {KeyStr("J", 3U, kTypeSingleDeletion), ""}, + {KeyStr("J", 2U, kTypeValue), "val"}, + {KeyStr("K", 8U, kTypeValue), "val3"}, + {KeyStr("K", 7U, kTypeValue), "val4"}, + {KeyStr("K", 6U, kTypeSingleDeletion), ""}, + {KeyStr("K", 5U, kTypeValue), "val5"}, + {KeyStr("K", 2U, kTypeSingleDeletion), ""}, + {KeyStr("K", 1U, kTypeSingleDeletion), ""}, + {KeyStr("L", 5U, kTypeSingleDeletion), ""}, + {KeyStr("L", 4U, kTypeValue), "val"}, + {KeyStr("L", 3U, kTypeSingleDeletion), ""}, + {KeyStr("L", 2U, kTypeValue), "val"}, + {KeyStr("L", 1U, kTypeSingleDeletion), ""}, + {KeyStr("M", 10U, kTypeSingleDeletion), ""}, + {KeyStr("M", 7U, kTypeValue), "val"}, + {KeyStr("M", 5U, kTypeSingleDeletion), ""}, + {KeyStr("M", 4U, kTypeValue), "val"}, + {KeyStr("M", 3U, kTypeSingleDeletion), ""}, + }); + AddMockFile(file2); + + auto file3 = mock::MakeMockFile({ + {KeyStr("D", 1U, kTypeValue), "val"}, + {KeyStr("H", 1U, kTypeValue), "val"}, + {KeyStr("I", 2U, kTypeValue), "val"}, + }); + AddMockFile(file3, 2); + + auto file4 = mock::MakeMockFile({ + {KeyStr("M", 1U, kTypeValue), "val"}, + }); + AddMockFile(file4, 2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("A", 14U, kTypeSingleDeletion), ""}, + {KeyStr("A", 13U, kTypeValue), ""}, + {KeyStr("A", 12U, kTypeSingleDeletion), ""}, + {KeyStr("A", 10U, kTypeValue), "val"}, + {KeyStr("B", 14U, kTypeSingleDeletion), ""}, + {KeyStr("B", 13U, kTypeValue), ""}, + {KeyStr("C", 14U, kTypeValue), "val3"}, + {KeyStr("D", 12U, kTypeSingleDeletion), ""}, + {KeyStr("D", 11U, kTypeValue), ""}, + {KeyStr("D", 10U, kTypeSingleDeletion), ""}, + {KeyStr("E", 12U, kTypeSingleDeletion), ""}, + {KeyStr("E", 11U, kTypeValue), ""}, + {KeyStr("G", 15U, kTypeValue), "val"}, + {KeyStr("G", 12U, kTypeSingleDeletion), ""}, + {KeyStr("I", 14U, kTypeSingleDeletion), ""}, + {KeyStr("I", 13U, kTypeValue), ""}, + {KeyStr("J", 15U, kTypeValue), "val"}, + {KeyStr("K", 16U, kTypeSingleDeletion), ""}, + {KeyStr("K", 15U, kTypeValue), ""}, + {KeyStr("K", 11U, kTypeSingleDeletion), ""}, + {KeyStr("K", 8U, kTypeValue), "val3"}, + {KeyStr("L", 16U, kTypeSingleDeletion), ""}, + {KeyStr("L", 15U, kTypeValue), ""}, + {KeyStr("L", 11U, kTypeSingleDeletion), ""}, + {KeyStr("M", 15U, kTypeSingleDeletion), ""}, + {KeyStr("M", 14U, kTypeValue), ""}, + {KeyStr("M", 3U, kTypeSingleDeletion), ""}}); + + SetLastSequence(22U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}, {10U}, 10U); +} + +// This test documents the behavior where a corrupt key follows a deletion or a +// single deletion and the (single) deletion gets removed while the corrupt key +// gets written out. TODO(noetzli): We probably want a better way to treat +// corrupt keys. +TEST_F(CompactionJobTest, DISABLED_CorruptionAfterDeletion) { + NewDB(); + + auto file1 = + mock::MakeMockFile({{test::KeyStr("A", 6U, kTypeValue), "val3"}, + {test::KeyStr("a", 5U, kTypeDeletion), ""}, + {test::KeyStr("a", 4U, kTypeValue, true), "val"}}); + AddMockFile(file1); + + auto file2 = + mock::MakeMockFile({{test::KeyStr("b", 3U, kTypeSingleDeletion), ""}, + {test::KeyStr("b", 2U, kTypeValue, true), "val"}, + {test::KeyStr("c", 1U, kTypeValue), "val2"}}); + AddMockFile(file2); + + auto expected_results = + mock::MakeMockFile({{test::KeyStr("A", 0U, kTypeValue), "val3"}, + {test::KeyStr("a", 0U, kTypeValue, true), "val"}, + {test::KeyStr("b", 0U, kTypeValue, true), "val"}, + {test::KeyStr("c", 0U, kTypeValue), "val2"}}); + + SetLastSequence(6U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}); +} + +TEST_F(CompactionJobTest, OldestBlobFileNumber) { + NewDB(); + + // Note: blob1 is inlined TTL, so it will not be considered for the purposes + // of identifying the oldest referenced blob file. Similarly, blob6 will be + // ignored because it has TTL and hence refers to a TTL blob file. + const stl_wrappers::KVMap::value_type blob1( + KeyStr("a", 1U, kTypeBlobIndex), BlobStrInlinedTTL("foo", 1234567890ULL)); + const stl_wrappers::KVMap::value_type blob2(KeyStr("b", 2U, kTypeBlobIndex), + BlobStr(59, 123456, 999)); + const stl_wrappers::KVMap::value_type blob3(KeyStr("c", 3U, kTypeBlobIndex), + BlobStr(138, 1000, 1 << 8)); + auto file1 = mock::MakeMockFile({blob1, blob2, blob3}); + AddMockFile(file1); + + const stl_wrappers::KVMap::value_type blob4(KeyStr("d", 4U, kTypeBlobIndex), + BlobStr(199, 3 << 10, 1 << 20)); + const stl_wrappers::KVMap::value_type blob5(KeyStr("e", 5U, kTypeBlobIndex), + BlobStr(19, 6789, 333)); + const stl_wrappers::KVMap::value_type blob6( + KeyStr("f", 6U, kTypeBlobIndex), + BlobStrTTL(5, 2048, 1 << 7, 1234567890ULL)); + auto file2 = mock::MakeMockFile({blob4, blob5, blob6}); + AddMockFile(file2); + + const stl_wrappers::KVMap::value_type expected_blob1( + KeyStr("a", 0U, kTypeBlobIndex), blob1.second); + const stl_wrappers::KVMap::value_type expected_blob2( + KeyStr("b", 0U, kTypeBlobIndex), blob2.second); + const stl_wrappers::KVMap::value_type expected_blob3( + KeyStr("c", 0U, kTypeBlobIndex), blob3.second); + const stl_wrappers::KVMap::value_type expected_blob4( + KeyStr("d", 0U, kTypeBlobIndex), blob4.second); + const stl_wrappers::KVMap::value_type expected_blob5( + KeyStr("e", 0U, kTypeBlobIndex), blob5.second); + const stl_wrappers::KVMap::value_type expected_blob6( + KeyStr("f", 0U, kTypeBlobIndex), blob6.second); + auto expected_results = + mock::MakeMockFile({expected_blob1, expected_blob2, expected_blob3, + expected_blob4, expected_blob5, expected_blob6}); + + SetLastSequence(6U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}, + std::vector(), kMaxSequenceNumber, + /* output_level */ 1, /* verify */ true, + /* expected_oldest_blob_file_numbers */ {19}); +} + +TEST_F(CompactionJobTest, VerifyPenultimateLevelOutput) { + cf_options_.bottommost_temperature = Temperature::kCold; + SyncPoint::GetInstance()->SetCallBack( + "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) { + auto supports_per_key_placement = static_cast(arg); + *supports_per_key_placement = true; + }); + + std::atomic_uint64_t latest_cold_seq = 0; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast(arg); + context->output_to_penultimate_level = + context->seq_num > latest_cold_seq; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + NewDB(); + + // Add files on different levels that may overlap + auto file0_1 = mock::MakeMockFile({{KeyStr("z", 12U, kTypeValue), "val"}}); + AddMockFile(file0_1); + + auto file1_1 = mock::MakeMockFile({{KeyStr("b", 10U, kTypeValue), "val"}, + {KeyStr("f", 11U, kTypeValue), "val"}}); + AddMockFile(file1_1, 1); + auto file1_2 = mock::MakeMockFile({{KeyStr("j", 12U, kTypeValue), "val"}, + {KeyStr("k", 13U, kTypeValue), "val"}}); + AddMockFile(file1_2, 1); + auto file1_3 = mock::MakeMockFile({{KeyStr("p", 14U, kTypeValue), "val"}, + {KeyStr("u", 15U, kTypeValue), "val"}}); + AddMockFile(file1_3, 1); + + auto file2_1 = mock::MakeMockFile({{KeyStr("f", 8U, kTypeValue), "val"}, + {KeyStr("h", 9U, kTypeValue), "val"}}); + AddMockFile(file2_1, 2); + auto file2_2 = mock::MakeMockFile({{KeyStr("m", 6U, kTypeValue), "val"}, + {KeyStr("p", 7U, kTypeValue), "val"}}); + AddMockFile(file2_2, 2); + + auto file3_1 = mock::MakeMockFile({{KeyStr("g", 2U, kTypeValue), "val"}, + {KeyStr("k", 3U, kTypeValue), "val"}}); + AddMockFile(file3_1, 3); + auto file3_2 = mock::MakeMockFile({{KeyStr("v", 4U, kTypeValue), "val"}, + {KeyStr("x", 5U, kTypeValue), "val"}}); + AddMockFile(file3_2, 3); + + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + const std::vector input_levels = {0, 1, 2, 3}; + auto files0 = cfd->current()->storage_info()->LevelFiles(input_levels[0]); + auto files1 = cfd->current()->storage_info()->LevelFiles(input_levels[1]); + auto files2 = cfd->current()->storage_info()->LevelFiles(input_levels[2]); + auto files3 = cfd->current()->storage_info()->LevelFiles(input_levels[3]); + + RunLastLevelCompaction( + {files0, files1, files2, files3}, input_levels, + /*verify_func=*/[&](Compaction& comp) { + for (char c = 'a'; c <= 'z'; c++) { + std::string c_str; + c_str = c; + const Slice key(c_str); + if (c == 'a') { + ASSERT_FALSE(comp.WithinPenultimateLevelOutputRange(key)); + } else { + ASSERT_TRUE(comp.WithinPenultimateLevelOutputRange(key)); + } + } + }); +} + +TEST_F(CompactionJobTest, NoEnforceSingleDeleteContract) { + db_options_.enforce_single_del_contracts = false; + NewDB(); + + auto file = + mock::MakeMockFile({{KeyStr("a", 4U, kTypeSingleDeletion), ""}, + {KeyStr("a", 3U, kTypeDeletion), "dontcare"}}); + AddMockFile(file); + SetLastSequence(4U); + + auto expected_results = mock::MakeMockFile(); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}); +} + +TEST_F(CompactionJobTest, InputSerialization) { + // Setup a random CompactionServiceInput + CompactionServiceInput input; + const int kStrMaxLen = 1000; + Random rnd(static_cast(time(nullptr))); + Random64 rnd64(time(nullptr)); + input.column_family.name = rnd.RandomString(rnd.Uniform(kStrMaxLen)); + input.column_family.options.comparator = ReverseBytewiseComparator(); + input.column_family.options.max_bytes_for_level_base = + rnd64.Uniform(UINT64_MAX); + input.column_family.options.disable_auto_compactions = rnd.OneIn(2); + input.column_family.options.compression = kZSTD; + input.column_family.options.compression_opts.level = 4; + input.db_options.max_background_flushes = 10; + input.db_options.paranoid_checks = rnd.OneIn(2); + input.db_options.statistics = CreateDBStatistics(); + input.db_options.env = env_; + while (!rnd.OneIn(10)) { + input.snapshots.emplace_back(rnd64.Uniform(UINT64_MAX)); + } + while (!rnd.OneIn(10)) { + input.input_files.emplace_back(rnd.RandomString( + rnd.Uniform(kStrMaxLen - 1) + + 1)); // input file name should have at least one character + } + input.output_level = 4; + input.has_begin = rnd.OneIn(2); + if (input.has_begin) { + input.begin = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)); + } + input.has_end = rnd.OneIn(2); + if (input.has_end) { + input.end = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)); + } + + std::string output; + ASSERT_OK(input.Write(&output)); + + // Test deserialization + CompactionServiceInput deserialized1; + ASSERT_OK(CompactionServiceInput::Read(output, &deserialized1)); + ASSERT_TRUE(deserialized1.TEST_Equals(&input)); + + // Test mismatch + deserialized1.db_options.max_background_flushes += 10; + std::string mismatch; + ASSERT_FALSE(deserialized1.TEST_Equals(&input, &mismatch)); + ASSERT_EQ(mismatch, "db_options.max_background_flushes"); + + // Test unknown field + CompactionServiceInput deserialized2; + output.clear(); + ASSERT_OK(input.Write(&output)); + output.append("new_field=123;"); + + ASSERT_OK(CompactionServiceInput::Read(output, &deserialized2)); + ASSERT_TRUE(deserialized2.TEST_Equals(&input)); + + // Test missing field + CompactionServiceInput deserialized3; + deserialized3.output_level = 0; + std::string to_remove = "output_level=4;"; + size_t pos = output.find(to_remove); + ASSERT_TRUE(pos != std::string::npos); + output.erase(pos, to_remove.length()); + ASSERT_OK(CompactionServiceInput::Read(output, &deserialized3)); + mismatch.clear(); + ASSERT_FALSE(deserialized3.TEST_Equals(&input, &mismatch)); + ASSERT_EQ(mismatch, "output_level"); + + // manually set the value back, should match the original structure + deserialized3.output_level = 4; + ASSERT_TRUE(deserialized3.TEST_Equals(&input)); + + // Test invalid version + output.clear(); + ASSERT_OK(input.Write(&output)); + + uint32_t data_version = DecodeFixed32(output.data()); + const size_t kDataVersionSize = sizeof(data_version); + ASSERT_EQ(data_version, + 1U); // Update once the default data version is changed + char buf[kDataVersionSize]; + EncodeFixed32(buf, data_version + 10); // make sure it's not valid + output.replace(0, kDataVersionSize, buf, kDataVersionSize); + Status s = CompactionServiceInput::Read(output, &deserialized3); + ASSERT_TRUE(s.IsNotSupported()); +} + +TEST_F(CompactionJobTest, ResultSerialization) { + // Setup a random CompactionServiceResult + CompactionServiceResult result; + const int kStrMaxLen = 1000; + Random rnd(static_cast(time(nullptr))); + Random64 rnd64(time(nullptr)); + std::vector status_list = { + Status::OK(), + Status::InvalidArgument("invalid option"), + Status::Aborted("failed to run"), + Status::NotSupported("not supported option"), + }; + result.status = + status_list.at(rnd.Uniform(static_cast(status_list.size()))); + while (!rnd.OneIn(10)) { + UniqueId64x2 id{rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX)}; + result.output_files.emplace_back( + rnd.RandomString(rnd.Uniform(kStrMaxLen)), rnd64.Uniform(UINT64_MAX), + rnd64.Uniform(UINT64_MAX), + rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)), + rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)), + rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX), + rnd64.Uniform(UINT64_MAX), rnd.OneIn(2), id); + } + result.output_level = rnd.Uniform(10); + result.output_path = rnd.RandomString(rnd.Uniform(kStrMaxLen)); + result.num_output_records = rnd64.Uniform(UINT64_MAX); + result.total_bytes = rnd64.Uniform(UINT64_MAX); + result.bytes_read = 123; + result.bytes_written = rnd64.Uniform(UINT64_MAX); + result.stats.elapsed_micros = rnd64.Uniform(UINT64_MAX); + result.stats.num_output_files = rnd.Uniform(1000); + result.stats.is_full_compaction = rnd.OneIn(2); + result.stats.num_single_del_mismatch = rnd64.Uniform(UINT64_MAX); + result.stats.num_input_files = 9; + + std::string output; + ASSERT_OK(result.Write(&output)); + + // Test deserialization + CompactionServiceResult deserialized1; + ASSERT_OK(CompactionServiceResult::Read(output, &deserialized1)); + ASSERT_TRUE(deserialized1.TEST_Equals(&result)); + + // Test mismatch + deserialized1.stats.num_input_files += 10; + std::string mismatch; + ASSERT_FALSE(deserialized1.TEST_Equals(&result, &mismatch)); + ASSERT_EQ(mismatch, "stats.num_input_files"); + + // Test unique id mismatch + if (!result.output_files.empty()) { + CompactionServiceResult deserialized_tmp; + ASSERT_OK(CompactionServiceResult::Read(output, &deserialized_tmp)); + deserialized_tmp.output_files[0].unique_id[0] += 1; + ASSERT_FALSE(deserialized_tmp.TEST_Equals(&result, &mismatch)); + ASSERT_EQ(mismatch, "output_files.unique_id"); + deserialized_tmp.status.PermitUncheckedError(); + } + + // Test unknown field + CompactionServiceResult deserialized2; + output.clear(); + ASSERT_OK(result.Write(&output)); + output.append("new_field=123;"); + + ASSERT_OK(CompactionServiceResult::Read(output, &deserialized2)); + ASSERT_TRUE(deserialized2.TEST_Equals(&result)); + + // Test missing field + CompactionServiceResult deserialized3; + deserialized3.bytes_read = 0; + std::string to_remove = "bytes_read=123;"; + size_t pos = output.find(to_remove); + ASSERT_TRUE(pos != std::string::npos); + output.erase(pos, to_remove.length()); + ASSERT_OK(CompactionServiceResult::Read(output, &deserialized3)); + mismatch.clear(); + ASSERT_FALSE(deserialized3.TEST_Equals(&result, &mismatch)); + ASSERT_EQ(mismatch, "bytes_read"); + + deserialized3.bytes_read = 123; + ASSERT_TRUE(deserialized3.TEST_Equals(&result)); + + // Test invalid version + output.clear(); + ASSERT_OK(result.Write(&output)); + + uint32_t data_version = DecodeFixed32(output.data()); + const size_t kDataVersionSize = sizeof(data_version); + ASSERT_EQ(data_version, + 1U); // Update once the default data version is changed + char buf[kDataVersionSize]; + EncodeFixed32(buf, data_version + 10); // make sure it's not valid + output.replace(0, kDataVersionSize, buf, kDataVersionSize); + Status s = CompactionServiceResult::Read(output, &deserialized3); + ASSERT_TRUE(s.IsNotSupported()); + for (const auto& item : status_list) { + item.PermitUncheckedError(); + } +} + +class CompactionJobDynamicFileSizeTest + : public CompactionJobTestBase, + public ::testing::WithParamInterface { + public: + CompactionJobDynamicFileSizeTest() + : CompactionJobTestBase( + test::PerThreadDBPath("compaction_job_dynamic_file_size_test"), + BytewiseComparator(), [](uint64_t /*ts*/) { return ""; }, + /*test_io_priority=*/false, TableTypeForTest::kMockTable) {} +}; + +TEST_P(CompactionJobDynamicFileSizeTest, CutForMaxCompactionBytes) { + // dynamic_file_size option should have no impact on cutting for max + // compaction bytes. + bool enable_dyanmic_file_size = GetParam(); + cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size; + + NewDB(); + mutable_cf_options_.target_file_size_base = 80; + mutable_cf_options_.max_compaction_bytes = 21; + + auto file1 = mock::MakeMockFile({ + {KeyStr("c", 5U, kTypeValue), "val2"}, + {KeyStr("n", 6U, kTypeValue), "val3"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("h", 3U, kTypeValue), "val"}, + {KeyStr("j", 4U, kTypeValue), "val"}}); + AddMockFile(file2, 1); + + // Create three L2 files, each size 10. + // max_compaction_bytes 21 means the compaction output in L1 will + // be cut to at least two files. + auto file3 = mock::MakeMockFile({{KeyStr("b", 1U, kTypeValue), "val"}, + {KeyStr("c", 1U, kTypeValue), "val"}, + {KeyStr("c1", 1U, kTypeValue), "val"}, + {KeyStr("c2", 1U, kTypeValue), "val"}, + {KeyStr("c3", 1U, kTypeValue), "val"}, + {KeyStr("c4", 1U, kTypeValue), "val"}, + {KeyStr("d", 1U, kTypeValue), "val"}, + {KeyStr("e", 2U, kTypeValue), "val"}}); + AddMockFile(file3, 2); + + auto file4 = mock::MakeMockFile({{KeyStr("h", 1U, kTypeValue), "val"}, + {KeyStr("i", 1U, kTypeValue), "val"}, + {KeyStr("i1", 1U, kTypeValue), "val"}, + {KeyStr("i2", 1U, kTypeValue), "val"}, + {KeyStr("i3", 1U, kTypeValue), "val"}, + {KeyStr("i4", 1U, kTypeValue), "val"}, + {KeyStr("j", 1U, kTypeValue), "val"}, + {KeyStr("k", 2U, kTypeValue), "val"}}); + AddMockFile(file4, 2); + + auto file5 = mock::MakeMockFile({{KeyStr("l", 1U, kTypeValue), "val"}, + {KeyStr("m", 1U, kTypeValue), "val"}, + {KeyStr("m1", 1U, kTypeValue), "val"}, + {KeyStr("m2", 1U, kTypeValue), "val"}, + {KeyStr("m3", 1U, kTypeValue), "val"}, + {KeyStr("m4", 1U, kTypeValue), "val"}, + {KeyStr("n", 1U, kTypeValue), "val"}, + {KeyStr("o", 2U, kTypeValue), "val"}}); + AddMockFile(file5, 2); + + // The expected output should be: + // L1: [c, h, j] [n] + // L2: [b ... e] [h ... k] [l ... o] + // It's better to have "j" in the first file, because anyway it's overlapping + // with the second file on L2. + // (Note: before this PR, it was cut at "h" because it's using the internal + // comparator which think L1 "h" with seqno 3 is smaller than L2 "h" with + // seqno 1, but actually they're overlapped with the compaction picker). + + auto expected_file1 = + mock::MakeMockFile({{KeyStr("c", 5U, kTypeValue), "val2"}, + {KeyStr("h", 3U, kTypeValue), "val"}, + {KeyStr("j", 4U, kTypeValue), "val"}}); + auto expected_file2 = + mock::MakeMockFile({{KeyStr("n", 6U, kTypeValue), "val3"}}); + + SetLastSequence(6U); + + const std::vector input_levels = {0, 1}; + auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0); + auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1); + + RunCompaction({lvl0_files, lvl1_files}, input_levels, + {expected_file1, expected_file2}); +} + +TEST_P(CompactionJobDynamicFileSizeTest, CutToSkipGrandparentFile) { + bool enable_dyanmic_file_size = GetParam(); + cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size; + + NewDB(); + // Make sure the grandparent level file size (10) qualifies skipping. + // Currently, it has to be > 1/8 of target file size. + mutable_cf_options_.target_file_size_base = 70; + + auto file1 = mock::MakeMockFile({ + {KeyStr("a", 5U, kTypeValue), "val2"}, + {KeyStr("z", 6U, kTypeValue), "val3"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("c", 3U, kTypeValue), "val"}, + {KeyStr("x", 4U, kTypeValue), "val"}}); + AddMockFile(file2, 1); + + auto file3 = mock::MakeMockFile({{KeyStr("b", 1U, kTypeValue), "val"}, + {KeyStr("d", 2U, kTypeValue), "val"}}); + AddMockFile(file3, 2); + + auto file4 = mock::MakeMockFile({{KeyStr("h", 1U, kTypeValue), "val"}, + {KeyStr("i", 2U, kTypeValue), "val"}}); + AddMockFile(file4, 2); + + auto file5 = mock::MakeMockFile({{KeyStr("v", 1U, kTypeValue), "val"}, + {KeyStr("y", 2U, kTypeValue), "val"}}); + AddMockFile(file5, 2); + + auto expected_file1 = + mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"}, + {KeyStr("c", 3U, kTypeValue), "val"}}); + auto expected_file2 = + mock::MakeMockFile({{KeyStr("x", 4U, kTypeValue), "val"}, + {KeyStr("z", 6U, kTypeValue), "val3"}}); + + auto expected_file_disable_dynamic_file_size = + mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"}, + {KeyStr("c", 3U, kTypeValue), "val"}, + {KeyStr("x", 4U, kTypeValue), "val"}, + {KeyStr("z", 6U, kTypeValue), "val3"}}); + + SetLastSequence(6U); + const std::vector input_levels = {0, 1}; + auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0); + auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1); + if (enable_dyanmic_file_size) { + RunCompaction({lvl0_files, lvl1_files}, input_levels, + {expected_file1, expected_file2}); + } else { + RunCompaction({lvl0_files, lvl1_files}, input_levels, + {expected_file_disable_dynamic_file_size}); + } +} + +TEST_P(CompactionJobDynamicFileSizeTest, CutToAlignGrandparentBoundary) { + bool enable_dyanmic_file_size = GetParam(); + cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size; + NewDB(); + + // MockTable has 1 byte per entry by default and each file is 10 bytes. + // When the file size is smaller than 100, it won't cut file earlier to align + // with its grandparent boundary. + const size_t kKeyValueSize = 10000; + mock_table_factory_->SetKeyValueSize(kKeyValueSize); + + mutable_cf_options_.target_file_size_base = 10 * kKeyValueSize; + + mock::KVVector file1; + char ch = 'd'; + // Add value from d -> o + for (char i = 0; i < 12; i++) { + file1.emplace_back(KeyStr(std::string(1, ch + i), i + 10, kTypeValue), + "val" + std::to_string(i)); + } + + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("e", 3U, kTypeValue), "val"}, + {KeyStr("s", 4U, kTypeValue), "val"}}); + AddMockFile(file2, 1); + + // the 1st grandparent file should be skipped + auto file3 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"}, + {KeyStr("b", 2U, kTypeValue), "val"}}); + AddMockFile(file3, 2); + + auto file4 = mock::MakeMockFile({{KeyStr("c", 1U, kTypeValue), "val"}, + {KeyStr("e", 2U, kTypeValue), "val"}}); + AddMockFile(file4, 2); + + auto file5 = mock::MakeMockFile({{KeyStr("h", 1U, kTypeValue), "val"}, + {KeyStr("j", 2U, kTypeValue), "val"}}); + AddMockFile(file5, 2); + + auto file6 = mock::MakeMockFile({{KeyStr("k", 1U, kTypeValue), "val"}, + {KeyStr("n", 2U, kTypeValue), "val"}}); + AddMockFile(file6, 2); + + auto file7 = mock::MakeMockFile({{KeyStr("q", 1U, kTypeValue), "val"}, + {KeyStr("t", 2U, kTypeValue), "val"}}); + AddMockFile(file7, 2); + + // The expected outputs are: + // L1: [d,e,f,g,h,i,j] [k,l,m,n,o,s] + // L2: [a, b] [c, e] [h, j] [k, n] [q, t] + // The first output cut earlier at "j", so it could be aligned with L2 files. + // If dynamic_file_size is not enabled, it will be cut based on the + // target_file_size + mock::KVVector expected_file1; + for (char i = 0; i < 7; i++) { + expected_file1.emplace_back( + KeyStr(std::string(1, ch + i), i + 10, kTypeValue), + "val" + std::to_string(i)); + } + + mock::KVVector expected_file2; + for (char i = 7; i < 12; i++) { + expected_file2.emplace_back( + KeyStr(std::string(1, ch + i), i + 10, kTypeValue), + "val" + std::to_string(i)); + } + expected_file2.emplace_back(KeyStr("s", 4U, kTypeValue), "val"); + + mock::KVVector expected_file_disable_dynamic_file_size1; + for (char i = 0; i < 10; i++) { + expected_file_disable_dynamic_file_size1.emplace_back( + KeyStr(std::string(1, ch + i), i + 10, kTypeValue), + "val" + std::to_string(i)); + } + + mock::KVVector expected_file_disable_dynamic_file_size2; + for (char i = 10; i < 12; i++) { + expected_file_disable_dynamic_file_size2.emplace_back( + KeyStr(std::string(1, ch + i), i + 10, kTypeValue), + "val" + std::to_string(i)); + } + + expected_file_disable_dynamic_file_size2.emplace_back( + KeyStr("s", 4U, kTypeValue), "val"); + + SetLastSequence(22U); + const std::vector input_levels = {0, 1}; + auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0); + auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1); + if (enable_dyanmic_file_size) { + RunCompaction({lvl0_files, lvl1_files}, input_levels, + {expected_file1, expected_file2}); + } else { + RunCompaction({lvl0_files, lvl1_files}, input_levels, + {expected_file_disable_dynamic_file_size1, + expected_file_disable_dynamic_file_size2}); + } +} + +TEST_P(CompactionJobDynamicFileSizeTest, CutToAlignGrandparentBoundarySameKey) { + bool enable_dyanmic_file_size = GetParam(); + cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size; + NewDB(); + + // MockTable has 1 byte per entry by default and each file is 10 bytes. + // When the file size is smaller than 100, it won't cut file earlier to align + // with its grandparent boundary. + const size_t kKeyValueSize = 10000; + mock_table_factory_->SetKeyValueSize(kKeyValueSize); + + mutable_cf_options_.target_file_size_base = 10 * kKeyValueSize; + + mock::KVVector file1; + for (int i = 0; i < 7; i++) { + file1.emplace_back(KeyStr("a", 100 - i, kTypeValue), + "val" + std::to_string(100 - i)); + } + file1.emplace_back(KeyStr("b", 90, kTypeValue), "valb"); + + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("a", 93U, kTypeValue), "val93"}, + {KeyStr("b", 90U, kTypeValue), "valb"}}); + AddMockFile(file2, 1); + + auto file3 = mock::MakeMockFile({{KeyStr("a", 89U, kTypeValue), "val"}, + {KeyStr("a", 88U, kTypeValue), "val"}}); + AddMockFile(file3, 2); + + auto file4 = mock::MakeMockFile({{KeyStr("a", 87U, kTypeValue), "val"}, + {KeyStr("a", 86U, kTypeValue), "val"}}); + AddMockFile(file4, 2); + + auto file5 = mock::MakeMockFile({{KeyStr("b", 85U, kTypeValue), "val"}, + {KeyStr("b", 84U, kTypeValue), "val"}}); + AddMockFile(file5, 2); + + mock::KVVector expected_file1; + mock::KVVector expected_file_disable_dynamic_file_size; + + for (int i = 0; i < 8; i++) { + expected_file1.emplace_back(KeyStr("a", 100 - i, kTypeValue), + "val" + std::to_string(100 - i)); + expected_file_disable_dynamic_file_size.emplace_back( + KeyStr("a", 100 - i, kTypeValue), "val" + std::to_string(100 - i)); + } + + // make sure `b` is cut in a separated file (so internally it's not using + // internal comparator, which will think the "b:90" (seqno 90) here is smaller + // than "b:85" on L2.) + auto expected_file2 = + mock::MakeMockFile({{KeyStr("b", 90U, kTypeValue), "valb"}}); + + expected_file_disable_dynamic_file_size.emplace_back( + KeyStr("b", 90U, kTypeValue), "valb"); + + SetLastSequence(122U); + const std::vector input_levels = {0, 1}; + auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0); + auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1); + + // Just keep all the history + std::vector snapshots; + for (int i = 80; i <= 100; i++) { + snapshots.emplace_back(i); + } + if (enable_dyanmic_file_size) { + RunCompaction({lvl0_files, lvl1_files}, input_levels, + {expected_file1, expected_file2}, snapshots); + } else { + RunCompaction({lvl0_files, lvl1_files}, input_levels, + {expected_file_disable_dynamic_file_size}, snapshots); + } +} + +TEST_P(CompactionJobDynamicFileSizeTest, CutForMaxCompactionBytesSameKey) { + // dynamic_file_size option should have no impact on cutting for max + // compaction bytes. + bool enable_dyanmic_file_size = GetParam(); + cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size; + + NewDB(); + mutable_cf_options_.target_file_size_base = 80; + mutable_cf_options_.max_compaction_bytes = 20; + + auto file1 = mock::MakeMockFile({{KeyStr("a", 104U, kTypeValue), "val1"}, + {KeyStr("b", 103U, kTypeValue), "val"}}); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("a", 102U, kTypeValue), "val2"}, + {KeyStr("c", 101U, kTypeValue), "val"}}); + AddMockFile(file2, 1); + + for (int i = 0; i < 10; i++) { + auto file = + mock::MakeMockFile({{KeyStr("a", 100 - (i * 2), kTypeValue), "val"}, + {KeyStr("a", 99 - (i * 2), kTypeValue), "val"}}); + AddMockFile(file, 2); + } + + for (int i = 0; i < 10; i++) { + auto file = + mock::MakeMockFile({{KeyStr("b", 80 - (i * 2), kTypeValue), "val"}, + {KeyStr("b", 79 - (i * 2), kTypeValue), "val"}}); + AddMockFile(file, 2); + } + + auto file5 = mock::MakeMockFile({{KeyStr("c", 60U, kTypeValue), "valc"}, + {KeyStr("c", 59U, kTypeValue), "valc"}}); + + // "a" has 10 overlapped grandparent files (each size 10), which is far + // exceeded the `max_compaction_bytes`, but make sure 2 "a" are not separated, + // as splitting them won't help reducing the compaction size. + // also make sure "b" and "c" are cut separately. + mock::KVVector expected_file1 = + mock::MakeMockFile({{KeyStr("a", 104U, kTypeValue), "val1"}, + {KeyStr("a", 102U, kTypeValue), "val2"}}); + mock::KVVector expected_file2 = + mock::MakeMockFile({{KeyStr("b", 103U, kTypeValue), "val"}}); + mock::KVVector expected_file3 = + mock::MakeMockFile({{KeyStr("c", 101U, kTypeValue), "val"}}); + + SetLastSequence(122U); + const std::vector input_levels = {0, 1}; + auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0); + auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1); + + // Just keep all the history + std::vector snapshots; + for (int i = 80; i <= 105; i++) { + snapshots.emplace_back(i); + } + RunCompaction({lvl0_files, lvl1_files}, input_levels, + {expected_file1, expected_file2, expected_file3}, snapshots); +} + +INSTANTIATE_TEST_CASE_P(CompactionJobDynamicFileSizeTest, + CompactionJobDynamicFileSizeTest, testing::Bool()); + +class CompactionJobTimestampTest : public CompactionJobTestBase { + public: + CompactionJobTimestampTest() + : CompactionJobTestBase(test::PerThreadDBPath("compaction_job_ts_test"), + test::BytewiseComparatorWithU64TsWrapper(), + test::EncodeInt, /*test_io_priority=*/false, + TableTypeForTest::kMockTable) {} +}; + +TEST_F(CompactionJobTimestampTest, GCDisabled) { + NewDB(); + + auto file1 = + mock::MakeMockFile({{KeyStr("a", 10, ValueType::kTypeValue, 100), "a10"}, + {KeyStr("a", 9, ValueType::kTypeValue, 99), "a9"}, + {KeyStr("b", 8, ValueType::kTypeValue, 98), "b8"}, + {KeyStr("d", 7, ValueType::kTypeValue, 97), "d7"}}); + + AddMockFile(file1); + + auto file2 = mock::MakeMockFile( + {{KeyStr("b", 6, ValueType::kTypeDeletionWithTimestamp, 96), ""}, + {KeyStr("c", 5, ValueType::kTypeDeletionWithTimestamp, 95), ""}, + {KeyStr("c", 4, ValueType::kTypeValue, 94), "c5"}, + {KeyStr("d", 3, ValueType::kTypeSingleDeletion, 93), ""}}); + AddMockFile(file2); + + SetLastSequence(10); + + auto expected_results = mock::MakeMockFile( + {{KeyStr("a", 10, ValueType::kTypeValue, 100), "a10"}, + {KeyStr("a", 9, ValueType::kTypeValue, 99), "a9"}, + {KeyStr("b", 8, ValueType::kTypeValue, 98), "b8"}, + {KeyStr("b", 6, ValueType::kTypeDeletionWithTimestamp, 96), ""}, + {KeyStr("c", 5, ValueType::kTypeDeletionWithTimestamp, 95), ""}, + {KeyStr("c", 4, ValueType::kTypeValue, 94), "c5"}, + {KeyStr("d", 7, ValueType::kTypeValue, 97), "d7"}, + {KeyStr("d", 3, ValueType::kTypeSingleDeletion, 93), ""}}); + constexpr int input_level = 0; + const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}); +} + +TEST_F(CompactionJobTimestampTest, NoKeyExpired) { + NewDB(); + + auto file1 = + mock::MakeMockFile({{KeyStr("a", 6, ValueType::kTypeValue, 100), "a6"}, + {KeyStr("b", 7, ValueType::kTypeValue, 101), "b7"}, + {KeyStr("c", 5, ValueType::kTypeValue, 99), "c5"}}); + AddMockFile(file1); + + auto file2 = + mock::MakeMockFile({{KeyStr("a", 4, ValueType::kTypeValue, 98), "a4"}, + {KeyStr("c", 3, ValueType::kTypeValue, 97), "c3"}}); + AddMockFile(file2); + + SetLastSequence(101); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 6, ValueType::kTypeValue, 100), "a6"}, + {KeyStr("a", 4, ValueType::kTypeValue, 98), "a4"}, + {KeyStr("b", 7, ValueType::kTypeValue, 101), "b7"}, + {KeyStr("c", 5, ValueType::kTypeValue, 99), "c5"}, + {KeyStr("c", 3, ValueType::kTypeValue, 97), "c3"}}); + constexpr int input_level = 0; + const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level); + + full_history_ts_low_ = encode_u64_ts_(0); + RunCompaction({files}, {input_level}, {expected_results}); +} + +TEST_F(CompactionJobTimestampTest, AllKeysExpired) { + NewDB(); + + auto file1 = mock::MakeMockFile( + {{KeyStr("a", 5, ValueType::kTypeDeletionWithTimestamp, 100), ""}, + {KeyStr("b", 6, ValueType::kTypeSingleDeletion, 99), ""}, + {KeyStr("c", 7, ValueType::kTypeValue, 98), "c7"}}); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile( + {{KeyStr("a", 4, ValueType::kTypeValue, 97), "a4"}, + {KeyStr("b", 3, ValueType::kTypeValue, 96), "b3"}, + {KeyStr("c", 2, ValueType::kTypeDeletionWithTimestamp, 95), ""}, + {KeyStr("c", 1, ValueType::kTypeValue, 94), "c1"}}); + AddMockFile(file2); + + SetLastSequence(7); + + auto expected_results = + mock::MakeMockFile({{KeyStr("c", 0, ValueType::kTypeValue, 0), "c7"}}); + constexpr int input_level = 0; + const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level); + + full_history_ts_low_ = encode_u64_ts_(std::numeric_limits::max()); + RunCompaction({files}, {input_level}, {expected_results}); +} + +TEST_F(CompactionJobTimestampTest, SomeKeysExpired) { + NewDB(); + + auto file1 = + mock::MakeMockFile({{KeyStr("a", 5, ValueType::kTypeValue, 50), "a5"}, + {KeyStr("b", 6, ValueType::kTypeValue, 49), "b6"}}); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile( + {{KeyStr("a", 3, ValueType::kTypeValue, 48), "a3"}, + {KeyStr("a", 2, ValueType::kTypeValue, 46), "a2"}, + {KeyStr("b", 4, ValueType::kTypeDeletionWithTimestamp, 47), ""}}); + AddMockFile(file2); + + SetLastSequence(6); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 5, ValueType::kTypeValue, 50), "a5"}, + {KeyStr("a", 0, ValueType::kTypeValue, 0), "a3"}, + {KeyStr("b", 6, ValueType::kTypeValue, 49), "b6"}}); + constexpr int input_level = 0; + const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level); + + full_history_ts_low_ = encode_u64_ts_(49); + RunCompaction({files}, {input_level}, {expected_results}); +} + +class CompactionJobTimestampTestWithBbTable : public CompactionJobTestBase { + public: + // Block-based table is needed if we want to test subcompaction partitioning + // with anchors. + explicit CompactionJobTimestampTestWithBbTable() + : CompactionJobTestBase( + test::PerThreadDBPath("compaction_job_ts_bbt_test"), + test::BytewiseComparatorWithU64TsWrapper(), test::EncodeInt, + /*test_io_priority=*/false, TableTypeForTest::kBlockBasedTable) {} +}; + +TEST_F(CompactionJobTimestampTestWithBbTable, SubcompactionAnchorL1) { + cf_options_.target_file_size_base = 20; + mutable_cf_options_.target_file_size_base = 20; + NewDB(); + + const std::vector keys = { + KeyStr("a", 20, ValueType::kTypeValue, 200), + KeyStr("b", 21, ValueType::kTypeValue, 210), + KeyStr("b", 20, ValueType::kTypeValue, 200), + KeyStr("b", 18, ValueType::kTypeValue, 180), + KeyStr("c", 17, ValueType::kTypeValue, 170), + KeyStr("c", 16, ValueType::kTypeValue, 160), + KeyStr("c", 15, ValueType::kTypeValue, 150)}; + const std::vector values = {"a20", "b21", "b20", "b18", + "c17", "c16", "c15"}; + + constexpr int input_level = 1; + + auto file1 = mock::MakeMockFile( + {{keys[0], values[0]}, {keys[1], values[1]}, {keys[2], values[2]}}); + AddMockFile(file1, input_level); + + auto file2 = mock::MakeMockFile( + {{keys[3], values[3]}, {keys[4], values[4]}, {keys[5], values[5]}}); + AddMockFile(file2, input_level); + + auto file3 = mock::MakeMockFile({{keys[6], values[6]}}); + AddMockFile(file3, input_level); + + SetLastSequence(20); + + auto output1 = mock::MakeMockFile({{keys[0], values[0]}}); + auto output2 = mock::MakeMockFile( + {{keys[1], values[1]}, {keys[2], values[2]}, {keys[3], values[3]}}); + auto output3 = mock::MakeMockFile( + {{keys[4], values[4]}, {keys[5], values[5]}, {keys[6], values[6]}}); + + auto expected_results = + std::vector{output1, output2, output3}; + const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level); + + constexpr int output_level = 2; + constexpr int max_subcompactions = 4; + RunCompaction({files}, {input_level}, expected_results, /*snapshots=*/{}, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + output_level, /*verify=*/true, {kInvalidBlobFileNumber}, + /*check_get_priority=*/false, Env::IO_TOTAL, Env::IO_TOTAL, + max_subcompactions); +} + +TEST_F(CompactionJobTimestampTestWithBbTable, SubcompactionL0) { + cf_options_.target_file_size_base = 20; + mutable_cf_options_.target_file_size_base = 20; + NewDB(); + + const std::vector keys = { + KeyStr("a", 20, ValueType::kTypeValue, 200), + KeyStr("b", 20, ValueType::kTypeValue, 200), + KeyStr("b", 19, ValueType::kTypeValue, 190), + KeyStr("b", 18, ValueType::kTypeValue, 180), + KeyStr("c", 17, ValueType::kTypeValue, 170), + KeyStr("c", 16, ValueType::kTypeValue, 160), + KeyStr("c", 15, ValueType::kTypeValue, 150)}; + const std::vector values = {"a20", "b20", "b19", "b18", + "c17", "c16", "c15"}; + + constexpr int input_level = 0; + + auto file1 = mock::MakeMockFile({{keys[5], values[5]}, {keys[6], values[6]}}); + AddMockFile(file1, input_level); + + auto file2 = mock::MakeMockFile({{keys[3], values[3]}, {keys[4], values[4]}}); + AddMockFile(file2, input_level); + + auto file3 = mock::MakeMockFile( + {{keys[0], values[0]}, {keys[1], values[1]}, {keys[2], values[2]}}); + AddMockFile(file3, input_level); + + SetLastSequence(20); + + auto output1 = mock::MakeMockFile({{keys[0], values[0]}}); + auto output2 = mock::MakeMockFile( + {{keys[1], values[1]}, {keys[2], values[2]}, {keys[3], values[3]}}); + auto output3 = mock::MakeMockFile( + {{keys[4], values[4]}, {keys[5], values[5]}, {keys[6], values[6]}}); + + auto expected_results = + std::vector{output1, output2, output3}; + const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level); + + constexpr int output_level = 1; + constexpr int max_subcompactions = 4; + RunCompaction({files}, {input_level}, expected_results, /*snapshots=*/{}, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + output_level, /*verify=*/true, {kInvalidBlobFileNumber}, + /*check_get_priority=*/false, Env::IO_TOTAL, Env::IO_TOTAL, + max_subcompactions); +} + +// The io priority of the compaction reads and writes are different from +// other DB reads and writes. To prepare the compaction input files, use the +// default filesystem from Env. To test the io priority of the compaction +// reads and writes, db_options_.fs is set as MockTestFileSystem. +class CompactionJobIOPriorityTest : public CompactionJobTestBase { + public: + CompactionJobIOPriorityTest() + : CompactionJobTestBase( + test::PerThreadDBPath("compaction_job_io_priority_test"), + BytewiseComparator(), [](uint64_t /*ts*/) { return ""; }, + /*test_io_priority=*/true, TableTypeForTest::kBlockBasedTable) {} +}; + +TEST_F(CompactionJobIOPriorityTest, WriteControllerStateNormal) { + // When the state from WriteController is normal. + NewDB(); + mock::KVVector expected_results = CreateTwoFiles(false); + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + constexpr int input_level = 0; + auto files = cfd->current()->storage_info()->LevelFiles(input_level); + ASSERT_EQ(2U, files.size()); + RunCompaction({files}, {input_level}, {expected_results}, {}, + kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, false, + Env::IO_LOW, Env::IO_LOW); +} + +TEST_F(CompactionJobIOPriorityTest, WriteControllerStateDelayed) { + // When the state from WriteController is Delayed. + NewDB(); + mock::KVVector expected_results = CreateTwoFiles(false); + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + constexpr int input_level = 0; + auto files = cfd->current()->storage_info()->LevelFiles(input_level); + ASSERT_EQ(2U, files.size()); + { + std::unique_ptr delay_token = + write_controller_.GetDelayToken(1000000); + RunCompaction({files}, {input_level}, {expected_results}, {}, + kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, false, + Env::IO_USER, Env::IO_USER); + } +} + +TEST_F(CompactionJobIOPriorityTest, WriteControllerStateStalled) { + // When the state from WriteController is Stalled. + NewDB(); + mock::KVVector expected_results = CreateTwoFiles(false); + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + constexpr int input_level = 0; + auto files = cfd->current()->storage_info()->LevelFiles(input_level); + ASSERT_EQ(2U, files.size()); + { + std::unique_ptr stop_token = + write_controller_.GetStopToken(); + RunCompaction({files}, {input_level}, {expected_results}, {}, + kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, false, + Env::IO_USER, Env::IO_USER); + } +} + +TEST_F(CompactionJobIOPriorityTest, GetRateLimiterPriority) { + NewDB(); + mock::KVVector expected_results = CreateTwoFiles(false); + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + constexpr int input_level = 0; + auto files = cfd->current()->storage_info()->LevelFiles(input_level); + ASSERT_EQ(2U, files.size()); + RunCompaction({files}, {input_level}, {expected_results}, {}, + kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, true, + Env::IO_LOW, Env::IO_LOW); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as CompactionJobStats is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/compaction/compaction_outputs.cc b/src/rocksdb/db/compaction/compaction_outputs.cc new file mode 100644 index 000000000..e74378e2a --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_outputs.cc @@ -0,0 +1,646 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_outputs.h" + +#include "db/builder.h" + +namespace ROCKSDB_NAMESPACE { + +void CompactionOutputs::NewBuilder(const TableBuilderOptions& tboptions) { + builder_.reset(NewTableBuilder(tboptions, file_writer_.get())); +} + +Status CompactionOutputs::Finish(const Status& intput_status, + const SeqnoToTimeMapping& seqno_time_mapping) { + FileMetaData* meta = GetMetaData(); + assert(meta != nullptr); + Status s = intput_status; + if (s.ok()) { + std::string seqno_time_mapping_str; + seqno_time_mapping.Encode(seqno_time_mapping_str, meta->fd.smallest_seqno, + meta->fd.largest_seqno, meta->file_creation_time); + builder_->SetSeqnoTimeTableProperties(seqno_time_mapping_str, + meta->oldest_ancester_time); + s = builder_->Finish(); + + } else { + builder_->Abandon(); + } + Status io_s = builder_->io_status(); + if (s.ok()) { + s = io_s; + } else { + io_s.PermitUncheckedError(); + } + const uint64_t current_bytes = builder_->FileSize(); + if (s.ok()) { + meta->fd.file_size = current_bytes; + meta->marked_for_compaction = builder_->NeedCompact(); + } + current_output().finished = true; + stats_.bytes_written += current_bytes; + stats_.num_output_files = outputs_.size(); + + return s; +} + +IOStatus CompactionOutputs::WriterSyncClose(const Status& input_status, + SystemClock* clock, + Statistics* statistics, + bool use_fsync) { + IOStatus io_s; + if (input_status.ok()) { + StopWatch sw(clock, statistics, COMPACTION_OUTFILE_SYNC_MICROS); + io_s = file_writer_->Sync(use_fsync); + } + if (input_status.ok() && io_s.ok()) { + io_s = file_writer_->Close(); + } + + if (input_status.ok() && io_s.ok()) { + FileMetaData* meta = GetMetaData(); + meta->file_checksum = file_writer_->GetFileChecksum(); + meta->file_checksum_func_name = file_writer_->GetFileChecksumFuncName(); + } + + file_writer_.reset(); + + return io_s; +} + +size_t CompactionOutputs::UpdateGrandparentBoundaryInfo( + const Slice& internal_key) { + size_t curr_key_boundary_switched_num = 0; + const std::vector& grandparents = compaction_->grandparents(); + + if (grandparents.empty()) { + return curr_key_boundary_switched_num; + } + assert(!internal_key.empty()); + InternalKey ikey; + ikey.DecodeFrom(internal_key); + assert(ikey.Valid()); + + const Comparator* ucmp = compaction_->column_family_data()->user_comparator(); + + // Move the grandparent_index_ to the file containing the current user_key. + // If there are multiple files containing the same user_key, make sure the + // index points to the last file containing the key. + while (grandparent_index_ < grandparents.size()) { + if (being_grandparent_gap_) { + if (sstableKeyCompare(ucmp, ikey, + grandparents[grandparent_index_]->smallest) < 0) { + break; + } + if (seen_key_) { + curr_key_boundary_switched_num++; + grandparent_overlapped_bytes_ += + grandparents[grandparent_index_]->fd.GetFileSize(); + grandparent_boundary_switched_num_++; + } + being_grandparent_gap_ = false; + } else { + int cmp_result = sstableKeyCompare( + ucmp, ikey, grandparents[grandparent_index_]->largest); + // If it's same key, make sure grandparent_index_ is pointing to the last + // one. + if (cmp_result < 0 || + (cmp_result == 0 && + (grandparent_index_ == grandparents.size() - 1 || + sstableKeyCompare(ucmp, ikey, + grandparents[grandparent_index_ + 1]->smallest) < + 0))) { + break; + } + if (seen_key_) { + curr_key_boundary_switched_num++; + grandparent_boundary_switched_num_++; + } + being_grandparent_gap_ = true; + grandparent_index_++; + } + } + + // If the first key is in the middle of a grandparent file, adding it to the + // overlap + if (!seen_key_ && !being_grandparent_gap_) { + assert(grandparent_overlapped_bytes_ == 0); + grandparent_overlapped_bytes_ = + GetCurrentKeyGrandparentOverlappedBytes(internal_key); + } + + seen_key_ = true; + return curr_key_boundary_switched_num; +} + +uint64_t CompactionOutputs::GetCurrentKeyGrandparentOverlappedBytes( + const Slice& internal_key) const { + // no overlap with any grandparent file + if (being_grandparent_gap_) { + return 0; + } + uint64_t overlapped_bytes = 0; + + const std::vector& grandparents = compaction_->grandparents(); + const Comparator* ucmp = compaction_->column_family_data()->user_comparator(); + InternalKey ikey; + ikey.DecodeFrom(internal_key); +#ifndef NDEBUG + // make sure the grandparent_index_ is pointing to the last files containing + // the current key. + int cmp_result = + sstableKeyCompare(ucmp, ikey, grandparents[grandparent_index_]->largest); + assert( + cmp_result < 0 || + (cmp_result == 0 && + (grandparent_index_ == grandparents.size() - 1 || + sstableKeyCompare( + ucmp, ikey, grandparents[grandparent_index_ + 1]->smallest) < 0))); + assert(sstableKeyCompare(ucmp, ikey, + grandparents[grandparent_index_]->smallest) >= 0); +#endif + overlapped_bytes += grandparents[grandparent_index_]->fd.GetFileSize(); + + // go backwards to find all overlapped files, one key can overlap multiple + // files. In the following example, if the current output key is `c`, and one + // compaction file was cut before `c`, current `c` can overlap with 3 files: + // [a b] [c... + // [b, b] [c, c] [c, c] [c, d] + for (int64_t i = static_cast(grandparent_index_) - 1; + i >= 0 && sstableKeyCompare(ucmp, ikey, grandparents[i]->largest) == 0; + i--) { + overlapped_bytes += grandparents[i]->fd.GetFileSize(); + } + + return overlapped_bytes; +} + +bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) { + assert(c_iter.Valid()); + + // always update grandparent information like overlapped file number, size + // etc. + const Slice& internal_key = c_iter.key(); + const uint64_t previous_overlapped_bytes = grandparent_overlapped_bytes_; + size_t num_grandparent_boundaries_crossed = + UpdateGrandparentBoundaryInfo(internal_key); + + if (!HasBuilder()) { + return false; + } + + // If there's user defined partitioner, check that first + if (partitioner_ && partitioner_->ShouldPartition(PartitionerRequest( + last_key_for_partitioner_, c_iter.user_key(), + current_output_file_size_)) == kRequired) { + return true; + } + + // files output to Level 0 won't be split + if (compaction_->output_level() == 0) { + return false; + } + + // reach the max file size + if (current_output_file_size_ >= compaction_->max_output_file_size()) { + return true; + } + + const InternalKeyComparator* icmp = + &compaction_->column_family_data()->internal_comparator(); + + // Check if it needs to split for RoundRobin + // Invalid local_output_split_key indicates that we do not need to split + if (local_output_split_key_ != nullptr && !is_split_) { + // Split occurs when the next key is larger than/equal to the cursor + if (icmp->Compare(internal_key, local_output_split_key_->Encode()) >= 0) { + is_split_ = true; + return true; + } + } + + // only check if the current key is going to cross the grandparents file + // boundary (either the file beginning or ending). + if (num_grandparent_boundaries_crossed > 0) { + // Cut the file before the current key if the size of the current output + // file + its overlapped grandparent files is bigger than + // max_compaction_bytes. Which is to prevent future bigger than + // max_compaction_bytes compaction from the current output level. + if (grandparent_overlapped_bytes_ + current_output_file_size_ > + compaction_->max_compaction_bytes()) { + return true; + } + + // Cut the file if including the key is going to add a skippable file on + // the grandparent level AND its size is reasonably big (1/8 of target file + // size). For example, if it's compacting the files L0 + L1: + // L0: [1, 21] + // L1: [3, 23] + // L2: [2, 4] [11, 15] [22, 24] + // Without this break, it will output as: + // L1: [1,3, 21,23] + // With this break, it will output as (assuming [11, 15] at L2 is bigger + // than 1/8 of target size): + // L1: [1,3] [21,23] + // Then for the future compactions, [11,15] won't be included. + // For random datasets (either evenly distributed or skewed), it rarely + // triggers this condition, but if the user is adding 2 different datasets + // without any overlap, it may likely happen. + // More details, check PR #1963 + const size_t num_skippable_boundaries_crossed = + being_grandparent_gap_ ? 2 : 3; + if (compaction_->immutable_options()->compaction_style == + kCompactionStyleLevel && + compaction_->immutable_options()->level_compaction_dynamic_file_size && + num_grandparent_boundaries_crossed >= + num_skippable_boundaries_crossed && + grandparent_overlapped_bytes_ - previous_overlapped_bytes > + compaction_->target_output_file_size() / 8) { + return true; + } + + // Pre-cut the output file if it's reaching a certain size AND it's at the + // boundary of a grandparent file. It can reduce the future compaction size, + // the cost is having smaller files. + // The pre-cut size threshold is based on how many grandparent boundaries + // it has seen before. Basically, if it has seen no boundary at all, then it + // will pre-cut at 50% target file size. Every boundary it has seen + // increases the threshold by 5%, max at 90%, which it will always cut. + // The idea is based on if it has seen more boundaries before, it will more + // likely to see another boundary (file cutting opportunity) before the + // target file size. The test shows it can generate larger files than a + // static threshold like 75% and has a similar write amplification + // improvement. + if (compaction_->immutable_options()->compaction_style == + kCompactionStyleLevel && + compaction_->immutable_options()->level_compaction_dynamic_file_size && + current_output_file_size_ >= + ((compaction_->target_output_file_size() + 99) / 100) * + (50 + std::min(grandparent_boundary_switched_num_ * 5, + size_t{40}))) { + return true; + } + } + + // check ttl file boundaries if there's any + if (!files_to_cut_for_ttl_.empty()) { + if (cur_files_to_cut_for_ttl_ != -1) { + // Previous key is inside the range of a file + if (icmp->Compare(internal_key, + files_to_cut_for_ttl_[cur_files_to_cut_for_ttl_] + ->largest.Encode()) > 0) { + next_files_to_cut_for_ttl_ = cur_files_to_cut_for_ttl_ + 1; + cur_files_to_cut_for_ttl_ = -1; + return true; + } + } else { + // Look for the key position + while (next_files_to_cut_for_ttl_ < + static_cast(files_to_cut_for_ttl_.size())) { + if (icmp->Compare(internal_key, + files_to_cut_for_ttl_[next_files_to_cut_for_ttl_] + ->smallest.Encode()) >= 0) { + if (icmp->Compare(internal_key, + files_to_cut_for_ttl_[next_files_to_cut_for_ttl_] + ->largest.Encode()) <= 0) { + // With in the current file + cur_files_to_cut_for_ttl_ = next_files_to_cut_for_ttl_; + return true; + } + // Beyond the current file + next_files_to_cut_for_ttl_++; + } else { + // Still fall into the gap + break; + } + } + } + } + + return false; +} + +Status CompactionOutputs::AddToOutput( + const CompactionIterator& c_iter, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func) { + Status s; + const Slice& key = c_iter.key(); + + if (ShouldStopBefore(c_iter) && HasBuilder()) { + s = close_file_func(*this, c_iter.InputStatus(), key); + if (!s.ok()) { + return s; + } + // reset grandparent information + grandparent_boundary_switched_num_ = 0; + grandparent_overlapped_bytes_ = + GetCurrentKeyGrandparentOverlappedBytes(key); + } + + // Open output file if necessary + if (!HasBuilder()) { + s = open_file_func(*this); + if (!s.ok()) { + return s; + } + } + + assert(builder_ != nullptr); + const Slice& value = c_iter.value(); + s = current_output().validator.Add(key, value); + if (!s.ok()) { + return s; + } + builder_->Add(key, value); + + stats_.num_output_records++; + current_output_file_size_ = builder_->EstimatedFileSize(); + + if (blob_garbage_meter_) { + s = blob_garbage_meter_->ProcessOutFlow(key, value); + } + + if (!s.ok()) { + return s; + } + + const ParsedInternalKey& ikey = c_iter.ikey(); + s = current_output().meta.UpdateBoundaries(key, value, ikey.sequence, + ikey.type); + + if (partitioner_) { + last_key_for_partitioner_.assign(c_iter.user_key().data_, + c_iter.user_key().size_); + } + + return s; +} + +Status CompactionOutputs::AddRangeDels( + const Slice* comp_start_user_key, const Slice* comp_end_user_key, + CompactionIterationStats& range_del_out_stats, bool bottommost_level, + const InternalKeyComparator& icmp, SequenceNumber earliest_snapshot, + const Slice& next_table_min_key, const std::string& full_history_ts_low) { + assert(HasRangeDel()); + FileMetaData& meta = current_output().meta; + const Comparator* ucmp = icmp.user_comparator(); + + Slice lower_bound_guard, upper_bound_guard; + std::string smallest_user_key; + const Slice *lower_bound, *upper_bound; + bool lower_bound_from_sub_compact = false; + + size_t output_size = outputs_.size(); + if (output_size == 1) { + // For the first output table, include range tombstones before the min + // key but after the subcompaction boundary. + lower_bound = comp_start_user_key; + lower_bound_from_sub_compact = true; + } else if (meta.smallest.size() > 0) { + // For subsequent output tables, only include range tombstones from min + // key onwards since the previous file was extended to contain range + // tombstones falling before min key. + smallest_user_key = meta.smallest.user_key().ToString(false /*hex*/); + lower_bound_guard = Slice(smallest_user_key); + lower_bound = &lower_bound_guard; + } else { + lower_bound = nullptr; + } + if (!next_table_min_key.empty()) { + // This may be the last file in the subcompaction in some cases, so we + // need to compare the end key of subcompaction with the next file start + // key. When the end key is chosen by the subcompaction, we know that + // it must be the biggest key in output file. Therefore, it is safe to + // use the smaller key as the upper bound of the output file, to ensure + // that there is no overlapping between different output files. + upper_bound_guard = ExtractUserKey(next_table_min_key); + if (comp_end_user_key != nullptr && + ucmp->CompareWithoutTimestamp(upper_bound_guard, *comp_end_user_key) >= + 0) { + upper_bound = comp_end_user_key; + } else { + upper_bound = &upper_bound_guard; + } + } else { + // This is the last file in the subcompaction, so extend until the + // subcompaction ends. + upper_bound = comp_end_user_key; + } + bool has_overlapping_endpoints; + if (upper_bound != nullptr && meta.largest.size() > 0) { + has_overlapping_endpoints = ucmp->CompareWithoutTimestamp( + meta.largest.user_key(), *upper_bound) == 0; + } else { + has_overlapping_endpoints = false; + } + + // The end key of the subcompaction must be bigger or equal to the upper + // bound. If the end of subcompaction is null or the upper bound is null, + // it means that this file is the last file in the compaction. So there + // will be no overlapping between this file and others. + assert(comp_end_user_key == nullptr || upper_bound == nullptr || + ucmp->CompareWithoutTimestamp(*upper_bound, *comp_end_user_key) <= 0); + auto it = range_del_agg_->NewIterator(lower_bound, upper_bound, + has_overlapping_endpoints); + // Position the range tombstone output iterator. There may be tombstone + // fragments that are entirely out of range, so make sure that we do not + // include those. + if (lower_bound != nullptr) { + it->Seek(*lower_bound); + } else { + it->SeekToFirst(); + } + for (; it->Valid(); it->Next()) { + auto tombstone = it->Tombstone(); + if (upper_bound != nullptr) { + int cmp = + ucmp->CompareWithoutTimestamp(*upper_bound, tombstone.start_key_); + if ((has_overlapping_endpoints && cmp < 0) || + (!has_overlapping_endpoints && cmp <= 0)) { + // Tombstones starting after upper_bound only need to be included in + // the next table. If the current SST ends before upper_bound, i.e., + // `has_overlapping_endpoints == false`, we can also skip over range + // tombstones that start exactly at upper_bound. Such range + // tombstones will be included in the next file and are not relevant + // to the point keys or endpoints of the current file. + break; + } + } + + const size_t ts_sz = ucmp->timestamp_size(); + // Garbage collection for range tombstones. + // If user-defined timestamp is enabled, range tombstones are dropped if + // they are at bottommost_level, below full_history_ts_low and not visible + // in any snapshot. trim_ts_ is passed to the constructor for + // range_del_agg_, and range_del_agg_ internally drops tombstones above + // trim_ts_. + if (bottommost_level && tombstone.seq_ <= earliest_snapshot && + (ts_sz == 0 || + (!full_history_ts_low.empty() && + ucmp->CompareTimestamp(tombstone.ts_, full_history_ts_low) < 0))) { + // TODO(andrewkr): tombstones that span multiple output files are + // counted for each compaction output file, so lots of double + // counting. + range_del_out_stats.num_range_del_drop_obsolete++; + range_del_out_stats.num_record_drop_obsolete++; + continue; + } + + auto kv = tombstone.Serialize(); + assert(lower_bound == nullptr || + ucmp->CompareWithoutTimestamp(*lower_bound, kv.second) < 0); + // Range tombstone is not supported by output validator yet. + builder_->Add(kv.first.Encode(), kv.second); + InternalKey smallest_candidate = std::move(kv.first); + if (lower_bound != nullptr && + ucmp->CompareWithoutTimestamp(smallest_candidate.user_key(), + *lower_bound) <= 0) { + // Pretend the smallest key has the same user key as lower_bound + // (the max key in the previous table or subcompaction) in order for + // files to appear key-space partitioned. + // + // When lower_bound is chosen by a subcompaction, we know that + // subcompactions over smaller keys cannot contain any keys at + // lower_bound. We also know that smaller subcompactions exist, + // because otherwise the subcompaction woud be unbounded on the left. + // As a result, we know that no other files on the output level will + // contain actual keys at lower_bound (an output file may have a + // largest key of lower_bound@kMaxSequenceNumber, but this only + // indicates a large range tombstone was truncated). Therefore, it is + // safe to use the tombstone's sequence number, to ensure that keys at + // lower_bound at lower levels are covered by truncated tombstones. + // + // If lower_bound was chosen by the smallest data key in the file, + // choose lowest seqnum so this file's smallest internal key comes + // after the previous file's largest. The fake seqnum is OK because + // the read path's file-picking code only considers user key. + if (lower_bound_from_sub_compact) { + if (ts_sz) { + assert(tombstone.ts_.size() == ts_sz); + smallest_candidate = InternalKey(*lower_bound, tombstone.seq_, + kTypeRangeDeletion, tombstone.ts_); + } else { + smallest_candidate = + InternalKey(*lower_bound, tombstone.seq_, kTypeRangeDeletion); + } + } else { + smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion); + } + } + InternalKey largest_candidate = tombstone.SerializeEndKey(); + if (upper_bound != nullptr && + ucmp->CompareWithoutTimestamp(*upper_bound, + largest_candidate.user_key()) <= 0) { + // Pretend the largest key has the same user key as upper_bound (the + // min key in the following table or subcompaction) in order for files + // to appear key-space partitioned. + // + // Choose highest seqnum so this file's largest internal key comes + // before the next file's/subcompaction's smallest. The fake seqnum is + // OK because the read path's file-picking code only considers the + // user key portion. + // + // Note Seek() also creates InternalKey with (user_key, + // kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of + // kTypeRangeDeletion (0xF), so the range tombstone comes before the + // Seek() key in InternalKey's ordering. So Seek() will look in the + // next file for the user key + if (ts_sz) { + static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff"; + if (ts_sz <= strlen(kTsMax)) { + largest_candidate = + InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion, + Slice(kTsMax, ts_sz)); + } else { + largest_candidate = + InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion, + std::string(ts_sz, '\xff')); + } + } else { + largest_candidate = + InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion); + } + } +#ifndef NDEBUG + SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber; + if (meta.smallest.size() > 0) { + smallest_ikey_seqnum = GetInternalKeySeqno(meta.smallest.Encode()); + } +#endif + meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate, + tombstone.seq_, icmp); + // The smallest key in a file is used for range tombstone truncation, so + // it cannot have a seqnum of 0 (unless the smallest data key in a file + // has a seqnum of 0). Otherwise, the truncated tombstone may expose + // deleted keys at lower levels. + assert(smallest_ikey_seqnum == 0 || + ExtractInternalKeyFooter(meta.smallest.Encode()) != + PackSequenceAndType(0, kTypeRangeDeletion)); + } + return Status::OK(); +} + +void CompactionOutputs::FillFilesToCutForTtl() { + if (compaction_->immutable_options()->compaction_style != + kCompactionStyleLevel || + compaction_->immutable_options()->compaction_pri != + kMinOverlappingRatio || + compaction_->mutable_cf_options()->ttl == 0 || + compaction_->num_input_levels() < 2 || compaction_->bottommost_level()) { + return; + } + + // We define new file with the oldest ancestor time to be younger than 1/4 + // TTL, and an old one to be older than 1/2 TTL time. + int64_t temp_current_time; + auto get_time_status = + compaction_->immutable_options()->clock->GetCurrentTime( + &temp_current_time); + if (!get_time_status.ok()) { + return; + } + + auto current_time = static_cast(temp_current_time); + if (current_time < compaction_->mutable_cf_options()->ttl) { + return; + } + + uint64_t old_age_thres = + current_time - compaction_->mutable_cf_options()->ttl / 2; + const std::vector& olevel = + *(compaction_->inputs(compaction_->num_input_levels() - 1)); + for (FileMetaData* file : olevel) { + // Worth filtering out by start and end? + uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime(); + // We put old files if they are not too small to prevent a flood + // of small files. + if (oldest_ancester_time < old_age_thres && + file->fd.GetFileSize() > + compaction_->mutable_cf_options()->target_file_size_base / 2) { + files_to_cut_for_ttl_.push_back(file); + } + } +} + +CompactionOutputs::CompactionOutputs(const Compaction* compaction, + const bool is_penultimate_level) + : compaction_(compaction), is_penultimate_level_(is_penultimate_level) { + partitioner_ = compaction->output_level() == 0 + ? nullptr + : compaction->CreateSstPartitioner(); + + if (compaction->output_level() != 0) { + FillFilesToCutForTtl(); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_outputs.h b/src/rocksdb/db/compaction/compaction_outputs.h new file mode 100644 index 000000000..f40aa8215 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_outputs.h @@ -0,0 +1,385 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/blob/blob_garbage_meter.h" +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_iterator.h" +#include "db/internal_stats.h" +#include "db/output_validator.h" + +namespace ROCKSDB_NAMESPACE { + +class CompactionOutputs; +using CompactionFileOpenFunc = std::function; +using CompactionFileCloseFunc = + std::function; + +// Files produced by subcompaction, most of the functions are used by +// compaction_job Open/Close compaction file functions. +class CompactionOutputs { + public: + // compaction output file + struct Output { + Output(FileMetaData&& _meta, const InternalKeyComparator& _icmp, + bool _enable_order_check, bool _enable_hash, bool _finished, + uint64_t precalculated_hash) + : meta(std::move(_meta)), + validator(_icmp, _enable_order_check, _enable_hash, + precalculated_hash), + finished(_finished) {} + FileMetaData meta; + OutputValidator validator; + bool finished; + std::shared_ptr table_properties; + }; + + CompactionOutputs() = delete; + + explicit CompactionOutputs(const Compaction* compaction, + const bool is_penultimate_level); + + // Add generated output to the list + void AddOutput(FileMetaData&& meta, const InternalKeyComparator& icmp, + bool enable_order_check, bool enable_hash, + bool finished = false, uint64_t precalculated_hash = 0) { + outputs_.emplace_back(std::move(meta), icmp, enable_order_check, + enable_hash, finished, precalculated_hash); + } + + // Set new table builder for the current output + void NewBuilder(const TableBuilderOptions& tboptions); + + // Assign a new WritableFileWriter to the current output + void AssignFileWriter(WritableFileWriter* writer) { + file_writer_.reset(writer); + } + + // TODO: Remove it when remote compaction support tiered compaction + void SetTotalBytes(uint64_t bytes) { stats_.bytes_written += bytes; } + void SetNumOutputRecords(uint64_t num) { stats_.num_output_records = num; } + + // TODO: Move the BlobDB builder into CompactionOutputs + const std::vector& GetBlobFileAdditions() const { + if (is_penultimate_level_) { + assert(blob_file_additions_.empty()); + } + return blob_file_additions_; + } + + std::vector* GetBlobFileAdditionsPtr() { + assert(!is_penultimate_level_); + return &blob_file_additions_; + } + + bool HasBlobFileAdditions() const { return !blob_file_additions_.empty(); } + + BlobGarbageMeter* CreateBlobGarbageMeter() { + assert(!is_penultimate_level_); + blob_garbage_meter_ = std::make_unique(); + return blob_garbage_meter_.get(); + } + + BlobGarbageMeter* GetBlobGarbageMeter() const { + if (is_penultimate_level_) { + // blobdb doesn't support per_key_placement yet + assert(blob_garbage_meter_ == nullptr); + return nullptr; + } + return blob_garbage_meter_.get(); + } + + void UpdateBlobStats() { + assert(!is_penultimate_level_); + stats_.num_output_files_blob = blob_file_additions_.size(); + for (const auto& blob : blob_file_additions_) { + stats_.bytes_written_blob += blob.GetTotalBlobBytes(); + } + } + + // Finish the current output file + Status Finish(const Status& intput_status, + const SeqnoToTimeMapping& seqno_time_mapping); + + // Update output table properties from table builder + void UpdateTableProperties() { + current_output().table_properties = + std::make_shared(GetTableProperties()); + } + + IOStatus WriterSyncClose(const Status& intput_status, SystemClock* clock, + Statistics* statistics, bool use_fsync); + + TableProperties GetTableProperties() { + return builder_->GetTableProperties(); + } + + Slice SmallestUserKey() const { + if (!outputs_.empty() && outputs_[0].finished) { + return outputs_[0].meta.smallest.user_key(); + } else { + return Slice{nullptr, 0}; + } + } + + Slice LargestUserKey() const { + if (!outputs_.empty() && outputs_.back().finished) { + return outputs_.back().meta.largest.user_key(); + } else { + return Slice{nullptr, 0}; + } + } + + // In case the last output file is empty, which doesn't need to keep. + void RemoveLastEmptyOutput() { + if (!outputs_.empty() && !outputs_.back().meta.fd.file_size) { + // An error occurred, so ignore the last output. + outputs_.pop_back(); + } + } + + // Remove the last output, for example the last output doesn't have data (no + // entry and no range-dels), but file_size might not be 0, as it has SST + // metadata. + void RemoveLastOutput() { + assert(!outputs_.empty()); + outputs_.pop_back(); + } + + bool HasBuilder() const { return builder_ != nullptr; } + + FileMetaData* GetMetaData() { return ¤t_output().meta; } + + bool HasOutput() const { return !outputs_.empty(); } + + uint64_t NumEntries() const { return builder_->NumEntries(); } + + void ResetBuilder() { + builder_.reset(); + current_output_file_size_ = 0; + } + + // Add range-dels from the aggregator to the current output file + // @param comp_start_user_key and comp_end_user_key include timestamp if + // user-defined timestamp is enabled. + // @param full_history_ts_low used for range tombstone garbage collection. + Status AddRangeDels(const Slice* comp_start_user_key, + const Slice* comp_end_user_key, + CompactionIterationStats& range_del_out_stats, + bool bottommost_level, const InternalKeyComparator& icmp, + SequenceNumber earliest_snapshot, + const Slice& next_table_min_key, + const std::string& full_history_ts_low); + + // if the outputs have range delete, range delete is also data + bool HasRangeDel() const { + return range_del_agg_ && !range_del_agg_->IsEmpty(); + } + + private: + friend class SubcompactionState; + + void FillFilesToCutForTtl(); + + void SetOutputSlitKey(const std::optional start, + const std::optional end) { + const InternalKeyComparator* icmp = + &compaction_->column_family_data()->internal_comparator(); + + const InternalKey* output_split_key = compaction_->GetOutputSplitKey(); + // Invalid output_split_key indicates that we do not need to split + if (output_split_key != nullptr) { + // We may only split the output when the cursor is in the range. Split + if ((!end.has_value() || + icmp->user_comparator()->Compare( + ExtractUserKey(output_split_key->Encode()), end.value()) < 0) && + (!start.has_value() || icmp->user_comparator()->Compare( + ExtractUserKey(output_split_key->Encode()), + start.value()) > 0)) { + local_output_split_key_ = output_split_key; + } + } + } + + // Returns true iff we should stop building the current output + // before processing the current key in compaction iterator. + bool ShouldStopBefore(const CompactionIterator& c_iter); + + void Cleanup() { + if (builder_ != nullptr) { + // May happen if we get a shutdown call in the middle of compaction + builder_->Abandon(); + builder_.reset(); + } + } + + // update tracked grandparents information like grandparent index, if it's + // in the gap between 2 grandparent files, accumulated grandparent files size + // etc. + // It returns how many boundaries it crosses by including current key. + size_t UpdateGrandparentBoundaryInfo(const Slice& internal_key); + + // helper function to get the overlapped grandparent files size, it's only + // used for calculating the first key's overlap. + uint64_t GetCurrentKeyGrandparentOverlappedBytes( + const Slice& internal_key) const; + + // Add current key from compaction_iterator to the output file. If needed + // close and open new compaction output with the functions provided. + Status AddToOutput(const CompactionIterator& c_iter, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func); + + // Close the current output. `open_file_func` is needed for creating new file + // for range-dels only output file. + Status CloseOutput(const Status& curr_status, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func) { + Status status = curr_status; + // handle subcompaction containing only range deletions + if (status.ok() && !HasBuilder() && !HasOutput() && HasRangeDel()) { + status = open_file_func(*this); + } + if (HasBuilder()) { + const Slice empty_key{}; + Status s = close_file_func(*this, status, empty_key); + if (!s.ok() && status.ok()) { + status = s; + } + } + + return status; + } + + // This subcompaction's output could be empty if compaction was aborted before + // this subcompaction had a chance to generate any output files. When + // subcompactions are executed sequentially this is more likely and will be + // particularly likely for the later subcompactions to be empty. Once they are + // run in parallel however it should be much rarer. + // It's caller's responsibility to make sure it's not empty. + Output& current_output() { + assert(!outputs_.empty()); + return outputs_.back(); + } + + // Assign the range_del_agg to the target output level. There's only one + // range-del-aggregator per compaction outputs, for + // output_to_penultimate_level compaction it is only assigned to the + // penultimate level. + void AssignRangeDelAggregator( + std::unique_ptr&& range_del_agg) { + assert(range_del_agg_ == nullptr); + range_del_agg_ = std::move(range_del_agg); + } + + const Compaction* compaction_; + + // current output builder and writer + std::unique_ptr builder_; + std::unique_ptr file_writer_; + uint64_t current_output_file_size_ = 0; + + // all the compaction outputs so far + std::vector outputs_; + + // BlobDB info + std::vector blob_file_additions_; + std::unique_ptr blob_garbage_meter_; + + // Basic compaction output stats for this level's outputs + InternalStats::CompactionOutputsStats stats_; + + // indicate if this CompactionOutputs obj for penultimate_level, should always + // be false if per_key_placement feature is not enabled. + const bool is_penultimate_level_; + std::unique_ptr range_del_agg_ = nullptr; + + // partitioner information + std::string last_key_for_partitioner_; + std::unique_ptr partitioner_; + + // A flag determines if this subcompaction has been split by the cursor + bool is_split_ = false; + + // We also maintain the output split key for each subcompaction to avoid + // repetitive comparison in ShouldStopBefore() + const InternalKey* local_output_split_key_ = nullptr; + + // Some identified files with old oldest ancester time and the range should be + // isolated out so that the output file(s) in that range can be merged down + // for TTL and clear the timestamps for the range. + std::vector files_to_cut_for_ttl_; + int cur_files_to_cut_for_ttl_ = -1; + int next_files_to_cut_for_ttl_ = 0; + + // An index that used to speed up ShouldStopBefore(). + size_t grandparent_index_ = 0; + + // if the output key is being grandparent files gap, so: + // key > grandparents[grandparent_index_ - 1].largest && + // key < grandparents[grandparent_index_].smallest + bool being_grandparent_gap_ = true; + + // The number of bytes overlapping between the current output and + // grandparent files used in ShouldStopBefore(). + uint64_t grandparent_overlapped_bytes_ = 0; + + // A flag determines whether the key has been seen in ShouldStopBefore() + bool seen_key_ = false; + + // for the current output file, how many file boundaries has it crossed, + // basically number of files overlapped * 2 + size_t grandparent_boundary_switched_num_ = 0; +}; + +// helper struct to concatenate the last level and penultimate level outputs +// which could be replaced by std::ranges::join_view() in c++20 +struct OutputIterator { + public: + explicit OutputIterator(const std::vector& a, + const std::vector& b) + : a_(a), b_(b) { + within_a = !a_.empty(); + idx_ = 0; + } + + OutputIterator begin() { return *this; } + + OutputIterator end() { return *this; } + + size_t size() { return a_.size() + b_.size(); } + + const CompactionOutputs::Output& operator*() const { + return within_a ? a_[idx_] : b_[idx_]; + } + + OutputIterator& operator++() { + idx_++; + if (within_a && idx_ >= a_.size()) { + within_a = false; + idx_ = 0; + } + assert(within_a || idx_ <= b_.size()); + return *this; + } + + bool operator!=(const OutputIterator& /*rhs*/) const { + return within_a || idx_ < b_.size(); + } + + private: + const std::vector& a_; + const std::vector& b_; + bool within_a; + size_t idx_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_picker.cc b/src/rocksdb/db/compaction/compaction_picker.cc new file mode 100644 index 000000000..abdecca9f --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker.cc @@ -0,0 +1,1234 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_picker.h" + +#include +#include +#include +#include +#include +#include + +#include "db/column_family.h" +#include "file/filename.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "monitoring/statistics.h" +#include "test_util/sync_point.h" +#include "util/random.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +bool FindIntraL0Compaction(const std::vector& level_files, + size_t min_files_to_compact, + uint64_t max_compact_bytes_per_del_file, + uint64_t max_compaction_bytes, + CompactionInputFiles* comp_inputs, + SequenceNumber earliest_mem_seqno) { + // Do not pick ingested file when there is at least one memtable not flushed + // which of seqno is overlap with the sst. + TEST_SYNC_POINT("FindIntraL0Compaction"); + size_t start = 0; + for (; start < level_files.size(); start++) { + if (level_files[start]->being_compacted) { + return false; + } + // If there is no data in memtable, the earliest sequence number would the + // largest sequence number in last memtable. + // Because all files are sorted in descending order by largest_seqno, so we + // only need to check the first one. + if (level_files[start]->fd.largest_seqno <= earliest_mem_seqno) { + break; + } + } + if (start >= level_files.size()) { + return false; + } + size_t compact_bytes = static_cast(level_files[start]->fd.file_size); + size_t compact_bytes_per_del_file = std::numeric_limits::max(); + // Compaction range will be [start, limit). + size_t limit; + // Pull in files until the amount of compaction work per deleted file begins + // increasing or maximum total compaction size is reached. + size_t new_compact_bytes_per_del_file = 0; + for (limit = start + 1; limit < level_files.size(); ++limit) { + compact_bytes += static_cast(level_files[limit]->fd.file_size); + new_compact_bytes_per_del_file = compact_bytes / (limit - start); + if (level_files[limit]->being_compacted || + new_compact_bytes_per_del_file > compact_bytes_per_del_file || + compact_bytes > max_compaction_bytes) { + break; + } + compact_bytes_per_del_file = new_compact_bytes_per_del_file; + } + + if ((limit - start) >= min_files_to_compact && + compact_bytes_per_del_file < max_compact_bytes_per_del_file) { + assert(comp_inputs != nullptr); + comp_inputs->level = 0; + for (size_t i = start; i < limit; ++i) { + comp_inputs->files.push_back(level_files[i]); + } + return true; + } + return false; +} + +// Determine compression type, based on user options, level of the output +// file and whether compression is disabled. +// If enable_compression is false, then compression is always disabled no +// matter what the values of the other two parameters are. +// Otherwise, the compression type is determined based on options and level. +CompressionType GetCompressionType(const VersionStorageInfo* vstorage, + const MutableCFOptions& mutable_cf_options, + int level, int base_level, + const bool enable_compression) { + if (!enable_compression) { + // disable compression + return kNoCompression; + } + + // If bottommost_compression is set and we are compacting to the + // bottommost level then we should use it. + if (mutable_cf_options.bottommost_compression != kDisableCompressionOption && + level >= (vstorage->num_non_empty_levels() - 1)) { + return mutable_cf_options.bottommost_compression; + } + // If the user has specified a different compression level for each level, + // then pick the compression for that level. + if (!mutable_cf_options.compression_per_level.empty()) { + assert(level == 0 || level >= base_level); + int idx = (level == 0) ? 0 : level - base_level + 1; + + const int n = + static_cast(mutable_cf_options.compression_per_level.size()) - 1; + // It is possible for level_ to be -1; in that case, we use level + // 0's compression. This occurs mostly in backwards compatibility + // situations when the builder doesn't know what level the file + // belongs to. Likewise, if level is beyond the end of the + // specified compression levels, use the last value. + return mutable_cf_options + .compression_per_level[std::max(0, std::min(idx, n))]; + } else { + return mutable_cf_options.compression; + } +} + +CompressionOptions GetCompressionOptions(const MutableCFOptions& cf_options, + const VersionStorageInfo* vstorage, + int level, + const bool enable_compression) { + if (!enable_compression) { + return cf_options.compression_opts; + } + // If bottommost_compression_opts is enabled and we are compacting to the + // bottommost level then we should use the specified compression options. + if (level >= (vstorage->num_non_empty_levels() - 1) && + cf_options.bottommost_compression_opts.enabled) { + return cf_options.bottommost_compression_opts; + } + return cf_options.compression_opts; +} + +CompactionPicker::CompactionPicker(const ImmutableOptions& ioptions, + const InternalKeyComparator* icmp) + : ioptions_(ioptions), icmp_(icmp) {} + +CompactionPicker::~CompactionPicker() {} + +// Delete this compaction from the list of running compactions. +void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) { + UnregisterCompaction(c); + if (!status.ok()) { + c->ResetNextCompactionIndex(); + } +} + +void CompactionPicker::GetRange(const CompactionInputFiles& inputs, + InternalKey* smallest, + InternalKey* largest) const { + const int level = inputs.level; + assert(!inputs.empty()); + smallest->Clear(); + largest->Clear(); + + if (level == 0) { + for (size_t i = 0; i < inputs.size(); i++) { + FileMetaData* f = inputs[i]; + if (i == 0) { + *smallest = f->smallest; + *largest = f->largest; + } else { + if (icmp_->Compare(f->smallest, *smallest) < 0) { + *smallest = f->smallest; + } + if (icmp_->Compare(f->largest, *largest) > 0) { + *largest = f->largest; + } + } + } + } else { + *smallest = inputs[0]->smallest; + *largest = inputs[inputs.size() - 1]->largest; + } +} + +void CompactionPicker::GetRange(const CompactionInputFiles& inputs1, + const CompactionInputFiles& inputs2, + InternalKey* smallest, + InternalKey* largest) const { + assert(!inputs1.empty() || !inputs2.empty()); + if (inputs1.empty()) { + GetRange(inputs2, smallest, largest); + } else if (inputs2.empty()) { + GetRange(inputs1, smallest, largest); + } else { + InternalKey smallest1, smallest2, largest1, largest2; + GetRange(inputs1, &smallest1, &largest1); + GetRange(inputs2, &smallest2, &largest2); + *smallest = + icmp_->Compare(smallest1, smallest2) < 0 ? smallest1 : smallest2; + *largest = icmp_->Compare(largest1, largest2) < 0 ? largest2 : largest1; + } +} + +void CompactionPicker::GetRange(const std::vector& inputs, + InternalKey* smallest, InternalKey* largest, + int exclude_level) const { + InternalKey current_smallest; + InternalKey current_largest; + bool initialized = false; + for (const auto& in : inputs) { + if (in.empty() || in.level == exclude_level) { + continue; + } + GetRange(in, ¤t_smallest, ¤t_largest); + if (!initialized) { + *smallest = current_smallest; + *largest = current_largest; + initialized = true; + } else { + if (icmp_->Compare(current_smallest, *smallest) < 0) { + *smallest = current_smallest; + } + if (icmp_->Compare(current_largest, *largest) > 0) { + *largest = current_largest; + } + } + } + assert(initialized); +} + +bool CompactionPicker::ExpandInputsToCleanCut(const std::string& /*cf_name*/, + VersionStorageInfo* vstorage, + CompactionInputFiles* inputs, + InternalKey** next_smallest) { + // This isn't good compaction + assert(!inputs->empty()); + + const int level = inputs->level; + // GetOverlappingInputs will always do the right thing for level-0. + // So we don't need to do any expansion if level == 0. + if (level == 0) { + return true; + } + + InternalKey smallest, largest; + + // Keep expanding inputs until we are sure that there is a "clean cut" + // boundary between the files in input and the surrounding files. + // This will ensure that no parts of a key are lost during compaction. + int hint_index = -1; + size_t old_size; + do { + old_size = inputs->size(); + GetRange(*inputs, &smallest, &largest); + inputs->clear(); + vstorage->GetOverlappingInputs(level, &smallest, &largest, &inputs->files, + hint_index, &hint_index, true, + next_smallest); + } while (inputs->size() > old_size); + + // we started off with inputs non-empty and the previous loop only grew + // inputs. thus, inputs should be non-empty here + assert(!inputs->empty()); + + // If, after the expansion, there are files that are already under + // compaction, then we must drop/cancel this compaction. + if (AreFilesInCompaction(inputs->files)) { + return false; + } + return true; +} + +bool CompactionPicker::RangeOverlapWithCompaction( + const Slice& smallest_user_key, const Slice& largest_user_key, + int level) const { + const Comparator* ucmp = icmp_->user_comparator(); + for (Compaction* c : compactions_in_progress_) { + if (c->output_level() == level && + ucmp->CompareWithoutTimestamp(smallest_user_key, + c->GetLargestUserKey()) <= 0 && + ucmp->CompareWithoutTimestamp(largest_user_key, + c->GetSmallestUserKey()) >= 0) { + // Overlap + return true; + } + if (c->SupportsPerKeyPlacement()) { + if (c->OverlapPenultimateLevelOutputRange(smallest_user_key, + largest_user_key)) { + return true; + } + } + } + // Did not overlap with any running compaction in level `level` + return false; +} + +bool CompactionPicker::FilesRangeOverlapWithCompaction( + const std::vector& inputs, int level, + int penultimate_level) const { + bool is_empty = true; + for (auto& in : inputs) { + if (!in.empty()) { + is_empty = false; + break; + } + } + if (is_empty) { + // No files in inputs + return false; + } + + // TODO: Intra L0 compactions can have the ranges overlapped, but the input + // files cannot be overlapped in the order of L0 files. + InternalKey smallest, largest; + GetRange(inputs, &smallest, &largest, Compaction::kInvalidLevel); + if (penultimate_level != Compaction::kInvalidLevel) { + if (ioptions_.compaction_style == kCompactionStyleUniversal) { + if (RangeOverlapWithCompaction(smallest.user_key(), largest.user_key(), + penultimate_level)) { + return true; + } + } else { + InternalKey penultimate_smallest, penultimate_largest; + GetRange(inputs, &penultimate_smallest, &penultimate_largest, level); + if (RangeOverlapWithCompaction(penultimate_smallest.user_key(), + penultimate_largest.user_key(), + penultimate_level)) { + return true; + } + } + } + + return RangeOverlapWithCompaction(smallest.user_key(), largest.user_key(), + level); +} + +// Returns true if any one of specified files are being compacted +bool CompactionPicker::AreFilesInCompaction( + const std::vector& files) { + for (size_t i = 0; i < files.size(); i++) { + if (files[i]->being_compacted) { + return true; + } + } + return false; +} + +Compaction* CompactionPicker::CompactFiles( + const CompactionOptions& compact_options, + const std::vector& input_files, int output_level, + VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, uint32_t output_path_id) { +#ifndef NDEBUG + assert(input_files.size()); + // This compaction output should not overlap with a running compaction as + // `SanitizeCompactionInputFiles` should've checked earlier and db mutex + // shouldn't have been released since. + int start_level = Compaction::kInvalidLevel; + for (const auto& in : input_files) { + // input_files should already be sorted by level + if (!in.empty()) { + start_level = in.level; + break; + } + } + assert(output_level == 0 || + !FilesRangeOverlapWithCompaction( + input_files, output_level, + Compaction::EvaluatePenultimateLevel(vstorage, ioptions_, + start_level, output_level))); +#endif /* !NDEBUG */ + + CompressionType compression_type; + if (compact_options.compression == kDisableCompressionOption) { + int base_level; + if (ioptions_.compaction_style == kCompactionStyleLevel) { + base_level = vstorage->base_level(); + } else { + base_level = 1; + } + compression_type = GetCompressionType(vstorage, mutable_cf_options, + output_level, base_level); + } else { + // TODO(ajkr): `CompactionOptions` offers configurable `CompressionType` + // without configurable `CompressionOptions`, which is inconsistent. + compression_type = compact_options.compression; + } + auto c = new Compaction( + vstorage, ioptions_, mutable_cf_options, mutable_db_options, input_files, + output_level, compact_options.output_file_size_limit, + mutable_cf_options.max_compaction_bytes, output_path_id, compression_type, + GetCompressionOptions(mutable_cf_options, vstorage, output_level), + Temperature::kUnknown, compact_options.max_subcompactions, + /* grandparents */ {}, true); + RegisterCompaction(c); + return c; +} + +Status CompactionPicker::GetCompactionInputsFromFileNumbers( + std::vector* input_files, + std::unordered_set* input_set, const VersionStorageInfo* vstorage, + const CompactionOptions& /*compact_options*/) const { + if (input_set->size() == 0U) { + return Status::InvalidArgument( + "Compaction must include at least one file."); + } + assert(input_files); + + std::vector matched_input_files; + matched_input_files.resize(vstorage->num_levels()); + int first_non_empty_level = -1; + int last_non_empty_level = -1; + // TODO(yhchiang): use a lazy-initialized mapping from + // file_number to FileMetaData in Version. + for (int level = 0; level < vstorage->num_levels(); ++level) { + for (auto file : vstorage->LevelFiles(level)) { + auto iter = input_set->find(file->fd.GetNumber()); + if (iter != input_set->end()) { + matched_input_files[level].files.push_back(file); + input_set->erase(iter); + last_non_empty_level = level; + if (first_non_empty_level == -1) { + first_non_empty_level = level; + } + } + } + } + + if (!input_set->empty()) { + std::string message( + "Cannot find matched SST files for the following file numbers:"); + for (auto fn : *input_set) { + message += " "; + message += std::to_string(fn); + } + return Status::InvalidArgument(message); + } + + for (int level = first_non_empty_level; level <= last_non_empty_level; + ++level) { + matched_input_files[level].level = level; + input_files->emplace_back(std::move(matched_input_files[level])); + } + + return Status::OK(); +} + +// Returns true if any one of the parent files are being compacted +bool CompactionPicker::IsRangeInCompaction(VersionStorageInfo* vstorage, + const InternalKey* smallest, + const InternalKey* largest, + int level, int* level_index) { + std::vector inputs; + assert(level < NumberLevels()); + + vstorage->GetOverlappingInputs(level, smallest, largest, &inputs, + level_index ? *level_index : 0, level_index); + return AreFilesInCompaction(inputs); +} + +// Populates the set of inputs of all other levels that overlap with the +// start level. +// Now we assume all levels except start level and output level are empty. +// Will also attempt to expand "start level" if that doesn't expand +// "output level" or cause "level" to include a file for compaction that has an +// overlapping user-key with another file. +// REQUIRES: input_level and output_level are different +// REQUIRES: inputs->empty() == false +// Returns false if files on parent level are currently in compaction, which +// means that we can't compact them +bool CompactionPicker::SetupOtherInputs( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, CompactionInputFiles* inputs, + CompactionInputFiles* output_level_inputs, int* parent_index, + int base_index, bool only_expand_towards_right) { + assert(!inputs->empty()); + assert(output_level_inputs->empty()); + const int input_level = inputs->level; + const int output_level = output_level_inputs->level; + if (input_level == output_level) { + // no possibility of conflict + return true; + } + + // For now, we only support merging two levels, start level and output level. + // We need to assert other levels are empty. + for (int l = input_level + 1; l < output_level; l++) { + assert(vstorage->NumLevelFiles(l) == 0); + } + + InternalKey smallest, largest; + + // Get the range one last time. + GetRange(*inputs, &smallest, &largest); + + // Populate the set of next-level files (inputs_GetOutputLevelInputs()) to + // include in compaction + vstorage->GetOverlappingInputs(output_level, &smallest, &largest, + &output_level_inputs->files, *parent_index, + parent_index); + if (AreFilesInCompaction(output_level_inputs->files)) { + return false; + } + if (!output_level_inputs->empty()) { + if (!ExpandInputsToCleanCut(cf_name, vstorage, output_level_inputs)) { + return false; + } + } + + // See if we can further grow the number of inputs in "level" without + // changing the number of "level+1" files we pick up. We also choose NOT + // to expand if this would cause "level" to include some entries for some + // user key, while excluding other entries for the same user key. This + // can happen when one user key spans multiple files. + if (!output_level_inputs->empty()) { + const uint64_t limit = mutable_cf_options.max_compaction_bytes; + const uint64_t output_level_inputs_size = + TotalFileSize(output_level_inputs->files); + const uint64_t inputs_size = TotalFileSize(inputs->files); + bool expand_inputs = false; + + CompactionInputFiles expanded_inputs; + expanded_inputs.level = input_level; + // Get closed interval of output level + InternalKey all_start, all_limit; + GetRange(*inputs, *output_level_inputs, &all_start, &all_limit); + bool try_overlapping_inputs = true; + if (only_expand_towards_right) { + // Round-robin compaction only allows expansion towards the larger side. + vstorage->GetOverlappingInputs(input_level, &smallest, &all_limit, + &expanded_inputs.files, base_index, + nullptr); + } else { + vstorage->GetOverlappingInputs(input_level, &all_start, &all_limit, + &expanded_inputs.files, base_index, + nullptr); + } + uint64_t expanded_inputs_size = TotalFileSize(expanded_inputs.files); + if (!ExpandInputsToCleanCut(cf_name, vstorage, &expanded_inputs)) { + try_overlapping_inputs = false; + } + if (try_overlapping_inputs && expanded_inputs.size() > inputs->size() && + (mutable_cf_options.ignore_max_compaction_bytes_for_input || + output_level_inputs_size + expanded_inputs_size < limit) && + !AreFilesInCompaction(expanded_inputs.files)) { + InternalKey new_start, new_limit; + GetRange(expanded_inputs, &new_start, &new_limit); + CompactionInputFiles expanded_output_level_inputs; + expanded_output_level_inputs.level = output_level; + vstorage->GetOverlappingInputs(output_level, &new_start, &new_limit, + &expanded_output_level_inputs.files, + *parent_index, parent_index); + assert(!expanded_output_level_inputs.empty()); + if (!AreFilesInCompaction(expanded_output_level_inputs.files) && + ExpandInputsToCleanCut(cf_name, vstorage, + &expanded_output_level_inputs) && + expanded_output_level_inputs.size() == output_level_inputs->size()) { + expand_inputs = true; + } + } + if (!expand_inputs) { + vstorage->GetCleanInputsWithinInterval(input_level, &all_start, + &all_limit, &expanded_inputs.files, + base_index, nullptr); + expanded_inputs_size = TotalFileSize(expanded_inputs.files); + if (expanded_inputs.size() > inputs->size() && + (mutable_cf_options.ignore_max_compaction_bytes_for_input || + output_level_inputs_size + expanded_inputs_size < limit) && + !AreFilesInCompaction(expanded_inputs.files)) { + expand_inputs = true; + } + } + if (expand_inputs) { + ROCKS_LOG_INFO(ioptions_.logger, + "[%s] Expanding@%d %" ROCKSDB_PRIszt "+%" ROCKSDB_PRIszt + "(%" PRIu64 "+%" PRIu64 " bytes) to %" ROCKSDB_PRIszt + "+%" ROCKSDB_PRIszt " (%" PRIu64 "+%" PRIu64 " bytes)\n", + cf_name.c_str(), input_level, inputs->size(), + output_level_inputs->size(), inputs_size, + output_level_inputs_size, expanded_inputs.size(), + output_level_inputs->size(), expanded_inputs_size, + output_level_inputs_size); + inputs->files = expanded_inputs.files; + } + } else { + // Likely to be trivial move. Expand files if they are still trivial moves, + // but limit to mutable_cf_options.max_compaction_bytes or 8 files so that + // we don't create too much compaction pressure for the next level. + } + return true; +} + +void CompactionPicker::GetGrandparents( + VersionStorageInfo* vstorage, const CompactionInputFiles& inputs, + const CompactionInputFiles& output_level_inputs, + std::vector* grandparents) { + InternalKey start, limit; + GetRange(inputs, output_level_inputs, &start, &limit); + // Compute the set of grandparent files that overlap this compaction + // (parent == level+1; grandparent == level+2 or the first + // level after that has overlapping files) + for (int level = output_level_inputs.level + 1; level < NumberLevels(); + level++) { + vstorage->GetOverlappingInputs(level, &start, &limit, grandparents); + if (!grandparents->empty()) { + break; + } + } +} + +Compaction* CompactionPicker::CompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + int input_level, int output_level, + const CompactRangeOptions& compact_range_options, const InternalKey* begin, + const InternalKey* end, InternalKey** compaction_end, bool* manual_conflict, + uint64_t max_file_num_to_ignore, const std::string& trim_ts) { + // CompactionPickerFIFO has its own implementation of compact range + assert(ioptions_.compaction_style != kCompactionStyleFIFO); + + if (input_level == ColumnFamilyData::kCompactAllLevels) { + assert(ioptions_.compaction_style == kCompactionStyleUniversal); + + // Universal compaction with more than one level always compacts all the + // files together to the last level. + assert(vstorage->num_levels() > 1); + // DBImpl::CompactRange() set output level to be the last level + if (ioptions_.allow_ingest_behind) { + assert(output_level == vstorage->num_levels() - 2); + } else { + assert(output_level == vstorage->num_levels() - 1); + } + // DBImpl::RunManualCompaction will make full range for universal compaction + assert(begin == nullptr); + assert(end == nullptr); + *compaction_end = nullptr; + + int start_level = 0; + for (; start_level < vstorage->num_levels() && + vstorage->NumLevelFiles(start_level) == 0; + start_level++) { + } + if (start_level == vstorage->num_levels()) { + return nullptr; + } + + if ((start_level == 0) && (!level0_compactions_in_progress_.empty())) { + *manual_conflict = true; + // Only one level 0 compaction allowed + return nullptr; + } + + std::vector inputs(vstorage->num_levels() - + start_level); + for (int level = start_level; level < vstorage->num_levels(); level++) { + inputs[level - start_level].level = level; + auto& files = inputs[level - start_level].files; + for (FileMetaData* f : vstorage->LevelFiles(level)) { + files.push_back(f); + } + if (AreFilesInCompaction(files)) { + *manual_conflict = true; + return nullptr; + } + } + + // 2 non-exclusive manual compactions could run at the same time producing + // overlaping outputs in the same level. + if (FilesRangeOverlapWithCompaction( + inputs, output_level, + Compaction::EvaluatePenultimateLevel(vstorage, ioptions_, + start_level, output_level))) { + // This compaction output could potentially conflict with the output + // of a currently running compaction, we cannot run it. + *manual_conflict = true; + return nullptr; + } + + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + std::move(inputs), output_level, + MaxFileSizeForLevel(mutable_cf_options, output_level, + ioptions_.compaction_style), + /* max_compaction_bytes */ LLONG_MAX, + compact_range_options.target_path_id, + GetCompressionType(vstorage, mutable_cf_options, output_level, 1), + GetCompressionOptions(mutable_cf_options, vstorage, output_level), + Temperature::kUnknown, compact_range_options.max_subcompactions, + /* grandparents */ {}, /* is manual */ true, trim_ts, /* score */ -1, + /* deletion_compaction */ false, /* l0_files_might_overlap */ true, + CompactionReason::kUnknown, + compact_range_options.blob_garbage_collection_policy, + compact_range_options.blob_garbage_collection_age_cutoff); + + RegisterCompaction(c); + vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options); + return c; + } + + CompactionInputFiles inputs; + inputs.level = input_level; + bool covering_the_whole_range = true; + + // All files are 'overlapping' in universal style compaction. + // We have to compact the entire range in one shot. + if (ioptions_.compaction_style == kCompactionStyleUniversal) { + begin = nullptr; + end = nullptr; + } + + vstorage->GetOverlappingInputs(input_level, begin, end, &inputs.files); + if (inputs.empty()) { + return nullptr; + } + + if ((input_level == 0) && (!level0_compactions_in_progress_.empty())) { + // Only one level 0 compaction allowed + TEST_SYNC_POINT("CompactionPicker::CompactRange:Conflict"); + *manual_conflict = true; + return nullptr; + } + + // Avoid compacting too much in one shot in case the range is large. + // But we cannot do this for level-0 since level-0 files can overlap + // and we must not pick one file and drop another older file if the + // two files overlap. + if (input_level > 0) { + const uint64_t limit = mutable_cf_options.max_compaction_bytes; + uint64_t input_level_total = 0; + int hint_index = -1; + InternalKey* smallest = nullptr; + InternalKey* largest = nullptr; + for (size_t i = 0; i + 1 < inputs.size(); ++i) { + if (!smallest) { + smallest = &inputs[i]->smallest; + } + largest = &inputs[i]->largest; + + uint64_t input_file_size = inputs[i]->fd.GetFileSize(); + uint64_t output_level_total = 0; + if (output_level < vstorage->num_non_empty_levels()) { + std::vector files; + vstorage->GetOverlappingInputsRangeBinarySearch( + output_level, smallest, largest, &files, hint_index, &hint_index); + for (const auto& file : files) { + output_level_total += file->fd.GetFileSize(); + } + } + + input_level_total += input_file_size; + + if (input_level_total + output_level_total >= limit) { + covering_the_whole_range = false; + // still include the current file, so the compaction could be larger + // than max_compaction_bytes, which is also to make sure the compaction + // can make progress even `max_compaction_bytes` is small (e.g. smaller + // than an SST file). + inputs.files.resize(i + 1); + break; + } + } + } + + assert(compact_range_options.target_path_id < + static_cast(ioptions_.cf_paths.size())); + + // for BOTTOM LEVEL compaction only, use max_file_num_to_ignore to filter out + // files that are created during the current compaction. + if (compact_range_options.bottommost_level_compaction == + BottommostLevelCompaction::kForceOptimized && + max_file_num_to_ignore != std::numeric_limits::max()) { + assert(input_level == output_level); + // inputs_shrunk holds a continuous subset of input files which were all + // created before the current manual compaction + std::vector inputs_shrunk; + size_t skip_input_index = inputs.size(); + for (size_t i = 0; i < inputs.size(); ++i) { + if (inputs[i]->fd.GetNumber() < max_file_num_to_ignore) { + inputs_shrunk.push_back(inputs[i]); + } else if (!inputs_shrunk.empty()) { + // inputs[i] was created during the current manual compaction and + // need to be skipped + skip_input_index = i; + break; + } + } + if (inputs_shrunk.empty()) { + return nullptr; + } + if (inputs.size() != inputs_shrunk.size()) { + inputs.files.swap(inputs_shrunk); + } + // set covering_the_whole_range to false if there is any file that need to + // be compacted in the range of inputs[skip_input_index+1, inputs.size()) + for (size_t i = skip_input_index + 1; i < inputs.size(); ++i) { + if (inputs[i]->fd.GetNumber() < max_file_num_to_ignore) { + covering_the_whole_range = false; + } + } + } + + InternalKey key_storage; + InternalKey* next_smallest = &key_storage; + if (ExpandInputsToCleanCut(cf_name, vstorage, &inputs, &next_smallest) == + false) { + // manual compaction is now multi-threaded, so it can + // happen that ExpandWhileOverlapping fails + // we handle it higher in RunManualCompaction + *manual_conflict = true; + return nullptr; + } + + if (covering_the_whole_range || !next_smallest) { + *compaction_end = nullptr; + } else { + **compaction_end = *next_smallest; + } + + CompactionInputFiles output_level_inputs; + if (output_level == ColumnFamilyData::kCompactToBaseLevel) { + assert(input_level == 0); + output_level = vstorage->base_level(); + assert(output_level > 0); + } + output_level_inputs.level = output_level; + if (input_level != output_level) { + int parent_index = -1; + if (!SetupOtherInputs(cf_name, mutable_cf_options, vstorage, &inputs, + &output_level_inputs, &parent_index, -1)) { + // manual compaction is now multi-threaded, so it can + // happen that SetupOtherInputs fails + // we handle it higher in RunManualCompaction + *manual_conflict = true; + return nullptr; + } + } + + std::vector compaction_inputs({inputs}); + if (!output_level_inputs.empty()) { + compaction_inputs.push_back(output_level_inputs); + } + for (size_t i = 0; i < compaction_inputs.size(); i++) { + if (AreFilesInCompaction(compaction_inputs[i].files)) { + *manual_conflict = true; + return nullptr; + } + } + + // 2 non-exclusive manual compactions could run at the same time producing + // overlaping outputs in the same level. + if (FilesRangeOverlapWithCompaction( + compaction_inputs, output_level, + Compaction::EvaluatePenultimateLevel(vstorage, ioptions_, input_level, + output_level))) { + // This compaction output could potentially conflict with the output + // of a currently running compaction, we cannot run it. + *manual_conflict = true; + return nullptr; + } + + std::vector grandparents; + GetGrandparents(vstorage, inputs, output_level_inputs, &grandparents); + Compaction* compaction = new Compaction( + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + std::move(compaction_inputs), output_level, + MaxFileSizeForLevel(mutable_cf_options, output_level, + ioptions_.compaction_style, vstorage->base_level(), + ioptions_.level_compaction_dynamic_level_bytes), + mutable_cf_options.max_compaction_bytes, + compact_range_options.target_path_id, + GetCompressionType(vstorage, mutable_cf_options, output_level, + vstorage->base_level()), + GetCompressionOptions(mutable_cf_options, vstorage, output_level), + Temperature::kUnknown, compact_range_options.max_subcompactions, + std::move(grandparents), /* is manual */ true, trim_ts, /* score */ -1, + /* deletion_compaction */ false, /* l0_files_might_overlap */ true, + CompactionReason::kUnknown, + compact_range_options.blob_garbage_collection_policy, + compact_range_options.blob_garbage_collection_age_cutoff); + + TEST_SYNC_POINT_CALLBACK("CompactionPicker::CompactRange:Return", compaction); + RegisterCompaction(compaction); + + // Creating a compaction influences the compaction score because the score + // takes running compactions into account (by skipping files that are already + // being compacted). Since we just changed compaction score, we recalculate it + // here + vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options); + + return compaction; +} + +#ifndef ROCKSDB_LITE +namespace { +// Test whether two files have overlapping key-ranges. +bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a, + const SstFileMetaData& b) { + if (c->CompareWithoutTimestamp(a.smallestkey, b.smallestkey) >= 0) { + if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) { + // b.smallestkey <= a.smallestkey <= b.largestkey + return true; + } + } else if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) { + // a.smallestkey < b.smallestkey <= a.largestkey + return true; + } + if (c->CompareWithoutTimestamp(a.largestkey, b.largestkey) <= 0) { + if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) { + // b.smallestkey <= a.largestkey <= b.largestkey + return true; + } + } else if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) { + // a.smallestkey <= b.largestkey < a.largestkey + return true; + } + return false; +} +} // namespace + +Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels( + std::unordered_set* input_files, + const ColumnFamilyMetaData& cf_meta, const int output_level) const { + auto& levels = cf_meta.levels; + auto comparator = icmp_->user_comparator(); + + // TODO(yhchiang): add is_adjustable to CompactionOptions + + // the smallest and largest key of the current compaction input + std::string smallestkey; + std::string largestkey; + // a flag for initializing smallest and largest key + bool is_first = false; + const int kNotFound = -1; + + // For each level, it does the following things: + // 1. Find the first and the last compaction input files + // in the current level. + // 2. Include all files between the first and the last + // compaction input files. + // 3. Update the compaction key-range. + // 4. For all remaining levels, include files that have + // overlapping key-range with the compaction key-range. + for (int l = 0; l <= output_level; ++l) { + auto& current_files = levels[l].files; + int first_included = static_cast(current_files.size()); + int last_included = kNotFound; + + // identify the first and the last compaction input files + // in the current level. + for (size_t f = 0; f < current_files.size(); ++f) { + const uint64_t file_number = TableFileNameToNumber(current_files[f].name); + if (input_files->find(file_number) == input_files->end()) { + continue; + } + first_included = std::min(first_included, static_cast(f)); + last_included = std::max(last_included, static_cast(f)); + if (is_first == false) { + smallestkey = current_files[f].smallestkey; + largestkey = current_files[f].largestkey; + is_first = true; + } + } + if (last_included == kNotFound) { + continue; + } + + if (l != 0) { + // expand the compaction input of the current level if it + // has overlapping key-range with other non-compaction input + // files in the same level. + while (first_included > 0) { + if (comparator->CompareWithoutTimestamp( + current_files[first_included - 1].largestkey, + current_files[first_included].smallestkey) < 0) { + break; + } + first_included--; + } + + while (last_included < static_cast(current_files.size()) - 1) { + if (comparator->CompareWithoutTimestamp( + current_files[last_included + 1].smallestkey, + current_files[last_included].largestkey) > 0) { + break; + } + last_included++; + } + } else if (output_level > 0) { + last_included = static_cast(current_files.size() - 1); + } + + // include all files between the first and the last compaction input files. + for (int f = first_included; f <= last_included; ++f) { + if (current_files[f].being_compacted) { + return Status::Aborted("Necessary compaction input file " + + current_files[f].name + + " is currently being compacted."); + } + input_files->insert(TableFileNameToNumber(current_files[f].name)); + } + + // update smallest and largest key + if (l == 0) { + for (int f = first_included; f <= last_included; ++f) { + if (comparator->CompareWithoutTimestamp( + smallestkey, current_files[f].smallestkey) > 0) { + smallestkey = current_files[f].smallestkey; + } + if (comparator->CompareWithoutTimestamp( + largestkey, current_files[f].largestkey) < 0) { + largestkey = current_files[f].largestkey; + } + } + } else { + if (comparator->CompareWithoutTimestamp( + smallestkey, current_files[first_included].smallestkey) > 0) { + smallestkey = current_files[first_included].smallestkey; + } + if (comparator->CompareWithoutTimestamp( + largestkey, current_files[last_included].largestkey) < 0) { + largestkey = current_files[last_included].largestkey; + } + } + + SstFileMetaData aggregated_file_meta; + aggregated_file_meta.smallestkey = smallestkey; + aggregated_file_meta.largestkey = largestkey; + + // For all lower levels, include all overlapping files. + // We need to add overlapping files from the current level too because even + // if there no input_files in level l, we would still need to add files + // which overlap with the range containing the input_files in levels 0 to l + // Level 0 doesn't need to be handled this way because files are sorted by + // time and not by key + for (int m = std::max(l, 1); m <= output_level; ++m) { + for (auto& next_lv_file : levels[m].files) { + if (HaveOverlappingKeyRanges(comparator, aggregated_file_meta, + next_lv_file)) { + if (next_lv_file.being_compacted) { + return Status::Aborted( + "File " + next_lv_file.name + + " that has overlapping key range with one of the compaction " + " input file is currently being compacted."); + } + input_files->insert(TableFileNameToNumber(next_lv_file.name)); + } + } + } + } + if (RangeOverlapWithCompaction(smallestkey, largestkey, output_level)) { + return Status::Aborted( + "A running compaction is writing to the same output level in an " + "overlapping key range"); + } + return Status::OK(); +} + +Status CompactionPicker::SanitizeCompactionInputFiles( + std::unordered_set* input_files, + const ColumnFamilyMetaData& cf_meta, const int output_level) const { + assert(static_cast(cf_meta.levels.size()) - 1 == + cf_meta.levels[cf_meta.levels.size() - 1].level); + if (output_level >= static_cast(cf_meta.levels.size())) { + return Status::InvalidArgument( + "Output level for column family " + cf_meta.name + + " must between [0, " + + std::to_string(cf_meta.levels[cf_meta.levels.size() - 1].level) + "]."); + } + + if (output_level > MaxOutputLevel()) { + return Status::InvalidArgument( + "Exceed the maximum output level defined by " + "the current compaction algorithm --- " + + std::to_string(MaxOutputLevel())); + } + + if (output_level < 0) { + return Status::InvalidArgument("Output level cannot be negative."); + } + + if (input_files->size() == 0) { + return Status::InvalidArgument( + "A compaction must contain at least one file."); + } + + Status s = SanitizeCompactionInputFilesForAllLevels(input_files, cf_meta, + output_level); + + if (!s.ok()) { + return s; + } + + // for all input files, check whether the file number matches + // any currently-existing files. + for (auto file_num : *input_files) { + bool found = false; + int input_file_level = -1; + for (const auto& level_meta : cf_meta.levels) { + for (const auto& file_meta : level_meta.files) { + if (file_num == TableFileNameToNumber(file_meta.name)) { + if (file_meta.being_compacted) { + return Status::Aborted("Specified compaction input file " + + MakeTableFileName("", file_num) + + " is already being compacted."); + } + found = true; + input_file_level = level_meta.level; + break; + } + } + if (found) { + break; + } + } + if (!found) { + return Status::InvalidArgument( + "Specified compaction input file " + MakeTableFileName("", file_num) + + " does not exist in column family " + cf_meta.name + "."); + } + if (input_file_level > output_level) { + return Status::InvalidArgument( + "Cannot compact file to up level, input file: " + + MakeTableFileName("", file_num) + " level " + + std::to_string(input_file_level) + " > output level " + + std::to_string(output_level)); + } + } + + return Status::OK(); +} +#endif // !ROCKSDB_LITE + +void CompactionPicker::RegisterCompaction(Compaction* c) { + if (c == nullptr) { + return; + } + assert(ioptions_.compaction_style != kCompactionStyleLevel || + c->output_level() == 0 || + !FilesRangeOverlapWithCompaction(*c->inputs(), c->output_level(), + c->GetPenultimateLevel())); + if (c->start_level() == 0 || + ioptions_.compaction_style == kCompactionStyleUniversal) { + level0_compactions_in_progress_.insert(c); + } + compactions_in_progress_.insert(c); + TEST_SYNC_POINT_CALLBACK("CompactionPicker::RegisterCompaction:Registered", + c); +} + +void CompactionPicker::UnregisterCompaction(Compaction* c) { + if (c == nullptr) { + return; + } + if (c->start_level() == 0 || + ioptions_.compaction_style == kCompactionStyleUniversal) { + level0_compactions_in_progress_.erase(c); + } + compactions_in_progress_.erase(c); +} + +void CompactionPicker::PickFilesMarkedForCompaction( + const std::string& cf_name, VersionStorageInfo* vstorage, int* start_level, + int* output_level, CompactionInputFiles* start_level_inputs) { + if (vstorage->FilesMarkedForCompaction().empty()) { + return; + } + + auto continuation = [&, cf_name](std::pair level_file) { + // If it's being compacted it has nothing to do here. + // If this assert() fails that means that some function marked some + // files as being_compacted, but didn't call ComputeCompactionScore() + assert(!level_file.second->being_compacted); + *start_level = level_file.first; + *output_level = + (*start_level == 0) ? vstorage->base_level() : *start_level + 1; + + if (*start_level == 0 && !level0_compactions_in_progress()->empty()) { + return false; + } + + start_level_inputs->files = {level_file.second}; + start_level_inputs->level = *start_level; + return ExpandInputsToCleanCut(cf_name, vstorage, start_level_inputs); + }; + + // take a chance on a random file first + Random64 rnd(/* seed */ reinterpret_cast(vstorage)); + size_t random_file_index = static_cast(rnd.Uniform( + static_cast(vstorage->FilesMarkedForCompaction().size()))); + TEST_SYNC_POINT_CALLBACK("CompactionPicker::PickFilesMarkedForCompaction", + &random_file_index); + + if (continuation(vstorage->FilesMarkedForCompaction()[random_file_index])) { + // found the compaction! + return; + } + + for (auto& level_file : vstorage->FilesMarkedForCompaction()) { + if (continuation(level_file)) { + // found the compaction! + return; + } + } + start_level_inputs->files.clear(); +} + +bool CompactionPicker::GetOverlappingL0Files( + VersionStorageInfo* vstorage, CompactionInputFiles* start_level_inputs, + int output_level, int* parent_index) { + // Two level 0 compaction won't run at the same time, so don't need to worry + // about files on level 0 being compacted. + assert(level0_compactions_in_progress()->empty()); + InternalKey smallest, largest; + GetRange(*start_level_inputs, &smallest, &largest); + // Note that the next call will discard the file we placed in + // c->inputs_[0] earlier and replace it with an overlapping set + // which will include the picked file. + start_level_inputs->files.clear(); + vstorage->GetOverlappingInputs(0, &smallest, &largest, + &(start_level_inputs->files)); + + // If we include more L0 files in the same compaction run it can + // cause the 'smallest' and 'largest' key to get extended to a + // larger range. So, re-invoke GetRange to get the new key range + GetRange(*start_level_inputs, &smallest, &largest); + if (IsRangeInCompaction(vstorage, &smallest, &largest, output_level, + parent_index)) { + return false; + } + assert(!start_level_inputs->files.empty()); + + return true; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_picker.h b/src/rocksdb/db/compaction/compaction_picker.h new file mode 100644 index 000000000..7739dd96b --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker.h @@ -0,0 +1,323 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include +#include + +#include "db/compaction/compaction.h" +#include "db/version_set.h" +#include "options/cf_options.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// The file contains an abstract class CompactionPicker, and its two +// sub-classes LevelCompactionPicker and NullCompactionPicker, as +// well as some helper functions used by them. + +class LogBuffer; +class Compaction; +class VersionStorageInfo; +struct CompactionInputFiles; + +// An abstract class to pick compactions from an existing LSM-tree. +// +// Each compaction style inherits the class and implement the +// interface to form automatic compactions. If NeedCompaction() is true, +// then call PickCompaction() to find what files need to be compacted +// and where to put the output files. +// +// Non-virtual functions CompactRange() and CompactFiles() are used to +// pick files to compact based on users' DB::CompactRange() and +// DB::CompactFiles() requests, respectively. There is little +// compaction style specific logic for them. +class CompactionPicker { + public: + CompactionPicker(const ImmutableOptions& ioptions, + const InternalKeyComparator* icmp); + virtual ~CompactionPicker(); + + // Pick level and inputs for a new compaction. + // Returns nullptr if there is no compaction to be done. + // Otherwise returns a pointer to a heap-allocated object that + // describes the compaction. Caller should delete the result. + virtual Compaction* PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer, + SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) = 0; + + // Return a compaction object for compacting the range [begin,end] in + // the specified level. Returns nullptr if there is nothing in that + // level that overlaps the specified range. Caller should delete + // the result. + // + // The returned Compaction might not include the whole requested range. + // In that case, compaction_end will be set to the next key that needs + // compacting. In case the compaction will compact the whole range, + // compaction_end will be set to nullptr. + // Client is responsible for compaction_end storage -- when called, + // *compaction_end should point to valid InternalKey! + virtual Compaction* CompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + int input_level, int output_level, + const CompactRangeOptions& compact_range_options, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end, bool* manual_conflict, + uint64_t max_file_num_to_ignore, const std::string& trim_ts); + + // The maximum allowed output level. Default value is NumberLevels() - 1. + virtual int MaxOutputLevel() const { return NumberLevels() - 1; } + + virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const = 0; + +// Sanitize the input set of compaction input files. +// When the input parameters do not describe a valid compaction, the +// function will try to fix the input_files by adding necessary +// files. If it's not possible to conver an invalid input_files +// into a valid one by adding more files, the function will return a +// non-ok status with specific reason. +#ifndef ROCKSDB_LITE + Status SanitizeCompactionInputFiles(std::unordered_set* input_files, + const ColumnFamilyMetaData& cf_meta, + const int output_level) const; +#endif // ROCKSDB_LITE + + // Free up the files that participated in a compaction + // + // Requirement: DB mutex held + void ReleaseCompactionFiles(Compaction* c, Status status); + + // Returns true if any one of the specified files are being compacted + bool AreFilesInCompaction(const std::vector& files); + + // Takes a list of CompactionInputFiles and returns a (manual) Compaction + // object. + // + // Caller must provide a set of input files that has been passed through + // `SanitizeCompactionInputFiles` earlier. The lock should not be released + // between that call and this one. + Compaction* CompactFiles(const CompactionOptions& compact_options, + const std::vector& input_files, + int output_level, VersionStorageInfo* vstorage, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + uint32_t output_path_id); + + // Converts a set of compaction input file numbers into + // a list of CompactionInputFiles. + Status GetCompactionInputsFromFileNumbers( + std::vector* input_files, + std::unordered_set* input_set, + const VersionStorageInfo* vstorage, + const CompactionOptions& compact_options) const; + + // Is there currently a compaction involving level 0 taking place + bool IsLevel0CompactionInProgress() const { + return !level0_compactions_in_progress_.empty(); + } + + // Return true if the passed key range overlap with a compaction output + // that is currently running. + bool RangeOverlapWithCompaction(const Slice& smallest_user_key, + const Slice& largest_user_key, + int level) const; + + // Stores the minimal range that covers all entries in inputs in + // *smallest, *largest. + // REQUIRES: inputs is not empty + void GetRange(const CompactionInputFiles& inputs, InternalKey* smallest, + InternalKey* largest) const; + + // Stores the minimal range that covers all entries in inputs1 and inputs2 + // in *smallest, *largest. + // REQUIRES: inputs is not empty + void GetRange(const CompactionInputFiles& inputs1, + const CompactionInputFiles& inputs2, InternalKey* smallest, + InternalKey* largest) const; + + // Stores the minimal range that covers all entries in inputs + // in *smallest, *largest. + // REQUIRES: inputs is not empty (at least on entry have one file) + void GetRange(const std::vector& inputs, + InternalKey* smallest, InternalKey* largest, + int exclude_level) const; + + int NumberLevels() const { return ioptions_.num_levels; } + + // Add more files to the inputs on "level" to make sure that + // no newer version of a key is compacted to "level+1" while leaving an older + // version in a "level". Otherwise, any Get() will search "level" first, + // and will likely return an old/stale value for the key, since it always + // searches in increasing order of level to find the value. This could + // also scramble the order of merge operands. This function should be + // called any time a new Compaction is created, and its inputs_[0] are + // populated. + // + // Will return false if it is impossible to apply this compaction. + bool ExpandInputsToCleanCut(const std::string& cf_name, + VersionStorageInfo* vstorage, + CompactionInputFiles* inputs, + InternalKey** next_smallest = nullptr); + + // Returns true if any one of the parent files are being compacted + bool IsRangeInCompaction(VersionStorageInfo* vstorage, + const InternalKey* smallest, + const InternalKey* largest, int level, int* index); + + // Returns true if the key range that `inputs` files cover overlap with the + // key range of a currently running compaction. + bool FilesRangeOverlapWithCompaction( + const std::vector& inputs, int level, + int penultimate_level) const; + + bool SetupOtherInputs(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, + CompactionInputFiles* inputs, + CompactionInputFiles* output_level_inputs, + int* parent_index, int base_index, + bool only_expand_towards_right = false); + + void GetGrandparents(VersionStorageInfo* vstorage, + const CompactionInputFiles& inputs, + const CompactionInputFiles& output_level_inputs, + std::vector* grandparents); + + void PickFilesMarkedForCompaction(const std::string& cf_name, + VersionStorageInfo* vstorage, + int* start_level, int* output_level, + CompactionInputFiles* start_level_inputs); + + bool GetOverlappingL0Files(VersionStorageInfo* vstorage, + CompactionInputFiles* start_level_inputs, + int output_level, int* parent_index); + + // Register this compaction in the set of running compactions + void RegisterCompaction(Compaction* c); + + // Remove this compaction from the set of running compactions + void UnregisterCompaction(Compaction* c); + + std::set* level0_compactions_in_progress() { + return &level0_compactions_in_progress_; + } + std::unordered_set* compactions_in_progress() { + return &compactions_in_progress_; + } + + const InternalKeyComparator* icmp() const { return icmp_; } + + protected: + const ImmutableOptions& ioptions_; + +// A helper function to SanitizeCompactionInputFiles() that +// sanitizes "input_files" by adding necessary files. +#ifndef ROCKSDB_LITE + virtual Status SanitizeCompactionInputFilesForAllLevels( + std::unordered_set* input_files, + const ColumnFamilyMetaData& cf_meta, const int output_level) const; +#endif // ROCKSDB_LITE + + // Keeps track of all compactions that are running on Level0. + // Protected by DB mutex + std::set level0_compactions_in_progress_; + + // Keeps track of all compactions that are running. + // Protected by DB mutex + std::unordered_set compactions_in_progress_; + + const InternalKeyComparator* const icmp_; +}; + +#ifndef ROCKSDB_LITE +// A dummy compaction that never triggers any automatic +// compaction. +class NullCompactionPicker : public CompactionPicker { + public: + NullCompactionPicker(const ImmutableOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + virtual ~NullCompactionPicker() {} + + // Always return "nullptr" + Compaction* PickCompaction( + const std::string& /*cf_name*/, + const MutableCFOptions& /*mutable_cf_options*/, + const MutableDBOptions& /*mutable_db_options*/, + VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */, + SequenceNumber /* earliest_memtable_seqno */) override { + return nullptr; + } + + // Always return "nullptr" + Compaction* CompactRange(const std::string& /*cf_name*/, + const MutableCFOptions& /*mutable_cf_options*/, + const MutableDBOptions& /*mutable_db_options*/, + VersionStorageInfo* /*vstorage*/, + int /*input_level*/, int /*output_level*/, + const CompactRangeOptions& /*compact_range_options*/, + const InternalKey* /*begin*/, + const InternalKey* /*end*/, + InternalKey** /*compaction_end*/, + bool* /*manual_conflict*/, + uint64_t /*max_file_num_to_ignore*/, + const std::string& /*trim_ts*/) override { + return nullptr; + } + + // Always returns false. + virtual bool NeedsCompaction( + const VersionStorageInfo* /*vstorage*/) const override { + return false; + } +}; +#endif // !ROCKSDB_LITE + +// Attempts to find an intra L0 compaction conforming to the given parameters. +// +// @param level_files Metadata for L0 files. +// @param min_files_to_compact Minimum number of files required to +// do the compaction. +// @param max_compact_bytes_per_del_file Maximum average size in bytes per +// file that is going to get deleted by +// the compaction. +// @param max_compaction_bytes Maximum total size in bytes (in terms +// of compensated file size) for files +// to be compacted. +// @param [out] comp_inputs If a compaction was found, will be +// initialized with corresponding input +// files. Cannot be nullptr. +// +// @return true iff compaction was found. +bool FindIntraL0Compaction( + const std::vector& level_files, size_t min_files_to_compact, + uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes, + CompactionInputFiles* comp_inputs, + SequenceNumber earliest_mem_seqno = kMaxSequenceNumber); + +CompressionType GetCompressionType(const VersionStorageInfo* vstorage, + const MutableCFOptions& mutable_cf_options, + int level, int base_level, + const bool enable_compression = true); + +CompressionOptions GetCompressionOptions( + const MutableCFOptions& mutable_cf_options, + const VersionStorageInfo* vstorage, int level, + const bool enable_compression = true); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_picker_fifo.cc b/src/rocksdb/db/compaction/compaction_picker_fifo.cc new file mode 100644 index 000000000..1f875e3e1 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_fifo.cc @@ -0,0 +1,433 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_picker_fifo.h" +#ifndef ROCKSDB_LITE + +#include +#include +#include + +#include "db/column_family.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +uint64_t GetTotalFilesSize(const std::vector& files) { + uint64_t total_size = 0; + for (const auto& f : files) { + total_size += f->fd.file_size; + } + return total_size; +} +} // anonymous namespace + +bool FIFOCompactionPicker::NeedsCompaction( + const VersionStorageInfo* vstorage) const { + const int kLevel0 = 0; + return vstorage->CompactionScore(kLevel0) >= 1; +} + +Compaction* FIFOCompactionPicker::PickTTLCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer) { + assert(mutable_cf_options.ttl > 0); + + const int kLevel0 = 0; + const std::vector& level_files = vstorage->LevelFiles(kLevel0); + uint64_t total_size = GetTotalFilesSize(level_files); + + int64_t _current_time; + auto status = ioptions_.clock->GetCurrentTime(&_current_time); + if (!status.ok()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: Couldn't get current time: %s. " + "Not doing compactions based on TTL. ", + cf_name.c_str(), status.ToString().c_str()); + return nullptr; + } + const uint64_t current_time = static_cast(_current_time); + + if (!level0_compactions_in_progress_.empty()) { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: Already executing compaction. No need " + "to run parallel compactions since compactions are very fast", + cf_name.c_str()); + return nullptr; + } + + std::vector inputs; + inputs.emplace_back(); + inputs[0].level = 0; + + // avoid underflow + if (current_time > mutable_cf_options.ttl) { + for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) { + FileMetaData* f = *ritr; + assert(f); + if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) { + uint64_t creation_time = + f->fd.table_reader->GetTableProperties()->creation_time; + if (creation_time == 0 || + creation_time >= (current_time - mutable_cf_options.ttl)) { + break; + } + } + total_size -= f->fd.file_size; + inputs[0].files.push_back(f); + } + } + + // Return a nullptr and proceed to size-based FIFO compaction if: + // 1. there are no files older than ttl OR + // 2. there are a few files older than ttl, but deleting them will not bring + // the total size to be less than max_table_files_size threshold. + if (inputs[0].files.empty() || + total_size > + mutable_cf_options.compaction_options_fifo.max_table_files_size) { + return nullptr; + } + + for (const auto& f : inputs[0].files) { + uint64_t creation_time = 0; + assert(f); + if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) { + creation_time = f->fd.table_reader->GetTableProperties()->creation_time; + } + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: picking file %" PRIu64 + " with creation time %" PRIu64 " for deletion", + cf_name.c_str(), f->fd.GetNumber(), creation_time); + } + + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + std::move(inputs), 0, 0, 0, 0, kNoCompression, + mutable_cf_options.compression_opts, Temperature::kUnknown, + /* max_subcompactions */ 0, {}, /* is manual */ false, + /* trim_ts */ "", vstorage->CompactionScore(0), + /* is deletion compaction */ true, /* l0_files_might_overlap */ true, + CompactionReason::kFIFOTtl); + return c; +} + +// The size-based compaction picker for FIFO. +// +// When the entire column family size exceeds max_table_files_size, FIFO will +// try to delete the oldest sst file(s) until the resulting column family size +// is smaller than max_table_files_size. +// +// This function also takes care the case where a DB is migrating from level / +// universal compaction to FIFO compaction. During the migration, the column +// family will also have non-L0 files while FIFO can only create L0 files. +// In this case, this function will first purge the sst files in the bottom- +// most non-empty level first, and the DB will eventually converge to the +// regular FIFO case where there're only L0 files. Note that during the +// migration case, the purge order will only be an approximation of "FIFO" +// as entries inside lower-level files might sometimes be newer than some +// entries inside upper-level files. +Compaction* FIFOCompactionPicker::PickSizeCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer) { + // compute the total size and identify the last non-empty level + int last_level = 0; + uint64_t total_size = 0; + for (int level = 0; level < vstorage->num_levels(); ++level) { + auto level_size = GetTotalFilesSize(vstorage->LevelFiles(level)); + total_size += level_size; + if (level_size > 0) { + last_level = level; + } + } + const std::vector& last_level_files = + vstorage->LevelFiles(last_level); + + if (last_level == 0 && + total_size <= + mutable_cf_options.compaction_options_fifo.max_table_files_size) { + // total size not exceeded, try to find intra level 0 compaction if enabled + const std::vector& level0_files = vstorage->LevelFiles(0); + if (mutable_cf_options.compaction_options_fifo.allow_compaction && + level0_files.size() > 0) { + CompactionInputFiles comp_inputs; + // try to prevent same files from being compacted multiple times, which + // could produce large files that may never TTL-expire. Achieve this by + // disallowing compactions with files larger than memtable (inflate its + // size by 10% to account for uncompressed L0 files that may have size + // slightly greater than memtable size limit). + size_t max_compact_bytes_per_del_file = + static_cast(MultiplyCheckOverflow( + static_cast(mutable_cf_options.write_buffer_size), + 1.1)); + if (FindIntraL0Compaction( + level0_files, + mutable_cf_options + .level0_file_num_compaction_trigger /* min_files_to_compact */ + , + max_compact_bytes_per_del_file, + mutable_cf_options.max_compaction_bytes, &comp_inputs)) { + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + {comp_inputs}, 0, 16 * 1024 * 1024 /* output file size limit */, + 0 /* max compaction bytes, not applicable */, + 0 /* output path ID */, mutable_cf_options.compression, + mutable_cf_options.compression_opts, Temperature::kUnknown, + 0 /* max_subcompactions */, {}, /* is manual */ false, + /* trim_ts */ "", vstorage->CompactionScore(0), + /* is deletion compaction */ false, + /* l0_files_might_overlap */ true, + CompactionReason::kFIFOReduceNumFiles); + return c; + } + } + + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: nothing to do. Total size %" PRIu64 + ", max size %" PRIu64 "\n", + cf_name.c_str(), total_size, + mutable_cf_options.compaction_options_fifo.max_table_files_size); + return nullptr; + } + + if (!level0_compactions_in_progress_.empty()) { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: Already executing compaction. No need " + "to run parallel compactions since compactions are very fast", + cf_name.c_str()); + return nullptr; + } + + std::vector inputs; + inputs.emplace_back(); + inputs[0].level = last_level; + + if (last_level == 0) { + // In L0, right-most files are the oldest files. + for (auto ritr = last_level_files.rbegin(); ritr != last_level_files.rend(); + ++ritr) { + auto f = *ritr; + total_size -= f->fd.file_size; + inputs[0].files.push_back(f); + char tmp_fsize[16]; + AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize)); + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: picking file %" PRIu64 + " with size %s for deletion", + cf_name.c_str(), f->fd.GetNumber(), tmp_fsize); + if (total_size <= + mutable_cf_options.compaction_options_fifo.max_table_files_size) { + break; + } + } + } else { + // If the last level is non-L0, we actually don't know which file is + // logically the oldest since the file creation time only represents + // when this file was compacted to this level, which is independent + // to when the entries in this file were first inserted. + // + // As a result, we delete files from the left instead. This means the sst + // file with the smallest key will be deleted first. This design decision + // better serves a major type of FIFO use cases where smaller keys are + // associated with older data. + for (const auto& f : last_level_files) { + total_size -= f->fd.file_size; + inputs[0].files.push_back(f); + char tmp_fsize[16]; + AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize)); + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: picking file %" PRIu64 + " with size %s for deletion", + cf_name.c_str(), f->fd.GetNumber(), tmp_fsize); + if (total_size <= + mutable_cf_options.compaction_options_fifo.max_table_files_size) { + break; + } + } + } + + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + std::move(inputs), last_level, + /* target_file_size */ 0, + /* max_compaction_bytes */ 0, + /* output_path_id */ 0, kNoCompression, + mutable_cf_options.compression_opts, Temperature::kUnknown, + /* max_subcompactions */ 0, {}, /* is manual */ false, + /* trim_ts */ "", vstorage->CompactionScore(0), + /* is deletion compaction */ true, + /* l0_files_might_overlap */ true, CompactionReason::kFIFOMaxSize); + return c; +} + +Compaction* FIFOCompactionPicker::PickCompactionToWarm( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer) { + if (mutable_cf_options.compaction_options_fifo.age_for_warm == 0) { + return nullptr; + } + + // PickCompactionToWarm is only triggered if there is no non-L0 files. + for (int level = 1; level < vstorage->num_levels(); ++level) { + if (GetTotalFilesSize(vstorage->LevelFiles(level)) > 0) { + return nullptr; + } + } + + const int kLevel0 = 0; + const std::vector& level_files = vstorage->LevelFiles(kLevel0); + + int64_t _current_time; + auto status = ioptions_.clock->GetCurrentTime(&_current_time); + if (!status.ok()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: Couldn't get current time: %s. " + "Not doing compactions based on warm threshold. ", + cf_name.c_str(), status.ToString().c_str()); + return nullptr; + } + const uint64_t current_time = static_cast(_current_time); + + if (!level0_compactions_in_progress_.empty()) { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: Already executing compaction. Parallel " + "compactions are not supported", + cf_name.c_str()); + return nullptr; + } + + std::vector inputs; + inputs.emplace_back(); + inputs[0].level = 0; + + // avoid underflow + if (current_time > mutable_cf_options.compaction_options_fifo.age_for_warm) { + uint64_t create_time_threshold = + current_time - mutable_cf_options.compaction_options_fifo.age_for_warm; + uint64_t compaction_size = 0; + // We will ideally identify a file qualifying for warm tier by knowing + // the timestamp for the youngest entry in the file. However, right now + // we don't have the information. We infer it by looking at timestamp + // of the next file's (which is just younger) oldest entry's timestamp. + FileMetaData* prev_file = nullptr; + for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) { + FileMetaData* f = *ritr; + assert(f); + if (f->being_compacted) { + // Right now this probably won't happen as we never try to schedule + // two compactions in parallel, so here we just simply don't schedule + // anything. + return nullptr; + } + uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime(); + if (oldest_ancester_time == kUnknownOldestAncesterTime) { + // Older files might not have enough information. It is possible to + // handle these files by looking at newer files, but maintaining the + // logic isn't worth it. + break; + } + if (oldest_ancester_time > create_time_threshold) { + // The previous file (which has slightly older data) doesn't qualify + // for warm tier. + break; + } + if (prev_file != nullptr) { + compaction_size += prev_file->fd.GetFileSize(); + if (compaction_size > mutable_cf_options.max_compaction_bytes) { + break; + } + inputs[0].files.push_back(prev_file); + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: picking file %" PRIu64 + " with next file's oldest time %" PRIu64 " for warm", + cf_name.c_str(), prev_file->fd.GetNumber(), + oldest_ancester_time); + } + if (f->temperature == Temperature::kUnknown || + f->temperature == Temperature::kHot) { + prev_file = f; + } else if (!inputs[0].files.empty()) { + // A warm file newer than files picked. + break; + } else { + assert(prev_file == nullptr); + } + } + } + + if (inputs[0].files.empty()) { + return nullptr; + } + + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + std::move(inputs), 0, 0 /* output file size limit */, + 0 /* max compaction bytes, not applicable */, 0 /* output path ID */, + mutable_cf_options.compression, mutable_cf_options.compression_opts, + Temperature::kWarm, + /* max_subcompactions */ 0, {}, /* is manual */ false, /* trim_ts */ "", + vstorage->CompactionScore(0), + /* is deletion compaction */ false, /* l0_files_might_overlap */ true, + CompactionReason::kChangeTemperature); + return c; +} + +Compaction* FIFOCompactionPicker::PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer, SequenceNumber /*earliest_memtable_seqno*/) { + Compaction* c = nullptr; + if (mutable_cf_options.ttl > 0) { + c = PickTTLCompaction(cf_name, mutable_cf_options, mutable_db_options, + vstorage, log_buffer); + } + if (c == nullptr) { + c = PickSizeCompaction(cf_name, mutable_cf_options, mutable_db_options, + vstorage, log_buffer); + } + if (c == nullptr) { + c = PickCompactionToWarm(cf_name, mutable_cf_options, mutable_db_options, + vstorage, log_buffer); + } + RegisterCompaction(c); + return c; +} + +Compaction* FIFOCompactionPicker::CompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + int input_level, int output_level, + const CompactRangeOptions& /*compact_range_options*/, + const InternalKey* /*begin*/, const InternalKey* /*end*/, + InternalKey** compaction_end, bool* /*manual_conflict*/, + uint64_t /*max_file_num_to_ignore*/, const std::string& /*trim_ts*/) { +#ifdef NDEBUG + (void)input_level; + (void)output_level; +#endif + assert(input_level == 0); + assert(output_level == 0); + *compaction_end = nullptr; + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.logger); + Compaction* c = PickCompaction(cf_name, mutable_cf_options, + mutable_db_options, vstorage, &log_buffer); + log_buffer.FlushBufferToLog(); + return c; +} + +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/compaction/compaction_picker_fifo.h b/src/rocksdb/db/compaction/compaction_picker_fifo.h new file mode 100644 index 000000000..544259f38 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_fifo.h @@ -0,0 +1,63 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#ifndef ROCKSDB_LITE + +#include "db/compaction/compaction_picker.h" + +namespace ROCKSDB_NAMESPACE { +class FIFOCompactionPicker : public CompactionPicker { + public: + FIFOCompactionPicker(const ImmutableOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + + virtual Compaction* PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* version, + LogBuffer* log_buffer, + SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override; + + virtual Compaction* CompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + int input_level, int output_level, + const CompactRangeOptions& compact_range_options, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end, bool* manual_conflict, + uint64_t max_file_num_to_ignore, const std::string& trim_ts) override; + + // The maximum allowed output level. Always returns 0. + virtual int MaxOutputLevel() const override { return 0; } + + virtual bool NeedsCompaction( + const VersionStorageInfo* vstorage) const override; + + private: + Compaction* PickTTLCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* version, + LogBuffer* log_buffer); + + Compaction* PickSizeCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* version, + LogBuffer* log_buffer); + + Compaction* PickCompactionToWarm(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* version, + LogBuffer* log_buffer); +}; +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/compaction/compaction_picker_level.cc b/src/rocksdb/db/compaction/compaction_picker_level.cc new file mode 100644 index 000000000..b689b6add --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_level.cc @@ -0,0 +1,841 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_picker_level.h" + +#include +#include +#include + +#include "db/version_edit.h" +#include "logging/log_buffer.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +bool LevelCompactionPicker::NeedsCompaction( + const VersionStorageInfo* vstorage) const { + if (!vstorage->ExpiredTtlFiles().empty()) { + return true; + } + if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) { + return true; + } + if (!vstorage->BottommostFilesMarkedForCompaction().empty()) { + return true; + } + if (!vstorage->FilesMarkedForCompaction().empty()) { + return true; + } + if (!vstorage->FilesMarkedForForcedBlobGC().empty()) { + return true; + } + for (int i = 0; i <= vstorage->MaxInputLevel(); i++) { + if (vstorage->CompactionScore(i) >= 1) { + return true; + } + } + return false; +} + +namespace { +// A class to build a leveled compaction step-by-step. +class LevelCompactionBuilder { + public: + LevelCompactionBuilder(const std::string& cf_name, + VersionStorageInfo* vstorage, + SequenceNumber earliest_mem_seqno, + CompactionPicker* compaction_picker, + LogBuffer* log_buffer, + const MutableCFOptions& mutable_cf_options, + const ImmutableOptions& ioptions, + const MutableDBOptions& mutable_db_options) + : cf_name_(cf_name), + vstorage_(vstorage), + earliest_mem_seqno_(earliest_mem_seqno), + compaction_picker_(compaction_picker), + log_buffer_(log_buffer), + mutable_cf_options_(mutable_cf_options), + ioptions_(ioptions), + mutable_db_options_(mutable_db_options) {} + + // Pick and return a compaction. + Compaction* PickCompaction(); + + // Pick the initial files to compact to the next level. (or together + // in Intra-L0 compactions) + void SetupInitialFiles(); + + // If the initial files are from L0 level, pick other L0 + // files if needed. + bool SetupOtherL0FilesIfNeeded(); + + // Compaction with round-robin compaction priority allows more files to be + // picked to form a large compaction + void SetupOtherFilesWithRoundRobinExpansion(); + // Based on initial files, setup other files need to be compacted + // in this compaction, accordingly. + bool SetupOtherInputsIfNeeded(); + + Compaction* GetCompaction(); + + // For the specfied level, pick a file that we want to compact. + // Returns false if there is no file to compact. + // If it returns true, inputs->files.size() will be exactly one for + // all compaction priorities except round-robin. For round-robin, + // multiple consecutive files may be put into inputs->files. + // If level is 0 and there is already a compaction on that level, this + // function will return false. + bool PickFileToCompact(); + + // Return true if a L0 trivial move is picked up. + bool TryPickL0TrivialMove(); + + // For L0->L0, picks the longest span of files that aren't currently + // undergoing compaction for which work-per-deleted-file decreases. The span + // always starts from the newest L0 file. + // + // Intra-L0 compaction is independent of all other files, so it can be + // performed even when L0->base_level compactions are blocked. + // + // Returns true if `inputs` is populated with a span of files to be compacted; + // otherwise, returns false. + bool PickIntraL0Compaction(); + + // Return true if TrivialMove is extended. `start_index` is the index of + // the intiial file picked, which should already be in `start_level_inputs_`. + bool TryExtendNonL0TrivialMove(int start_index); + + // Picks a file from level_files to compact. + // level_files is a vector of (level, file metadata) in ascending order of + // level. If compact_to_next_level is true, compact the file to the next + // level, otherwise, compact to the same level as the input file. + void PickFileToCompact( + const autovector>& level_files, + bool compact_to_next_level); + + const std::string& cf_name_; + VersionStorageInfo* vstorage_; + SequenceNumber earliest_mem_seqno_; + CompactionPicker* compaction_picker_; + LogBuffer* log_buffer_; + int start_level_ = -1; + int output_level_ = -1; + int parent_index_ = -1; + int base_index_ = -1; + double start_level_score_ = 0; + bool is_manual_ = false; + bool is_l0_trivial_move_ = false; + CompactionInputFiles start_level_inputs_; + std::vector compaction_inputs_; + CompactionInputFiles output_level_inputs_; + std::vector grandparents_; + CompactionReason compaction_reason_ = CompactionReason::kUnknown; + + const MutableCFOptions& mutable_cf_options_; + const ImmutableOptions& ioptions_; + const MutableDBOptions& mutable_db_options_; + // Pick a path ID to place a newly generated file, with its level + static uint32_t GetPathId(const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + int level); + + static const int kMinFilesForIntraL0Compaction = 4; +}; + +void LevelCompactionBuilder::PickFileToCompact( + const autovector>& level_files, + bool compact_to_next_level) { + for (auto& level_file : level_files) { + // If it's being compacted it has nothing to do here. + // If this assert() fails that means that some function marked some + // files as being_compacted, but didn't call ComputeCompactionScore() + assert(!level_file.second->being_compacted); + start_level_ = level_file.first; + if ((compact_to_next_level && + start_level_ == vstorage_->num_non_empty_levels() - 1) || + (start_level_ == 0 && + !compaction_picker_->level0_compactions_in_progress()->empty())) { + continue; + } + if (compact_to_next_level) { + output_level_ = + (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; + } else { + output_level_ = start_level_; + } + start_level_inputs_.files = {level_file.second}; + start_level_inputs_.level = start_level_; + if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_)) { + return; + } + } + start_level_inputs_.files.clear(); +} + +void LevelCompactionBuilder::SetupInitialFiles() { + // Find the compactions by size on all levels. + bool skipped_l0_to_base = false; + for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) { + start_level_score_ = vstorage_->CompactionScore(i); + start_level_ = vstorage_->CompactionScoreLevel(i); + assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1)); + if (start_level_score_ >= 1) { + if (skipped_l0_to_base && start_level_ == vstorage_->base_level()) { + // If L0->base_level compaction is pending, don't schedule further + // compaction from base level. Otherwise L0->base_level compaction + // may starve. + continue; + } + output_level_ = + (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; + if (PickFileToCompact()) { + // found the compaction! + if (start_level_ == 0) { + // L0 score = `num L0 files` / `level0_file_num_compaction_trigger` + compaction_reason_ = CompactionReason::kLevelL0FilesNum; + } else { + // L1+ score = `Level files size` / `MaxBytesForLevel` + compaction_reason_ = CompactionReason::kLevelMaxLevelSize; + } + break; + } else { + // didn't find the compaction, clear the inputs + start_level_inputs_.clear(); + if (start_level_ == 0) { + skipped_l0_to_base = true; + // L0->base_level may be blocked due to ongoing L0->base_level + // compactions. It may also be blocked by an ongoing compaction from + // base_level downwards. + // + // In these cases, to reduce L0 file count and thus reduce likelihood + // of write stalls, we can attempt compacting a span of files within + // L0. + if (PickIntraL0Compaction()) { + output_level_ = 0; + compaction_reason_ = CompactionReason::kLevelL0FilesNum; + break; + } + } + } + } else { + // Compaction scores are sorted in descending order, no further scores + // will be >= 1. + break; + } + } + if (!start_level_inputs_.empty()) { + return; + } + + // if we didn't find a compaction, check if there are any files marked for + // compaction + parent_index_ = base_index_ = -1; + + compaction_picker_->PickFilesMarkedForCompaction( + cf_name_, vstorage_, &start_level_, &output_level_, &start_level_inputs_); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kFilesMarkedForCompaction; + return; + } + + // Bottommost Files Compaction on deleting tombstones + PickFileToCompact(vstorage_->BottommostFilesMarkedForCompaction(), false); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kBottommostFiles; + return; + } + + // TTL Compaction + if (ioptions_.compaction_pri == kRoundRobin && + !vstorage_->ExpiredTtlFiles().empty()) { + auto expired_files = vstorage_->ExpiredTtlFiles(); + // the expired files list should already be sorted by level + start_level_ = expired_files.front().first; +#ifndef NDEBUG + for (const auto& file : expired_files) { + assert(start_level_ <= file.first); + } +#endif + if (start_level_ > 0) { + output_level_ = start_level_ + 1; + if (PickFileToCompact()) { + compaction_reason_ = CompactionReason::kRoundRobinTtl; + return; + } + } + } + + PickFileToCompact(vstorage_->ExpiredTtlFiles(), true); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kTtl; + return; + } + + // Periodic Compaction + PickFileToCompact(vstorage_->FilesMarkedForPeriodicCompaction(), false); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kPeriodicCompaction; + return; + } + + // Forced blob garbage collection + PickFileToCompact(vstorage_->FilesMarkedForForcedBlobGC(), false); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kForcedBlobGC; + return; + } +} + +bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() { + if (start_level_ == 0 && output_level_ != 0 && !is_l0_trivial_move_) { + return compaction_picker_->GetOverlappingL0Files( + vstorage_, &start_level_inputs_, output_level_, &parent_index_); + } + return true; +} + +void LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion() { + // We only expand when the start level is not L0 under round robin + assert(start_level_ >= 1); + + // For round-robin compaction priority, we have 3 constraints when picking + // multiple files. + // Constraint 1: We can only pick consecutive files + // -> Constraint 1a: When a file is being compacted (or some input files + // are being compacted after expanding, we cannot + // choose it and have to stop choosing more files + // -> Constraint 1b: When we reach the last file (with largest keys), we + // cannot choose more files (the next file will be the + // first one) + // Constraint 2: We should ensure the total compaction bytes (including the + // overlapped files from the next level) is no more than + // mutable_cf_options_.max_compaction_bytes + // Constraint 3: We try our best to pick as many files as possible so that + // the post-compaction level size is less than + // MaxBytesForLevel(start_level_) + // Constraint 4: We do not expand if it is possible to apply a trivial move + // Constraint 5 (TODO): Try to pick minimal files to split into the target + // number of subcompactions + TEST_SYNC_POINT("LevelCompactionPicker::RoundRobin"); + + // Only expand the inputs when we have selected a file in start_level_inputs_ + if (start_level_inputs_.size() == 0) return; + + uint64_t start_lvl_bytes_no_compacting = 0; + uint64_t curr_bytes_to_compact = 0; + uint64_t start_lvl_max_bytes_to_compact = 0; + const std::vector& level_files = + vstorage_->LevelFiles(start_level_); + // Constraint 3 (pre-calculate the ideal max bytes to compact) + for (auto f : level_files) { + if (!f->being_compacted) { + start_lvl_bytes_no_compacting += f->fd.GetFileSize(); + } + } + if (start_lvl_bytes_no_compacting > + vstorage_->MaxBytesForLevel(start_level_)) { + start_lvl_max_bytes_to_compact = start_lvl_bytes_no_compacting - + vstorage_->MaxBytesForLevel(start_level_); + } + + size_t start_index = vstorage_->FilesByCompactionPri(start_level_)[0]; + InternalKey smallest, largest; + // Constraint 4 (No need to check again later) + compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest); + CompactionInputFiles output_level_inputs; + output_level_inputs.level = output_level_; + vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest, + &output_level_inputs.files); + if (output_level_inputs.empty()) { + if (TryExtendNonL0TrivialMove((int)start_index)) { + return; + } + } + // Constraint 3 + if (start_level_inputs_[0]->fd.GetFileSize() >= + start_lvl_max_bytes_to_compact) { + return; + } + CompactionInputFiles tmp_start_level_inputs; + tmp_start_level_inputs = start_level_inputs_; + // TODO (zichen): Future parallel round-robin may also need to update this + // Constraint 1b (only expand till the end) + for (size_t i = start_index + 1; i < level_files.size(); i++) { + auto* f = level_files[i]; + if (f->being_compacted) { + // Constraint 1a + return; + } + + tmp_start_level_inputs.files.push_back(f); + if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &tmp_start_level_inputs) || + compaction_picker_->FilesRangeOverlapWithCompaction( + {tmp_start_level_inputs}, output_level_, + Compaction::EvaluatePenultimateLevel( + vstorage_, ioptions_, start_level_, output_level_))) { + // Constraint 1a + tmp_start_level_inputs.clear(); + return; + } + + curr_bytes_to_compact = 0; + for (auto start_lvl_f : tmp_start_level_inputs.files) { + curr_bytes_to_compact += start_lvl_f->fd.GetFileSize(); + } + + // Check whether any output level files are locked + compaction_picker_->GetRange(tmp_start_level_inputs, &smallest, &largest); + vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest, + &output_level_inputs.files); + if (!output_level_inputs.empty() && + !compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &output_level_inputs)) { + // Constraint 1a + tmp_start_level_inputs.clear(); + return; + } + + uint64_t start_lvl_curr_bytes_to_compact = curr_bytes_to_compact; + for (auto output_lvl_f : output_level_inputs.files) { + curr_bytes_to_compact += output_lvl_f->fd.GetFileSize(); + } + if (curr_bytes_to_compact > mutable_cf_options_.max_compaction_bytes) { + // Constraint 2 + tmp_start_level_inputs.clear(); + return; + } + + start_level_inputs_.files = tmp_start_level_inputs.files; + // Constraint 3 + if (start_lvl_curr_bytes_to_compact > start_lvl_max_bytes_to_compact) { + return; + } + } +} + +bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() { + // Setup input files from output level. For output to L0, we only compact + // spans of files that do not interact with any pending compactions, so don't + // need to consider other levels. + if (output_level_ != 0) { + output_level_inputs_.level = output_level_; + bool round_robin_expanding = + ioptions_.compaction_pri == kRoundRobin && + compaction_reason_ == CompactionReason::kLevelMaxLevelSize; + if (round_robin_expanding) { + SetupOtherFilesWithRoundRobinExpansion(); + } + if (!is_l0_trivial_move_ && + !compaction_picker_->SetupOtherInputs( + cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_, + &output_level_inputs_, &parent_index_, base_index_, + round_robin_expanding)) { + return false; + } + + compaction_inputs_.push_back(start_level_inputs_); + if (!output_level_inputs_.empty()) { + compaction_inputs_.push_back(output_level_inputs_); + } + + if (!is_l0_trivial_move_) { + // In some edge cases we could pick a compaction that will be compacting + // a key range that overlap with another running compaction, and both + // of them have the same output level. This could happen if + // (1) we are running a non-exclusive manual compaction + // (2) AddFile ingest a new file into the LSM tree + // We need to disallow this from happening. + if (compaction_picker_->FilesRangeOverlapWithCompaction( + compaction_inputs_, output_level_, + Compaction::EvaluatePenultimateLevel( + vstorage_, ioptions_, start_level_, output_level_))) { + // This compaction output could potentially conflict with the output + // of a currently running compaction, we cannot run it. + return false; + } + compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_, + output_level_inputs_, &grandparents_); + } + } else { + compaction_inputs_.push_back(start_level_inputs_); + } + return true; +} + +Compaction* LevelCompactionBuilder::PickCompaction() { + // Pick up the first file to start compaction. It may have been extended + // to a clean cut. + SetupInitialFiles(); + if (start_level_inputs_.empty()) { + return nullptr; + } + assert(start_level_ >= 0 && output_level_ >= 0); + + // If it is a L0 -> base level compaction, we need to set up other L0 + // files if needed. + if (!SetupOtherL0FilesIfNeeded()) { + return nullptr; + } + + // Pick files in the output level and expand more files in the start level + // if needed. + if (!SetupOtherInputsIfNeeded()) { + return nullptr; + } + + // Form a compaction object containing the files we picked. + Compaction* c = GetCompaction(); + + TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c); + + return c; +} + +Compaction* LevelCompactionBuilder::GetCompaction() { + auto c = new Compaction( + vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_, + std::move(compaction_inputs_), output_level_, + MaxFileSizeForLevel(mutable_cf_options_, output_level_, + ioptions_.compaction_style, vstorage_->base_level(), + ioptions_.level_compaction_dynamic_level_bytes), + mutable_cf_options_.max_compaction_bytes, + GetPathId(ioptions_, mutable_cf_options_, output_level_), + GetCompressionType(vstorage_, mutable_cf_options_, output_level_, + vstorage_->base_level()), + GetCompressionOptions(mutable_cf_options_, vstorage_, output_level_), + Temperature::kUnknown, + /* max_subcompactions */ 0, std::move(grandparents_), is_manual_, + /* trim_ts */ "", start_level_score_, false /* deletion_compaction */, + /* l0_files_might_overlap */ start_level_ == 0 && !is_l0_trivial_move_, + compaction_reason_); + + // If it's level 0 compaction, make sure we don't execute any other level 0 + // compactions in parallel + compaction_picker_->RegisterCompaction(c); + + // Creating a compaction influences the compaction score because the score + // takes running compactions into account (by skipping files that are already + // being compacted). Since we just changed compaction score, we recalculate it + // here + vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); + return c; +} + +/* + * Find the optimal path to place a file + * Given a level, finds the path where levels up to it will fit in levels + * up to and including this path + */ +uint32_t LevelCompactionBuilder::GetPathId( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, int level) { + uint32_t p = 0; + assert(!ioptions.cf_paths.empty()); + + // size remaining in the most recent path + uint64_t current_path_size = ioptions.cf_paths[0].target_size; + + uint64_t level_size; + int cur_level = 0; + + // max_bytes_for_level_base denotes L1 size. + // We estimate L0 size to be the same as L1. + level_size = mutable_cf_options.max_bytes_for_level_base; + + // Last path is the fallback + while (p < ioptions.cf_paths.size() - 1) { + if (level_size <= current_path_size) { + if (cur_level == level) { + // Does desired level fit in this path? + return p; + } else { + current_path_size -= level_size; + if (cur_level > 0) { + if (ioptions.level_compaction_dynamic_level_bytes) { + // Currently, level_compaction_dynamic_level_bytes is ignored when + // multiple db paths are specified. https://github.com/facebook/ + // rocksdb/blob/main/db/column_family.cc. + // Still, adding this check to avoid accidentally using + // max_bytes_for_level_multiplier_additional + level_size = static_cast( + level_size * mutable_cf_options.max_bytes_for_level_multiplier); + } else { + level_size = static_cast( + level_size * mutable_cf_options.max_bytes_for_level_multiplier * + mutable_cf_options.MaxBytesMultiplerAdditional(cur_level)); + } + } + cur_level++; + continue; + } + } + p++; + current_path_size = ioptions.cf_paths[p].target_size; + } + return p; +} + +bool LevelCompactionBuilder::TryPickL0TrivialMove() { + if (vstorage_->base_level() <= 0) { + return false; + } + if (start_level_ == 0 && mutable_cf_options_.compression_per_level.empty() && + !vstorage_->LevelFiles(output_level_).empty() && + ioptions_.db_paths.size() <= 1) { + // Try to pick trivial move from L0 to L1. We start from the oldest + // file. We keep expanding to newer files if it would form a + // trivial move. + // For now we don't support it with + // mutable_cf_options_.compression_per_level to prevent the logic + // of determining whether L0 can be trivial moved to the next level. + // We skip the case where output level is empty, since in this case, at + // least the oldest file would qualify for trivial move, and this would + // be a surprising behavior with few benefits. + + // We search from the oldest file from the newest. In theory, there are + // files in the middle can form trivial move too, but it is probably + // uncommon and we ignore these cases for simplicity. + const std::vector& level_files = + vstorage_->LevelFiles(start_level_); + + InternalKey my_smallest, my_largest; + for (auto it = level_files.rbegin(); it != level_files.rend(); ++it) { + CompactionInputFiles output_level_inputs; + output_level_inputs.level = output_level_; + FileMetaData* file = *it; + if (it == level_files.rbegin()) { + my_smallest = file->smallest; + my_largest = file->largest; + } else { + if (compaction_picker_->icmp()->Compare(file->largest, my_smallest) < + 0) { + my_smallest = file->smallest; + } else if (compaction_picker_->icmp()->Compare(file->smallest, + my_largest) > 0) { + my_largest = file->largest; + } else { + break; + } + } + vstorage_->GetOverlappingInputs(output_level_, &my_smallest, &my_largest, + &output_level_inputs.files); + if (output_level_inputs.empty()) { + assert(!file->being_compacted); + start_level_inputs_.files.push_back(file); + } else { + break; + } + } + } + + if (!start_level_inputs_.empty()) { + // Sort files by key range. Not sure it's 100% necessary but it's cleaner + // to always keep files sorted by key the key ranges don't overlap. + std::sort(start_level_inputs_.files.begin(), + start_level_inputs_.files.end(), + [icmp = compaction_picker_->icmp()](FileMetaData* f1, + FileMetaData* f2) -> bool { + return (icmp->Compare(f1->smallest, f2->smallest) < 0); + }); + + is_l0_trivial_move_ = true; + return true; + } + return false; +} + +bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index) { + if (start_level_inputs_.size() == 1 && + (ioptions_.db_paths.empty() || ioptions_.db_paths.size() == 1) && + (mutable_cf_options_.compression_per_level.empty())) { + // Only file of `index`, and it is likely a trivial move. Try to + // expand if it is still a trivial move, but not beyond + // max_compaction_bytes or 4 files, so that we don't create too + // much compaction pressure for the next level. + // Ignore if there are more than one DB path, as it would be hard + // to predict whether it is a trivial move. + const std::vector& level_files = + vstorage_->LevelFiles(start_level_); + const size_t kMaxMultiTrivialMove = 4; + FileMetaData* initial_file = start_level_inputs_.files[0]; + size_t total_size = initial_file->fd.GetFileSize(); + CompactionInputFiles output_level_inputs; + output_level_inputs.level = output_level_; + for (int i = start_index + 1; + i < static_cast(level_files.size()) && + start_level_inputs_.size() < kMaxMultiTrivialMove; + i++) { + FileMetaData* next_file = level_files[i]; + if (next_file->being_compacted) { + break; + } + vstorage_->GetOverlappingInputs(output_level_, &(initial_file->smallest), + &(next_file->largest), + &output_level_inputs.files); + if (!output_level_inputs.empty()) { + break; + } + if (i < static_cast(level_files.size()) - 1 && + compaction_picker_->icmp() + ->user_comparator() + ->CompareWithoutTimestamp( + next_file->largest.user_key(), + level_files[i + 1]->smallest.user_key()) == 0) { + TEST_SYNC_POINT_CALLBACK( + "LevelCompactionBuilder::TryExtendNonL0TrivialMove:NoCleanCut", + nullptr); + // Not a clean up after adding the next file. Skip. + break; + } + total_size += next_file->fd.GetFileSize(); + if (total_size > mutable_cf_options_.max_compaction_bytes) { + break; + } + start_level_inputs_.files.push_back(next_file); + } + return start_level_inputs_.size() > 1; + } + return false; +} + +bool LevelCompactionBuilder::PickFileToCompact() { + // level 0 files are overlapping. So we cannot pick more + // than one concurrent compactions at this level. This + // could be made better by looking at key-ranges that are + // being compacted at level 0. + if (start_level_ == 0 && + !compaction_picker_->level0_compactions_in_progress()->empty()) { + TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0"); + return false; + } + + start_level_inputs_.clear(); + start_level_inputs_.level = start_level_; + + assert(start_level_ >= 0); + + if (TryPickL0TrivialMove()) { + return true; + } + + const std::vector& level_files = + vstorage_->LevelFiles(start_level_); + + // Pick the file with the highest score in this level that is not already + // being compacted. + const std::vector& file_scores = + vstorage_->FilesByCompactionPri(start_level_); + + unsigned int cmp_idx; + for (cmp_idx = vstorage_->NextCompactionIndex(start_level_); + cmp_idx < file_scores.size(); cmp_idx++) { + int index = file_scores[cmp_idx]; + auto* f = level_files[index]; + + // do not pick a file to compact if it is being compacted + // from n-1 level. + if (f->being_compacted) { + if (ioptions_.compaction_pri == kRoundRobin) { + // TODO(zichen): this file may be involved in one compaction from + // an upper level, cannot advance the cursor for round-robin policy. + // Currently, we do not pick any file to compact in this case. We + // should fix this later to ensure a compaction is picked but the + // cursor shall not be advanced. + return false; + } + continue; + } + + start_level_inputs_.files.push_back(f); + if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_) || + compaction_picker_->FilesRangeOverlapWithCompaction( + {start_level_inputs_}, output_level_, + Compaction::EvaluatePenultimateLevel( + vstorage_, ioptions_, start_level_, output_level_))) { + // A locked (pending compaction) input-level file was pulled in due to + // user-key overlap. + start_level_inputs_.clear(); + + if (ioptions_.compaction_pri == kRoundRobin) { + return false; + } + continue; + } + + // Now that input level is fully expanded, we check whether any output + // files are locked due to pending compaction. + // + // Note we rely on ExpandInputsToCleanCut() to tell us whether any output- + // level files are locked, not just the extra ones pulled in for user-key + // overlap. + InternalKey smallest, largest; + compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest); + CompactionInputFiles output_level_inputs; + output_level_inputs.level = output_level_; + vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest, + &output_level_inputs.files); + if (output_level_inputs.empty()) { + if (TryExtendNonL0TrivialMove(index)) { + break; + } + } else { + if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &output_level_inputs)) { + start_level_inputs_.clear(); + if (ioptions_.compaction_pri == kRoundRobin) { + return false; + } + continue; + } + } + + base_index_ = index; + break; + } + + // store where to start the iteration in the next call to PickCompaction + if (ioptions_.compaction_pri != kRoundRobin) { + vstorage_->SetNextCompactionIndex(start_level_, cmp_idx); + } + return start_level_inputs_.size() > 0; +} + +bool LevelCompactionBuilder::PickIntraL0Compaction() { + start_level_inputs_.clear(); + const std::vector& level_files = + vstorage_->LevelFiles(0 /* level */); + if (level_files.size() < + static_cast( + mutable_cf_options_.level0_file_num_compaction_trigger + 2) || + level_files[0]->being_compacted) { + // If L0 isn't accumulating much files beyond the regular trigger, don't + // resort to L0->L0 compaction yet. + return false; + } + return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction, + std::numeric_limits::max(), + mutable_cf_options_.max_compaction_bytes, + &start_level_inputs_, earliest_mem_seqno_); +} +} // namespace + +Compaction* LevelCompactionPicker::PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer, SequenceNumber earliest_mem_seqno) { + LevelCompactionBuilder builder(cf_name, vstorage, earliest_mem_seqno, this, + log_buffer, mutable_cf_options, ioptions_, + mutable_db_options); + return builder.PickCompaction(); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_picker_level.h b/src/rocksdb/db/compaction/compaction_picker_level.h new file mode 100644 index 000000000..42a9b60a6 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_level.h @@ -0,0 +1,33 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/compaction/compaction_picker.h" + +namespace ROCKSDB_NAMESPACE { +// Picking compactions for leveled compaction. See wiki page +// https://github.com/facebook/rocksdb/wiki/Leveled-Compaction +// for description of Leveled compaction. +class LevelCompactionPicker : public CompactionPicker { + public: + LevelCompactionPicker(const ImmutableOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + virtual Compaction* PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer, + SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override; + + virtual bool NeedsCompaction( + const VersionStorageInfo* vstorage) const override; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_picker_test.cc b/src/rocksdb/db/compaction/compaction_picker_test.cc new file mode 100644 index 000000000..2e2e566c0 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_test.cc @@ -0,0 +1,3964 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include +#include + +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_picker_fifo.h" +#include "db/compaction/compaction_picker_level.h" +#include "db/compaction/compaction_picker_universal.h" +#include "db/compaction/file_pri.h" +#include "rocksdb/advanced_options.h" +#include "table/unique_id_impl.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class CountingLogger : public Logger { + public: + using Logger::Logv; + void Logv(const char* /*format*/, va_list /*ap*/) override { log_count++; } + size_t log_count; +}; + +class CompactionPickerTestBase : public testing::Test { + public: + const Comparator* ucmp_; + InternalKeyComparator icmp_; + Options options_; + ImmutableOptions ioptions_; + MutableCFOptions mutable_cf_options_; + MutableDBOptions mutable_db_options_; + LevelCompactionPicker level_compaction_picker; + std::string cf_name_; + CountingLogger logger_; + LogBuffer log_buffer_; + uint32_t file_num_; + CompactionOptionsFIFO fifo_options_; + std::unique_ptr vstorage_; + std::vector> files_; + // does not own FileMetaData + std::unordered_map> file_map_; + // input files to compaction process. + std::vector input_files_; + int compaction_level_start_; + + explicit CompactionPickerTestBase(const Comparator* _ucmp) + : ucmp_(_ucmp), + icmp_(ucmp_), + options_(CreateOptions(ucmp_)), + ioptions_(options_), + mutable_cf_options_(options_), + mutable_db_options_(), + level_compaction_picker(ioptions_, &icmp_), + cf_name_("dummy"), + log_buffer_(InfoLogLevel::INFO_LEVEL, &logger_), + file_num_(1), + vstorage_(nullptr) { + mutable_cf_options_.ttl = 0; + mutable_cf_options_.periodic_compaction_seconds = 0; + // ioptions_.compaction_pri = kMinOverlappingRatio has its own set of + // tests to cover. + ioptions_.compaction_pri = kByCompensatedSize; + fifo_options_.max_table_files_size = 1; + mutable_cf_options_.RefreshDerivedOptions(ioptions_); + ioptions_.cf_paths.emplace_back("dummy", + std::numeric_limits::max()); + } + + ~CompactionPickerTestBase() override {} + + void NewVersionStorage(int num_levels, CompactionStyle style) { + DeleteVersionStorage(); + options_.num_levels = num_levels; + vstorage_.reset(new VersionStorageInfo(&icmp_, ucmp_, options_.num_levels, + style, nullptr, false)); + vstorage_->PrepareForVersionAppend(ioptions_, mutable_cf_options_); + } + + // Create a new VersionStorageInfo object so we can add mode files and then + // merge it with the existing VersionStorageInfo + void AddVersionStorage() { + temp_vstorage_.reset(new VersionStorageInfo( + &icmp_, ucmp_, options_.num_levels, ioptions_.compaction_style, + vstorage_.get(), false)); + } + + void DeleteVersionStorage() { + vstorage_.reset(); + temp_vstorage_.reset(); + files_.clear(); + file_map_.clear(); + input_files_.clear(); + } + + // REQUIRES: smallest and largest are c-style strings ending with '\0' + void Add(int level, uint32_t file_number, const char* smallest, + const char* largest, uint64_t file_size = 1, uint32_t path_id = 0, + SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100, + size_t compensated_file_size = 0, bool marked_for_compact = false, + Temperature temperature = Temperature::kUnknown, + uint64_t oldest_ancestor_time = kUnknownOldestAncesterTime, + Slice ts_of_smallest = Slice(), Slice ts_of_largest = Slice()) { + assert(ts_of_smallest.size() == ucmp_->timestamp_size()); + assert(ts_of_largest.size() == ucmp_->timestamp_size()); + + VersionStorageInfo* vstorage; + if (temp_vstorage_) { + vstorage = temp_vstorage_.get(); + } else { + vstorage = vstorage_.get(); + } + assert(level < vstorage->num_levels()); + char* smallest_key_buf = nullptr; + char* largest_key_buf = nullptr; + + if (!ts_of_smallest.empty()) { + smallest_key_buf = new char[strlen(smallest) + ucmp_->timestamp_size()]; + memcpy(smallest_key_buf, smallest, strlen(smallest)); + memcpy(smallest_key_buf + strlen(smallest), ts_of_smallest.data(), + ucmp_->timestamp_size()); + largest_key_buf = new char[strlen(largest) + ucmp_->timestamp_size()]; + memcpy(largest_key_buf, largest, strlen(largest)); + memcpy(largest_key_buf + strlen(largest), ts_of_largest.data(), + ucmp_->timestamp_size()); + } + + InternalKey smallest_ikey = InternalKey( + smallest_key_buf ? Slice(smallest_key_buf, + ucmp_->timestamp_size() + strlen(smallest)) + : smallest, + smallest_seq, kTypeValue); + InternalKey largest_ikey = InternalKey( + largest_key_buf + ? Slice(largest_key_buf, ucmp_->timestamp_size() + strlen(largest)) + : largest, + largest_seq, kTypeValue); + + FileMetaData* f = new FileMetaData( + file_number, path_id, file_size, smallest_ikey, largest_ikey, + smallest_seq, largest_seq, marked_for_compact, temperature, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); + f->compensated_file_size = + (compensated_file_size != 0) ? compensated_file_size : file_size; + f->oldest_ancester_time = oldest_ancestor_time; + vstorage->AddFile(level, f); + files_.emplace_back(f); + file_map_.insert({file_number, {f, level}}); + + delete[] smallest_key_buf; + delete[] largest_key_buf; + } + + void SetCompactionInputFilesLevels(int level_count, int start_level) { + input_files_.resize(level_count); + for (int i = 0; i < level_count; ++i) { + input_files_[i].level = start_level + i; + } + compaction_level_start_ = start_level; + } + + void AddToCompactionFiles(uint32_t file_number) { + auto iter = file_map_.find(file_number); + assert(iter != file_map_.end()); + int level = iter->second.second; + assert(level < vstorage_->num_levels()); + input_files_[level - compaction_level_start_].files.emplace_back( + iter->second.first); + } + + void UpdateVersionStorageInfo() { + if (temp_vstorage_) { + VersionBuilder builder(FileOptions(), &ioptions_, nullptr, + vstorage_.get(), nullptr); + ASSERT_OK(builder.SaveTo(temp_vstorage_.get())); + vstorage_ = std::move(temp_vstorage_); + } + vstorage_->PrepareForVersionAppend(ioptions_, mutable_cf_options_); + vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); + vstorage_->SetFinalized(); + } + + private: + Options CreateOptions(const Comparator* ucmp) const { + Options opts; + opts.comparator = ucmp; + return opts; + } + + std::unique_ptr temp_vstorage_; +}; + +class CompactionPickerTest : public CompactionPickerTestBase { + public: + explicit CompactionPickerTest() + : CompactionPickerTestBase(BytewiseComparator()) {} + + ~CompactionPickerTest() override {} +}; + +class CompactionPickerU64TsTest : public CompactionPickerTestBase { + public: + explicit CompactionPickerU64TsTest() + : CompactionPickerTestBase(test::BytewiseComparatorWithU64TsWrapper()) {} + + ~CompactionPickerU64TsTest() override {} +}; + +TEST_F(CompactionPickerTest, Empty) { + NewVersionStorage(6, kCompactionStyleLevel); + UpdateVersionStorageInfo(); + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() == nullptr); +} + +TEST_F(CompactionPickerTest, Single) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + Add(0, 1U, "p", "q"); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() == nullptr); +} + +TEST_F(CompactionPickerTest, Level0Trigger) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + Add(0, 1U, "150", "200"); + Add(0, 2U, "200", "250"); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, Level1Trigger) { + NewVersionStorage(6, kCompactionStyleLevel); + Add(1, 66U, "150", "200", 1000000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, Level1Trigger2) { + mutable_cf_options_.target_file_size_base = 10000000000; + mutable_cf_options_.RefreshDerivedOptions(ioptions_); + NewVersionStorage(6, kCompactionStyleLevel); + Add(1, 66U, "150", "200", 1000000001U); + Add(1, 88U, "201", "300", 1000000000U); + Add(2, 6U, "150", "179", 1000000000U); + Add(2, 7U, "180", "220", 1000000000U); + Add(2, 8U, "221", "300", 1000000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->num_input_files(1)); + ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber()); + ASSERT_EQ(uint64_t{1073741824}, compaction->OutputFilePreallocationSize()); +} + +TEST_F(CompactionPickerTest, LevelMaxScore) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.target_file_size_base = 10000000; + mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024; + mutable_cf_options_.RefreshDerivedOptions(ioptions_); + Add(0, 1U, "150", "200", 1000000U); + // Level 1 score 1.2 + Add(1, 66U, "150", "200", 6000000U); + Add(1, 88U, "201", "300", 6000000U); + // Level 2 score 1.8. File 7 is the largest. Should be picked + Add(2, 6U, "150", "179", 60000000U); + Add(2, 7U, "180", "220", 60000001U); + Add(2, 8U, "221", "300", 60000000U); + // Level 3 score slightly larger than 1 + Add(3, 26U, "150", "170", 260000000U); + Add(3, 27U, "171", "179", 260000000U); + Add(3, 28U, "191", "220", 260000000U); + Add(3, 29U, "221", "300", 260000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(mutable_cf_options_.target_file_size_base + + mutable_cf_options_.target_file_size_base / 10, + compaction->OutputFilePreallocationSize()); +} + +TEST_F(CompactionPickerTest, NeedsCompactionLevel) { + const int kLevels = 6; + const int kFileCount = 20; + + for (int level = 0; level < kLevels - 1; ++level) { + NewVersionStorage(kLevels, kCompactionStyleLevel); + uint64_t file_size = vstorage_->MaxBytesForLevel(level) * 2 / kFileCount; + for (int file_count = 1; file_count <= kFileCount; ++file_count) { + // start a brand new version in each test. + NewVersionStorage(kLevels, kCompactionStyleLevel); + for (int i = 0; i < file_count; ++i) { + Add(level, i, std::to_string((i + 100) * 1000).c_str(), + std::to_string((i + 100) * 1000 + 999).c_str(), file_size, 0, + i * 100, i * 100 + 99); + } + UpdateVersionStorageInfo(); + ASSERT_EQ(vstorage_->CompactionScoreLevel(0), level); + ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()), + vstorage_->CompactionScore(0) >= 1); + // release the version storage + DeleteVersionStorage(); + } + } +} + +TEST_F(CompactionPickerTest, Level0TriggerDynamic) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 200; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200"); + Add(0, 2U, "200", "250"); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(1, static_cast(compaction->num_input_levels())); + ASSERT_EQ(num_levels - 1, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, Level0TriggerDynamic2) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 200; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200"); + Add(0, 2U, "200", "250"); + Add(num_levels - 1, 3U, "200", "250", 300U); + + UpdateVersionStorageInfo(); + ASSERT_EQ(vstorage_->base_level(), num_levels - 2); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(1, static_cast(compaction->num_input_levels())); + ASSERT_EQ(num_levels - 2, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, Level0TriggerDynamic3) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 200; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200"); + Add(0, 2U, "200", "250"); + Add(num_levels - 1, 3U, "200", "250", 300U); + Add(num_levels - 1, 4U, "300", "350", 3000U); + + UpdateVersionStorageInfo(); + ASSERT_EQ(vstorage_->base_level(), num_levels - 3); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(1, static_cast(compaction->num_input_levels())); + ASSERT_EQ(num_levels - 3, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, Level0TriggerDynamic4) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 200; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200"); + Add(0, 2U, "200", "250"); + Add(num_levels - 1, 3U, "200", "250", 300U); + Add(num_levels - 1, 4U, "300", "350", 3000U); + Add(num_levels - 3, 5U, "150", "180", 3U); + Add(num_levels - 3, 6U, "181", "300", 3U); + Add(num_levels - 3, 7U, "400", "450", 3U); + + UpdateVersionStorageInfo(); + ASSERT_EQ(vstorage_->base_level(), num_levels - 3); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->num_input_files(1)); + ASSERT_EQ(num_levels - 3, compaction->level(1)); + ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(1, 1)->fd.GetNumber()); + ASSERT_EQ(2, static_cast(compaction->num_input_levels())); + ASSERT_EQ(num_levels - 3, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, LevelTriggerDynamic4) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + ioptions_.compaction_pri = kMinOverlappingRatio; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 200; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200"); + Add(num_levels - 1, 2U, "200", "250", 300U); + Add(num_levels - 1, 3U, "300", "350", 3000U); + Add(num_levels - 1, 4U, "400", "450", 3U); + Add(num_levels - 2, 5U, "150", "180", 300U); + Add(num_levels - 2, 6U, "181", "350", 500U); + Add(num_levels - 2, 7U, "400", "450", 200U); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(0, compaction->num_input_files(1)); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(num_levels - 1, compaction->output_level()); +} + +// Universal and FIFO Compactions are not supported in ROCKSDB_LITE +#ifndef ROCKSDB_LITE +TEST_F(CompactionPickerTest, NeedsCompactionUniversal) { + NewVersionStorage(1, kCompactionStyleUniversal); + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + UpdateVersionStorageInfo(); + // must return false when there's no files. + ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()), + false); + + // verify the trigger given different number of L0 files. + for (int i = 1; + i <= mutable_cf_options_.level0_file_num_compaction_trigger * 2; ++i) { + NewVersionStorage(1, kCompactionStyleUniversal); + Add(0, i, std::to_string((i + 100) * 1000).c_str(), + std::to_string((i + 100) * 1000 + 999).c_str(), 1000000, 0, i * 100, + i * 100 + 99); + UpdateVersionStorageInfo(); + ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()), + vstorage_->CompactionScore(0) >= 1); + } +} + +TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) { + const uint64_t kFileSize = 100000; + NewVersionStorage(1, kCompactionStyleUniversal); + ioptions_.allow_ingest_behind = true; + ioptions_.num_levels = 3; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + UpdateVersionStorageInfo(); + // must return false when there's no files. + ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()), + false); + + NewVersionStorage(3, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", kFileSize, 0, 401, 450); + Add(0, 4U, "260", "300", kFileSize, 0, 260, 300); + Add(1, 5U, "100", "151", kFileSize, 0, 200, 251); + Add(1, 3U, "301", "350", kFileSize, 0, 101, 150); + Add(2, 6U, "120", "200", kFileSize, 0, 20, 100); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + // output level should be the one above the bottom-most + ASSERT_EQ(1, compaction->output_level()); +} +// Tests if the files can be trivially moved in multi level +// universal compaction when allow_trivial_move option is set +// In this test as the input files overlaps, they cannot +// be trivially moved. + +TEST_F(CompactionPickerTest, CannotTrivialMoveUniversal) { + const uint64_t kFileSize = 100000; + + mutable_cf_options_.compaction_options_universal.allow_trivial_move = true; + NewVersionStorage(1, kCompactionStyleUniversal); + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + UpdateVersionStorageInfo(); + // must return false when there's no files. + ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()), + false); + + NewVersionStorage(3, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", kFileSize, 0, 401, 450); + Add(0, 4U, "260", "300", kFileSize, 0, 260, 300); + Add(1, 5U, "100", "151", kFileSize, 0, 200, 251); + Add(1, 3U, "301", "350", kFileSize, 0, 101, 150); + Add(2, 6U, "120", "200", kFileSize, 0, 20, 100); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_TRUE(!compaction->is_trivial_move()); +} +// Tests if the files can be trivially moved in multi level +// universal compaction when allow_trivial_move option is set +// In this test as the input files doesn't overlaps, they should +// be trivially moved. +TEST_F(CompactionPickerTest, AllowsTrivialMoveUniversal) { + const uint64_t kFileSize = 100000; + + mutable_cf_options_.compaction_options_universal.allow_trivial_move = true; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(3, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", kFileSize, 0, 401, 450); + Add(0, 4U, "260", "300", kFileSize, 0, 260, 300); + Add(1, 5U, "010", "080", kFileSize, 0, 200, 251); + Add(2, 3U, "301", "350", kFileSize, 0, 101, 150); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_TRUE(compaction->is_trivial_move()); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction1) { + // The case where universal periodic compaction can be picked + // with some newer files being compacted. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", kFileSize, 0, 401, 450); + Add(0, 4U, "260", "300", kFileSize, 0, 260, 300); + Add(3, 5U, "010", "080", kFileSize, 0, 200, 251); + Add(4, 3U, "301", "350", kFileSize, 0, 101, 150); + Add(4, 6U, "501", "750", kFileSize, 0, 101, 150); + + file_map_[2].first->being_compacted = true; + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[3].first); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->output_level()); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction2) { + // The case where universal periodic compaction does not + // pick up only level to compact if it doesn't cover + // any file marked as periodic compaction. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(3, 5U, "010", "080", kFileSize, 0, 200, 251); + Add(4, 3U, "301", "350", kFileSize, 0, 101, 150); + Add(4, 6U, "501", "750", kFileSize, 0, 101, 150); + + file_map_[5].first->being_compacted = true; + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_FALSE(compaction); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction3) { + // The case where universal periodic compaction does not + // pick up only the last sorted run which is an L0 file if it isn't + // marked as periodic compaction. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 5U, "010", "080", kFileSize, 0, 200, 251); + Add(0, 6U, "501", "750", kFileSize, 0, 101, 150); + + file_map_[5].first->being_compacted = true; + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_FALSE(compaction); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction4) { + // The case where universal periodic compaction couldn't form + // a compaction that includes any file marked for periodic compaction. + // Right now we form the compaction anyway if it is more than one + // sorted run. Just put the case here to validate that it doesn't + // crash. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(2, 2U, "010", "080", kFileSize, 0, 200, 251); + Add(3, 5U, "010", "080", kFileSize, 0, 200, 251); + Add(4, 3U, "301", "350", kFileSize, 0, 101, 150); + Add(4, 6U, "501", "750", kFileSize, 0, 101, 150); + + file_map_[2].first->being_compacted = true; + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[2].first); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(!compaction || + compaction->start_level() != compaction->output_level()); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction5) { + // Test single L0 file periodic compaction triggering. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 6U, "150", "200", kFileSize, 0, 500, 550); + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[6].first); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(4, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction6) { + // Test single sorted run non-L0 periodic compaction + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(4, 5U, "150", "200", kFileSize, 0, 500, 550); + Add(4, 6U, "350", "400", kFileSize, 0, 500, 550); + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[6].first); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->start_level()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(4, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, UniversalIncrementalSpace1) { + const uint64_t kFileSize = 100000; + + mutable_cf_options_.max_compaction_bytes = 555555; + mutable_cf_options_.compaction_options_universal.incremental = true; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 30; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(2, 2U, "010", "080", kFileSize, 0, 200, 251); + Add(3, 5U, "310", "380", kFileSize, 0, 200, 251); + Add(3, 6U, "410", "880", kFileSize, 0, 200, 251); + Add(3, 7U, "910", "980", 1, 0, 200, 251); + Add(4, 10U, "201", "250", kFileSize, 0, 101, 150); + Add(4, 11U, "301", "350", kFileSize, 0, 101, 150); + Add(4, 12U, "401", "450", kFileSize, 0, 101, 150); + Add(4, 13U, "501", "750", kFileSize, 0, 101, 150); + Add(4, 14U, "801", "850", kFileSize, 0, 101, 150); + Add(4, 15U, "901", "950", kFileSize, 0, 101, 150); + // Add(4, 15U, "960", "970", kFileSize, 0, 101, 150); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->output_level()); + ASSERT_EQ(3, compaction->start_level()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(0, 1)->fd.GetNumber()); + // ASSERT_EQ(4U, compaction->num_input_files(1)); + ASSERT_EQ(11U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(12U, compaction->input(1, 1)->fd.GetNumber()); + ASSERT_EQ(13U, compaction->input(1, 2)->fd.GetNumber()); + ASSERT_EQ(14U, compaction->input(1, 3)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, UniversalIncrementalSpace2) { + const uint64_t kFileSize = 100000; + + mutable_cf_options_.max_compaction_bytes = 400000; + mutable_cf_options_.compaction_options_universal.incremental = true; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 30; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(1, 2U, "010", "080", kFileSize, 0, 200, 251); + Add(2, 5U, "310", "380", kFileSize, 0, 200, 251); + Add(2, 6U, "410", "880", kFileSize, 0, 200, 251); + Add(2, 7U, "910", "980", kFileSize, 0, 200, 251); + Add(4, 10U, "201", "250", kFileSize, 0, 101, 150); + Add(4, 11U, "301", "350", kFileSize, 0, 101, 150); + Add(4, 12U, "401", "450", kFileSize, 0, 101, 150); + Add(4, 13U, "501", "750", kFileSize, 0, 101, 150); + Add(4, 14U, "801", "850", kFileSize, 0, 101, 150); + Add(4, 15U, "901", "950", kFileSize, 0, 101, 150); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->output_level()); + ASSERT_EQ(2, compaction->start_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(15U, compaction->input(1, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, UniversalIncrementalSpace3) { + // Test bottom level files falling between gaps between two upper level + // files + const uint64_t kFileSize = 100000; + + mutable_cf_options_.max_compaction_bytes = 300000; + mutable_cf_options_.compaction_options_universal.incremental = true; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 30; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(2, 2U, "010", "080", kFileSize, 0, 200, 251); + Add(3, 5U, "000", "180", kFileSize, 0, 200, 251); + Add(3, 6U, "181", "190", kFileSize, 0, 200, 251); + Add(3, 7U, "710", "810", kFileSize, 0, 200, 251); + Add(3, 8U, "820", "830", kFileSize, 0, 200, 251); + Add(3, 9U, "900", "991", kFileSize, 0, 200, 251); + Add(4, 10U, "201", "250", kFileSize, 0, 101, 150); + Add(4, 11U, "301", "350", kFileSize, 0, 101, 150); + Add(4, 12U, "401", "450", kFileSize, 0, 101, 150); + Add(4, 13U, "501", "750", kFileSize, 0, 101, 150); + Add(4, 14U, "801", "850", kFileSize, 0, 101, 150); + Add(4, 15U, "901", "950", kFileSize, 0, 101, 150); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->output_level()); + ASSERT_EQ(2, compaction->start_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->num_input_files(1)); + ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(1, 1)->fd.GetNumber()); + ASSERT_EQ(0, compaction->num_input_files(2)); +} + +TEST_F(CompactionPickerTest, UniversalIncrementalSpace4) { + // Test compaction candidates always cover many files. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.max_compaction_bytes = 3200000; + mutable_cf_options_.compaction_options_universal.incremental = true; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 30; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(2, 2U, "010", "080", kFileSize, 0, 200, 251); + + // Generate files like following: + // L3: (1101, 1180) (1201, 1280) ... (7901, 7908) + // L4: (1130, 1150) (1160, 1210) (1230, 1250) (1260 1310) ... (7960, 8010) + for (int i = 11; i < 79; i++) { + Add(3, 100 + i * 3, std::to_string(i * 100).c_str(), + std::to_string(i * 100 + 80).c_str(), kFileSize, 0, 200, 251); + // Add a tie breaker + if (i == 66) { + Add(3, 10000U, "6690", "6699", kFileSize, 0, 200, 251); + } + + Add(4, 100 + i * 3 + 1, std::to_string(i * 100 + 30).c_str(), + std::to_string(i * 100 + 50).c_str(), kFileSize, 0, 200, 251); + Add(4, 100 + i * 3 + 2, std::to_string(i * 100 + 60).c_str(), + std::to_string(i * 100 + 110).c_str(), kFileSize, 0, 200, 251); + } + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->output_level()); + ASSERT_EQ(3, compaction->start_level()); + ASSERT_EQ(6U, compaction->num_input_files(0)); + ASSERT_EQ(100 + 62U * 3, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(10000U, compaction->input(0, 5)->fd.GetNumber()); + ASSERT_EQ(11, compaction->num_input_files(1)); +} + +TEST_F(CompactionPickerTest, UniversalIncrementalSpace5) { + // Test compaction candidates always cover many files with some single + // files larger than size threshold. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.max_compaction_bytes = 3200000; + mutable_cf_options_.compaction_options_universal.incremental = true; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 30; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(2, 2U, "010", "080", kFileSize, 0, 200, 251); + + // Generate files like following: + // L3: (1101, 1180) (1201, 1280) ... (7901, 7908) + // L4: (1130, 1150) (1160, 1210) (1230, 1250) (1260 1310) ... (7960, 8010) + for (int i = 11; i < 70; i++) { + Add(3, 100 + i * 3, std::to_string(i * 100).c_str(), + std::to_string(i * 100 + 80).c_str(), + i % 10 == 9 ? kFileSize * 100 : kFileSize, 0, 200, 251); + + Add(4, 100 + i * 3 + 1, std::to_string(i * 100 + 30).c_str(), + std::to_string(i * 100 + 50).c_str(), kFileSize, 0, 200, 251); + Add(4, 100 + i * 3 + 2, std::to_string(i * 100 + 60).c_str(), + std::to_string(i * 100 + 110).c_str(), kFileSize, 0, 200, 251); + } + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->output_level()); + ASSERT_EQ(3, compaction->start_level()); + ASSERT_EQ(6U, compaction->num_input_files(0)); + ASSERT_EQ(100 + 14 * 3, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(100 + 19 * 3, compaction->input(0, 5)->fd.GetNumber()); + ASSERT_EQ(13, compaction->num_input_files(1)); +} + +TEST_F(CompactionPickerTest, NeedsCompactionFIFO) { + NewVersionStorage(1, kCompactionStyleFIFO); + const int kFileCount = + mutable_cf_options_.level0_file_num_compaction_trigger * 3; + const uint64_t kFileSize = 100000; + const uint64_t kMaxSize = kFileSize * kFileCount / 2; + + fifo_options_.max_table_files_size = kMaxSize; + mutable_cf_options_.compaction_options_fifo = fifo_options_; + FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + UpdateVersionStorageInfo(); + // must return false when there's no files. + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), false); + + // verify whether compaction is needed based on the current + // size of L0 files. + for (int i = 1; i <= kFileCount; ++i) { + NewVersionStorage(1, kCompactionStyleFIFO); + Add(0, i, std::to_string((i + 100) * 1000).c_str(), + std::to_string((i + 100) * 1000 + 999).c_str(), kFileSize, 0, i * 100, + i * 100 + 99); + UpdateVersionStorageInfo(); + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), + vstorage_->CompactionScore(0) >= 1); + } +} + +TEST_F(CompactionPickerTest, FIFOToWarm1) { + NewVersionStorage(1, kCompactionStyleFIFO); + const uint64_t kFileSize = 100000; + const uint64_t kMaxSize = kFileSize * 100000; + uint64_t kWarmThreshold = 2000; + + fifo_options_.max_table_files_size = kMaxSize; + fifo_options_.age_for_warm = kWarmThreshold; + mutable_cf_options_.compaction_options_fifo = fifo_options_; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_compaction_bytes = kFileSize * 100; + FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + + int64_t current_time = 0; + ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); + uint64_t threshold_time = + static_cast(current_time) - kWarmThreshold; + Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true, + Temperature::kUnknown, static_cast(current_time) - 100); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true, + Temperature::kUnknown, threshold_time + 100); + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true, + Temperature::kUnknown, threshold_time - 2000); + Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true, + Temperature::kUnknown, threshold_time - 3000); + UpdateVersionStorageInfo(); + + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); + std::unique_ptr compaction(fifo_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, FIFOToWarm2) { + NewVersionStorage(1, kCompactionStyleFIFO); + const uint64_t kFileSize = 100000; + const uint64_t kMaxSize = kFileSize * 100000; + uint64_t kWarmThreshold = 2000; + + fifo_options_.max_table_files_size = kMaxSize; + fifo_options_.age_for_warm = kWarmThreshold; + mutable_cf_options_.compaction_options_fifo = fifo_options_; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_compaction_bytes = kFileSize * 100; + FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + + int64_t current_time = 0; + ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); + uint64_t threshold_time = + static_cast(current_time) - kWarmThreshold; + Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true, + Temperature::kUnknown, static_cast(current_time) - 100); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true, + Temperature::kUnknown, threshold_time + 100); + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true, + Temperature::kUnknown, threshold_time - 2000); + Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true, + Temperature::kUnknown, threshold_time - 3000); + Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true, + Temperature::kUnknown, threshold_time - 4000); + UpdateVersionStorageInfo(); + + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); + std::unique_ptr compaction(fifo_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, FIFOToWarmMaxSize) { + NewVersionStorage(1, kCompactionStyleFIFO); + const uint64_t kFileSize = 100000; + const uint64_t kMaxSize = kFileSize * 100000; + uint64_t kWarmThreshold = 2000; + + fifo_options_.max_table_files_size = kMaxSize; + fifo_options_.age_for_warm = kWarmThreshold; + mutable_cf_options_.compaction_options_fifo = fifo_options_; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_compaction_bytes = kFileSize * 9; + FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + + int64_t current_time = 0; + ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); + uint64_t threshold_time = + static_cast(current_time) - kWarmThreshold; + Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true, + Temperature::kUnknown, static_cast(current_time) - 100); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true, + Temperature::kUnknown, threshold_time + 100); + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true, + Temperature::kUnknown, threshold_time - 2000); + Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true, + Temperature::kUnknown, threshold_time - 3000); + Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true, + Temperature::kUnknown, threshold_time - 4000); + Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true, + Temperature::kUnknown, threshold_time - 5000); + UpdateVersionStorageInfo(); + + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); + std::unique_ptr compaction(fifo_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, FIFOToWarmWithExistingWarm) { + NewVersionStorage(1, kCompactionStyleFIFO); + const uint64_t kFileSize = 100000; + const uint64_t kMaxSize = kFileSize * 100000; + uint64_t kWarmThreshold = 2000; + + fifo_options_.max_table_files_size = kMaxSize; + fifo_options_.age_for_warm = kWarmThreshold; + mutable_cf_options_.compaction_options_fifo = fifo_options_; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_compaction_bytes = kFileSize * 100; + FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + + int64_t current_time = 0; + ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); + uint64_t threshold_time = + static_cast(current_time) - kWarmThreshold; + Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true, + Temperature::kUnknown, static_cast(current_time) - 100); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true, + Temperature::kUnknown, threshold_time + 100); + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true, + Temperature::kUnknown, threshold_time - 2000); + Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true, + Temperature::kUnknown, threshold_time - 3000); + Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true, + Temperature::kUnknown, threshold_time - 4000); + Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true, + Temperature::kWarm, threshold_time - 5000); + UpdateVersionStorageInfo(); + + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); + std::unique_ptr compaction(fifo_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, FIFOToWarmWithOngoing) { + NewVersionStorage(1, kCompactionStyleFIFO); + const uint64_t kFileSize = 100000; + const uint64_t kMaxSize = kFileSize * 100000; + uint64_t kWarmThreshold = 2000; + + fifo_options_.max_table_files_size = kMaxSize; + fifo_options_.age_for_warm = kWarmThreshold; + mutable_cf_options_.compaction_options_fifo = fifo_options_; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_compaction_bytes = kFileSize * 100; + FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + + int64_t current_time = 0; + ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); + uint64_t threshold_time = + static_cast(current_time) - kWarmThreshold; + Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true, + Temperature::kUnknown, static_cast(current_time) - 100); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true, + Temperature::kUnknown, threshold_time + 100); + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true, + Temperature::kUnknown, threshold_time - 2000); + Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true, + Temperature::kUnknown, threshold_time - 3000); + Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true, + Temperature::kUnknown, threshold_time - 4000); + Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true, + Temperature::kWarm, threshold_time - 5000); + file_map_[2].first->being_compacted = true; + UpdateVersionStorageInfo(); + + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); + std::unique_ptr compaction(fifo_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + // Stop if a file is being compacted + ASSERT_TRUE(compaction.get() == nullptr); +} + +TEST_F(CompactionPickerTest, FIFOToWarmWithHotBetweenWarms) { + NewVersionStorage(1, kCompactionStyleFIFO); + const uint64_t kFileSize = 100000; + const uint64_t kMaxSize = kFileSize * 100000; + uint64_t kWarmThreshold = 2000; + + fifo_options_.max_table_files_size = kMaxSize; + fifo_options_.age_for_warm = kWarmThreshold; + mutable_cf_options_.compaction_options_fifo = fifo_options_; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_compaction_bytes = kFileSize * 100; + FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + + int64_t current_time = 0; + ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); + uint64_t threshold_time = + static_cast(current_time) - kWarmThreshold; + Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true, + Temperature::kUnknown, static_cast(current_time) - 100); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true, + Temperature::kUnknown, threshold_time + 100); + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true, + Temperature::kUnknown, threshold_time - 2000); + Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true, + Temperature::kWarm, threshold_time - 3000); + Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true, + Temperature::kUnknown, threshold_time - 4000); + Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true, + Temperature::kWarm, threshold_time - 5000); + UpdateVersionStorageInfo(); + + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); + std::unique_ptr compaction(fifo_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + // Stop if a file is being compacted + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); +} + +#endif // ROCKSDB_LITE + +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) { + NewVersionStorage(6, kCompactionStyleLevel); + ioptions_.compaction_pri = kMinOverlappingRatio; + mutable_cf_options_.target_file_size_base = 100000000000; + mutable_cf_options_.target_file_size_multiplier = 10; + mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024; + mutable_cf_options_.RefreshDerivedOptions(ioptions_); + + Add(2, 6U, "150", "179", 50000000U); + Add(2, 7U, "180", "220", 50000000U); + Add(2, 8U, "321", "400", 50000000U); // File not overlapping + Add(2, 9U, "721", "800", 50000000U); + + Add(3, 26U, "150", "170", 260000000U); + Add(3, 27U, "171", "179", 260000000U); + Add(3, 28U, "191", "220", 260000000U); + Add(3, 29U, "221", "300", 260000000U); + Add(3, 30U, "750", "900", 260000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Pick file 8 because it overlaps with 0 files on level 3. + ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber()); + // Compaction input size * 1.1 + ASSERT_GE(uint64_t{55000000}, compaction->OutputFilePreallocationSize()); +} + +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping2) { + NewVersionStorage(6, kCompactionStyleLevel); + ioptions_.compaction_pri = kMinOverlappingRatio; + mutable_cf_options_.target_file_size_base = 10000000; + mutable_cf_options_.target_file_size_multiplier = 10; + mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024; + + Add(2, 6U, "150", "175", + 60000000U); // Overlaps with file 26, 27, total size 521M + Add(2, 7U, "176", "200", 60000000U); // Overlaps with file 27, 28, total size + // 520M, the smallest overlapping + Add(2, 8U, "201", "300", + 60000000U); // Overlaps with file 28, 29, total size 521M + + Add(3, 25U, "100", "110", 261000000U); + Add(3, 26U, "150", "170", 261000000U); + Add(3, 27U, "171", "179", 260000000U); + Add(3, 28U, "191", "220", 260000000U); + Add(3, 29U, "221", "300", 261000000U); + Add(3, 30U, "321", "400", 261000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Picking file 7 because overlapping ratio is the biggest. + ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping3) { + NewVersionStorage(6, kCompactionStyleLevel); + ioptions_.compaction_pri = kMinOverlappingRatio; + mutable_cf_options_.max_bytes_for_level_base = 10000000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + + // file 7 and 8 over lap with the same file, but file 8 is smaller so + // it will be picked. + Add(2, 6U, "150", "167", 60000000U); // Overlaps with file 26, 27 + Add(2, 7U, "168", "169", 60000000U); // Overlaps with file 27 + Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 28, but the file + // itself is larger. Should be picked. + + Add(3, 26U, "160", "165", 260000000U); + Add(3, 27U, "166", "170", 260000000U); + Add(3, 28U, "180", "400", 260000000U); + Add(3, 29U, "401", "500", 260000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Picking file 8 because overlapping ratio is the biggest. + ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping4) { + NewVersionStorage(6, kCompactionStyleLevel); + ioptions_.compaction_pri = kMinOverlappingRatio; + mutable_cf_options_.max_bytes_for_level_base = 10000000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + mutable_cf_options_.ignore_max_compaction_bytes_for_input = false; + + // file 7 and 8 over lap with the same file, but file 8 is smaller so + // it will be picked. + // Overlaps with file 26, 27. And the file is compensated so will be + // picked up. + Add(2, 6U, "150", "167", 60000000U, 0, 100, 100, 180000000U); + Add(2, 7U, "168", "169", 60000000U); // Overlaps with file 27 + Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 28 + + Add(3, 26U, "160", "165", 60000000U); + // Boosted file size in output level is not considered. + Add(3, 27U, "166", "170", 60000000U, 0, 100, 100, 260000000U); + Add(3, 28U, "180", "400", 60000000U); + Add(3, 29U, "401", "500", 60000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Picking file 8 because overlapping ratio is the biggest. + ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, CompactionPriRoundRobin) { + std::vector test_cursors = {InternalKey("249", 100, kTypeValue), + InternalKey("600", 100, kTypeValue), + InternalKey()}; + std::vector selected_files = {8U, 6U, 6U}; + + ioptions_.compaction_pri = kRoundRobin; + mutable_cf_options_.max_bytes_for_level_base = 12000000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + for (size_t i = 0; i < test_cursors.size(); i++) { + // start a brand new version in each test. + NewVersionStorage(6, kCompactionStyleLevel); + vstorage_->ResizeCompactCursors(6); + // Set the cursor + vstorage_->AddCursorForOneLevel(2, test_cursors[i]); + Add(2, 6U, "150", "199", 50000000U); // Overlap with 26U, 27U + Add(2, 7U, "200", "249", 50000000U); // File not overlapping + Add(2, 8U, "300", "600", 50000000U); // Overlap with 28U, 29U + + Add(3, 26U, "130", "165", 60000000U); + Add(3, 27U, "166", "170", 60000000U); + Add(3, 28U, "270", "340", 60000000U); + Add(3, 29U, "401", "500", 60000000U); + UpdateVersionStorageInfo(); + LevelCompactionPicker local_level_compaction_picker = + LevelCompactionPicker(ioptions_, &icmp_); + std::unique_ptr compaction( + local_level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + // Since the max bytes for level 2 is 120M, picking one file to compact + // makes the post-compaction level size less than 120M, there is exactly one + // file picked for round-robin compaction + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(selected_files[i], compaction->input(0, 0)->fd.GetNumber()); + // release the version storage + DeleteVersionStorage(); + } +} + +TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin1) { + ioptions_.compaction_pri = kRoundRobin; + mutable_cf_options_.max_compaction_bytes = 100000000u; + mutable_cf_options_.max_bytes_for_level_base = 120; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + // start a brand new version in each test. + NewVersionStorage(6, kCompactionStyleLevel); + vstorage_->ResizeCompactCursors(6); + // Set the cursor (file picking should start with 7U) + vstorage_->AddCursorForOneLevel(2, InternalKey("199", 100, kTypeValue)); + Add(2, 6U, "150", "199", 500U); + Add(2, 7U, "200", "249", 500U); + Add(2, 8U, "300", "600", 500U); + Add(2, 9U, "700", "800", 500U); + Add(2, 10U, "850", "950", 500U); + + Add(3, 26U, "130", "165", 600U); + Add(3, 27U, "166", "170", 600U); + Add(3, 28U, "270", "340", 600U); + Add(3, 29U, "401", "500", 600U); + Add(3, 30U, "601", "800", 600U); + Add(3, 31U, "830", "890", 600U); + UpdateVersionStorageInfo(); + LevelCompactionPicker local_level_compaction_picker = + LevelCompactionPicker(ioptions_, &icmp_); + std::unique_ptr compaction( + local_level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + + // The maximum compaction bytes is very large in this case so we can igore its + // constraint in this test case. The maximum bytes for level 2 is 1200 + // bytes, and thus at least 3 files should be picked so that the bytes in + // level 2 is less than the maximum + ASSERT_EQ(3U, compaction->num_input_files(0)); + ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(8U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(9U, compaction->input(0, 2)->fd.GetNumber()); + // release the version storage + DeleteVersionStorage(); +} + +TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin2) { + ioptions_.compaction_pri = kRoundRobin; + mutable_cf_options_.max_compaction_bytes = 2500u; + mutable_cf_options_.max_bytes_for_level_base = 120; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + // start a brand new version in each test. + NewVersionStorage(6, kCompactionStyleLevel); + vstorage_->ResizeCompactCursors(6); + // Set the cursor (file picking should start with 6U) + vstorage_->AddCursorForOneLevel(2, InternalKey("1000", 100, kTypeValue)); + Add(2, 6U, "150", "199", 500U); // Overlap with 26U, 27U + Add(2, 7U, "200", "249", 500U); // Overlap with 27U + Add(2, 8U, "300", "600", 500U); // Overlap with 28U, 29U + Add(2, 9U, "700", "800", 500U); + Add(2, 10U, "850", "950", 500U); + + Add(3, 26U, "130", "165", 600U); + Add(3, 27U, "166", "230", 600U); + Add(3, 28U, "270", "340", 600U); + Add(3, 29U, "401", "500", 600U); + Add(3, 30U, "601", "800", 600U); + Add(3, 31U, "830", "890", 600U); + UpdateVersionStorageInfo(); + LevelCompactionPicker local_level_compaction_picker = + LevelCompactionPicker(ioptions_, &icmp_); + std::unique_ptr compaction( + local_level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + + // The maximum compaction bytes is only 2500 bytes now. Even though we are + // required to choose 3 files so that the post-compaction level size is less + // than 1200 bytes. We cannot pick 3 files to compact since the maximum + // compaction size is 2500. After picking files 6U and 7U, the number of + // compaction bytes has reached 2200, and thus no more space to add another + // input file with 50M bytes. + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(0, 1)->fd.GetNumber()); + // release the version storage + DeleteVersionStorage(); +} + +TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin3) { + ioptions_.compaction_pri = kRoundRobin; + mutable_cf_options_.max_compaction_bytes = 1000000u; + mutable_cf_options_.max_bytes_for_level_base = 120; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + // start a brand new version in each test. + NewVersionStorage(6, kCompactionStyleLevel); + vstorage_->ResizeCompactCursors(6); + // Set the cursor (file picking should start with 9U) + vstorage_->AddCursorForOneLevel(2, InternalKey("700", 100, kTypeValue)); + Add(2, 6U, "150", "199", 500U); + Add(2, 7U, "200", "249", 500U); + Add(2, 8U, "300", "600", 500U); + Add(2, 9U, "700", "800", 500U); + Add(2, 10U, "850", "950", 500U); + + Add(3, 26U, "130", "165", 600U); + Add(3, 27U, "166", "170", 600U); + Add(3, 28U, "270", "340", 600U); + Add(3, 29U, "401", "500", 600U); + Add(3, 30U, "601", "800", 600U); + Add(3, 31U, "830", "890", 600U); + UpdateVersionStorageInfo(); + LevelCompactionPicker local_level_compaction_picker = + LevelCompactionPicker(ioptions_, &icmp_); + std::unique_ptr compaction( + local_level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + + // Cannot pick more files since we reach the last file in level 2 + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(9U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(10U, compaction->input(0, 1)->fd.GetNumber()); + // release the version storage + DeleteVersionStorage(); +} + +TEST_F(CompactionPickerTest, CompactionPriMinOverlappingManyFiles) { + NewVersionStorage(6, kCompactionStyleLevel); + ioptions_.compaction_pri = kMinOverlappingRatio; + mutable_cf_options_.max_bytes_for_level_base = 15000000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + + // file 7 and 8 over lap with the same file, but file 8 is smaller so + // it will be picked. + Add(2, 13U, "010", "011", + 6100U); // Overlaps with a large file. Not picked + Add(2, 14U, "020", "021", + 6100U); // Overlaps with a large file. Not picked + Add(2, 15U, "030", "031", + 6100U); // Overlaps with a large file. Not picked + Add(2, 16U, "040", "041", + 6100U); // Overlaps with a large file. Not picked + Add(2, 17U, "050", "051", + 6100U); // Overlaps with a large file. Not picked + Add(2, 18U, "060", "061", + 6100U); // Overlaps with a large file. Not picked + Add(2, 19U, "070", "071", + 6100U); // Overlaps with a large file. Not picked + Add(2, 20U, "080", "081", + 6100U); // Overlaps with a large file. Not picked + + Add(2, 6U, "150", "167", 60000000U); // Overlaps with file 26, 27 + Add(2, 7U, "168", "169", 60000000U); // Overlaps with file 27 + Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 28, but the file + // itself is larger. Should be picked. + Add(2, 9U, "610", "611", + 6100U); // Overlaps with a large file. Not picked + Add(2, 10U, "620", "621", + 6100U); // Overlaps with a large file. Not picked + Add(2, 11U, "630", "631", + 6100U); // Overlaps with a large file. Not picked + Add(2, 12U, "640", "641", + 6100U); // Overlaps with a large file. Not picked + + Add(3, 31U, "001", "100", 260000000U); + Add(3, 26U, "160", "165", 260000000U); + Add(3, 27U, "166", "170", 260000000U); + Add(3, 28U, "180", "400", 260000000U); + Add(3, 29U, "401", "500", 260000000U); + Add(3, 30U, "601", "700", 260000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Picking file 8 because overlapping ratio is the biggest. + ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber()); +} + +// This test exhibits the bug where we don't properly reset parent_index in +// PickCompaction() +TEST_F(CompactionPickerTest, ParentIndexResetBug) { + int num_levels = ioptions_.num_levels; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 200; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200"); // <- marked for compaction + Add(1, 3U, "400", "500", 600); // <- this one needs compacting + Add(2, 4U, "150", "200"); + Add(2, 5U, "201", "210"); + Add(2, 6U, "300", "310"); + Add(2, 7U, "400", "500"); // <- being compacted + + vstorage_->LevelFiles(2)[3]->being_compacted = true; + vstorage_->LevelFiles(0)[0]->marked_for_compaction = true; + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); +} + +// This test checks ExpandWhileOverlapping() by having overlapping user keys +// ranges (with different sequence numbers) in the input files. +TEST_F(CompactionPickerTest, OverlappingUserKeys) { + NewVersionStorage(6, kCompactionStyleLevel); + ioptions_.compaction_pri = kByCompensatedSize; + + Add(1, 1U, "100", "150", 1U); + // Overlapping user keys + Add(1, 2U, "200", "400", 1U); + Add(1, 3U, "400", "500", 1000000000U, 0, 0); + Add(2, 4U, "600", "700", 1U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys2) { + NewVersionStorage(6, kCompactionStyleLevel); + // Overlapping user keys on same level and output level + Add(1, 1U, "200", "400", 1000000000U); + Add(1, 2U, "400", "500", 1U, 0, 0); + Add(2, 3U, "000", "100", 1U); + Add(2, 4U, "100", "600", 1U, 0, 0); + Add(2, 5U, "600", "700", 1U, 0, 0); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(3U, compaction->num_input_files(1)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(4U, compaction->input(1, 1)->fd.GetNumber()); + ASSERT_EQ(5U, compaction->input(1, 2)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys3) { + NewVersionStorage(6, kCompactionStyleLevel); + // Chain of overlapping user key ranges (forces ExpandWhileOverlapping() to + // expand multiple times) + Add(1, 1U, "100", "150", 1U); + Add(1, 2U, "150", "200", 1U, 0, 0); + Add(1, 3U, "200", "250", 1000000000U, 0, 0); + Add(1, 4U, "250", "300", 1U, 0, 0); + Add(1, 5U, "300", "350", 1U, 0, 0); + // Output level overlaps with the beginning and the end of the chain + Add(2, 6U, "050", "100", 1U); + Add(2, 7U, "350", "400", 1U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(5U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->num_input_files(1)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber()); + ASSERT_EQ(4U, compaction->input(0, 3)->fd.GetNumber()); + ASSERT_EQ(5U, compaction->input(0, 4)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys4) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_bytes_for_level_base = 1000000; + + Add(1, 1U, "100", "150", 1U); + Add(1, 2U, "150", "199", 1U, 0, 0); + Add(1, 3U, "200", "250", 1100000U, 0, 0); + Add(1, 4U, "251", "300", 1U, 0, 0); + Add(1, 5U, "300", "350", 1U, 0, 0); + + Add(2, 6U, "100", "115", 1U); + Add(2, 7U, "125", "325", 1U); + Add(2, 8U, "350", "400", 1U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys5) { + NewVersionStorage(6, kCompactionStyleLevel); + // Overlapping user keys on same level and output level + Add(1, 1U, "200", "400", 1000000000U); + Add(1, 2U, "400", "500", 1U, 0, 0); + Add(2, 3U, "000", "100", 1U); + Add(2, 4U, "100", "600", 1U, 0, 0); + Add(2, 5U, "600", "700", 1U, 0, 0); + + vstorage_->LevelFiles(2)[2]->being_compacted = true; + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() == nullptr); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys6) { + NewVersionStorage(6, kCompactionStyleLevel); + // Overlapping user keys on same level and output level + Add(1, 1U, "200", "400", 1U, 0, 0); + Add(1, 2U, "401", "500", 1U, 0, 0); + Add(2, 3U, "000", "100", 1U); + Add(2, 4U, "100", "300", 1U, 0, 0); + Add(2, 5U, "305", "450", 1U, 0, 0); + Add(2, 6U, "460", "600", 1U, 0, 0); + Add(2, 7U, "600", "700", 1U, 0, 0); + + vstorage_->LevelFiles(1)[0]->marked_for_compaction = true; + vstorage_->LevelFiles(1)[1]->marked_for_compaction = true; + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(3U, compaction->num_input_files(1)); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys7) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_compaction_bytes = 100000000000u; + // Overlapping user keys on same level and output level + Add(1, 1U, "200", "400", 1U, 0, 0); + Add(1, 2U, "401", "500", 1000000000U, 0, 0); + Add(2, 3U, "100", "250", 1U); + Add(2, 4U, "300", "600", 1U, 0, 0); + Add(2, 5U, "600", "800", 1U, 0, 0); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_GE(1U, compaction->num_input_files(0)); + ASSERT_GE(2U, compaction->num_input_files(1)); + // File 5 has to be included in the compaction + ASSERT_EQ(5U, compaction->inputs(1)->back()->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys8) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_compaction_bytes = 100000000000u; + // grow the number of inputs in "level" without + // changing the number of "level+1" files we pick up + // Expand input level as much as possible + // no overlapping case + Add(1, 1U, "101", "150", 1U); + Add(1, 2U, "151", "200", 1U); + Add(1, 3U, "201", "300", 1000000000U); + Add(1, 4U, "301", "400", 1U); + Add(1, 5U, "401", "500", 1U); + Add(2, 6U, "150", "200", 1U); + Add(2, 7U, "200", "450", 1U, 0, 0); + Add(2, 8U, "500", "600", 1U); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(3U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->num_input_files(1)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(4U, compaction->input(0, 2)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys9) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_compaction_bytes = 100000000000u; + // grow the number of inputs in "level" without + // changing the number of "level+1" files we pick up + // Expand input level as much as possible + // overlapping case + Add(1, 1U, "121", "150", 1U); + Add(1, 2U, "151", "200", 1U); + Add(1, 3U, "201", "300", 1000000000U); + Add(1, 4U, "301", "400", 1U); + Add(1, 5U, "401", "500", 1U); + Add(2, 6U, "100", "120", 1U); + Add(2, 7U, "150", "200", 1U); + Add(2, 8U, "200", "450", 1U, 0, 0); + Add(2, 9U, "501", "600", 1U); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(5U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->num_input_files(1)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber()); + ASSERT_EQ(4U, compaction->input(0, 3)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(8U, compaction->input(1, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys10) { + // Locked file encountered when pulling in extra input-level files with same + // user keys. Verify we pick the next-best file from the same input level. + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_compaction_bytes = 100000000000u; + + // file_number 2U is largest and thus first choice. But it overlaps with + // file_number 1U which is being compacted. So instead we pick the next- + // biggest file, 3U, which is eligible for compaction. + Add(1 /* level */, 1U /* file_number */, "100" /* smallest */, + "150" /* largest */, 1U /* file_size */); + file_map_[1U].first->being_compacted = true; + Add(1 /* level */, 2U /* file_number */, "150" /* smallest */, + "200" /* largest */, 1000000000U /* file_size */, 0 /* smallest_seq */, + 0 /* largest_seq */); + Add(1 /* level */, 3U /* file_number */, "201" /* smallest */, + "250" /* largest */, 900000000U /* file_size */); + Add(2 /* level */, 4U /* file_number */, "100" /* smallest */, + "150" /* largest */, 1U /* file_size */); + Add(2 /* level */, 5U /* file_number */, "151" /* smallest */, + "200" /* largest */, 1U /* file_size */); + Add(2 /* level */, 6U /* file_number */, "201" /* smallest */, + "250" /* largest */, 1U /* file_size */); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys11) { + // Locked file encountered when pulling in extra output-level files with same + // user keys. Expected to skip that compaction and pick the next-best choice. + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_compaction_bytes = 100000000000u; + + // score(L1) = 3.7 + // score(L2) = 1.85 + // There is no eligible file in L1 to compact since both candidates pull in + // file_number 5U, which overlaps with a file pending compaction (6U). The + // first eligible compaction is from L2->L3. + Add(1 /* level */, 2U /* file_number */, "151" /* smallest */, + "200" /* largest */, 1000000000U /* file_size */); + Add(1 /* level */, 3U /* file_number */, "201" /* smallest */, + "250" /* largest */, 1U /* file_size */); + Add(2 /* level */, 4U /* file_number */, "100" /* smallest */, + "149" /* largest */, 5000000000U /* file_size */); + Add(2 /* level */, 5U /* file_number */, "150" /* smallest */, + "201" /* largest */, 1U /* file_size */); + Add(2 /* level */, 6U /* file_number */, "201" /* smallest */, + "249" /* largest */, 1U /* file_size */, 0 /* smallest_seq */, + 0 /* largest_seq */); + file_map_[6U].first->being_compacted = true; + Add(3 /* level */, 7U /* file_number */, "100" /* smallest */, + "149" /* largest */, 1U /* file_size */); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, FileTtlBooster) { + // Set TTL to 2048 + // TTL boosting for all levels starts at 1024, + // Whole TTL range is 2048 * 31 / 32 - 1024 = 1984 - 1024 = 960. + // From second last level (L5), range starts at + // 1024 + 480, 1024 + 240, 1024 + 120 (which is L3). + // Boosting step 124 / 16 = 7.75 -> 7 + // + const uint64_t kCurrentTime = 1000000; + FileMetaData meta; + + { + FileTtlBooster booster(kCurrentTime, 2048, 7, 3); + + // Not triggering if the file is younger than ttl/2 + meta.oldest_ancester_time = kCurrentTime - 1023; + ASSERT_EQ(1, booster.GetBoostScore(&meta)); + meta.oldest_ancester_time = kCurrentTime - 1024; + ASSERT_EQ(1, booster.GetBoostScore(&meta)); + meta.oldest_ancester_time = kCurrentTime + 10; + ASSERT_EQ(1, booster.GetBoostScore(&meta)); + + // Within one boosting step + meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 6); + ASSERT_EQ(1, booster.GetBoostScore(&meta)); + + // One boosting step + meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 7); + ASSERT_EQ(2, booster.GetBoostScore(&meta)); + meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 8); + ASSERT_EQ(2, booster.GetBoostScore(&meta)); + + // Multiple boosting steps + meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 30); + ASSERT_EQ(5, booster.GetBoostScore(&meta)); + + // Very high boosting steps + meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 700); + ASSERT_EQ(101, booster.GetBoostScore(&meta)); + } + { + // Test second last level + FileTtlBooster booster(kCurrentTime, 2048, 7, 5); + meta.oldest_ancester_time = kCurrentTime - (1024 + 480); + ASSERT_EQ(1, booster.GetBoostScore(&meta)); + meta.oldest_ancester_time = kCurrentTime - (1024 + 480 + 60); + ASSERT_EQ(3, booster.GetBoostScore(&meta)); + } + { + // Test last level + FileTtlBooster booster(kCurrentTime, 2048, 7, 6); + meta.oldest_ancester_time = kCurrentTime - (1024 + 480); + ASSERT_EQ(1, booster.GetBoostScore(&meta)); + meta.oldest_ancester_time = kCurrentTime - (1024 + 480 + 60); + ASSERT_EQ(1, booster.GetBoostScore(&meta)); + meta.oldest_ancester_time = kCurrentTime - 3000; + ASSERT_EQ(1, booster.GetBoostScore(&meta)); + } +} + +TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri1) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 900000000U; + + // 6 L0 files, score 3. + Add(0, 1U, "000", "400", 1U); + Add(0, 2U, "001", "400", 1U, 0, 0); + Add(0, 3U, "001", "400", 1000000000U, 0, 0); + Add(0, 31U, "001", "400", 1000000000U, 0, 0); + Add(0, 32U, "001", "400", 1000000000U, 0, 0); + Add(0, 33U, "001", "400", 1000000000U, 0, 0); + + // L1 total size 2GB, score 2.2. If one file being compacted, score 1.1. + Add(1, 4U, "050", "300", 1000000000U, 0, 0); + file_map_[4u].first->being_compacted = true; + Add(1, 5U, "301", "350", 1000000000U, 0, 0); + + // Output level overlaps with the beginning and the end of the chain + Add(2, 6U, "050", "100", 1U); + Add(2, 7U, "300", "400", 1U); + + // No compaction should be scheduled, if L0 has higher priority than L1 + // but L0->L1 compaction is blocked by a file in L1 being compacted. + UpdateVersionStorageInfo(); + ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0)); + ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1)); + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() == nullptr); +} + +TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri2) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 900000000U; + + // 6 L0 files, score 3. + Add(0, 1U, "000", "400", 1U); + Add(0, 2U, "001", "400", 1U, 0, 0); + Add(0, 3U, "001", "400", 1000000000U, 0, 0); + Add(0, 31U, "001", "400", 1000000000U, 0, 0); + Add(0, 32U, "001", "400", 1000000000U, 0, 0); + Add(0, 33U, "001", "400", 1000000000U, 0, 0); + + // L1 total size 2GB, score 2.2. If one file being compacted, score 1.1. + Add(1, 4U, "050", "300", 1000000000U, 0, 0); + Add(1, 5U, "301", "350", 1000000000U, 0, 0); + + // Output level overlaps with the beginning and the end of the chain + Add(2, 6U, "050", "100", 1U); + Add(2, 7U, "300", "400", 1U); + + // If no file in L1 being compacted, L0->L1 compaction will be scheduled. + UpdateVersionStorageInfo(); // being_compacted flag is cleared here. + ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0)); + ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1)); + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); +} + +TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri3) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 900000000U; + + // 6 L0 files, score 3. + Add(0, 1U, "000", "400", 1U); + Add(0, 2U, "001", "400", 1U, 0, 0); + Add(0, 3U, "001", "400", 1000000000U, 0, 0); + Add(0, 31U, "001", "400", 1000000000U, 0, 0); + Add(0, 32U, "001", "400", 1000000000U, 0, 0); + Add(0, 33U, "001", "400", 1000000000U, 0, 0); + + // L1 score more than 6. + Add(1, 4U, "050", "300", 1000000000U, 0, 0); + file_map_[4u].first->being_compacted = true; + Add(1, 5U, "301", "350", 1000000000U, 0, 0); + Add(1, 51U, "351", "400", 6000000000U, 0, 0); + + // Output level overlaps with the beginning and the end of the chain + Add(2, 6U, "050", "100", 1U); + Add(2, 7U, "300", "400", 1U); + + // If score in L1 is larger than L0, L1 compaction goes through despite + // there is pending L0 compaction. + UpdateVersionStorageInfo(); + ASSERT_EQ(1, vstorage_->CompactionScoreLevel(0)); + ASSERT_EQ(0, vstorage_->CompactionScoreLevel(1)); + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); +} + +TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded1) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = false; + mutable_cf_options_.level0_file_num_compaction_trigger = 4; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200", 200); + Add(0, 2U, "150", "200", 200); + Add(0, 3U, "150", "200", 200); + // Level 1 is over target by 200 + Add(1, 4U, "400", "500", 600); + Add(1, 5U, "600", "700", 600); + // Level 2 is less than target 10000 even added size of level 1 + // Size ratio of L2/L1 is 9600 / 1200 = 8 + Add(2, 6U, "150", "200", 2500); + Add(2, 7U, "201", "210", 2000); + Add(2, 8U, "300", "310", 2600); + Add(2, 9U, "400", "500", 2500); + // Level 3 exceeds target 100,000 of 1000 + Add(3, 10U, "400", "500", 101000); + // Level 4 exceeds target 1,000,000 by 900 after adding size from level 3 + // Size ratio L4/L3 is 9.9 + // After merge from L3, L4 size is 1000900 + Add(4, 11U, "400", "500", 999900); + Add(5, 12U, "400", "500", 8007200); + + UpdateVersionStorageInfo(); + + ASSERT_EQ(200u * 9u + 10900u + 900u * 9, + vstorage_->estimated_compaction_needed_bytes()); +} + +TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded2) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = false; + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200", 200); + Add(0, 2U, "150", "200", 200); + Add(0, 4U, "150", "200", 200); + Add(0, 5U, "150", "200", 200); + Add(0, 6U, "150", "200", 200); + // Level 1 size will be 1400 after merging with L0 + Add(1, 7U, "400", "500", 200); + Add(1, 8U, "600", "700", 200); + // Level 2 is less than target 10000 even added size of level 1 + Add(2, 9U, "150", "200", 9100); + // Level 3 over the target, but since level 4 is empty, we assume it will be + // a trivial move. + Add(3, 10U, "400", "500", 101000); + + UpdateVersionStorageInfo(); + + // estimated L1->L2 merge: 400 * (9100.0 / 1400.0 + 1.0) + ASSERT_EQ(1400u + 3000u, vstorage_->estimated_compaction_needed_bytes()); +} + +TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded3) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = false; + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200", 2000); + Add(0, 2U, "150", "200", 2000); + Add(0, 4U, "150", "200", 2000); + Add(0, 5U, "150", "200", 2000); + Add(0, 6U, "150", "200", 1000); + // Level 1 size will be 10000 after merging with L0 + Add(1, 7U, "400", "500", 500); + Add(1, 8U, "600", "700", 500); + + Add(2, 9U, "150", "200", 10000); + + UpdateVersionStorageInfo(); + + ASSERT_EQ(10000u + 18000u, vstorage_->estimated_compaction_needed_bytes()); +} + +TEST_F(CompactionPickerTest, EstimateCompactionBytesNeededDynamicLevel) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + + // Set Last level size 50000 + // num_levels - 1 target 5000 + // num_levels - 2 is base level with target 1000 (rounded up to + // max_bytes_for_level_base). + Add(num_levels - 1, 10U, "400", "500", 50000); + + Add(0, 1U, "150", "200", 200); + Add(0, 2U, "150", "200", 200); + Add(0, 4U, "150", "200", 200); + Add(0, 5U, "150", "200", 200); + Add(0, 6U, "150", "200", 200); + // num_levels - 3 is over target by 100 + 1000 + Add(num_levels - 3, 7U, "400", "500", 550); + Add(num_levels - 3, 8U, "600", "700", 550); + // num_levels - 2 is over target by 1100 + 200 + Add(num_levels - 2, 9U, "150", "200", 5200); + + UpdateVersionStorageInfo(); + + // Merging to the second last level: (5200 / 2100 + 1) * 1100 + // Merging to the last level: (50000 / 6300 + 1) * 1300 + ASSERT_EQ(2100u + 3823u + 11617u, + vstorage_->estimated_compaction_needed_bytes()); +} + +TEST_F(CompactionPickerTest, IsBottommostLevelTest) { + // case 1: Higher levels are empty + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "a", "m"); + Add(0, 2U, "c", "z"); + Add(1, 3U, "d", "e"); + Add(1, 4U, "l", "p"); + Add(2, 5U, "g", "i"); + Add(2, 6U, "x", "z"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(2, 1); + AddToCompactionFiles(3U); + AddToCompactionFiles(5U); + bool result = + Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_TRUE(result); + + // case 2: Higher levels have no overlap + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "a", "m"); + Add(0, 2U, "c", "z"); + Add(1, 3U, "d", "e"); + Add(1, 4U, "l", "p"); + Add(2, 5U, "g", "i"); + Add(2, 6U, "x", "z"); + Add(3, 7U, "k", "p"); + Add(3, 8U, "t", "w"); + Add(4, 9U, "a", "b"); + Add(5, 10U, "c", "cc"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(2, 1); + AddToCompactionFiles(3U); + AddToCompactionFiles(5U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_TRUE(result); + + // case 3.1: Higher levels (level 3) have overlap + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "a", "m"); + Add(0, 2U, "c", "z"); + Add(1, 3U, "d", "e"); + Add(1, 4U, "l", "p"); + Add(2, 5U, "g", "i"); + Add(2, 6U, "x", "z"); + Add(3, 7U, "e", "g"); + Add(3, 8U, "h", "k"); + Add(4, 9U, "a", "b"); + Add(5, 10U, "c", "cc"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(2, 1); + AddToCompactionFiles(3U); + AddToCompactionFiles(5U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_FALSE(result); + + // case 3.2: Higher levels (level 5) have overlap + DeleteVersionStorage(); + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "a", "m"); + Add(0, 2U, "c", "z"); + Add(1, 3U, "d", "e"); + Add(1, 4U, "l", "p"); + Add(2, 5U, "g", "i"); + Add(2, 6U, "x", "z"); + Add(3, 7U, "j", "k"); + Add(3, 8U, "l", "m"); + Add(4, 9U, "a", "b"); + Add(5, 10U, "c", "cc"); + Add(5, 11U, "h", "k"); + Add(5, 12U, "y", "yy"); + Add(5, 13U, "z", "zz"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(2, 1); + AddToCompactionFiles(3U); + AddToCompactionFiles(5U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_FALSE(result); + + // case 3.3: Higher levels (level 5) have overlap, but it's only overlapping + // one key ("d") + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "a", "m"); + Add(0, 2U, "c", "z"); + Add(1, 3U, "d", "e"); + Add(1, 4U, "l", "p"); + Add(2, 5U, "g", "i"); + Add(2, 6U, "x", "z"); + Add(3, 7U, "j", "k"); + Add(3, 8U, "l", "m"); + Add(4, 9U, "a", "b"); + Add(5, 10U, "c", "cc"); + Add(5, 11U, "ccc", "d"); + Add(5, 12U, "y", "yy"); + Add(5, 13U, "z", "zz"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(2, 1); + AddToCompactionFiles(3U); + AddToCompactionFiles(5U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_FALSE(result); + + // Level 0 files overlap + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "s", "t"); + Add(0, 2U, "a", "m"); + Add(0, 3U, "b", "z"); + Add(0, 4U, "e", "f"); + Add(5, 10U, "y", "z"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(1, 0); + AddToCompactionFiles(1U); + AddToCompactionFiles(2U); + AddToCompactionFiles(3U); + AddToCompactionFiles(4U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_FALSE(result); + + // Level 0 files don't overlap + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "s", "t"); + Add(0, 2U, "a", "m"); + Add(0, 3U, "b", "k"); + Add(0, 4U, "e", "f"); + Add(5, 10U, "y", "z"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(1, 0); + AddToCompactionFiles(1U); + AddToCompactionFiles(2U); + AddToCompactionFiles(3U); + AddToCompactionFiles(4U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_TRUE(result); + + // Level 1 files overlap + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "s", "t"); + Add(0, 2U, "a", "m"); + Add(0, 3U, "b", "k"); + Add(0, 4U, "e", "f"); + Add(1, 5U, "a", "m"); + Add(1, 6U, "n", "o"); + Add(1, 7U, "w", "y"); + Add(5, 10U, "y", "z"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(2, 0); + AddToCompactionFiles(1U); + AddToCompactionFiles(2U); + AddToCompactionFiles(3U); + AddToCompactionFiles(4U); + AddToCompactionFiles(5U); + AddToCompactionFiles(6U); + AddToCompactionFiles(7U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_FALSE(result); + + DeleteVersionStorage(); +} + +TEST_F(CompactionPickerTest, MaxCompactionBytesHit) { + mutable_cf_options_.max_bytes_for_level_base = 1000000u; + mutable_cf_options_.max_compaction_bytes = 800000u; + mutable_cf_options_.ignore_max_compaction_bytes_for_input = false; + ioptions_.level_compaction_dynamic_level_bytes = false; + NewVersionStorage(6, kCompactionStyleLevel); + // A compaction should be triggered and pick file 2 and 5. + // It can expand because adding file 1 and 3, the compaction size will + // exceed mutable_cf_options_.max_bytes_for_level_base. + Add(1, 1U, "100", "150", 300000U); + Add(1, 2U, "151", "200", 300001U, 0, 0); + Add(1, 3U, "201", "250", 300000U, 0, 0); + Add(1, 4U, "251", "300", 300000U, 0, 0); + Add(2, 5U, "100", "256", 1U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, MaxCompactionBytesNotHit) { + mutable_cf_options_.max_bytes_for_level_base = 800000u; + mutable_cf_options_.max_compaction_bytes = 1000000u; + mutable_cf_options_.ignore_max_compaction_bytes_for_input = false; + ioptions_.level_compaction_dynamic_level_bytes = false; + NewVersionStorage(6, kCompactionStyleLevel); + // A compaction should be triggered and pick file 2 and 5. + // and it expands to file 1 and 3 too. + Add(1, 1U, "100", "150", 300000U); + Add(1, 2U, "151", "200", 300001U, 0, 0); + Add(1, 3U, "201", "250", 300000U, 0, 0); + Add(1, 4U, "251", "300", 300000U, 0, 0); + Add(2, 5U, "000", "251", 1U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(3U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber()); + ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, IsTrivialMoveOn) { + mutable_cf_options_.max_bytes_for_level_base = 10000u; + mutable_cf_options_.max_compaction_bytes = 10001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + NewVersionStorage(6, kCompactionStyleLevel); + // A compaction should be triggered and pick file 2 + Add(1, 1U, "100", "150", 3000U); + Add(1, 2U, "151", "200", 3001U); + Add(1, 3U, "201", "250", 3000U); + Add(1, 4U, "251", "300", 3000U); + + Add(3, 5U, "120", "130", 7000U); + Add(3, 6U, "170", "180", 7000U); + Add(3, 7U, "220", "230", 7000U); + Add(3, 8U, "270", "280", 7000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_TRUE(compaction->IsTrivialMove()); +} + +TEST_F(CompactionPickerTest, L0TrivialMove1) { + mutable_cf_options_.max_bytes_for_level_base = 10000000u; + mutable_cf_options_.level0_file_num_compaction_trigger = 4; + mutable_cf_options_.max_compaction_bytes = 10000000u; + ioptions_.level_compaction_dynamic_level_bytes = false; + NewVersionStorage(6, kCompactionStyleLevel); + + Add(0, 1U, "100", "150", 3000U, 0, 710, 800); + Add(0, 2U, "151", "200", 3001U, 0, 610, 700); + Add(0, 3U, "301", "350", 3000U, 0, 510, 600); + Add(0, 4U, "451", "400", 3000U, 0, 410, 500); + + Add(1, 5U, "120", "130", 7000U); + Add(1, 6U, "170", "180", 7000U); + Add(1, 7U, "220", "230", 7000U); + Add(1, 8U, "270", "280", 7000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1, compaction->num_input_levels()); + ASSERT_EQ(2, compaction->num_input_files(0)); + ASSERT_EQ(3, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_TRUE(compaction->IsTrivialMove()); +} + +TEST_F(CompactionPickerTest, L0TrivialMoveOneFile) { + mutable_cf_options_.max_bytes_for_level_base = 10000000u; + mutable_cf_options_.level0_file_num_compaction_trigger = 4; + mutable_cf_options_.max_compaction_bytes = 10000000u; + ioptions_.level_compaction_dynamic_level_bytes = false; + NewVersionStorage(6, kCompactionStyleLevel); + + Add(0, 1U, "100", "150", 3000U, 0, 710, 800); + Add(0, 2U, "551", "600", 3001U, 0, 610, 700); + Add(0, 3U, "101", "150", 3000U, 0, 510, 600); + Add(0, 4U, "451", "400", 3000U, 0, 410, 500); + + Add(1, 5U, "120", "130", 7000U); + Add(1, 6U, "170", "180", 7000U); + Add(1, 7U, "220", "230", 7000U); + Add(1, 8U, "270", "280", 7000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1, compaction->num_input_levels()); + ASSERT_EQ(1, compaction->num_input_files(0)); + ASSERT_EQ(4, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_TRUE(compaction->IsTrivialMove()); +} + +TEST_F(CompactionPickerTest, L0TrivialMoveWholeL0) { + mutable_cf_options_.max_bytes_for_level_base = 10000000u; + mutable_cf_options_.level0_file_num_compaction_trigger = 4; + mutable_cf_options_.max_compaction_bytes = 10000000u; + ioptions_.level_compaction_dynamic_level_bytes = false; + NewVersionStorage(6, kCompactionStyleLevel); + + Add(0, 1U, "300", "350", 3000U, 0, 710, 800); + Add(0, 2U, "651", "600", 3001U, 0, 610, 700); + Add(0, 3U, "501", "550", 3000U, 0, 510, 600); + Add(0, 4U, "451", "400", 3000U, 0, 410, 500); + + Add(1, 5U, "120", "130", 7000U); + Add(1, 6U, "970", "980", 7000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1, compaction->num_input_levels()); + ASSERT_EQ(4, compaction->num_input_files(0)); + ASSERT_EQ(1, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(3, compaction->input(0, 2)->fd.GetNumber()); + ASSERT_EQ(2, compaction->input(0, 3)->fd.GetNumber()); + ASSERT_TRUE(compaction->IsTrivialMove()); +} + +TEST_F(CompactionPickerTest, IsTrivialMoveOffSstPartitioned) { + mutable_cf_options_.max_bytes_for_level_base = 10000u; + mutable_cf_options_.max_compaction_bytes = 10001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + ioptions_.sst_partitioner_factory = NewSstPartitionerFixedPrefixFactory(1); + NewVersionStorage(6, kCompactionStyleLevel); + // A compaction should be triggered and pick file 2 + Add(1, 1U, "100", "150", 3000U); + Add(1, 2U, "151", "200", 3001U); + Add(1, 3U, "201", "250", 3000U); + Add(1, 4U, "251", "300", 3000U); + + Add(3, 5U, "120", "130", 7000U); + Add(3, 6U, "170", "180", 7000U); + Add(3, 7U, "220", "230", 7000U); + Add(3, 8U, "270", "280", 7000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + // No trivial move, because partitioning is applied + ASSERT_TRUE(!compaction->IsTrivialMove()); +} + +TEST_F(CompactionPickerTest, IsTrivialMoveOff) { + mutable_cf_options_.max_bytes_for_level_base = 1000000u; + mutable_cf_options_.max_compaction_bytes = 10000u; + ioptions_.level_compaction_dynamic_level_bytes = false; + NewVersionStorage(6, kCompactionStyleLevel); + // A compaction should be triggered and pick all files from level 1 + Add(1, 1U, "100", "150", 300000U, 0, 0); + Add(1, 2U, "150", "200", 300000U, 0, 0); + Add(1, 3U, "200", "250", 300000U, 0, 0); + Add(1, 4U, "250", "300", 300000U, 0, 0); + + Add(3, 5U, "120", "130", 6000U); + Add(3, 6U, "140", "150", 6000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_FALSE(compaction->IsTrivialMove()); +} + +TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles1) { + mutable_cf_options_.max_bytes_for_level_base = 1000u; + mutable_cf_options_.max_compaction_bytes = 10000001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + ioptions_.compaction_pri = kMinOverlappingRatio; + NewVersionStorage(6, kCompactionStyleLevel); + + Add(2, 1U, "100", "150", 3000U); + Add(2, 2U, "151", "200", 3001U); + Add(2, 3U, "301", "350", 3000U); + Add(2, 4U, "451", "400", 3000U); + Add(2, 5U, "551", "500", 3000U); + Add(2, 6U, "651", "600", 3000U); + Add(2, 7U, "751", "700", 3000U); + Add(2, 8U, "851", "900", 3000U); + + Add(3, 15U, "120", "130", 700U); + Add(3, 16U, "170", "180", 700U); + Add(3, 17U, "220", "230", 700U); + Add(3, 18U, "870", "880", 700U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_TRUE(compaction->IsTrivialMove()); + ASSERT_EQ(1, compaction->num_input_levels()); + ASSERT_EQ(4, compaction->num_input_files(0)); + ASSERT_EQ(3, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(5, compaction->input(0, 2)->fd.GetNumber()); + ASSERT_EQ(6, compaction->input(0, 3)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles2) { + mutable_cf_options_.max_bytes_for_level_base = 1000u; + mutable_cf_options_.max_compaction_bytes = 10000001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + ioptions_.compaction_pri = kMinOverlappingRatio; + NewVersionStorage(6, kCompactionStyleLevel); + + Add(2, 1U, "100", "150", 3000U); + Add(2, 2U, "151", "160", 3001U); + Add(2, 3U, "161", "179", 3000U); + Add(2, 4U, "220", "400", 3000U); + Add(2, 5U, "551", "500", 3000U); + Add(2, 6U, "651", "600", 3000U); + Add(2, 7U, "751", "700", 3000U); + Add(2, 8U, "851", "900", 3000U); + + Add(3, 15U, "120", "130", 700U); + Add(3, 17U, "220", "230", 700U); + Add(3, 18U, "870", "880", 700U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_TRUE(compaction->IsTrivialMove()); + ASSERT_EQ(1, compaction->num_input_levels()); + ASSERT_EQ(2, compaction->num_input_files(0)); + ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles3) { + mutable_cf_options_.max_bytes_for_level_base = 1000u; + mutable_cf_options_.max_compaction_bytes = 10000001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + ioptions_.compaction_pri = kMinOverlappingRatio; + NewVersionStorage(6, kCompactionStyleLevel); + + // Even if consecutive files can be trivial moved, we don't pick them + // since in case trivial move can't be issued for a reason, we cannot + // fall back to normal compactions. + Add(2, 1U, "100", "150", 3000U); + Add(2, 2U, "151", "160", 3001U); + Add(2, 5U, "551", "500", 3000U); + Add(2, 6U, "651", "600", 3000U); + Add(2, 7U, "751", "700", 3000U); + Add(2, 8U, "851", "900", 3000U); + + Add(3, 15U, "120", "130", 700U); + Add(3, 17U, "220", "230", 700U); + Add(3, 18U, "870", "880", 700U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_TRUE(compaction->IsTrivialMove()); + ASSERT_EQ(1, compaction->num_input_levels()); + ASSERT_EQ(1, compaction->num_input_files(0)); + ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles4) { + mutable_cf_options_.max_bytes_for_level_base = 1000u; + mutable_cf_options_.max_compaction_bytes = 10000001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + ioptions_.compaction_pri = kMinOverlappingRatio; + NewVersionStorage(6, kCompactionStyleLevel); + + Add(2, 1U, "100", "150", 4000U); + Add(2, 2U, "151", "160", 4001U); + Add(2, 3U, "161", "179", 4000U); + + Add(3, 15U, "120", "130", 700U); + Add(3, 17U, "220", "230", 700U); + Add(3, 18U, "870", "880", 700U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_TRUE(compaction->IsTrivialMove()); + ASSERT_EQ(1, compaction->num_input_levels()); + ASSERT_EQ(2, compaction->num_input_files(0)); + ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles5) { + mutable_cf_options_.max_bytes_for_level_base = 1000u; + mutable_cf_options_.max_compaction_bytes = 10000001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + ioptions_.compaction_pri = kMinOverlappingRatio; + NewVersionStorage(6, kCompactionStyleLevel); + + // File 4 and 5 aren't clean cut, so only 2 and 3 are picked. + Add(2, 1U, "100", "150", 4000U); + Add(2, 2U, "151", "160", 4001U); + Add(2, 3U, "161", "179", 4000U); + Add(2, 4U, "180", "185", 4000U); + Add(2, 5U, "185", "190", 4000U); + + Add(3, 15U, "120", "130", 700U); + Add(3, 17U, "220", "230", 700U); + Add(3, 18U, "870", "880", 700U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_TRUE(compaction->IsTrivialMove()); + ASSERT_EQ(1, compaction->num_input_levels()); + ASSERT_EQ(2, compaction->num_input_files(0)); + ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles6) { + mutable_cf_options_.max_bytes_for_level_base = 1000u; + mutable_cf_options_.max_compaction_bytes = 10000001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + ioptions_.compaction_pri = kMinOverlappingRatio; + NewVersionStorage(6, kCompactionStyleLevel); + + Add(2, 1U, "100", "150", 3000U); + Add(2, 2U, "151", "200", 3001U); + Add(2, 3U, "301", "350", 3000U); + Add(2, 4U, "451", "400", 3000U); + Add(2, 5U, "551", "500", 3000U); + file_map_[5U].first->being_compacted = true; + Add(2, 6U, "651", "600", 3000U); + Add(2, 7U, "751", "700", 3000U); + Add(2, 8U, "851", "900", 3000U); + + Add(3, 15U, "120", "130", 700U); + Add(3, 16U, "170", "180", 700U); + Add(3, 17U, "220", "230", 700U); + Add(3, 18U, "870", "880", 700U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_TRUE(compaction->IsTrivialMove()); + ASSERT_EQ(1, compaction->num_input_levels()); + // Since the next file is being compacted. Stopping at 3 and 4. + ASSERT_EQ(2, compaction->num_input_files(0)); + ASSERT_EQ(3, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, CacheNextCompactionIndex) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_compaction_bytes = 100000000000u; + + Add(1 /* level */, 1U /* file_number */, "100" /* smallest */, + "149" /* largest */, 1000000000U /* file_size */); + file_map_[1U].first->being_compacted = true; + Add(1 /* level */, 2U /* file_number */, "150" /* smallest */, + "199" /* largest */, 900000000U /* file_size */); + Add(1 /* level */, 3U /* file_number */, "200" /* smallest */, + "249" /* largest */, 800000000U /* file_size */); + Add(1 /* level */, 4U /* file_number */, "250" /* smallest */, + "299" /* largest */, 700000000U /* file_size */); + Add(2 /* level */, 5U /* file_number */, "150" /* smallest */, + "199" /* largest */, 100U /* file_size */); + Add(2 /* level */, 6U /* file_number */, "200" /* smallest */, + "240" /* largest */, 1U /* file_size */); + Add(2 /* level */, 7U /* file_number */, "260" /* smallest */, + "270" /* largest */, 1U /* file_size */); + file_map_[5U].first->being_compacted = true; + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2, vstorage_->NextCompactionIndex(1 /* level */)); + + compaction.reset(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3, vstorage_->NextCompactionIndex(1 /* level */)); + + compaction.reset(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() == nullptr); + ASSERT_EQ(4, vstorage_->NextCompactionIndex(1 /* level */)); +} + +TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesNotHit) { + // Intra L0 compaction triggers only if there are at least + // level0_file_num_compaction_trigger + 2 L0 files. + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_compaction_bytes = 1000000u; + NewVersionStorage(6, kCompactionStyleLevel); + + // All 5 L0 files will be picked for intra L0 compaction. The one L1 file + // spans entire L0 key range and is marked as being compacted to avoid + // L0->L1 compaction. + Add(0, 1U, "100", "150", 200000U, 0, 100, 101); + Add(0, 2U, "151", "200", 200000U, 0, 102, 103); + Add(0, 3U, "201", "250", 200000U, 0, 104, 105); + Add(0, 4U, "251", "300", 200000U, 0, 106, 107); + Add(0, 5U, "301", "350", 200000U, 0, 108, 109); + Add(1, 6U, "100", "350", 200000U, 0, 110, 111); + vstorage_->LevelFiles(1)[0]->being_compacted = true; + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(5U, compaction->num_input_files(0)); + ASSERT_EQ(CompactionReason::kLevelL0FilesNum, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) { + // Intra L0 compaction triggers only if there are at least + // level0_file_num_compaction_trigger + 2 L0 files. + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_compaction_bytes = 999999u; + NewVersionStorage(6, kCompactionStyleLevel); + + // 4 out of 5 L0 files will be picked for intra L0 compaction due to + // max_compaction_bytes limit (the minimum number of files for triggering + // intra L0 compaction is 4). The one L1 file spans entire L0 key range and + // is marked as being compacted to avoid L0->L1 compaction. + Add(0, 1U, "100", "150", 200000U, 0, 100, 101); + Add(0, 2U, "151", "200", 200000U, 0, 102, 103); + Add(0, 3U, "201", "250", 200000U, 0, 104, 105); + Add(0, 4U, "251", "300", 200000U, 0, 106, 107); + Add(0, 5U, "301", "350", 200000U, 0, 108, 109); + Add(1, 6U, "100", "350", 200000U, 0, 109, 110); + vstorage_->LevelFiles(1)[0]->being_compacted = true; + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(4U, compaction->num_input_files(0)); + ASSERT_EQ(CompactionReason::kLevelL0FilesNum, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, IntraL0ForEarliestSeqno) { + // Intra L0 compaction triggers only if there are at least + // level0_file_num_compaction_trigger + 2 L0 files. + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_compaction_bytes = 999999u; + NewVersionStorage(6, kCompactionStyleLevel); + + // 4 out of 6 L0 files will be picked for intra L0 compaction due to + // being_compact limit. And the latest one L0 will be skipped due to earliest + // seqno. The one L1 file spans entire L0 key range and is marked as being + // compacted to avoid L0->L1 compaction. + Add(1, 1U, "100", "350", 200000U, 0, 110, 111); + Add(0, 2U, "301", "350", 1U, 0, 108, 109); + Add(0, 3U, "251", "300", 1U, 0, 106, 107); + Add(0, 4U, "201", "250", 1U, 0, 104, 105); + Add(0, 5U, "151", "200", 1U, 0, 102, 103); + Add(0, 6U, "100", "150", 1U, 0, 100, 101); + Add(0, 7U, "100", "100", 1U, 0, 99, 100); + vstorage_->LevelFiles(0)[5]->being_compacted = true; + vstorage_->LevelFiles(1)[0]->being_compacted = true; + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_, 107)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(4U, compaction->num_input_files(0)); + ASSERT_EQ(CompactionReason::kLevelL0FilesNum, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); +} + +#ifndef ROCKSDB_LITE +TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) { + const uint64_t kFileSize = 100000; + + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + // This test covers the case where a "regular" universal compaction is + // scheduled first, followed by a delete triggered compaction. The latter + // should fail + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450); + Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300); + Add(3, 5U, "010", "080", 8 * kFileSize, 0, 200, 251); + Add(4, 3U, "301", "350", 8 * kFileSize, 0, 101, 150); + Add(4, 6U, "501", "750", 8 * kFileSize, 0, 101, 150); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_TRUE(compaction); + // Validate that its a compaction to reduce sorted runs + ASSERT_EQ(CompactionReason::kUniversalSortedRunNum, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + + AddVersionStorage(); + // Simulate a flush and mark the file for compaction + Add(0, 7U, "150", "200", kFileSize, 0, 551, 600, 0, true); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction2( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_FALSE(compaction2); +} + +TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap2) { + const uint64_t kFileSize = 100000; + + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + // This test covers the case where a delete triggered compaction is + // scheduled first, followed by a "regular" compaction. The latter + // should fail + NewVersionStorage(5, kCompactionStyleUniversal); + + // Mark file number 4 for compaction + Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300, 0, true); + Add(3, 5U, "240", "290", 8 * kFileSize, 0, 201, 250); + Add(4, 3U, "301", "350", 8 * kFileSize, 0, 101, 150); + Add(4, 6U, "501", "750", 8 * kFileSize, 0, 101, 150); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_TRUE(compaction); + // Validate that its a delete triggered compaction + ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction, + compaction->compaction_reason()); + ASSERT_EQ(3, compaction->output_level()); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + + AddVersionStorage(); + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction2( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_FALSE(compaction2); +} + +TEST_F(CompactionPickerTest, UniversalMarkedCompactionStartOutputOverlap) { + // The case where universal periodic compaction can be picked + // with some newer files being compacted. + const uint64_t kFileSize = 100000; + + ioptions_.compaction_style = kCompactionStyleUniversal; + + bool input_level_overlap = false; + bool output_level_overlap = false; + // Let's mark 2 files in 2 different levels for compaction. The + // compaction picker will randomly pick one, so use the sync point to + // ensure a deterministic order. Loop until both cases are covered + size_t random_index = 0; + SyncPoint::GetInstance()->SetCallBack( + "CompactionPicker::PickFilesMarkedForCompaction", [&](void* arg) { + size_t* index = static_cast(arg); + *index = random_index; + }); + SyncPoint::GetInstance()->EnableProcessing(); + while (!input_level_overlap || !output_level_overlap) { + // Ensure that the L0 file gets picked first + random_index = !input_level_overlap ? 0 : 1; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "260", "300", 4 * kFileSize, 0, 260, 300, 0, true); + Add(3, 2U, "010", "020", 2 * kFileSize, 0, 201, 248); + Add(3, 3U, "250", "270", 2 * kFileSize, 0, 202, 249); + Add(3, 4U, "290", "310", 2 * kFileSize, 0, 203, 250); + Add(3, 5U, "310", "320", 2 * kFileSize, 0, 204, 251, 0, true); + Add(4, 6U, "301", "350", 8 * kFileSize, 0, 101, 150); + Add(4, 7U, "501", "750", 8 * kFileSize, 0, 101, 150); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_TRUE(compaction); + // Validate that its a delete triggered compaction + ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction, + compaction->compaction_reason()); + ASSERT_TRUE(compaction->start_level() == 0 || + compaction->start_level() == 3); + if (compaction->start_level() == 0) { + // The L0 file was picked. The next compaction will detect an + // overlap on its input level + input_level_overlap = true; + ASSERT_EQ(3, compaction->output_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(3U, compaction->num_input_files(1)); + } else { + // The level 3 file was picked. The next compaction will pick + // the L0 file and will detect overlap when adding output + // level inputs + output_level_overlap = true; + ASSERT_EQ(4, compaction->output_level()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + } + + vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); + // After recomputing the compaction score, only one marked file will remain + random_index = 0; + std::unique_ptr compaction2( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_FALSE(compaction2); + DeleteVersionStorage(); + } +} + +TEST_F(CompactionPickerTest, UniversalMarkedL0NoOverlap) { + const uint64_t kFileSize = 100000; + + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + // This test covers the case where a delete triggered compaction is + // scheduled and should result in a full compaction + NewVersionStorage(1, kCompactionStyleUniversal); + + // Mark file number 4 for compaction + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, 0, true); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250); + Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150); + Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_TRUE(compaction); + // Validate that its a delete triggered compaction + ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(4U, compaction->num_input_files(0)); + ASSERT_TRUE(file_map_[4].first->being_compacted); + ASSERT_TRUE(file_map_[5].first->being_compacted); + ASSERT_TRUE(file_map_[3].first->being_compacted); + ASSERT_TRUE(file_map_[6].first->being_compacted); +} + +TEST_F(CompactionPickerTest, UniversalMarkedL0WithOverlap) { + const uint64_t kFileSize = 100000; + + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + // This test covers the case where a file is being compacted, and a + // delete triggered compaction is then scheduled. The latter should stop + // at the first file being compacted + NewVersionStorage(1, kCompactionStyleUniversal); + + // Mark file number 4 for compaction + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, 0, true); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250); + Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150); + Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100); + UpdateVersionStorageInfo(); + file_map_[3].first->being_compacted = true; + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_TRUE(compaction); + // Validate that its a delete triggered compaction + ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_TRUE(file_map_[4].first->being_compacted); + ASSERT_TRUE(file_map_[5].first->being_compacted); +} + +TEST_F(CompactionPickerTest, UniversalMarkedL0Overlap2) { + const uint64_t kFileSize = 100000; + + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + // This test covers the case where a delete triggered compaction is + // scheduled first, followed by a "regular" compaction. The latter + // should fail + NewVersionStorage(1, kCompactionStyleUniversal); + + // Mark file number 5 for compaction + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250, 0, true); + Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150); + Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_TRUE(compaction); + // Validate that its a delete triggered compaction + ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(3U, compaction->num_input_files(0)); + ASSERT_TRUE(file_map_[5].first->being_compacted); + ASSERT_TRUE(file_map_[3].first->being_compacted); + ASSERT_TRUE(file_map_[6].first->being_compacted); + + AddVersionStorage(); + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", kFileSize, 0, 401, 450); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction2( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction2); + ASSERT_EQ(3U, compaction->num_input_files(0)); + ASSERT_TRUE(file_map_[1].first->being_compacted); + ASSERT_TRUE(file_map_[2].first->being_compacted); + ASSERT_TRUE(file_map_[4].first->being_compacted); +} + +TEST_F(CompactionPickerTest, UniversalMarkedManualCompaction) { + const uint64_t kFileSize = 100000; + const int kNumLevels = 7; + + // This test makes sure the `files_marked_for_compaction_` is updated after + // creating manual compaction. + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(kNumLevels, kCompactionStyleUniversal); + + // Add 3 files marked for compaction + Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150, 0, true); + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, 0, true); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250, 0, true); + UpdateVersionStorageInfo(); + + // All 3 files are marked for compaction + ASSERT_EQ(3U, vstorage_->FilesMarkedForCompaction().size()); + + bool manual_conflict = false; + InternalKey* manual_end = nullptr; + std::unique_ptr compaction( + universal_compaction_picker.CompactRange( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + ColumnFamilyData::kCompactAllLevels, 6, CompactRangeOptions(), + nullptr, nullptr, &manual_end, &manual_conflict, + std::numeric_limits::max(), "")); + + ASSERT_TRUE(compaction); + + ASSERT_EQ(CompactionReason::kManualCompaction, + compaction->compaction_reason()); + ASSERT_EQ(kNumLevels - 1, compaction->output_level()); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(3U, compaction->num_input_files(0)); + ASSERT_TRUE(file_map_[3].first->being_compacted); + ASSERT_TRUE(file_map_[4].first->being_compacted); + ASSERT_TRUE(file_map_[5].first->being_compacted); + + // After creating the manual compaction, all files should be cleared from + // `FilesMarkedForCompaction`. So they won't be picked by others. + ASSERT_EQ(0U, vstorage_->FilesMarkedForCompaction().size()); +} + +TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNonLastLevel) { + // This test make sure size amplification compaction could still be triggered + // if the last sorted run is not the last level. + const uint64_t kFileSize = 100000; + const int kNumLevels = 7; + const int kLastLevel = kNumLevels - 1; + + ioptions_.compaction_style = kCompactionStyleUniversal; + ioptions_.preclude_last_level_data_seconds = 1000; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 200; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(kNumLevels, kCompactionStyleUniversal); + Add(0, 100U, "100", "300", 1 * kFileSize); + Add(0, 101U, "200", "400", 1 * kFileSize); + Add(4, 90U, "100", "600", 4 * kFileSize); + Add(5, 80U, "200", "300", 2 * kFileSize); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + // Make sure it's a size amp compaction and includes all files + ASSERT_EQ(compaction->compaction_reason(), + CompactionReason::kUniversalSizeAmplification); + ASSERT_EQ(compaction->output_level(), kLastLevel); + ASSERT_EQ(compaction->input_levels(0)->num_files, 2); + ASSERT_EQ(compaction->input_levels(4)->num_files, 1); + ASSERT_EQ(compaction->input_levels(5)->num_files, 1); +} + +TEST_F(CompactionPickerTest, UniversalSizeRatioTierCompactionLastLevel) { + // This test makes sure the size amp calculation skips the last level (L6), so + // size amp compaction is not triggered, instead a size ratio compaction is + // triggered. + const uint64_t kFileSize = 100000; + const int kNumLevels = 7; + const int kLastLevel = kNumLevels - 1; + const int kPenultimateLevel = kLastLevel - 1; + + ioptions_.compaction_style = kCompactionStyleUniversal; + ioptions_.preclude_last_level_data_seconds = 1000; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 200; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(kNumLevels, kCompactionStyleUniversal); + Add(0, 100U, "100", "300", 1 * kFileSize); + Add(0, 101U, "200", "400", 1 * kFileSize); + Add(5, 90U, "100", "600", 4 * kFileSize); + Add(6, 80U, "200", "300", 2 * kFileSize); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + // Internally, size amp compaction is evaluated before size ratio compaction. + // Here to make sure it's size ratio compaction instead of size amp + ASSERT_EQ(compaction->compaction_reason(), + CompactionReason::kUniversalSizeRatio); + ASSERT_EQ(compaction->output_level(), kPenultimateLevel - 1); + ASSERT_EQ(compaction->input_levels(0)->num_files, 2); + ASSERT_EQ(compaction->input_levels(5)->num_files, 0); + ASSERT_EQ(compaction->input_levels(6)->num_files, 0); +} + +TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNotSuport) { + // Tiered compaction only support level_num > 2 (otherwise the penultimate + // level is going to be level 0, which may make thing more complicated), so + // when there's only 2 level, still treating level 1 as the last level for + // size amp compaction + const uint64_t kFileSize = 100000; + const int kNumLevels = 2; + const int kLastLevel = kNumLevels - 1; + + ioptions_.compaction_style = kCompactionStyleUniversal; + ioptions_.preclude_last_level_data_seconds = 1000; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 200; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(kNumLevels, kCompactionStyleUniversal); + Add(0, 100U, "100", "300", 1 * kFileSize); + Add(0, 101U, "200", "400", 1 * kFileSize); + Add(0, 90U, "100", "600", 4 * kFileSize); + Add(1, 80U, "200", "300", 2 * kFileSize); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + // size amp compaction is still triggered even preclude_last_level is set + ASSERT_EQ(compaction->compaction_reason(), + CompactionReason::kUniversalSizeAmplification); + ASSERT_EQ(compaction->output_level(), kLastLevel); + ASSERT_EQ(compaction->input_levels(0)->num_files, 3); + ASSERT_EQ(compaction->input_levels(1)->num_files, 1); +} + +TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionLastLevel) { + // This test makes sure the size amp compaction for tiered storage could still + // be triggered, but only for non-last-level files + const uint64_t kFileSize = 100000; + const int kNumLevels = 7; + const int kLastLevel = kNumLevels - 1; + const int kPenultimateLevel = kLastLevel - 1; + + ioptions_.compaction_style = kCompactionStyleUniversal; + ioptions_.preclude_last_level_data_seconds = 1000; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 200; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(kNumLevels, kCompactionStyleUniversal); + Add(0, 100U, "100", "300", 3 * kFileSize); + Add(0, 101U, "200", "400", 2 * kFileSize); + Add(5, 90U, "100", "600", 2 * kFileSize); + Add(6, 80U, "200", "300", 2 * kFileSize); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + // It's a Size Amp compaction, but doesn't include the last level file and + // output to the penultimate level. + ASSERT_EQ(compaction->compaction_reason(), + CompactionReason::kUniversalSizeAmplification); + ASSERT_EQ(compaction->output_level(), kPenultimateLevel); + ASSERT_EQ(compaction->input_levels(0)->num_files, 2); + ASSERT_EQ(compaction->input_levels(5)->num_files, 1); + ASSERT_EQ(compaction->input_levels(6)->num_files, 0); +} + +TEST_F(CompactionPickerU64TsTest, Overlap) { + int num_levels = ioptions_.num_levels; + NewVersionStorage(num_levels, kCompactionStyleLevel); + + constexpr int level = 0; + constexpr uint64_t file_number = 20ULL; + constexpr char smallest[4] = "500"; + constexpr char largest[4] = "600"; + constexpr uint64_t ts_of_smallest = 12345ULL; + constexpr uint64_t ts_of_largest = 56789ULL; + + { + std::string ts1; + PutFixed64(&ts1, ts_of_smallest); + std::string ts2; + PutFixed64(&ts2, ts_of_largest); + Add(level, file_number, smallest, largest, + /*file_size=*/1U, /*path_id=*/0, + /*smallest_seq=*/100, /*largest_seq=*/100, /*compensated_file_size=*/0, + /*marked_for_compact=*/false, /*temperature=*/Temperature::kUnknown, + /*oldest_ancestor_time=*/kUnknownOldestAncesterTime, ts1, ts2); + UpdateVersionStorageInfo(); + } + + std::unordered_set input{file_number}; + + std::vector input_files; + ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input, vstorage_.get(), CompactionOptions())); + std::unique_ptr comp1(level_compaction_picker.CompactFiles( + CompactionOptions(), input_files, level, vstorage_.get(), + mutable_cf_options_, mutable_db_options_, /*output_path_id=*/0)); + + { + // [600, ts=50000] to [600, ts=50000] is the range to check. + // ucmp->Compare(smallest_user_key, c->GetLargestUserKey()) > 0, but + // ucmp->CompareWithoutTimestamp(smallest_user_key, + // c->GetLargestUserKey()) == 0. + // Should still be considered overlapping. + std::string user_key_with_ts1(largest); + PutFixed64(&user_key_with_ts1, ts_of_largest - 1); + std::string user_key_with_ts2(largest); + PutFixed64(&user_key_with_ts2, ts_of_largest - 1); + ASSERT_TRUE(level_compaction_picker.RangeOverlapWithCompaction( + user_key_with_ts1, user_key_with_ts2, level)); + } + { + // [500, ts=60000] to [500, ts=60000] is the range to check. + // ucmp->Compare(largest_user_key, c->GetSmallestUserKey()) < 0, but + // ucmp->CompareWithoutTimestamp(largest_user_key, + // c->GetSmallestUserKey()) == 0. + // Should still be considered overlapping. + std::string user_key_with_ts1(smallest); + PutFixed64(&user_key_with_ts1, ts_of_smallest + 1); + std::string user_key_with_ts2(smallest); + PutFixed64(&user_key_with_ts2, ts_of_smallest + 1); + ASSERT_TRUE(level_compaction_picker.RangeOverlapWithCompaction( + user_key_with_ts1, user_key_with_ts2, level)); + } +} + +TEST_F(CompactionPickerU64TsTest, CannotTrivialMoveUniversal) { + constexpr uint64_t kFileSize = 100000; + + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.compaction_options_universal.allow_trivial_move = true; + NewVersionStorage(1, kCompactionStyleUniversal); + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + UpdateVersionStorageInfo(); + // must return false when there's no files. + ASSERT_FALSE(universal_compaction_picker.NeedsCompaction(vstorage_.get())); + + std::string ts1; + PutFixed64(&ts1, 9000); + std::string ts2; + PutFixed64(&ts2, 8000); + std::string ts3; + PutFixed64(&ts3, 7000); + std::string ts4; + PutFixed64(&ts4, 6000); + + NewVersionStorage(3, kCompactionStyleUniversal); + // A compaction should be triggered and pick file 2 + Add(1, 1U, "150", "150", kFileSize, /*path_id=*/0, /*smallest_seq=*/100, + /*largest_seq=*/100, /*compensated_file_size=*/kFileSize, + /*marked_for_compact=*/false, Temperature::kUnknown, + kUnknownOldestAncesterTime, ts1, ts2); + Add(2, 2U, "150", "150", kFileSize, /*path_id=*/0, /*smallest_seq=*/100, + /*largest_seq=*/100, /*compensated_file_size=*/kFileSize, + /*marked_for_compact=*/false, Temperature::kUnknown, + kUnknownOldestAncesterTime, ts3, ts4); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + assert(compaction); + ASSERT_TRUE(!compaction->is_trivial_move()); +} + +class PerKeyPlacementCompactionPickerTest + : public CompactionPickerTest, + public testing::WithParamInterface { + public: + PerKeyPlacementCompactionPickerTest() : CompactionPickerTest() {} + + void SetUp() override { enable_per_key_placement_ = GetParam(); } + + protected: + bool enable_per_key_placement_ = false; +}; + +TEST_P(PerKeyPlacementCompactionPickerTest, OverlapWithNormalCompaction) { + SyncPoint::GetInstance()->SetCallBack( + "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) { + auto supports_per_key_placement = static_cast(arg); + *supports_per_key_placement = enable_per_key_placement_; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + int num_levels = ioptions_.num_levels; + NewVersionStorage(num_levels, kCompactionStyleLevel); + + Add(0, 21U, "100", "150", 60000000U); + Add(0, 22U, "300", "350", 60000000U); + Add(5, 40U, "200", "250", 60000000U); + Add(6, 50U, "101", "351", 60000000U); + UpdateVersionStorageInfo(); + + CompactionOptions comp_options; + std::unordered_set input_set; + input_set.insert(40); + std::vector input_files; + ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + std::unique_ptr comp1(level_compaction_picker.CompactFiles( + comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + + input_set.clear(); + input_files.clear(); + input_set.insert(21); + input_set.insert(22); + input_set.insert(50); + ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + ASSERT_EQ(enable_per_key_placement_, + level_compaction_picker.FilesRangeOverlapWithCompaction( + input_files, 6, + Compaction::EvaluatePenultimateLevel(vstorage_.get(), ioptions_, + 0, 6))); +} + +TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlap) { + SyncPoint::GetInstance()->SetCallBack( + "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) { + auto supports_per_key_placement = static_cast(arg); + *supports_per_key_placement = enable_per_key_placement_; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + int num_levels = ioptions_.num_levels; + NewVersionStorage(num_levels, kCompactionStyleLevel); + + Add(0, 21U, "100", "150", 60000000U); + Add(0, 22U, "300", "350", 60000000U); + Add(4, 40U, "200", "220", 60000000U); + Add(4, 41U, "230", "250", 60000000U); + Add(6, 50U, "101", "351", 60000000U); + UpdateVersionStorageInfo(); + + CompactionOptions comp_options; + std::unordered_set input_set; + input_set.insert(21); + input_set.insert(22); + input_set.insert(50); + std::vector input_files; + ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + std::unique_ptr comp1(level_compaction_picker.CompactFiles( + comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + + input_set.clear(); + input_files.clear(); + input_set.insert(40); + input_set.insert(41); + ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + ASSERT_EQ(enable_per_key_placement_, + level_compaction_picker.FilesRangeOverlapWithCompaction( + input_files, 5, Compaction::kInvalidLevel)); +} + +TEST_P(PerKeyPlacementCompactionPickerTest, + OverlapWithNormalCompactionUniveral) { + SyncPoint::GetInstance()->SetCallBack( + "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) { + auto supports_per_key_placement = static_cast(arg); + *supports_per_key_placement = enable_per_key_placement_; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + int num_levels = ioptions_.num_levels; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + NewVersionStorage(num_levels, kCompactionStyleUniversal); + + Add(0, 21U, "100", "150", 60000000U); + Add(0, 22U, "300", "350", 60000000U); + Add(5, 40U, "200", "250", 60000000U); + Add(6, 50U, "101", "351", 60000000U); + UpdateVersionStorageInfo(); + + CompactionOptions comp_options; + std::unordered_set input_set; + input_set.insert(40); + std::vector input_files; + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + std::unique_ptr comp1(universal_compaction_picker.CompactFiles( + comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + + input_set.clear(); + input_files.clear(); + input_set.insert(21); + input_set.insert(22); + input_set.insert(50); + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + ASSERT_EQ(enable_per_key_placement_, + universal_compaction_picker.FilesRangeOverlapWithCompaction( + input_files, 6, + Compaction::EvaluatePenultimateLevel(vstorage_.get(), ioptions_, + 0, 6))); +} + +TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlapUniversal) { + SyncPoint::GetInstance()->SetCallBack( + "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) { + auto supports_per_key_placement = static_cast(arg); + *supports_per_key_placement = enable_per_key_placement_; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + int num_levels = ioptions_.num_levels; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + NewVersionStorage(num_levels, kCompactionStyleUniversal); + + Add(0, 21U, "100", "150", 60000000U); + Add(0, 22U, "300", "350", 60000000U); + Add(4, 40U, "200", "220", 60000000U); + Add(4, 41U, "230", "250", 60000000U); + Add(6, 50U, "101", "351", 60000000U); + UpdateVersionStorageInfo(); + + CompactionOptions comp_options; + std::unordered_set input_set; + input_set.insert(21); + input_set.insert(22); + input_set.insert(50); + std::vector input_files; + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + std::unique_ptr comp1(universal_compaction_picker.CompactFiles( + comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + + input_set.clear(); + input_files.clear(); + input_set.insert(40); + input_set.insert(41); + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + ASSERT_EQ(enable_per_key_placement_, + universal_compaction_picker.FilesRangeOverlapWithCompaction( + input_files, 5, Compaction::kInvalidLevel)); +} + +TEST_P(PerKeyPlacementCompactionPickerTest, PenultimateOverlapUniversal) { + // This test is make sure the Tiered compaction would lock whole range of + // both output level and penultimate level + if (enable_per_key_placement_) { + ioptions_.preclude_last_level_data_seconds = 10000; + } + + int num_levels = ioptions_.num_levels; + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + NewVersionStorage(num_levels, kCompactionStyleUniversal); + + // L4: [200, 220] [230, 250] [360, 380] + // L5: + // L6: [101, 351] + Add(4, 40U, "200", "220", 60000000U); + Add(4, 41U, "230", "250", 60000000U); + Add(4, 42U, "360", "380", 60000000U); + Add(6, 60U, "101", "351", 60000000U); + UpdateVersionStorageInfo(); + + // the existing compaction is the 1st L4 file + L6 file + // then compaction of the 2nd L4 file to L5 (penultimate level) is overlapped + // when the tiered compaction feature is on. + CompactionOptions comp_options; + std::unordered_set input_set; + input_set.insert(40); + input_set.insert(60); + std::vector input_files; + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + std::unique_ptr comp1(universal_compaction_picker.CompactFiles( + comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + + input_set.clear(); + input_files.clear(); + input_set.insert(41); + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + ASSERT_EQ(enable_per_key_placement_, + universal_compaction_picker.FilesRangeOverlapWithCompaction( + input_files, 5, Compaction::kInvalidLevel)); + + // compacting the 3rd L4 file is always safe: + input_set.clear(); + input_files.clear(); + input_set.insert(42); + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction( + input_files, 5, Compaction::kInvalidLevel)); +} + +TEST_P(PerKeyPlacementCompactionPickerTest, LastLevelOnlyOverlapUniversal) { + if (enable_per_key_placement_) { + ioptions_.preclude_last_level_data_seconds = 10000; + } + + int num_levels = ioptions_.num_levels; + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + NewVersionStorage(num_levels, kCompactionStyleUniversal); + + // L4: [200, 220] [230, 250] [360, 380] + // L5: + // L6: [101, 351] + Add(4, 40U, "200", "220", 60000000U); + Add(4, 41U, "230", "250", 60000000U); + Add(4, 42U, "360", "380", 60000000U); + Add(6, 60U, "101", "351", 60000000U); + UpdateVersionStorageInfo(); + + CompactionOptions comp_options; + std::unordered_set input_set; + input_set.insert(60); + std::vector input_files; + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + std::unique_ptr comp1(universal_compaction_picker.CompactFiles( + comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + + // cannot compact file 41 if the preclude_last_level feature is on, otherwise + // compact file 41 is okay. + input_set.clear(); + input_files.clear(); + input_set.insert(41); + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + ASSERT_EQ(enable_per_key_placement_, + universal_compaction_picker.FilesRangeOverlapWithCompaction( + input_files, 5, Compaction::kInvalidLevel)); + + // compacting the 3rd L4 file is always safe: + input_set.clear(); + input_files.clear(); + input_set.insert(42); + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction( + input_files, 5, Compaction::kInvalidLevel)); +} + +TEST_P(PerKeyPlacementCompactionPickerTest, + LastLevelOnlyFailPenultimateUniversal) { + // This is to test last_level only compaction still unable to do the + // penultimate level compaction if there's already a file in the penultimate + // level. + // This should rarely happen in universal compaction, as the non-empty L5 + // should be included in the compaction. + if (enable_per_key_placement_) { + ioptions_.preclude_last_level_data_seconds = 10000; + } + + int num_levels = ioptions_.num_levels; + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + NewVersionStorage(num_levels, kCompactionStyleUniversal); + + // L4: [200, 220] + // L5: [230, 250] + // L6: [101, 351] + Add(4, 40U, "200", "220", 60000000U); + Add(5, 50U, "230", "250", 60000000U); + Add(6, 60U, "101", "351", 60000000U); + UpdateVersionStorageInfo(); + + CompactionOptions comp_options; + std::unordered_set input_set; + input_set.insert(60); + std::vector input_files; + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + std::unique_ptr comp1(universal_compaction_picker.CompactFiles( + comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + + ASSERT_TRUE(comp1); + ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel); + + // As comp1 cannot be output to the penultimate level, compacting file 40 to + // L5 is always safe. + input_set.clear(); + input_files.clear(); + input_set.insert(40); + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction( + input_files, 5, Compaction::kInvalidLevel)); + + std::unique_ptr comp2(universal_compaction_picker.CompactFiles( + comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + ASSERT_TRUE(comp2); + ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel()); +} + +TEST_P(PerKeyPlacementCompactionPickerTest, + LastLevelOnlyConflictWithOngoingUniversal) { + // This is to test last_level only compaction still unable to do the + // penultimate level compaction if there's already an ongoing compaction to + // the penultimate level + if (enable_per_key_placement_) { + ioptions_.preclude_last_level_data_seconds = 10000; + } + + int num_levels = ioptions_.num_levels; + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + NewVersionStorage(num_levels, kCompactionStyleUniversal); + + // L4: [200, 220] [230, 250] [360, 380] + // L5: + // L6: [101, 351] + Add(4, 40U, "200", "220", 60000000U); + Add(4, 41U, "230", "250", 60000000U); + Add(4, 42U, "360", "380", 60000000U); + Add(6, 60U, "101", "351", 60000000U); + UpdateVersionStorageInfo(); + + // create an ongoing compaction to L5 (penultimate level) + CompactionOptions comp_options; + std::unordered_set input_set; + input_set.insert(40); + std::vector input_files; + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + std::unique_ptr comp1(universal_compaction_picker.CompactFiles( + comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + + ASSERT_TRUE(comp1); + ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel); + + input_set.clear(); + input_files.clear(); + input_set.insert(60); + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + ASSERT_EQ(enable_per_key_placement_, + universal_compaction_picker.FilesRangeOverlapWithCompaction( + input_files, 6, + Compaction::EvaluatePenultimateLevel(vstorage_.get(), ioptions_, + 6, 6))); + + if (!enable_per_key_placement_) { + std::unique_ptr comp2(universal_compaction_picker.CompactFiles( + comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + ASSERT_TRUE(comp2); + ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel()); + } +} + +TEST_P(PerKeyPlacementCompactionPickerTest, + LastLevelOnlyNoConflictWithOngoingUniversal) { + // This is similar to `LastLevelOnlyConflictWithOngoingUniversal`, the only + // change is the ongoing compaction to L5 has no overlap with the last level + // compaction, so it's safe to move data from the last level to the + // penultimate level. + if (enable_per_key_placement_) { + ioptions_.preclude_last_level_data_seconds = 10000; + } + + int num_levels = ioptions_.num_levels; + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + NewVersionStorage(num_levels, kCompactionStyleUniversal); + + // L4: [200, 220] [230, 250] [360, 380] + // L5: + // L6: [101, 351] + Add(4, 40U, "200", "220", 60000000U); + Add(4, 41U, "230", "250", 60000000U); + Add(4, 42U, "360", "380", 60000000U); + Add(6, 60U, "101", "351", 60000000U); + UpdateVersionStorageInfo(); + + // create an ongoing compaction to L5 (penultimate level) + CompactionOptions comp_options; + std::unordered_set input_set; + input_set.insert(42); + std::vector input_files; + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + std::unique_ptr comp1(universal_compaction_picker.CompactFiles( + comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + + ASSERT_TRUE(comp1); + ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel); + + input_set.clear(); + input_files.clear(); + input_set.insert(60); + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + // always safe to move data up + ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction( + input_files, 6, + Compaction::EvaluatePenultimateLevel(vstorage_.get(), ioptions_, 6, 6))); + + // 2 compactions can be run in parallel + std::unique_ptr comp2(universal_compaction_picker.CompactFiles( + comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + ASSERT_TRUE(comp2); + if (enable_per_key_placement_) { + ASSERT_NE(Compaction::kInvalidLevel, comp2->GetPenultimateLevel()); + } else { + ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel()); + } +} + +INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompactionPickerTest, + PerKeyPlacementCompactionPickerTest, ::testing::Bool()); + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/compaction/compaction_picker_universal.cc b/src/rocksdb/db/compaction/compaction_picker_universal.cc new file mode 100644 index 000000000..376e4f60f --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_universal.cc @@ -0,0 +1,1450 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_picker_universal.h" +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include + +#include "db/column_family.h" +#include "file/filename.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "monitoring/statistics.h" +#include "test_util/sync_point.h" +#include "util/random.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +// A helper class that form universal compactions. The class is used by +// UniversalCompactionPicker::PickCompaction(). +// The usage is to create the class, and get the compaction object by calling +// PickCompaction(). +class UniversalCompactionBuilder { + public: + UniversalCompactionBuilder( + const ImmutableOptions& ioptions, const InternalKeyComparator* icmp, + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + UniversalCompactionPicker* picker, LogBuffer* log_buffer) + : ioptions_(ioptions), + icmp_(icmp), + cf_name_(cf_name), + mutable_cf_options_(mutable_cf_options), + mutable_db_options_(mutable_db_options), + vstorage_(vstorage), + picker_(picker), + log_buffer_(log_buffer) {} + + // Form and return the compaction object. The caller owns return object. + Compaction* PickCompaction(); + + private: + struct SortedRun { + SortedRun(int _level, FileMetaData* _file, uint64_t _size, + uint64_t _compensated_file_size, bool _being_compacted) + : level(_level), + file(_file), + size(_size), + compensated_file_size(_compensated_file_size), + being_compacted(_being_compacted) { + assert(compensated_file_size > 0); + assert(level != 0 || file != nullptr); + } + + void Dump(char* out_buf, size_t out_buf_size, + bool print_path = false) const; + + // sorted_run_count is added into the string to print + void DumpSizeInfo(char* out_buf, size_t out_buf_size, + size_t sorted_run_count) const; + + int level; + // `file` Will be null for level > 0. For level = 0, the sorted run is + // for this file. + FileMetaData* file; + // For level > 0, `size` and `compensated_file_size` are sum of sizes all + // files in the level. `being_compacted` should be the same for all files + // in a non-zero level. Use the value here. + uint64_t size; + uint64_t compensated_file_size; + bool being_compacted; + }; + + // Pick Universal compaction to limit read amplification + Compaction* PickCompactionToReduceSortedRuns( + unsigned int ratio, unsigned int max_number_of_files_to_compact); + + // Pick Universal compaction to limit space amplification. + Compaction* PickCompactionToReduceSizeAmp(); + + // Try to pick incremental compaction to reduce space amplification. + // It will return null if it cannot find a fanout within the threshold. + // Fanout is defined as + // total size of files to compact at output level + // -------------------------------------------------- + // total size of files to compact at other levels + Compaction* PickIncrementalForReduceSizeAmp(double fanout_threshold); + + Compaction* PickDeleteTriggeredCompaction(); + + // Form a compaction from the sorted run indicated by start_index to the + // oldest sorted run. + // The caller is responsible for making sure that those files are not in + // compaction. + Compaction* PickCompactionToOldest(size_t start_index, + CompactionReason compaction_reason); + + Compaction* PickCompactionWithSortedRunRange( + size_t start_index, size_t end_index, CompactionReason compaction_reason); + + // Try to pick periodic compaction. The caller should only call it + // if there is at least one file marked for periodic compaction. + // null will be returned if no such a compaction can be formed + // because some files are being compacted. + Compaction* PickPeriodicCompaction(); + + // Used in universal compaction when the allow_trivial_move + // option is set. Checks whether there are any overlapping files + // in the input. Returns true if the input files are non + // overlapping. + bool IsInputFilesNonOverlapping(Compaction* c); + + uint64_t GetMaxOverlappingBytes() const; + + const ImmutableOptions& ioptions_; + const InternalKeyComparator* icmp_; + double score_; + std::vector sorted_runs_; + const std::string& cf_name_; + const MutableCFOptions& mutable_cf_options_; + const MutableDBOptions& mutable_db_options_; + VersionStorageInfo* vstorage_; + UniversalCompactionPicker* picker_; + LogBuffer* log_buffer_; + + static std::vector CalculateSortedRuns( + const VersionStorageInfo& vstorage); + + // Pick a path ID to place a newly generated file, with its estimated file + // size. + static uint32_t GetPathId(const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + uint64_t file_size); +}; + +// Used in universal compaction when trivial move is enabled. +// This structure is used for the construction of min heap +// that contains the file meta data, the level of the file +// and the index of the file in that level + +struct InputFileInfo { + InputFileInfo() : f(nullptr), level(0), index(0) {} + + FileMetaData* f; + size_t level; + size_t index; +}; + +// Used in universal compaction when trivial move is enabled. +// This comparator is used for the construction of min heap +// based on the smallest key of the file. +struct SmallestKeyHeapComparator { + explicit SmallestKeyHeapComparator(const Comparator* ucmp) { ucmp_ = ucmp; } + + bool operator()(InputFileInfo i1, InputFileInfo i2) const { + return (ucmp_->CompareWithoutTimestamp(i1.f->smallest.user_key(), + i2.f->smallest.user_key()) > 0); + } + + private: + const Comparator* ucmp_; +}; + +using SmallestKeyHeap = + std::priority_queue, + SmallestKeyHeapComparator>; + +// This function creates the heap that is used to find if the files are +// overlapping during universal compaction when the allow_trivial_move +// is set. +SmallestKeyHeap create_level_heap(Compaction* c, const Comparator* ucmp) { + SmallestKeyHeap smallest_key_priority_q = + SmallestKeyHeap(SmallestKeyHeapComparator(ucmp)); + + InputFileInfo input_file; + + for (size_t l = 0; l < c->num_input_levels(); l++) { + if (c->num_input_files(l) != 0) { + if (l == 0 && c->start_level() == 0) { + for (size_t i = 0; i < c->num_input_files(0); i++) { + input_file.f = c->input(0, i); + input_file.level = 0; + input_file.index = i; + smallest_key_priority_q.push(std::move(input_file)); + } + } else { + input_file.f = c->input(l, 0); + input_file.level = l; + input_file.index = 0; + smallest_key_priority_q.push(std::move(input_file)); + } + } + } + return smallest_key_priority_q; +} + +#ifndef NDEBUG +// smallest_seqno and largest_seqno are set iff. `files` is not empty. +void GetSmallestLargestSeqno(const std::vector& files, + SequenceNumber* smallest_seqno, + SequenceNumber* largest_seqno) { + bool is_first = true; + for (FileMetaData* f : files) { + assert(f->fd.smallest_seqno <= f->fd.largest_seqno); + if (is_first) { + is_first = false; + *smallest_seqno = f->fd.smallest_seqno; + *largest_seqno = f->fd.largest_seqno; + } else { + if (f->fd.smallest_seqno < *smallest_seqno) { + *smallest_seqno = f->fd.smallest_seqno; + } + if (f->fd.largest_seqno > *largest_seqno) { + *largest_seqno = f->fd.largest_seqno; + } + } + } +} +#endif +} // namespace + +// Algorithm that checks to see if there are any overlapping +// files in the input +bool UniversalCompactionBuilder::IsInputFilesNonOverlapping(Compaction* c) { + auto comparator = icmp_->user_comparator(); + int first_iter = 1; + + InputFileInfo prev, curr, next; + + SmallestKeyHeap smallest_key_priority_q = + create_level_heap(c, icmp_->user_comparator()); + + while (!smallest_key_priority_q.empty()) { + curr = smallest_key_priority_q.top(); + smallest_key_priority_q.pop(); + + if (first_iter) { + prev = curr; + first_iter = 0; + } else { + if (comparator->CompareWithoutTimestamp( + prev.f->largest.user_key(), curr.f->smallest.user_key()) >= 0) { + // found overlapping files, return false + return false; + } + assert(comparator->CompareWithoutTimestamp( + curr.f->largest.user_key(), prev.f->largest.user_key()) > 0); + prev = curr; + } + + next.f = nullptr; + + if (c->level(curr.level) != 0 && + curr.index < c->num_input_files(curr.level) - 1) { + next.f = c->input(curr.level, curr.index + 1); + next.level = curr.level; + next.index = curr.index + 1; + } + + if (next.f) { + smallest_key_priority_q.push(std::move(next)); + } + } + return true; +} + +bool UniversalCompactionPicker::NeedsCompaction( + const VersionStorageInfo* vstorage) const { + const int kLevel0 = 0; + if (vstorage->CompactionScore(kLevel0) >= 1) { + return true; + } + if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) { + return true; + } + if (!vstorage->FilesMarkedForCompaction().empty()) { + return true; + } + return false; +} + +Compaction* UniversalCompactionPicker::PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer, SequenceNumber /* earliest_memtable_seqno */) { + UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name, + mutable_cf_options, mutable_db_options, + vstorage, this, log_buffer); + return builder.PickCompaction(); +} + +void UniversalCompactionBuilder::SortedRun::Dump(char* out_buf, + size_t out_buf_size, + bool print_path) const { + if (level == 0) { + assert(file != nullptr); + if (file->fd.GetPathId() == 0 || !print_path) { + snprintf(out_buf, out_buf_size, "file %" PRIu64, file->fd.GetNumber()); + } else { + snprintf(out_buf, out_buf_size, + "file %" PRIu64 + "(path " + "%" PRIu32 ")", + file->fd.GetNumber(), file->fd.GetPathId()); + } + } else { + snprintf(out_buf, out_buf_size, "level %d", level); + } +} + +void UniversalCompactionBuilder::SortedRun::DumpSizeInfo( + char* out_buf, size_t out_buf_size, size_t sorted_run_count) const { + if (level == 0) { + assert(file != nullptr); + snprintf(out_buf, out_buf_size, + "file %" PRIu64 "[%" ROCKSDB_PRIszt + "] " + "with size %" PRIu64 " (compensated size %" PRIu64 ")", + file->fd.GetNumber(), sorted_run_count, file->fd.GetFileSize(), + file->compensated_file_size); + } else { + snprintf(out_buf, out_buf_size, + "level %d[%" ROCKSDB_PRIszt + "] " + "with size %" PRIu64 " (compensated size %" PRIu64 ")", + level, sorted_run_count, size, compensated_file_size); + } +} + +std::vector +UniversalCompactionBuilder::CalculateSortedRuns( + const VersionStorageInfo& vstorage) { + std::vector ret; + for (FileMetaData* f : vstorage.LevelFiles(0)) { + ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size, + f->being_compacted); + } + for (int level = 1; level < vstorage.num_levels(); level++) { + uint64_t total_compensated_size = 0U; + uint64_t total_size = 0U; + bool being_compacted = false; + for (FileMetaData* f : vstorage.LevelFiles(level)) { + total_compensated_size += f->compensated_file_size; + total_size += f->fd.GetFileSize(); + // Size amp, read amp and periodic compactions always include all files + // for a non-zero level. However, a delete triggered compaction and + // a trivial move might pick a subset of files in a sorted run. So + // always check all files in a sorted run and mark the entire run as + // being compacted if one or more files are being compacted + if (f->being_compacted) { + being_compacted = f->being_compacted; + } + } + if (total_compensated_size > 0) { + ret.emplace_back(level, nullptr, total_size, total_compensated_size, + being_compacted); + } + } + return ret; +} + +// Universal style of compaction. Pick files that are contiguous in +// time-range to compact. +Compaction* UniversalCompactionBuilder::PickCompaction() { + const int kLevel0 = 0; + score_ = vstorage_->CompactionScore(kLevel0); + sorted_runs_ = CalculateSortedRuns(*vstorage_); + + if (sorted_runs_.size() == 0 || + (vstorage_->FilesMarkedForPeriodicCompaction().empty() && + vstorage_->FilesMarkedForCompaction().empty() && + sorted_runs_.size() < (unsigned int)mutable_cf_options_ + .level0_file_num_compaction_trigger)) { + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: nothing to do\n", + cf_name_.c_str()); + TEST_SYNC_POINT_CALLBACK( + "UniversalCompactionBuilder::PickCompaction:Return", nullptr); + return nullptr; + } + VersionStorageInfo::LevelSummaryStorage tmp; + ROCKS_LOG_BUFFER_MAX_SZ( + log_buffer_, 3072, + "[%s] Universal: sorted runs: %" ROCKSDB_PRIszt " files: %s\n", + cf_name_.c_str(), sorted_runs_.size(), vstorage_->LevelSummary(&tmp)); + + Compaction* c = nullptr; + // Periodic compaction has higher priority than other type of compaction + // because it's a hard requirement. + if (!vstorage_->FilesMarkedForPeriodicCompaction().empty()) { + // Always need to do a full compaction for periodic compaction. + c = PickPeriodicCompaction(); + } + + // Check for size amplification. + if (c == nullptr && + sorted_runs_.size() >= + static_cast( + mutable_cf_options_.level0_file_num_compaction_trigger)) { + if ((c = PickCompactionToReduceSizeAmp()) != nullptr) { + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size amp\n", + cf_name_.c_str()); + } else { + // Size amplification is within limits. Try reducing read + // amplification while maintaining file size ratios. + unsigned int ratio = + mutable_cf_options_.compaction_options_universal.size_ratio; + + if ((c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX)) != nullptr) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: compacting for size ratio\n", + cf_name_.c_str()); + } else { + // Size amplification and file size ratios are within configured limits. + // If max read amplification is exceeding configured limits, then force + // compaction without looking at filesize ratios and try to reduce + // the number of files to fewer than level0_file_num_compaction_trigger. + // This is guaranteed by NeedsCompaction() + assert(sorted_runs_.size() >= + static_cast( + mutable_cf_options_.level0_file_num_compaction_trigger)); + // Get the total number of sorted runs that are not being compacted + int num_sr_not_compacted = 0; + for (size_t i = 0; i < sorted_runs_.size(); i++) { + if (sorted_runs_[i].being_compacted == false) { + num_sr_not_compacted++; + } + } + + // The number of sorted runs that are not being compacted is greater + // than the maximum allowed number of sorted runs + if (num_sr_not_compacted > + mutable_cf_options_.level0_file_num_compaction_trigger) { + unsigned int num_files = + num_sr_not_compacted - + mutable_cf_options_.level0_file_num_compaction_trigger + 1; + if ((c = PickCompactionToReduceSortedRuns(UINT_MAX, num_files)) != + nullptr) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: compacting for file num -- %u\n", + cf_name_.c_str(), num_files); + } + } + } + } + } + + if (c == nullptr) { + if ((c = PickDeleteTriggeredCompaction()) != nullptr) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: delete triggered compaction\n", + cf_name_.c_str()); + } + } + + if (c == nullptr) { + TEST_SYNC_POINT_CALLBACK( + "UniversalCompactionBuilder::PickCompaction:Return", nullptr); + return nullptr; + } + + if (mutable_cf_options_.compaction_options_universal.allow_trivial_move == + true && + c->compaction_reason() != CompactionReason::kPeriodicCompaction) { + c->set_is_trivial_move(IsInputFilesNonOverlapping(c)); + } + +// validate that all the chosen files of L0 are non overlapping in time +#ifndef NDEBUG + bool is_first = true; + + size_t level_index = 0U; + if (c->start_level() == 0) { + for (auto f : *c->inputs(0)) { + assert(f->fd.smallest_seqno <= f->fd.largest_seqno); + if (is_first) { + is_first = false; + } + } + level_index = 1U; + } + for (; level_index < c->num_input_levels(); level_index++) { + if (c->num_input_files(level_index) != 0) { + SequenceNumber smallest_seqno = 0U; + SequenceNumber largest_seqno = 0U; + GetSmallestLargestSeqno(*(c->inputs(level_index)), &smallest_seqno, + &largest_seqno); + if (is_first) { + is_first = false; + } + } + } +#endif + // update statistics + size_t num_files = 0; + for (auto& each_level : *c->inputs()) { + num_files += each_level.files.size(); + } + RecordInHistogram(ioptions_.stats, NUM_FILES_IN_SINGLE_COMPACTION, num_files); + + picker_->RegisterCompaction(c); + vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); + + TEST_SYNC_POINT_CALLBACK("UniversalCompactionBuilder::PickCompaction:Return", + c); + return c; +} + +uint32_t UniversalCompactionBuilder::GetPathId( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, uint64_t file_size) { + // Two conditions need to be satisfied: + // (1) the target path needs to be able to hold the file's size + // (2) Total size left in this and previous paths need to be not + // smaller than expected future file size before this new file is + // compacted, which is estimated based on size_ratio. + // For example, if now we are compacting files of size (1, 1, 2, 4, 8), + // we will make sure the target file, probably with size of 16, will be + // placed in a path so that eventually when new files are generated and + // compacted to (1, 1, 2, 4, 8, 16), all those files can be stored in or + // before the path we chose. + // + // TODO(sdong): now the case of multiple column families is not + // considered in this algorithm. So the target size can be violated in + // that case. We need to improve it. + uint64_t accumulated_size = 0; + uint64_t future_size = + file_size * + (100 - mutable_cf_options.compaction_options_universal.size_ratio) / 100; + uint32_t p = 0; + assert(!ioptions.cf_paths.empty()); + for (; p < ioptions.cf_paths.size() - 1; p++) { + uint64_t target_size = ioptions.cf_paths[p].target_size; + if (target_size > file_size && + accumulated_size + (target_size - file_size) > future_size) { + return p; + } + accumulated_size += target_size; + } + return p; +} + +// +// Consider compaction files based on their size differences with +// the next file in time order. +// +Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns( + unsigned int ratio, unsigned int max_number_of_files_to_compact) { + unsigned int min_merge_width = + mutable_cf_options_.compaction_options_universal.min_merge_width; + unsigned int max_merge_width = + mutable_cf_options_.compaction_options_universal.max_merge_width; + + const SortedRun* sr = nullptr; + bool done = false; + size_t start_index = 0; + unsigned int candidate_count = 0; + + unsigned int max_files_to_compact = + std::min(max_merge_width, max_number_of_files_to_compact); + min_merge_width = std::max(min_merge_width, 2U); + + // Caller checks the size before executing this function. This invariant is + // important because otherwise we may have a possible integer underflow when + // dealing with unsigned types. + assert(sorted_runs_.size() > 0); + + // Considers a candidate file only if it is smaller than the + // total size accumulated so far. + for (size_t loop = 0; loop < sorted_runs_.size(); loop++) { + candidate_count = 0; + + // Skip files that are already being compacted + for (sr = nullptr; loop < sorted_runs_.size(); loop++) { + sr = &sorted_runs_[loop]; + + if (!sr->being_compacted) { + candidate_count = 1; + break; + } + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf)); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: %s" + "[%d] being compacted, skipping", + cf_name_.c_str(), file_num_buf, loop); + + sr = nullptr; + } + + // This file is not being compacted. Consider it as the + // first candidate to be compacted. + uint64_t candidate_size = sr != nullptr ? sr->compensated_file_size : 0; + if (sr != nullptr) { + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf), true); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: Possible candidate %s[%d].", + cf_name_.c_str(), file_num_buf, loop); + } + + // Check if the succeeding files need compaction. + for (size_t i = loop + 1; + candidate_count < max_files_to_compact && i < sorted_runs_.size(); + i++) { + const SortedRun* succeeding_sr = &sorted_runs_[i]; + if (succeeding_sr->being_compacted) { + break; + } + // Pick files if the total/last candidate file size (increased by the + // specified ratio) is still larger than the next candidate file. + // candidate_size is the total size of files picked so far with the + // default kCompactionStopStyleTotalSize; with + // kCompactionStopStyleSimilarSize, it's simply the size of the last + // picked file. + double sz = candidate_size * (100.0 + ratio) / 100.0; + if (sz < static_cast(succeeding_sr->size)) { + break; + } + if (mutable_cf_options_.compaction_options_universal.stop_style == + kCompactionStopStyleSimilarSize) { + // Similar-size stopping rule: also check the last picked file isn't + // far larger than the next candidate file. + sz = (succeeding_sr->size * (100.0 + ratio)) / 100.0; + if (sz < static_cast(candidate_size)) { + // If the small file we've encountered begins a run of similar-size + // files, we'll pick them up on a future iteration of the outer + // loop. If it's some lonely straggler, it'll eventually get picked + // by the last-resort read amp strategy which disregards size ratios. + break; + } + candidate_size = succeeding_sr->compensated_file_size; + } else { // default kCompactionStopStyleTotalSize + candidate_size += succeeding_sr->compensated_file_size; + } + candidate_count++; + } + + // Found a series of consecutive files that need compaction. + if (candidate_count >= (unsigned int)min_merge_width) { + start_index = loop; + done = true; + break; + } else { + for (size_t i = loop; + i < loop + candidate_count && i < sorted_runs_.size(); i++) { + const SortedRun* skipping_sr = &sorted_runs_[i]; + char file_num_buf[256]; + skipping_sr->DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop); + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Skipping %s", + cf_name_.c_str(), file_num_buf); + } + } + } + if (!done || candidate_count <= 1) { + return nullptr; + } + size_t first_index_after = start_index + candidate_count; + // Compression is enabled if files compacted earlier already reached + // size ratio of compression. + bool enable_compression = true; + int ratio_to_compress = + mutable_cf_options_.compaction_options_universal.compression_size_percent; + if (ratio_to_compress >= 0) { + uint64_t total_size = 0; + for (auto& sorted_run : sorted_runs_) { + total_size += sorted_run.compensated_file_size; + } + + uint64_t older_file_size = 0; + for (size_t i = sorted_runs_.size() - 1; i >= first_index_after; i--) { + older_file_size += sorted_runs_[i].size; + if (older_file_size * 100L >= total_size * (long)ratio_to_compress) { + enable_compression = false; + break; + } + } + } + + uint64_t estimated_total_size = 0; + for (unsigned int i = 0; i < first_index_after; i++) { + estimated_total_size += sorted_runs_[i].size; + } + uint32_t path_id = + GetPathId(ioptions_, mutable_cf_options_, estimated_total_size); + int start_level = sorted_runs_[start_index].level; + int output_level; + if (first_index_after == sorted_runs_.size()) { + output_level = vstorage_->num_levels() - 1; + } else if (sorted_runs_[first_index_after].level == 0) { + output_level = 0; + } else { + output_level = sorted_runs_[first_index_after].level - 1; + } + + // last level is reserved for the files ingested behind + if (ioptions_.allow_ingest_behind && + (output_level == vstorage_->num_levels() - 1)) { + assert(output_level > 1); + output_level--; + } + + std::vector inputs(vstorage_->num_levels()); + for (size_t i = 0; i < inputs.size(); ++i) { + inputs[i].level = start_level + static_cast(i); + } + for (size_t i = start_index; i < first_index_after; i++) { + auto& picking_sr = sorted_runs_[i]; + if (picking_sr.level == 0) { + FileMetaData* picking_file = picking_sr.file; + inputs[0].files.push_back(picking_file); + } else { + auto& files = inputs[picking_sr.level - start_level].files; + for (auto* f : vstorage_->LevelFiles(picking_sr.level)) { + files.push_back(f); + } + } + char file_num_buf[256]; + picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), i); + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Picking %s", + cf_name_.c_str(), file_num_buf); + } + + std::vector grandparents; + // Include grandparents for potential file cutting in incremental + // mode. It is for aligning file cutting boundaries across levels, + // so that subsequent compactions can pick files with aligned + // buffer. + // Single files are only picked up in incremental mode, so that + // there is no need for full range. + if (mutable_cf_options_.compaction_options_universal.incremental && + first_index_after < sorted_runs_.size() && + sorted_runs_[first_index_after].level > 1) { + grandparents = vstorage_->LevelFiles(sorted_runs_[first_index_after].level); + } + + if (output_level != 0 && + picker_->FilesRangeOverlapWithCompaction( + inputs, output_level, + Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_, + start_level, output_level))) { + return nullptr; + } + CompactionReason compaction_reason; + if (max_number_of_files_to_compact == UINT_MAX) { + compaction_reason = CompactionReason::kUniversalSizeRatio; + } else { + compaction_reason = CompactionReason::kUniversalSortedRunNum; + } + return new Compaction(vstorage_, ioptions_, mutable_cf_options_, + mutable_db_options_, std::move(inputs), output_level, + MaxFileSizeForLevel(mutable_cf_options_, output_level, + kCompactionStyleUniversal), + GetMaxOverlappingBytes(), path_id, + GetCompressionType(vstorage_, mutable_cf_options_, + output_level, 1, enable_compression), + GetCompressionOptions(mutable_cf_options_, vstorage_, + output_level, enable_compression), + Temperature::kUnknown, + /* max_subcompactions */ 0, grandparents, + /* is manual */ false, /* trim_ts */ "", score_, + false /* deletion_compaction */, + /* l0_files_might_overlap */ true, compaction_reason); +} + +// Look at overall size amplification. If size amplification +// exceeds the configured value, then do a compaction +// of the candidate files all the way upto the earliest +// base file (overrides configured values of file-size ratios, +// min_merge_width and max_merge_width). +// +Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() { + // percentage flexibility while reducing size amplification + uint64_t ratio = mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent; + + unsigned int candidate_count = 0; + uint64_t candidate_size = 0; + size_t start_index = 0; + const SortedRun* sr = nullptr; + + assert(!sorted_runs_.empty()); + if (sorted_runs_.back().being_compacted) { + return nullptr; + } + + // Skip files that are already being compacted + for (size_t loop = 0; loop + 1 < sorted_runs_.size(); loop++) { + sr = &sorted_runs_[loop]; + if (!sr->being_compacted) { + start_index = loop; // Consider this as the first candidate. + break; + } + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf), true); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: skipping %s[%d] compacted %s", + cf_name_.c_str(), file_num_buf, loop, + " cannot be a candidate to reduce size amp.\n"); + sr = nullptr; + } + + if (sr == nullptr) { + return nullptr; // no candidate files + } + { + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf), true); + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Universal: First candidate %s[%" ROCKSDB_PRIszt "] %s", + cf_name_.c_str(), file_num_buf, start_index, " to reduce size amp.\n"); + } + + // size of the base sorted run for size amp calculation + uint64_t base_sr_size = sorted_runs_.back().size; + size_t sr_end_idx = sorted_runs_.size() - 1; + // If tiered compaction is enabled and the last sorted run is the last level + if (ioptions_.preclude_last_level_data_seconds > 0 && + ioptions_.num_levels > 2 && + sorted_runs_.back().level == ioptions_.num_levels - 1 && + sorted_runs_.size() > 1) { + sr_end_idx = sorted_runs_.size() - 2; + base_sr_size = sorted_runs_[sr_end_idx].size; + } + + // keep adding up all the remaining files + for (size_t loop = start_index; loop < sr_end_idx; loop++) { + sr = &sorted_runs_[loop]; + if (sr->being_compacted) { + // TODO with incremental compaction is supported, we might want to + // schedule some incremental compactions in parallel if needed. + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf), true); + ROCKS_LOG_BUFFER( + log_buffer_, "[%s] Universal: Possible candidate %s[%d] %s", + cf_name_.c_str(), file_num_buf, start_index, + " is already being compacted. No size amp reduction possible.\n"); + return nullptr; + } + candidate_size += sr->compensated_file_size; + candidate_count++; + } + if (candidate_count == 0) { + return nullptr; + } + + // size amplification = percentage of additional size + if (candidate_size * 100 < ratio * base_sr_size) { + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64 + " earliest-file-size %" PRIu64, + cf_name_.c_str(), candidate_size, base_sr_size); + return nullptr; + } else { + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64 + " earliest-file-size %" PRIu64, + cf_name_.c_str(), candidate_size, base_sr_size); + } + // Since incremental compaction can't include more than second last + // level, it can introduce penalty, compared to full compaction. We + // hard code the pentalty to be 80%. If we end up with a compaction + // fanout higher than 80% of full level compactions, we fall back + // to full level compaction. + // The 80% threshold is arbitrary and can be adjusted or made + // configurable in the future. + // This also prevent the case when compaction falls behind and we + // need to compact more levels for compactions to catch up. + if (mutable_cf_options_.compaction_options_universal.incremental) { + double fanout_threshold = static_cast(base_sr_size) / + static_cast(candidate_size) * 1.8; + Compaction* picked = PickIncrementalForReduceSizeAmp(fanout_threshold); + if (picked != nullptr) { + // As the feature is still incremental, picking incremental compaction + // might fail and we will fall bck to compacting full level. + return picked; + } + } + return PickCompactionWithSortedRunRange( + start_index, sr_end_idx, CompactionReason::kUniversalSizeAmplification); +} + +Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp( + double fanout_threshold) { + // Try find all potential compactions with total size just over + // options.max_compaction_size / 2, and take the one with the lowest + // fanout (defined in declaration of the function). + // This is done by having a sliding window of the files at the second + // lowest level, and keep expanding while finding overlapping in the + // last level. Once total size exceeds the size threshold, calculate + // the fanout value. And then shrinking from the small side of the + // window. Keep doing it until the end. + // Finally, we try to include upper level files if they fall into + // the range. + // + // Note that it is a similar problem as leveled compaction's + // kMinOverlappingRatio priority, but instead of picking single files + // we expand to a target compaction size. The reason is that in + // leveled compaction, actual fanout value tends to high, e.g. 10, so + // even with single file in down merging level, the extra size + // compacted in boundary files is at a lower ratio. But here users + // often have size of second last level size to be 1/4, 1/3 or even + // 1/2 of the bottommost level, so picking single file in second most + // level will cause significant waste, which is not desirable. + // + // This algorithm has lots of room to improve to pick more efficient + // compactions. + assert(sorted_runs_.size() >= 2); + int second_last_level = sorted_runs_[sorted_runs_.size() - 2].level; + if (second_last_level == 0) { + // Can't split Level 0. + return nullptr; + } + int output_level = sorted_runs_.back().level; + const std::vector& bottom_files = + vstorage_->LevelFiles(output_level); + const std::vector& files = + vstorage_->LevelFiles(second_last_level); + assert(!bottom_files.empty()); + assert(!files.empty()); + + // std::unordered_map file_to_order; + + int picked_start_idx = 0; + int picked_end_idx = 0; + double picked_fanout = fanout_threshold; + + // Use half target compaction bytes as anchor to stop growing second most + // level files, and reserve growing space for more overlapping bottom level, + // clean cut, files from other levels, etc. + uint64_t comp_thres_size = mutable_cf_options_.max_compaction_bytes / 2; + int start_idx = 0; + int bottom_end_idx = 0; + int bottom_start_idx = 0; + uint64_t non_bottom_size = 0; + uint64_t bottom_size = 0; + bool end_bottom_size_counted = false; + for (int end_idx = 0; end_idx < static_cast(files.size()); end_idx++) { + FileMetaData* end_file = files[end_idx]; + + // Include bottom most level files smaller than the current second + // last level file. + int num_skipped = 0; + while (bottom_end_idx < static_cast(bottom_files.size()) && + icmp_->Compare(bottom_files[bottom_end_idx]->largest, + end_file->smallest) < 0) { + if (!end_bottom_size_counted) { + bottom_size += bottom_files[bottom_end_idx]->fd.file_size; + } + bottom_end_idx++; + end_bottom_size_counted = false; + num_skipped++; + } + + if (num_skipped > 1) { + // At least a file in the bottom most level falls into the file gap. No + // reason to include the file. We cut the range and start a new sliding + // window. + start_idx = end_idx; + } + + if (start_idx == end_idx) { + // new sliding window. + non_bottom_size = 0; + bottom_size = 0; + bottom_start_idx = bottom_end_idx; + end_bottom_size_counted = false; + } + + non_bottom_size += end_file->fd.file_size; + + // Include all overlapping files in bottom level. + while (bottom_end_idx < static_cast(bottom_files.size()) && + icmp_->Compare(bottom_files[bottom_end_idx]->smallest, + end_file->largest) < 0) { + if (!end_bottom_size_counted) { + bottom_size += bottom_files[bottom_end_idx]->fd.file_size; + end_bottom_size_counted = true; + } + if (icmp_->Compare(bottom_files[bottom_end_idx]->largest, + end_file->largest) > 0) { + // next level file cross large boundary of current file. + break; + } + bottom_end_idx++; + end_bottom_size_counted = false; + } + + if ((non_bottom_size + bottom_size > comp_thres_size || + end_idx == static_cast(files.size()) - 1) && + non_bottom_size > 0) { // Do we alow 0 size file at all? + // If it is a better compaction, remember it in picked* variables. + double fanout = static_cast(bottom_size) / + static_cast(non_bottom_size); + if (fanout < picked_fanout) { + picked_start_idx = start_idx; + picked_end_idx = end_idx; + picked_fanout = fanout; + } + // Shrink from the start end to under comp_thres_size + while (non_bottom_size + bottom_size > comp_thres_size && + start_idx <= end_idx) { + non_bottom_size -= files[start_idx]->fd.file_size; + start_idx++; + if (start_idx < static_cast(files.size())) { + while (bottom_start_idx <= bottom_end_idx && + icmp_->Compare(bottom_files[bottom_start_idx]->largest, + files[start_idx]->smallest) < 0) { + bottom_size -= bottom_files[bottom_start_idx]->fd.file_size; + bottom_start_idx++; + } + } + } + } + } + + if (picked_fanout >= fanout_threshold) { + assert(picked_fanout == fanout_threshold); + return nullptr; + } + + std::vector inputs; + CompactionInputFiles bottom_level_inputs; + CompactionInputFiles second_last_level_inputs; + second_last_level_inputs.level = second_last_level; + bottom_level_inputs.level = output_level; + for (int i = picked_start_idx; i <= picked_end_idx; i++) { + if (files[i]->being_compacted) { + return nullptr; + } + second_last_level_inputs.files.push_back(files[i]); + } + assert(!second_last_level_inputs.empty()); + if (!picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &second_last_level_inputs, + /*next_smallest=*/nullptr)) { + return nullptr; + } + // We might be able to avoid this binary search if we save and expand + // from bottom_start_idx and bottom_end_idx, but for now, we use + // SetupOtherInputs() for simplicity. + int parent_index = -1; // Create and use bottom_start_idx? + if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_, + &second_last_level_inputs, + &bottom_level_inputs, &parent_index, + /*base_index=*/-1)) { + return nullptr; + } + + // Try to include files in upper levels if they fall into the range. + // Since we need to go from lower level up and this is in the reverse + // order, compared to level order, we first write to an reversed + // data structure and finally copy them to compaction inputs. + InternalKey smallest, largest; + picker_->GetRange(second_last_level_inputs, &smallest, &largest); + std::vector inputs_reverse; + for (auto it = ++(++sorted_runs_.rbegin()); it != sorted_runs_.rend(); it++) { + SortedRun& sr = *it; + if (sr.level == 0) { + break; + } + std::vector level_inputs; + vstorage_->GetCleanInputsWithinInterval(sr.level, &smallest, &largest, + &level_inputs); + if (!level_inputs.empty()) { + inputs_reverse.push_back({}); + inputs_reverse.back().level = sr.level; + inputs_reverse.back().files = level_inputs; + picker_->GetRange(inputs_reverse.back(), &smallest, &largest); + } + } + for (auto it = inputs_reverse.rbegin(); it != inputs_reverse.rend(); it++) { + inputs.push_back(*it); + } + + inputs.push_back(second_last_level_inputs); + inputs.push_back(bottom_level_inputs); + + int start_level = Compaction::kInvalidLevel; + for (const auto& in : inputs) { + if (!in.empty()) { + // inputs should already be sorted by level + start_level = in.level; + break; + } + } + + // intra L0 compactions outputs could have overlap + if (output_level != 0 && + picker_->FilesRangeOverlapWithCompaction( + inputs, output_level, + Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_, + start_level, output_level))) { + return nullptr; + } + + // TODO support multi paths? + uint32_t path_id = 0; + return new Compaction( + vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_, + std::move(inputs), output_level, + MaxFileSizeForLevel(mutable_cf_options_, output_level, + kCompactionStyleUniversal), + GetMaxOverlappingBytes(), path_id, + GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1, + true /* enable_compression */), + GetCompressionOptions(mutable_cf_options_, vstorage_, output_level, + true /* enable_compression */), + Temperature::kUnknown, + /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false, + /* trim_ts */ "", score_, false /* deletion_compaction */, + /* l0_files_might_overlap */ true, + CompactionReason::kUniversalSizeAmplification); +} + +// Pick files marked for compaction. Typically, files are marked by +// CompactOnDeleteCollector due to the presence of tombstones. +Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() { + CompactionInputFiles start_level_inputs; + int output_level; + std::vector inputs; + std::vector grandparents; + + if (vstorage_->num_levels() == 1) { + // This is single level universal. Since we're basically trying to reclaim + // space by processing files marked for compaction due to high tombstone + // density, let's do the same thing as compaction to reduce size amp which + // has the same goals. + int start_index = -1; + + start_level_inputs.level = 0; + start_level_inputs.files.clear(); + output_level = 0; + // Find the first file marked for compaction. Ignore the last file + for (size_t loop = 0; loop + 1 < sorted_runs_.size(); loop++) { + SortedRun* sr = &sorted_runs_[loop]; + if (sr->being_compacted) { + continue; + } + FileMetaData* f = vstorage_->LevelFiles(0)[loop]; + if (f->marked_for_compaction) { + start_level_inputs.files.push_back(f); + start_index = + static_cast(loop); // Consider this as the first candidate. + break; + } + } + if (start_index < 0) { + // Either no file marked, or they're already being compacted + return nullptr; + } + + for (size_t loop = start_index + 1; loop < sorted_runs_.size(); loop++) { + SortedRun* sr = &sorted_runs_[loop]; + if (sr->being_compacted) { + break; + } + + FileMetaData* f = vstorage_->LevelFiles(0)[loop]; + start_level_inputs.files.push_back(f); + } + if (start_level_inputs.size() <= 1) { + // If only the last file in L0 is marked for compaction, ignore it + return nullptr; + } + inputs.push_back(start_level_inputs); + } else { + int start_level; + + // For multi-level universal, the strategy is to make this look more like + // leveled. We pick one of the files marked for compaction and compact with + // overlapping files in the adjacent level. + picker_->PickFilesMarkedForCompaction(cf_name_, vstorage_, &start_level, + &output_level, &start_level_inputs); + if (start_level_inputs.empty()) { + return nullptr; + } + + // Pick the first non-empty level after the start_level + for (output_level = start_level + 1; output_level < vstorage_->num_levels(); + output_level++) { + if (vstorage_->NumLevelFiles(output_level) != 0) { + break; + } + } + + // If all higher levels are empty, pick the highest level as output level + if (output_level == vstorage_->num_levels()) { + if (start_level == 0) { + output_level = vstorage_->num_levels() - 1; + } else { + // If start level is non-zero and all higher levels are empty, this + // compaction will translate into a trivial move. Since the idea is + // to reclaim space and trivial move doesn't help with that, we + // skip compaction in this case and return nullptr + return nullptr; + } + } + if (ioptions_.allow_ingest_behind && + output_level == vstorage_->num_levels() - 1) { + assert(output_level > 1); + output_level--; + } + + if (output_level != 0) { + if (start_level == 0) { + if (!picker_->GetOverlappingL0Files(vstorage_, &start_level_inputs, + output_level, nullptr)) { + return nullptr; + } + } + + CompactionInputFiles output_level_inputs; + int parent_index = -1; + + output_level_inputs.level = output_level; + if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_, + &start_level_inputs, &output_level_inputs, + &parent_index, -1)) { + return nullptr; + } + inputs.push_back(start_level_inputs); + if (!output_level_inputs.empty()) { + inputs.push_back(output_level_inputs); + } + if (picker_->FilesRangeOverlapWithCompaction( + inputs, output_level, + Compaction::EvaluatePenultimateLevel( + vstorage_, ioptions_, start_level, output_level))) { + return nullptr; + } + + picker_->GetGrandparents(vstorage_, start_level_inputs, + output_level_inputs, &grandparents); + } else { + inputs.push_back(start_level_inputs); + } + } + + uint64_t estimated_total_size = 0; + // Use size of the output level as estimated file size + for (FileMetaData* f : vstorage_->LevelFiles(output_level)) { + estimated_total_size += f->fd.GetFileSize(); + } + uint32_t path_id = + GetPathId(ioptions_, mutable_cf_options_, estimated_total_size); + return new Compaction( + vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_, + std::move(inputs), output_level, + MaxFileSizeForLevel(mutable_cf_options_, output_level, + kCompactionStyleUniversal), + /* max_grandparent_overlap_bytes */ GetMaxOverlappingBytes(), path_id, + GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1), + GetCompressionOptions(mutable_cf_options_, vstorage_, output_level), + Temperature::kUnknown, + /* max_subcompactions */ 0, grandparents, /* is manual */ false, + /* trim_ts */ "", score_, false /* deletion_compaction */, + /* l0_files_might_overlap */ true, + CompactionReason::kFilesMarkedForCompaction); +} + +Compaction* UniversalCompactionBuilder::PickCompactionToOldest( + size_t start_index, CompactionReason compaction_reason) { + return PickCompactionWithSortedRunRange(start_index, sorted_runs_.size() - 1, + compaction_reason); +} + +Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange( + size_t start_index, size_t end_index, CompactionReason compaction_reason) { + assert(start_index < sorted_runs_.size()); + + // Estimate total file size + uint64_t estimated_total_size = 0; + for (size_t loop = start_index; loop <= end_index; loop++) { + estimated_total_size += sorted_runs_[loop].size; + } + uint32_t path_id = + GetPathId(ioptions_, mutable_cf_options_, estimated_total_size); + int start_level = sorted_runs_[start_index].level; + + std::vector inputs(vstorage_->num_levels()); + for (size_t i = 0; i < inputs.size(); ++i) { + inputs[i].level = start_level + static_cast(i); + } + for (size_t loop = start_index; loop <= end_index; loop++) { + auto& picking_sr = sorted_runs_[loop]; + if (picking_sr.level == 0) { + FileMetaData* f = picking_sr.file; + inputs[0].files.push_back(f); + } else { + auto& files = inputs[picking_sr.level - start_level].files; + for (auto* f : vstorage_->LevelFiles(picking_sr.level)) { + files.push_back(f); + } + } + std::string comp_reason_print_string; + if (compaction_reason == CompactionReason::kPeriodicCompaction) { + comp_reason_print_string = "periodic compaction"; + } else if (compaction_reason == + CompactionReason::kUniversalSizeAmplification) { + comp_reason_print_string = "size amp"; + } else { + assert(false); + comp_reason_print_string = "unknown: "; + comp_reason_print_string.append( + std::to_string(static_cast(compaction_reason))); + } + + char file_num_buf[256]; + picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop); + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: %s picking %s", + cf_name_.c_str(), comp_reason_print_string.c_str(), + file_num_buf); + } + + int output_level; + if (end_index == sorted_runs_.size() - 1) { + // output files at the last level, unless it's reserved + output_level = vstorage_->num_levels() - 1; + // last level is reserved for the files ingested behind + if (ioptions_.allow_ingest_behind) { + assert(output_level > 1); + output_level--; + } + } else { + // if it's not including all sorted_runs, it can only output to the level + // above the `end_index + 1` sorted_run. + output_level = sorted_runs_[end_index + 1].level - 1; + } + + // intra L0 compactions outputs could have overlap + if (output_level != 0 && + picker_->FilesRangeOverlapWithCompaction( + inputs, output_level, + Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_, + start_level, output_level))) { + return nullptr; + } + + // We never check size for + // compaction_options_universal.compression_size_percent, + // because we always compact all the files, so always compress. + return new Compaction( + vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_, + std::move(inputs), output_level, + MaxFileSizeForLevel(mutable_cf_options_, output_level, + kCompactionStyleUniversal), + GetMaxOverlappingBytes(), path_id, + GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1, + true /* enable_compression */), + GetCompressionOptions(mutable_cf_options_, vstorage_, output_level, + true /* enable_compression */), + Temperature::kUnknown, + /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false, + /* trim_ts */ "", score_, false /* deletion_compaction */, + /* l0_files_might_overlap */ true, compaction_reason); +} + +Compaction* UniversalCompactionBuilder::PickPeriodicCompaction() { + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Periodic Compaction", + cf_name_.c_str()); + + // In universal compaction, sorted runs contain older data are almost always + // generated earlier too. To simplify the problem, we just try to trigger + // a full compaction. We start from the oldest sorted run and include + // all sorted runs, until we hit a sorted already being compacted. + // Since usually the largest (which is usually the oldest) sorted run is + // included anyway, doing a full compaction won't increase write + // amplification much. + + // Get some information from marked files to check whether a file is + // included in the compaction. + + size_t start_index = sorted_runs_.size(); + while (start_index > 0 && !sorted_runs_[start_index - 1].being_compacted) { + start_index--; + } + if (start_index == sorted_runs_.size()) { + return nullptr; + } + + // There is a rare corner case where we can't pick up all the files + // because some files are being compacted and we end up with picking files + // but none of them need periodic compaction. Unless we simply recompact + // the last sorted run (either the last level or last L0 file), we would just + // execute the compaction, in order to simplify the logic. + if (start_index == sorted_runs_.size() - 1) { + bool included_file_marked = false; + int start_level = sorted_runs_[start_index].level; + FileMetaData* start_file = sorted_runs_[start_index].file; + for (const std::pair& level_file_pair : + vstorage_->FilesMarkedForPeriodicCompaction()) { + if (start_level != 0) { + // Last sorted run is a level + if (start_level == level_file_pair.first) { + included_file_marked = true; + break; + } + } else { + // Last sorted run is a L0 file. + if (start_file == level_file_pair.second) { + included_file_marked = true; + break; + } + } + } + if (!included_file_marked) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: Cannot form a compaction covering file " + "marked for periodic compaction", + cf_name_.c_str()); + return nullptr; + } + } + + Compaction* c = PickCompactionToOldest(start_index, + CompactionReason::kPeriodicCompaction); + + TEST_SYNC_POINT_CALLBACK( + "UniversalCompactionPicker::PickPeriodicCompaction:Return", c); + + return c; +} + +uint64_t UniversalCompactionBuilder::GetMaxOverlappingBytes() const { + if (!mutable_cf_options_.compaction_options_universal.incremental) { + return std::numeric_limits::max(); + } else { + // Try to align cutting boundary with files at the next level if the + // file isn't end up with 1/2 of target size, or it would overlap + // with two full size files at the next level. + return mutable_cf_options_.target_file_size_base / 2 * 3; + } +} +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/compaction/compaction_picker_universal.h b/src/rocksdb/db/compaction/compaction_picker_universal.h new file mode 100644 index 000000000..5f897cc9b --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_universal.h @@ -0,0 +1,32 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#ifndef ROCKSDB_LITE + +#include "db/compaction/compaction_picker.h" + +namespace ROCKSDB_NAMESPACE { +class UniversalCompactionPicker : public CompactionPicker { + public: + UniversalCompactionPicker(const ImmutableOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + virtual Compaction* PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer, + SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override; + virtual int MaxOutputLevel() const override { return NumberLevels() - 1; } + + virtual bool NeedsCompaction( + const VersionStorageInfo* vstorage) const override; +}; +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/compaction/compaction_service_job.cc b/src/rocksdb/db/compaction/compaction_service_job.cc new file mode 100644 index 000000000..1d2e99d99 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_service_job.cc @@ -0,0 +1,829 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_job.h" +#include "db/compaction/compaction_state.h" +#include "logging/logging.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/thread_status_util.h" +#include "options/options_helper.h" +#include "rocksdb/utilities/options_type.h" + +#ifndef ROCKSDB_LITE +namespace ROCKSDB_NAMESPACE { +class SubcompactionState; + +CompactionServiceJobStatus +CompactionJob::ProcessKeyValueCompactionWithCompactionService( + SubcompactionState* sub_compact) { + assert(sub_compact); + assert(sub_compact->compaction); + assert(db_options_.compaction_service); + + const Compaction* compaction = sub_compact->compaction; + CompactionServiceInput compaction_input; + compaction_input.output_level = compaction->output_level(); + compaction_input.db_id = db_id_; + + const std::vector& inputs = + *(compact_->compaction->inputs()); + for (const auto& files_per_level : inputs) { + for (const auto& file : files_per_level.files) { + compaction_input.input_files.emplace_back( + MakeTableFileName(file->fd.GetNumber())); + } + } + compaction_input.column_family.name = + compaction->column_family_data()->GetName(); + compaction_input.column_family.options = + compaction->column_family_data()->GetLatestCFOptions(); + compaction_input.db_options = + BuildDBOptions(db_options_, mutable_db_options_copy_); + compaction_input.snapshots = existing_snapshots_; + compaction_input.has_begin = sub_compact->start.has_value(); + compaction_input.begin = + compaction_input.has_begin ? sub_compact->start->ToString() : ""; + compaction_input.has_end = sub_compact->end.has_value(); + compaction_input.end = + compaction_input.has_end ? sub_compact->end->ToString() : ""; + + std::string compaction_input_binary; + Status s = compaction_input.Write(&compaction_input_binary); + if (!s.ok()) { + sub_compact->status = s; + return CompactionServiceJobStatus::kFailure; + } + + std::ostringstream input_files_oss; + bool is_first_one = true; + for (const auto& file : compaction_input.input_files) { + input_files_oss << (is_first_one ? "" : ", ") << file; + is_first_one = false; + } + + ROCKS_LOG_INFO( + db_options_.info_log, + "[%s] [JOB %d] Starting remote compaction (output level: %d): %s", + compaction_input.column_family.name.c_str(), job_id_, + compaction_input.output_level, input_files_oss.str().c_str()); + CompactionServiceJobInfo info(dbname_, db_id_, db_session_id_, + GetCompactionId(sub_compact), thread_pri_); + CompactionServiceJobStatus compaction_status = + db_options_.compaction_service->StartV2(info, compaction_input_binary); + switch (compaction_status) { + case CompactionServiceJobStatus::kSuccess: + break; + case CompactionServiceJobStatus::kFailure: + sub_compact->status = Status::Incomplete( + "CompactionService failed to start compaction job."); + ROCKS_LOG_WARN(db_options_.info_log, + "[%s] [JOB %d] Remote compaction failed to start.", + compaction_input.column_family.name.c_str(), job_id_); + return compaction_status; + case CompactionServiceJobStatus::kUseLocal: + ROCKS_LOG_INFO( + db_options_.info_log, + "[%s] [JOB %d] Remote compaction fallback to local by API Start.", + compaction_input.column_family.name.c_str(), job_id_); + return compaction_status; + default: + assert(false); // unknown status + break; + } + + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Waiting for remote compaction...", + compaction_input.column_family.name.c_str(), job_id_); + std::string compaction_result_binary; + compaction_status = db_options_.compaction_service->WaitForCompleteV2( + info, &compaction_result_binary); + + if (compaction_status == CompactionServiceJobStatus::kUseLocal) { + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Remote compaction fallback to local by API " + "WaitForComplete.", + compaction_input.column_family.name.c_str(), job_id_); + return compaction_status; + } + + CompactionServiceResult compaction_result; + s = CompactionServiceResult::Read(compaction_result_binary, + &compaction_result); + + if (compaction_status == CompactionServiceJobStatus::kFailure) { + if (s.ok()) { + if (compaction_result.status.ok()) { + sub_compact->status = Status::Incomplete( + "CompactionService failed to run the compaction job (even though " + "the internal status is okay)."); + } else { + // set the current sub compaction status with the status returned from + // remote + sub_compact->status = compaction_result.status; + } + } else { + sub_compact->status = Status::Incomplete( + "CompactionService failed to run the compaction job (and no valid " + "result is returned)."); + compaction_result.status.PermitUncheckedError(); + } + ROCKS_LOG_WARN(db_options_.info_log, + "[%s] [JOB %d] Remote compaction failed.", + compaction_input.column_family.name.c_str(), job_id_); + return compaction_status; + } + + if (!s.ok()) { + sub_compact->status = s; + compaction_result.status.PermitUncheckedError(); + return CompactionServiceJobStatus::kFailure; + } + sub_compact->status = compaction_result.status; + + std::ostringstream output_files_oss; + is_first_one = true; + for (const auto& file : compaction_result.output_files) { + output_files_oss << (is_first_one ? "" : ", ") << file.file_name; + is_first_one = false; + } + + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Receive remote compaction result, output path: " + "%s, files: %s", + compaction_input.column_family.name.c_str(), job_id_, + compaction_result.output_path.c_str(), + output_files_oss.str().c_str()); + + if (!s.ok()) { + sub_compact->status = s; + return CompactionServiceJobStatus::kFailure; + } + + for (const auto& file : compaction_result.output_files) { + uint64_t file_num = versions_->NewFileNumber(); + auto src_file = compaction_result.output_path + "/" + file.file_name; + auto tgt_file = TableFileName(compaction->immutable_options()->cf_paths, + file_num, compaction->output_path_id()); + s = fs_->RenameFile(src_file, tgt_file, IOOptions(), nullptr); + if (!s.ok()) { + sub_compact->status = s; + return CompactionServiceJobStatus::kFailure; + } + + FileMetaData meta; + uint64_t file_size; + s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr); + if (!s.ok()) { + sub_compact->status = s; + return CompactionServiceJobStatus::kFailure; + } + meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size, + file.smallest_seqno, file.largest_seqno); + meta.smallest.DecodeFrom(file.smallest_internal_key); + meta.largest.DecodeFrom(file.largest_internal_key); + meta.oldest_ancester_time = file.oldest_ancester_time; + meta.file_creation_time = file.file_creation_time; + meta.marked_for_compaction = file.marked_for_compaction; + meta.unique_id = file.unique_id; + + auto cfd = compaction->column_family_data(); + sub_compact->Current().AddOutput(std::move(meta), + cfd->internal_comparator(), false, false, + true, file.paranoid_hash); + } + sub_compact->compaction_job_stats = compaction_result.stats; + sub_compact->Current().SetNumOutputRecords( + compaction_result.num_output_records); + sub_compact->Current().SetTotalBytes(compaction_result.total_bytes); + RecordTick(stats_, REMOTE_COMPACT_READ_BYTES, compaction_result.bytes_read); + RecordTick(stats_, REMOTE_COMPACT_WRITE_BYTES, + compaction_result.bytes_written); + return CompactionServiceJobStatus::kSuccess; +} + +std::string CompactionServiceCompactionJob::GetTableFileName( + uint64_t file_number) { + return MakeTableFileName(output_path_, file_number); +} + +void CompactionServiceCompactionJob::RecordCompactionIOStats() { + compaction_result_->bytes_read += IOSTATS(bytes_read); + compaction_result_->bytes_written += IOSTATS(bytes_written); + CompactionJob::RecordCompactionIOStats(); +} + +CompactionServiceCompactionJob::CompactionServiceCompactionJob( + int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, + const MutableDBOptions& mutable_db_options, const FileOptions& file_options, + VersionSet* versions, const std::atomic* shutting_down, + LogBuffer* log_buffer, FSDirectory* output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, + std::vector existing_snapshots, + std::shared_ptr table_cache, EventLogger* event_logger, + const std::string& dbname, const std::shared_ptr& io_tracer, + const std::atomic& manual_compaction_canceled, + const std::string& db_id, const std::string& db_session_id, + std::string output_path, + const CompactionServiceInput& compaction_service_input, + CompactionServiceResult* compaction_service_result) + : CompactionJob( + job_id, compaction, db_options, mutable_db_options, file_options, + versions, shutting_down, log_buffer, nullptr, output_directory, + nullptr, stats, db_mutex, db_error_handler, + std::move(existing_snapshots), kMaxSequenceNumber, nullptr, nullptr, + std::move(table_cache), event_logger, + compaction->mutable_cf_options()->paranoid_file_checks, + compaction->mutable_cf_options()->report_bg_io_stats, dbname, + &(compaction_service_result->stats), Env::Priority::USER, io_tracer, + manual_compaction_canceled, db_id, db_session_id, + compaction->column_family_data()->GetFullHistoryTsLow()), + output_path_(std::move(output_path)), + compaction_input_(compaction_service_input), + compaction_result_(compaction_service_result) {} + +Status CompactionServiceCompactionJob::Run() { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_RUN); + + auto* c = compact_->compaction; + assert(c->column_family_data() != nullptr); + assert(c->column_family_data()->current()->storage_info()->NumLevelFiles( + compact_->compaction->level()) > 0); + + write_hint_ = + c->column_family_data()->CalculateSSTWriteHint(c->output_level()); + bottommost_level_ = c->bottommost_level(); + + Slice begin = compaction_input_.begin; + Slice end = compaction_input_.end; + compact_->sub_compact_states.emplace_back( + c, + compaction_input_.has_begin ? std::optional(begin) + : std::optional(), + compaction_input_.has_end ? std::optional(end) + : std::optional(), + /*sub_job_id*/ 0); + + log_buffer_->FlushBufferToLog(); + LogCompaction(); + const uint64_t start_micros = db_options_.clock->NowMicros(); + // Pick the only sub-compaction we should have + assert(compact_->sub_compact_states.size() == 1); + SubcompactionState* sub_compact = compact_->sub_compact_states.data(); + + ProcessKeyValueCompaction(sub_compact); + + compaction_stats_.stats.micros = + db_options_.clock->NowMicros() - start_micros; + compaction_stats_.stats.cpu_micros = + sub_compact->compaction_job_stats.cpu_micros; + + RecordTimeToHistogram(stats_, COMPACTION_TIME, + compaction_stats_.stats.micros); + RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME, + compaction_stats_.stats.cpu_micros); + + Status status = sub_compact->status; + IOStatus io_s = sub_compact->io_status; + + if (io_status_.ok()) { + io_status_ = io_s; + } + + if (status.ok()) { + constexpr IODebugContext* dbg = nullptr; + + if (output_directory_) { + io_s = output_directory_->FsyncWithDirOptions(IOOptions(), dbg, + DirFsyncOptions()); + } + } + if (io_status_.ok()) { + io_status_ = io_s; + } + if (status.ok()) { + status = io_s; + } + if (status.ok()) { + // TODO: Add verify_table() + } + + // Finish up all book-keeping to unify the subcompaction results + compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_); + UpdateCompactionStats(); + RecordCompactionIOStats(); + + LogFlush(db_options_.info_log); + compact_->status = status; + compact_->status.PermitUncheckedError(); + + // Build compaction result + compaction_result_->output_level = compact_->compaction->output_level(); + compaction_result_->output_path = output_path_; + for (const auto& output_file : sub_compact->GetOutputs()) { + auto& meta = output_file.meta; + compaction_result_->output_files.emplace_back( + MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno, + meta.fd.largest_seqno, meta.smallest.Encode().ToString(), + meta.largest.Encode().ToString(), meta.oldest_ancester_time, + meta.file_creation_time, output_file.validator.GetHash(), + meta.marked_for_compaction, meta.unique_id); + } + InternalStats::CompactionStatsFull compaction_stats; + sub_compact->AggregateCompactionStats(compaction_stats); + compaction_result_->num_output_records = + compaction_stats.stats.num_output_records; + compaction_result_->total_bytes = compaction_stats.TotalBytesWritten(); + + return status; +} + +void CompactionServiceCompactionJob::CleanupCompaction() { + CompactionJob::CleanupCompaction(); +} + +// Internal binary format for the input and result data +enum BinaryFormatVersion : uint32_t { + kOptionsString = 1, // Use string format similar to Option string format +}; + +static std::unordered_map cfd_type_info = { + {"name", + {offsetof(struct ColumnFamilyDescriptor, name), OptionType::kEncodedString, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"options", + {offsetof(struct ColumnFamilyDescriptor, options), + OptionType::kConfigurable, OptionVerificationType::kNormal, + OptionTypeFlags::kNone, + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + auto cf_options = static_cast(addr); + return GetColumnFamilyOptionsFromString(opts, ColumnFamilyOptions(), + value, cf_options); + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto cf_options = static_cast(addr); + std::string result; + auto status = + GetStringFromColumnFamilyOptions(opts, *cf_options, &result); + *value = "{" + result + "}"; + return status; + }, + [](const ConfigOptions& opts, const std::string& name, const void* addr1, + const void* addr2, std::string* mismatch) { + const auto this_one = static_cast(addr1); + const auto that_one = static_cast(addr2); + auto this_conf = CFOptionsAsConfigurable(*this_one); + auto that_conf = CFOptionsAsConfigurable(*that_one); + std::string mismatch_opt; + bool result = + this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt); + if (!result) { + *mismatch = name + "." + mismatch_opt; + } + return result; + }}}, +}; + +static std::unordered_map cs_input_type_info = { + {"column_family", + OptionTypeInfo::Struct( + "column_family", &cfd_type_info, + offsetof(struct CompactionServiceInput, column_family), + OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, + {"db_options", + {offsetof(struct CompactionServiceInput, db_options), + OptionType::kConfigurable, OptionVerificationType::kNormal, + OptionTypeFlags::kNone, + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + auto options = static_cast(addr); + return GetDBOptionsFromString(opts, DBOptions(), value, options); + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto options = static_cast(addr); + std::string result; + auto status = GetStringFromDBOptions(opts, *options, &result); + *value = "{" + result + "}"; + return status; + }, + [](const ConfigOptions& opts, const std::string& name, const void* addr1, + const void* addr2, std::string* mismatch) { + const auto this_one = static_cast(addr1); + const auto that_one = static_cast(addr2); + auto this_conf = DBOptionsAsConfigurable(*this_one); + auto that_conf = DBOptionsAsConfigurable(*that_one); + std::string mismatch_opt; + bool result = + this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt); + if (!result) { + *mismatch = name + "." + mismatch_opt; + } + return result; + }}}, + {"snapshots", OptionTypeInfo::Vector( + offsetof(struct CompactionServiceInput, snapshots), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + {0, OptionType::kUInt64T})}, + {"input_files", OptionTypeInfo::Vector( + offsetof(struct CompactionServiceInput, input_files), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + {0, OptionType::kEncodedString})}, + {"output_level", + {offsetof(struct CompactionServiceInput, output_level), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"db_id", + {offsetof(struct CompactionServiceInput, db_id), + OptionType::kEncodedString}}, + {"has_begin", + {offsetof(struct CompactionServiceInput, has_begin), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"begin", + {offsetof(struct CompactionServiceInput, begin), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"has_end", + {offsetof(struct CompactionServiceInput, has_end), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"end", + {offsetof(struct CompactionServiceInput, end), OptionType::kEncodedString, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, +}; + +static std::unordered_map + cs_output_file_type_info = { + {"file_name", + {offsetof(struct CompactionServiceOutputFile, file_name), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"smallest_seqno", + {offsetof(struct CompactionServiceOutputFile, smallest_seqno), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"largest_seqno", + {offsetof(struct CompactionServiceOutputFile, largest_seqno), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"smallest_internal_key", + {offsetof(struct CompactionServiceOutputFile, smallest_internal_key), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"largest_internal_key", + {offsetof(struct CompactionServiceOutputFile, largest_internal_key), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"oldest_ancester_time", + {offsetof(struct CompactionServiceOutputFile, oldest_ancester_time), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_creation_time", + {offsetof(struct CompactionServiceOutputFile, file_creation_time), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"paranoid_hash", + {offsetof(struct CompactionServiceOutputFile, paranoid_hash), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"marked_for_compaction", + {offsetof(struct CompactionServiceOutputFile, marked_for_compaction), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"unique_id", + OptionTypeInfo::Array( + offsetof(struct CompactionServiceOutputFile, unique_id), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + {0, OptionType::kUInt64T})}, +}; + +static std::unordered_map + compaction_job_stats_type_info = { + {"elapsed_micros", + {offsetof(struct CompactionJobStats, elapsed_micros), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"cpu_micros", + {offsetof(struct CompactionJobStats, cpu_micros), OptionType::kUInt64T, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"num_input_records", + {offsetof(struct CompactionJobStats, num_input_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_blobs_read", + {offsetof(struct CompactionJobStats, num_blobs_read), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_input_files", + {offsetof(struct CompactionJobStats, num_input_files), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_input_files_at_output_level", + {offsetof(struct CompactionJobStats, num_input_files_at_output_level), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_records", + {offsetof(struct CompactionJobStats, num_output_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_files", + {offsetof(struct CompactionJobStats, num_output_files), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_files_blob", + {offsetof(struct CompactionJobStats, num_output_files_blob), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"is_full_compaction", + {offsetof(struct CompactionJobStats, is_full_compaction), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"is_manual_compaction", + {offsetof(struct CompactionJobStats, is_manual_compaction), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_input_bytes", + {offsetof(struct CompactionJobStats, total_input_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_blob_bytes_read", + {offsetof(struct CompactionJobStats, total_blob_bytes_read), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_output_bytes", + {offsetof(struct CompactionJobStats, total_output_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_output_bytes_blob", + {offsetof(struct CompactionJobStats, total_output_bytes_blob), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_records_replaced", + {offsetof(struct CompactionJobStats, num_records_replaced), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_input_raw_key_bytes", + {offsetof(struct CompactionJobStats, total_input_raw_key_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_input_raw_value_bytes", + {offsetof(struct CompactionJobStats, total_input_raw_value_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_input_deletion_records", + {offsetof(struct CompactionJobStats, num_input_deletion_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_expired_deletion_records", + {offsetof(struct CompactionJobStats, num_expired_deletion_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_corrupt_keys", + {offsetof(struct CompactionJobStats, num_corrupt_keys), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_write_nanos", + {offsetof(struct CompactionJobStats, file_write_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_range_sync_nanos", + {offsetof(struct CompactionJobStats, file_range_sync_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_fsync_nanos", + {offsetof(struct CompactionJobStats, file_fsync_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_prepare_write_nanos", + {offsetof(struct CompactionJobStats, file_prepare_write_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"smallest_output_key_prefix", + {offsetof(struct CompactionJobStats, smallest_output_key_prefix), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"largest_output_key_prefix", + {offsetof(struct CompactionJobStats, largest_output_key_prefix), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_single_del_fallthru", + {offsetof(struct CompactionJobStats, num_single_del_fallthru), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_single_del_mismatch", + {offsetof(struct CompactionJobStats, num_single_del_mismatch), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +namespace { +// this is a helper struct to serialize and deserialize class Status, because +// Status's members are not public. +struct StatusSerializationAdapter { + uint8_t code; + uint8_t subcode; + uint8_t severity; + std::string message; + + StatusSerializationAdapter() = default; + explicit StatusSerializationAdapter(const Status& s) { + code = s.code(); + subcode = s.subcode(); + severity = s.severity(); + auto msg = s.getState(); + message = msg ? msg : ""; + } + + Status GetStatus() const { + return Status{static_cast(code), + static_cast(subcode), + static_cast(severity), message}; + } +}; +} // namespace + +static std::unordered_map + status_adapter_type_info = { + {"code", + {offsetof(struct StatusSerializationAdapter, code), + OptionType::kUInt8T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"subcode", + {offsetof(struct StatusSerializationAdapter, subcode), + OptionType::kUInt8T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"severity", + {offsetof(struct StatusSerializationAdapter, severity), + OptionType::kUInt8T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"message", + {offsetof(struct StatusSerializationAdapter, message), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +static std::unordered_map cs_result_type_info = { + {"status", + {offsetof(struct CompactionServiceResult, status), + OptionType::kCustomizable, OptionVerificationType::kNormal, + OptionTypeFlags::kNone, + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + auto status_obj = static_cast(addr); + StatusSerializationAdapter adapter; + Status s = OptionTypeInfo::ParseType( + opts, value, status_adapter_type_info, &adapter); + *status_obj = adapter.GetStatus(); + return s; + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto status_obj = static_cast(addr); + StatusSerializationAdapter adapter(*status_obj); + std::string result; + Status s = OptionTypeInfo::SerializeType(opts, status_adapter_type_info, + &adapter, &result); + *value = "{" + result + "}"; + return s; + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr1, const void* addr2, std::string* mismatch) { + const auto status1 = static_cast(addr1); + const auto status2 = static_cast(addr2); + + StatusSerializationAdapter adatper1(*status1); + StatusSerializationAdapter adapter2(*status2); + return OptionTypeInfo::TypesAreEqual(opts, status_adapter_type_info, + &adatper1, &adapter2, mismatch); + }}}, + {"output_files", + OptionTypeInfo::Vector( + offsetof(struct CompactionServiceResult, output_files), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + OptionTypeInfo::Struct("output_files", &cs_output_file_type_info, 0, + OptionVerificationType::kNormal, + OptionTypeFlags::kNone))}, + {"output_level", + {offsetof(struct CompactionServiceResult, output_level), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"output_path", + {offsetof(struct CompactionServiceResult, output_path), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_records", + {offsetof(struct CompactionServiceResult, num_output_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_bytes", + {offsetof(struct CompactionServiceResult, total_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"bytes_read", + {offsetof(struct CompactionServiceResult, bytes_read), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"bytes_written", + {offsetof(struct CompactionServiceResult, bytes_written), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"stats", OptionTypeInfo::Struct( + "stats", &compaction_job_stats_type_info, + offsetof(struct CompactionServiceResult, stats), + OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, +}; + +Status CompactionServiceInput::Read(const std::string& data_str, + CompactionServiceInput* obj) { + if (data_str.size() <= sizeof(BinaryFormatVersion)) { + return Status::InvalidArgument("Invalid CompactionServiceInput string"); + } + auto format_version = DecodeFixed32(data_str.data()); + if (format_version == kOptionsString) { + ConfigOptions cf; + cf.invoke_prepare_options = false; + cf.ignore_unknown_options = true; + return OptionTypeInfo::ParseType( + cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_input_type_info, + obj); + } else { + return Status::NotSupported( + "Compaction Service Input data version not supported: " + + std::to_string(format_version)); + } +} + +Status CompactionServiceInput::Write(std::string* output) { + char buf[sizeof(BinaryFormatVersion)]; + EncodeFixed32(buf, kOptionsString); + output->append(buf, sizeof(BinaryFormatVersion)); + ConfigOptions cf; + cf.invoke_prepare_options = false; + return OptionTypeInfo::SerializeType(cf, cs_input_type_info, this, output); +} + +Status CompactionServiceResult::Read(const std::string& data_str, + CompactionServiceResult* obj) { + if (data_str.size() <= sizeof(BinaryFormatVersion)) { + return Status::InvalidArgument("Invalid CompactionServiceResult string"); + } + auto format_version = DecodeFixed32(data_str.data()); + if (format_version == kOptionsString) { + ConfigOptions cf; + cf.invoke_prepare_options = false; + cf.ignore_unknown_options = true; + return OptionTypeInfo::ParseType( + cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_result_type_info, + obj); + } else { + return Status::NotSupported( + "Compaction Service Result data version not supported: " + + std::to_string(format_version)); + } +} + +Status CompactionServiceResult::Write(std::string* output) { + char buf[sizeof(BinaryFormatVersion)]; + EncodeFixed32(buf, kOptionsString); + output->append(buf, sizeof(BinaryFormatVersion)); + ConfigOptions cf; + cf.invoke_prepare_options = false; + return OptionTypeInfo::SerializeType(cf, cs_result_type_info, this, output); +} + +#ifndef NDEBUG +bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other) { + std::string mismatch; + return TEST_Equals(other, &mismatch); +} + +bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other, + std::string* mismatch) { + ConfigOptions cf; + cf.invoke_prepare_options = false; + return OptionTypeInfo::TypesAreEqual(cf, cs_result_type_info, this, other, + mismatch); +} + +bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other) { + std::string mismatch; + return TEST_Equals(other, &mismatch); +} + +bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other, + std::string* mismatch) { + ConfigOptions cf; + cf.invoke_prepare_options = false; + return OptionTypeInfo::TypesAreEqual(cf, cs_input_type_info, this, other, + mismatch); +} +#endif // NDEBUG +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/compaction/compaction_service_test.cc b/src/rocksdb/db/compaction/compaction_service_test.cc new file mode 100644 index 000000000..c475c4e3b --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_service_test.cc @@ -0,0 +1,966 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "table/unique_id_impl.h" + +namespace ROCKSDB_NAMESPACE { + +class MyTestCompactionService : public CompactionService { + public: + MyTestCompactionService( + std::string db_path, Options& options, + std::shared_ptr& statistics, + std::vector>& listeners, + std::vector> + table_properties_collector_factories) + : db_path_(std::move(db_path)), + options_(options), + statistics_(statistics), + start_info_("na", "na", "na", 0, Env::TOTAL), + wait_info_("na", "na", "na", 0, Env::TOTAL), + listeners_(listeners), + table_properties_collector_factories_( + std::move(table_properties_collector_factories)) {} + + static const char* kClassName() { return "MyTestCompactionService"; } + + const char* Name() const override { return kClassName(); } + + CompactionServiceJobStatus StartV2( + const CompactionServiceJobInfo& info, + const std::string& compaction_service_input) override { + InstrumentedMutexLock l(&mutex_); + start_info_ = info; + assert(info.db_name == db_path_); + jobs_.emplace(info.job_id, compaction_service_input); + CompactionServiceJobStatus s = CompactionServiceJobStatus::kSuccess; + if (is_override_start_status_) { + return override_start_status_; + } + return s; + } + + CompactionServiceJobStatus WaitForCompleteV2( + const CompactionServiceJobInfo& info, + std::string* compaction_service_result) override { + std::string compaction_input; + assert(info.db_name == db_path_); + { + InstrumentedMutexLock l(&mutex_); + wait_info_ = info; + auto i = jobs_.find(info.job_id); + if (i == jobs_.end()) { + return CompactionServiceJobStatus::kFailure; + } + compaction_input = std::move(i->second); + jobs_.erase(i); + } + + if (is_override_wait_status_) { + return override_wait_status_; + } + + CompactionServiceOptionsOverride options_override; + options_override.env = options_.env; + options_override.file_checksum_gen_factory = + options_.file_checksum_gen_factory; + options_override.comparator = options_.comparator; + options_override.merge_operator = options_.merge_operator; + options_override.compaction_filter = options_.compaction_filter; + options_override.compaction_filter_factory = + options_.compaction_filter_factory; + options_override.prefix_extractor = options_.prefix_extractor; + options_override.table_factory = options_.table_factory; + options_override.sst_partitioner_factory = options_.sst_partitioner_factory; + options_override.statistics = statistics_; + if (!listeners_.empty()) { + options_override.listeners = listeners_; + } + + if (!table_properties_collector_factories_.empty()) { + options_override.table_properties_collector_factories = + table_properties_collector_factories_; + } + + OpenAndCompactOptions options; + options.canceled = &canceled_; + + Status s = DB::OpenAndCompact( + options, db_path_, db_path_ + "/" + std::to_string(info.job_id), + compaction_input, compaction_service_result, options_override); + if (is_override_wait_result_) { + *compaction_service_result = override_wait_result_; + } + compaction_num_.fetch_add(1); + if (s.ok()) { + return CompactionServiceJobStatus::kSuccess; + } else { + return CompactionServiceJobStatus::kFailure; + } + } + + int GetCompactionNum() { return compaction_num_.load(); } + + CompactionServiceJobInfo GetCompactionInfoForStart() { return start_info_; } + CompactionServiceJobInfo GetCompactionInfoForWait() { return wait_info_; } + + void OverrideStartStatus(CompactionServiceJobStatus s) { + is_override_start_status_ = true; + override_start_status_ = s; + } + + void OverrideWaitStatus(CompactionServiceJobStatus s) { + is_override_wait_status_ = true; + override_wait_status_ = s; + } + + void OverrideWaitResult(std::string str) { + is_override_wait_result_ = true; + override_wait_result_ = std::move(str); + } + + void ResetOverride() { + is_override_wait_result_ = false; + is_override_start_status_ = false; + is_override_wait_status_ = false; + } + + void SetCanceled(bool canceled) { canceled_ = canceled; } + + private: + InstrumentedMutex mutex_; + std::atomic_int compaction_num_{0}; + std::map jobs_; + const std::string db_path_; + Options options_; + std::shared_ptr statistics_; + CompactionServiceJobInfo start_info_; + CompactionServiceJobInfo wait_info_; + bool is_override_start_status_ = false; + CompactionServiceJobStatus override_start_status_ = + CompactionServiceJobStatus::kFailure; + bool is_override_wait_status_ = false; + CompactionServiceJobStatus override_wait_status_ = + CompactionServiceJobStatus::kFailure; + bool is_override_wait_result_ = false; + std::string override_wait_result_; + std::vector> listeners_; + std::vector> + table_properties_collector_factories_; + std::atomic_bool canceled_{false}; +}; + +class CompactionServiceTest : public DBTestBase { + public: + explicit CompactionServiceTest() + : DBTestBase("compaction_service_test", true) {} + + protected: + void ReopenWithCompactionService(Options* options) { + options->env = env_; + primary_statistics_ = CreateDBStatistics(); + options->statistics = primary_statistics_; + compactor_statistics_ = CreateDBStatistics(); + + compaction_service_ = std::make_shared( + dbname_, *options, compactor_statistics_, remote_listeners, + remote_table_properties_collector_factories); + options->compaction_service = compaction_service_; + DestroyAndReopen(*options); + } + + Statistics* GetCompactorStatistics() { return compactor_statistics_.get(); } + + Statistics* GetPrimaryStatistics() { return primary_statistics_.get(); } + + MyTestCompactionService* GetCompactionService() { + CompactionService* cs = compaction_service_.get(); + return static_cast_with_check(cs); + } + + void GenerateTestData() { + // Generate 20 files @ L2 + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(2); + + // Generate 10 files @ L1 overlap with all 20 files @ L2 + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(1); + ASSERT_EQ(FilesPerLevel(), "0,10,20"); + } + + void VerifyTestData() { + for (int i = 0; i < 200; i++) { + auto result = Get(Key(i)); + if (i % 2) { + ASSERT_EQ(result, "value" + std::to_string(i)); + } else { + ASSERT_EQ(result, "value_new" + std::to_string(i)); + } + } + } + + std::vector> remote_listeners; + std::vector> + remote_table_properties_collector_factories; + + private: + std::shared_ptr compactor_statistics_; + std::shared_ptr primary_statistics_; + std::shared_ptr compaction_service_; +}; + +TEST_F(CompactionServiceTest, BasicCompactions) { + Options options = CurrentOptions(); + ReopenWithCompactionService(&options); + + Statistics* primary_statistics = GetPrimaryStatistics(); + Statistics* compactor_statistics = GetCompactorStatistics(); + + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // verify result + for (int i = 0; i < 200; i++) { + auto result = Get(Key(i)); + if (i % 2) { + ASSERT_EQ(result, "value" + std::to_string(i)); + } else { + ASSERT_EQ(result, "value_new" + std::to_string(i)); + } + } + auto my_cs = GetCompactionService(); + ASSERT_GE(my_cs->GetCompactionNum(), 1); + + // make sure the compaction statistics is only recorded on the remote side + ASSERT_GE(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), 1); + ASSERT_GE(compactor_statistics->getTickerCount(COMPACT_READ_BYTES), 1); + ASSERT_EQ(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES), 0); + // even with remote compaction, primary host still needs to read SST files to + // `verify_table()`. + ASSERT_GE(primary_statistics->getTickerCount(COMPACT_READ_BYTES), 1); + // all the compaction write happens on the remote side + ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES), + compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES)); + ASSERT_GE(primary_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES), 1); + ASSERT_GT(primary_statistics->getTickerCount(COMPACT_READ_BYTES), + primary_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES)); + // compactor is already the remote side, which doesn't have remote + ASSERT_EQ(compactor_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES), 0); + ASSERT_EQ(compactor_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES), + 0); + + // Test failed compaction + SyncPoint::GetInstance()->SetCallBack( + "DBImplSecondary::CompactWithoutInstallation::End", [&](void* status) { + // override job status + auto s = static_cast(status); + *s = Status::Aborted("MyTestCompactionService failed to compact!"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Status s; + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + s = Put(Key(key_id), "value_new" + std::to_string(key_id)); + if (s.IsAborted()) { + break; + } + } + if (s.IsAborted()) { + break; + } + s = Flush(); + if (s.IsAborted()) { + break; + } + s = dbfull()->TEST_WaitForCompact(); + if (s.IsAborted()) { + break; + } + } + ASSERT_TRUE(s.IsAborted()); + + // Test re-open and successful unique id verification + std::atomic_int verify_passed{0}; + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::Open::PassedVerifyUniqueId", [&](void* arg) { + // override job status + auto id = static_cast(arg); + assert(*id != kNullUniqueId64x2); + verify_passed++; + }); + Reopen(options); + ASSERT_GT(verify_passed, 0); + Close(); +} + +TEST_F(CompactionServiceTest, ManualCompaction) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + ReopenWithCompactionService(&options); + GenerateTestData(); + + auto my_cs = GetCompactionService(); + + std::string start_str = Key(15); + std::string end_str = Key(45); + Slice start(start_str); + Slice end(end_str); + uint64_t comp_num = my_cs->GetCompactionNum(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end)); + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + VerifyTestData(); + + start_str = Key(120); + start = start_str; + comp_num = my_cs->GetCompactionNum(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, nullptr)); + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + VerifyTestData(); + + end_str = Key(92); + end = end_str; + comp_num = my_cs->GetCompactionNum(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, &end)); + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + VerifyTestData(); + + comp_num = my_cs->GetCompactionNum(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + VerifyTestData(); +} + +TEST_F(CompactionServiceTest, CancelCompactionOnRemoteSide) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + ReopenWithCompactionService(&options); + GenerateTestData(); + + auto my_cs = GetCompactionService(); + + std::string start_str = Key(15); + std::string end_str = Key(45); + Slice start(start_str); + Slice end(end_str); + uint64_t comp_num = my_cs->GetCompactionNum(); + + // Test cancel compaction at the beginning + my_cs->SetCanceled(true); + auto s = db_->CompactRange(CompactRangeOptions(), &start, &end); + ASSERT_TRUE(s.IsIncomplete()); + // compaction number is not increased + ASSERT_GE(my_cs->GetCompactionNum(), comp_num); + VerifyTestData(); + + // Test cancel compaction in progress + ReopenWithCompactionService(&options); + GenerateTestData(); + my_cs = GetCompactionService(); + my_cs->SetCanceled(false); + + std::atomic_bool cancel_issued{false}; + SyncPoint::GetInstance()->SetCallBack("CompactionJob::Run():Inprogress", + [&](void* /*arg*/) { + cancel_issued = true; + my_cs->SetCanceled(true); + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + s = db_->CompactRange(CompactRangeOptions(), &start, &end); + ASSERT_TRUE(s.IsIncomplete()); + ASSERT_TRUE(cancel_issued); + // compaction number is not increased + ASSERT_GE(my_cs->GetCompactionNum(), comp_num); + VerifyTestData(); +} + +TEST_F(CompactionServiceTest, FailedToStart) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + ReopenWithCompactionService(&options); + + GenerateTestData(); + + auto my_cs = GetCompactionService(); + my_cs->OverrideStartStatus(CompactionServiceJobStatus::kFailure); + + std::string start_str = Key(15); + std::string end_str = Key(45); + Slice start(start_str); + Slice end(end_str); + Status s = db_->CompactRange(CompactRangeOptions(), &start, &end); + ASSERT_TRUE(s.IsIncomplete()); +} + +TEST_F(CompactionServiceTest, InvalidResult) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + ReopenWithCompactionService(&options); + + GenerateTestData(); + + auto my_cs = GetCompactionService(); + my_cs->OverrideWaitResult("Invalid Str"); + + std::string start_str = Key(15); + std::string end_str = Key(45); + Slice start(start_str); + Slice end(end_str); + Status s = db_->CompactRange(CompactRangeOptions(), &start, &end); + ASSERT_FALSE(s.ok()); +} + +TEST_F(CompactionServiceTest, SubCompaction) { + Options options = CurrentOptions(); + options.max_subcompactions = 10; + options.target_file_size_base = 1 << 10; // 1KB + options.disable_auto_compactions = true; + ReopenWithCompactionService(&options); + + GenerateTestData(); + VerifyTestData(); + + auto my_cs = GetCompactionService(); + int compaction_num_before = my_cs->GetCompactionNum(); + + auto cro = CompactRangeOptions(); + cro.max_subcompactions = 10; + Status s = db_->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(s); + VerifyTestData(); + int compaction_num = my_cs->GetCompactionNum() - compaction_num_before; + // make sure there's sub-compaction by checking the compaction number + ASSERT_GE(compaction_num, 2); +} + +class PartialDeleteCompactionFilter : public CompactionFilter { + public: + CompactionFilter::Decision FilterV2( + int /*level*/, const Slice& key, ValueType /*value_type*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + int i = std::stoi(key.ToString().substr(3)); + if (i > 5 && i <= 105) { + return CompactionFilter::Decision::kRemove; + } + return CompactionFilter::Decision::kKeep; + } + + const char* Name() const override { return "PartialDeleteCompactionFilter"; } +}; + +TEST_F(CompactionServiceTest, CompactionFilter) { + Options options = CurrentOptions(); + std::unique_ptr delete_comp_filter( + new PartialDeleteCompactionFilter()); + options.compaction_filter = delete_comp_filter.get(); + ReopenWithCompactionService(&options); + + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // verify result + for (int i = 0; i < 200; i++) { + auto result = Get(Key(i)); + if (i > 5 && i <= 105) { + ASSERT_EQ(result, "NOT_FOUND"); + } else if (i % 2) { + ASSERT_EQ(result, "value" + std::to_string(i)); + } else { + ASSERT_EQ(result, "value_new" + std::to_string(i)); + } + } + auto my_cs = GetCompactionService(); + ASSERT_GE(my_cs->GetCompactionNum(), 1); +} + +TEST_F(CompactionServiceTest, Snapshot) { + Options options = CurrentOptions(); + ReopenWithCompactionService(&options); + + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value1")); + const Snapshot* s1 = db_->GetSnapshot(); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(1), "value2")); + ASSERT_OK(Put(Key(3), "value2")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + auto my_cs = GetCompactionService(); + ASSERT_GE(my_cs->GetCompactionNum(), 1); + ASSERT_EQ("value1", Get(Key(1), s1)); + ASSERT_EQ("value2", Get(Key(1))); + db_->ReleaseSnapshot(s1); +} + +TEST_F(CompactionServiceTest, ConcurrentCompaction) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 100; + options.max_background_jobs = 20; + ReopenWithCompactionService(&options); + GenerateTestData(); + + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + + std::vector threads; + for (const auto& file : meta.levels[1].files) { + threads.emplace_back(std::thread([&]() { + std::string fname = file.db_path + "/" + file.name; + ASSERT_OK(db_->CompactFiles(CompactionOptions(), {fname}, 2)); + })); + } + + for (auto& thread : threads) { + thread.join(); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // verify result + for (int i = 0; i < 200; i++) { + auto result = Get(Key(i)); + if (i % 2) { + ASSERT_EQ(result, "value" + std::to_string(i)); + } else { + ASSERT_EQ(result, "value_new" + std::to_string(i)); + } + } + auto my_cs = GetCompactionService(); + ASSERT_EQ(my_cs->GetCompactionNum(), 10); + ASSERT_EQ(FilesPerLevel(), "0,0,10"); +} + +TEST_F(CompactionServiceTest, CompactionInfo) { + Options options = CurrentOptions(); + ReopenWithCompactionService(&options); + + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + auto my_cs = + static_cast_with_check(GetCompactionService()); + uint64_t comp_num = my_cs->GetCompactionNum(); + ASSERT_GE(comp_num, 1); + + CompactionServiceJobInfo info = my_cs->GetCompactionInfoForStart(); + ASSERT_EQ(dbname_, info.db_name); + std::string db_id, db_session_id; + ASSERT_OK(db_->GetDbIdentity(db_id)); + ASSERT_EQ(db_id, info.db_id); + ASSERT_OK(db_->GetDbSessionId(db_session_id)); + ASSERT_EQ(db_session_id, info.db_session_id); + ASSERT_EQ(Env::LOW, info.priority); + info = my_cs->GetCompactionInfoForWait(); + ASSERT_EQ(dbname_, info.db_name); + ASSERT_EQ(db_id, info.db_id); + ASSERT_EQ(db_session_id, info.db_session_id); + ASSERT_EQ(Env::LOW, info.priority); + + // Test priority USER + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + SstFileMetaData file = meta.levels[1].files[0]; + ASSERT_OK(db_->CompactFiles(CompactionOptions(), + {file.db_path + "/" + file.name}, 2)); + info = my_cs->GetCompactionInfoForStart(); + ASSERT_EQ(Env::USER, info.priority); + info = my_cs->GetCompactionInfoForWait(); + ASSERT_EQ(Env::USER, info.priority); + + // Test priority BOTTOM + env_->SetBackgroundThreads(1, Env::BOTTOM); + options.num_levels = 2; + ReopenWithCompactionService(&options); + my_cs = + static_cast_with_check(GetCompactionService()); + + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + info = my_cs->GetCompactionInfoForStart(); + ASSERT_EQ(Env::BOTTOM, info.priority); + info = my_cs->GetCompactionInfoForWait(); + ASSERT_EQ(Env::BOTTOM, info.priority); +} + +TEST_F(CompactionServiceTest, FallbackLocalAuto) { + Options options = CurrentOptions(); + ReopenWithCompactionService(&options); + + auto my_cs = GetCompactionService(); + Statistics* compactor_statistics = GetCompactorStatistics(); + Statistics* primary_statistics = GetPrimaryStatistics(); + uint64_t compactor_write_bytes = + compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES); + uint64_t primary_write_bytes = + primary_statistics->getTickerCount(COMPACT_WRITE_BYTES); + + my_cs->OverrideStartStatus(CompactionServiceJobStatus::kUseLocal); + + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // verify result + for (int i = 0; i < 200; i++) { + auto result = Get(Key(i)); + if (i % 2) { + ASSERT_EQ(result, "value" + std::to_string(i)); + } else { + ASSERT_EQ(result, "value_new" + std::to_string(i)); + } + } + + ASSERT_EQ(my_cs->GetCompactionNum(), 0); + + // make sure the compaction statistics is only recorded on the local side + ASSERT_EQ(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), + compactor_write_bytes); + ASSERT_GT(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES), + primary_write_bytes); + ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES), 0); + ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES), 0); +} + +TEST_F(CompactionServiceTest, FallbackLocalManual) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + ReopenWithCompactionService(&options); + + GenerateTestData(); + VerifyTestData(); + + auto my_cs = GetCompactionService(); + Statistics* compactor_statistics = GetCompactorStatistics(); + Statistics* primary_statistics = GetPrimaryStatistics(); + uint64_t compactor_write_bytes = + compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES); + uint64_t primary_write_bytes = + primary_statistics->getTickerCount(COMPACT_WRITE_BYTES); + + // re-enable remote compaction + my_cs->ResetOverride(); + std::string start_str = Key(15); + std::string end_str = Key(45); + Slice start(start_str); + Slice end(end_str); + uint64_t comp_num = my_cs->GetCompactionNum(); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end)); + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + // make sure the compaction statistics is only recorded on the remote side + ASSERT_GT(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), + compactor_write_bytes); + ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES), + compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES)); + ASSERT_EQ(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES), + primary_write_bytes); + + // return run local again with API WaitForComplete + my_cs->OverrideWaitStatus(CompactionServiceJobStatus::kUseLocal); + start_str = Key(120); + start = start_str; + comp_num = my_cs->GetCompactionNum(); + compactor_write_bytes = + compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES); + primary_write_bytes = primary_statistics->getTickerCount(COMPACT_WRITE_BYTES); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, nullptr)); + ASSERT_EQ(my_cs->GetCompactionNum(), + comp_num); // no remote compaction is run + // make sure the compaction statistics is only recorded on the local side + ASSERT_EQ(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), + compactor_write_bytes); + ASSERT_GT(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES), + primary_write_bytes); + ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES), + compactor_write_bytes); + + // verify result after 2 manual compactions + VerifyTestData(); +} + +TEST_F(CompactionServiceTest, RemoteEventListener) { + class RemoteEventListenerTest : public EventListener { + public: + const char* Name() const override { return "RemoteEventListenerTest"; } + + void OnSubcompactionBegin(const SubcompactionJobInfo& info) override { + auto result = on_going_compactions.emplace(info.job_id); + ASSERT_TRUE(result.second); // make sure there's no duplication + compaction_num++; + EventListener::OnSubcompactionBegin(info); + } + void OnSubcompactionCompleted(const SubcompactionJobInfo& info) override { + auto num = on_going_compactions.erase(info.job_id); + ASSERT_TRUE(num == 1); // make sure the compaction id exists + EventListener::OnSubcompactionCompleted(info); + } + void OnTableFileCreated(const TableFileCreationInfo& info) override { + ASSERT_EQ(on_going_compactions.count(info.job_id), 1); + file_created++; + EventListener::OnTableFileCreated(info); + } + void OnTableFileCreationStarted( + const TableFileCreationBriefInfo& info) override { + ASSERT_EQ(on_going_compactions.count(info.job_id), 1); + file_creation_started++; + EventListener::OnTableFileCreationStarted(info); + } + + bool ShouldBeNotifiedOnFileIO() override { + file_io_notified++; + return EventListener::ShouldBeNotifiedOnFileIO(); + } + + std::atomic_uint64_t file_io_notified{0}; + std::atomic_uint64_t file_creation_started{0}; + std::atomic_uint64_t file_created{0}; + + std::set on_going_compactions; // store the job_id + std::atomic_uint64_t compaction_num{0}; + }; + + auto listener = new RemoteEventListenerTest(); + remote_listeners.emplace_back(listener); + + Options options = CurrentOptions(); + ReopenWithCompactionService(&options); + + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // check the events are triggered + ASSERT_TRUE(listener->file_io_notified > 0); + ASSERT_TRUE(listener->file_creation_started > 0); + ASSERT_TRUE(listener->file_created > 0); + ASSERT_TRUE(listener->compaction_num > 0); + ASSERT_TRUE(listener->on_going_compactions.empty()); + + // verify result + for (int i = 0; i < 200; i++) { + auto result = Get(Key(i)); + if (i % 2) { + ASSERT_EQ(result, "value" + std::to_string(i)); + } else { + ASSERT_EQ(result, "value_new" + std::to_string(i)); + } + } +} + +TEST_F(CompactionServiceTest, TablePropertiesCollector) { + const static std::string kUserPropertyName = "TestCount"; + + class TablePropertiesCollectorTest : public TablePropertiesCollector { + public: + Status Finish(UserCollectedProperties* properties) override { + *properties = UserCollectedProperties{ + {kUserPropertyName, std::to_string(count_)}, + }; + return Status::OK(); + } + + UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties(); + } + + const char* Name() const override { return "TablePropertiesCollectorTest"; } + + Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/, + EntryType /*type*/, SequenceNumber /*seq*/, + uint64_t /*file_size*/) override { + count_++; + return Status::OK(); + } + + private: + uint32_t count_ = 0; + }; + + class TablePropertiesCollectorFactoryTest + : public TablePropertiesCollectorFactory { + public: + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context /*context*/) override { + return new TablePropertiesCollectorTest(); + } + + const char* Name() const override { + return "TablePropertiesCollectorFactoryTest"; + } + }; + + auto factory = new TablePropertiesCollectorFactoryTest(); + remote_table_properties_collector_factories.emplace_back(factory); + + const int kNumSst = 3; + const int kLevel0Trigger = 4; + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kLevel0Trigger; + ReopenWithCompactionService(&options); + + // generate a few SSTs locally which should not have user property + for (int i = 0; i < kNumSst; i++) { + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value")); + } + ASSERT_OK(Flush()); + } + + TablePropertiesCollection fname_to_props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&fname_to_props)); + for (const auto& file_props : fname_to_props) { + auto properties = file_props.second->user_collected_properties; + auto it = properties.find(kUserPropertyName); + ASSERT_EQ(it, properties.end()); + } + + // trigger compaction + for (int i = kNumSst; i < kLevel0Trigger; i++) { + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value")); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + ASSERT_OK(db_->GetPropertiesOfAllTables(&fname_to_props)); + + bool has_user_property = false; + for (const auto& file_props : fname_to_props) { + auto properties = file_props.second->user_collected_properties; + auto it = properties.find(kUserPropertyName); + if (it != properties.end()) { + has_user_property = true; + ASSERT_GT(std::stoi(it->second), 0); + } + } + ASSERT_TRUE(has_user_property); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as CompactionService is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/compaction/compaction_state.cc b/src/rocksdb/db/compaction/compaction_state.cc new file mode 100644 index 000000000..ee4b0c189 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_state.cc @@ -0,0 +1,46 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_state.h" + +namespace ROCKSDB_NAMESPACE { + +Slice CompactionState::SmallestUserKey() { + for (const auto& sub_compact_state : sub_compact_states) { + Slice smallest = sub_compact_state.SmallestUserKey(); + if (!smallest.empty()) { + return smallest; + } + } + // If there is no finished output, return an empty slice. + return Slice{nullptr, 0}; +} + +Slice CompactionState::LargestUserKey() { + for (auto it = sub_compact_states.rbegin(); it < sub_compact_states.rend(); + ++it) { + Slice largest = it->LargestUserKey(); + if (!largest.empty()) { + return largest; + } + } + // If there is no finished output, return an empty slice. + return Slice{nullptr, 0}; +} + +void CompactionState::AggregateCompactionStats( + InternalStats::CompactionStatsFull& compaction_stats, + CompactionJobStats& compaction_job_stats) { + for (const auto& sc : sub_compact_states) { + sc.AggregateCompactionStats(compaction_stats); + compaction_job_stats.Add(sc.compaction_job_stats); + } +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_state.h b/src/rocksdb/db/compaction/compaction_state.h new file mode 100644 index 000000000..cc5b66c68 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_state.h @@ -0,0 +1,42 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/compaction/compaction.h" +#include "db/compaction/subcompaction_state.h" +#include "db/internal_stats.h" + +// Data structures used for compaction_job and compaction_service_job which has +// the list of sub_compact_states and the aggregated information for the +// compaction. +namespace ROCKSDB_NAMESPACE { + +// Maintains state for the entire compaction +class CompactionState { + public: + Compaction* const compaction; + + // REQUIRED: subcompaction states are stored in order of increasing key-range + std::vector sub_compact_states; + Status status; + + void AggregateCompactionStats( + InternalStats::CompactionStatsFull& compaction_stats, + CompactionJobStats& compaction_job_stats); + + explicit CompactionState(Compaction* c) : compaction(c) {} + + Slice SmallestUserKey(); + + Slice LargestUserKey(); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/file_pri.h b/src/rocksdb/db/compaction/file_pri.h new file mode 100644 index 000000000..82dddcf93 --- /dev/null +++ b/src/rocksdb/db/compaction/file_pri.h @@ -0,0 +1,92 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once +#include + +#include "db/version_edit.h" + +namespace ROCKSDB_NAMESPACE { +// We boost files that are closer to TTL limit. This boosting could be +// through FileMetaData.compensated_file_size but this compensated size +// is widely used as something similar to file size so dramatically boost +// the value might cause unintended consequences. +// +// This boosting algorithm can go very fancy, but here we use a simple +// formula which can satisify: +// (1) Different levels are triggered slightly differently to avoid +// too many cascading cases +// (2) Files in the same level get boosting more when TTL gets closer. +// +// Don't do any boosting before TTL has past by half. This is to make +// sure lower write amp for most of the case. And all levels should be +// fully boosted when total TTL compaction threshold triggers. +// Differientiate boosting ranges of each level by 1/2. This will make +// range for each level exponentially increasing. We could do it by +// having them to be equal, or go even fancier. We can adjust it after +// we observe the behavior in production. +// The threshold starting boosting: +// +------------------------------------------------------------------ + +// ^ ^ ^ ^ ^ ^ +// Age 0 ... | | second last level thresold +// | | +// | third last level +// | +// forth last level +// +// We arbitrarily set with 0 when a file is aged boost_age_start and +// grow linearly. The ratio is arbitrarily set so that when the next level +// starts to boost, the previous level's boosting amount is 16. +class FileTtlBooster { + public: + FileTtlBooster(uint64_t current_time, uint64_t ttl, int num_non_empty_levels, + int level) + : current_time_(current_time) { + if (ttl == 0 || level == 0 || level >= num_non_empty_levels - 1) { + enabled_ = false; + boost_age_start_ = 0; + boost_step_ = 1; + } else { + enabled_ = true; + uint64_t all_boost_start_age = ttl / 2; + uint64_t all_boost_age_range = (ttl / 32) * 31 - all_boost_start_age; + uint64_t boost_age_range = + all_boost_age_range >> (num_non_empty_levels - level - 1); + boost_age_start_ = all_boost_start_age + boost_age_range; + const uint64_t kBoostRatio = 16; + // prevent 0 value to avoid divide 0 error. + boost_step_ = std::max(boost_age_range / kBoostRatio, uint64_t{1}); + } + } + + uint64_t GetBoostScore(FileMetaData* f) { + if (!enabled_) { + return 1; + } + uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime(); + if (oldest_ancester_time >= current_time_) { + return 1; + } + uint64_t age = current_time_ - oldest_ancester_time; + if (age > boost_age_start_) { + // Use integer just for convenience. + // We could make all file_to_order double if we want. + // Technically this can overflow if users override timing and + // give a very high current time. Ignore the case for simplicity. + // Boosting is addition to current value, so +1. This will effectively + // make boosting to kick in after the first boost_step_ is reached. + return (age - boost_age_start_) / boost_step_ + 1; + } + return 1; + } + + private: + bool enabled_; + uint64_t current_time_; + uint64_t boost_age_start_; + uint64_t boost_step_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/sst_partitioner.cc b/src/rocksdb/db/compaction/sst_partitioner.cc new file mode 100644 index 000000000..9e7f9fa89 --- /dev/null +++ b/src/rocksdb/db/compaction/sst_partitioner.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "rocksdb/sst_partitioner.h" + +#include + +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" + +namespace ROCKSDB_NAMESPACE { +static std::unordered_map + sst_fixed_prefix_type_info = { +#ifndef ROCKSDB_LITE + {"length", + {0, OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +#endif // ROCKSDB_LITE +}; + +SstPartitionerFixedPrefixFactory::SstPartitionerFixedPrefixFactory(size_t len) + : len_(len) { + RegisterOptions("Length", &len_, &sst_fixed_prefix_type_info); +} + +PartitionerResult SstPartitionerFixedPrefix::ShouldPartition( + const PartitionerRequest& request) { + Slice last_key_fixed(*request.prev_user_key); + if (last_key_fixed.size() > len_) { + last_key_fixed.size_ = len_; + } + Slice current_key_fixed(*request.current_user_key); + if (current_key_fixed.size() > len_) { + current_key_fixed.size_ = len_; + } + return last_key_fixed.compare(current_key_fixed) != 0 ? kRequired + : kNotRequired; +} + +bool SstPartitionerFixedPrefix::CanDoTrivialMove( + const Slice& smallest_user_key, const Slice& largest_user_key) { + return ShouldPartition(PartitionerRequest(smallest_user_key, largest_user_key, + 0)) == kNotRequired; +} + +std::unique_ptr +SstPartitionerFixedPrefixFactory::CreatePartitioner( + const SstPartitioner::Context& /* context */) const { + return std::unique_ptr(new SstPartitionerFixedPrefix(len_)); +} + +std::shared_ptr NewSstPartitionerFixedPrefixFactory( + size_t prefix_len) { + return std::make_shared(prefix_len); +} + +#ifndef ROCKSDB_LITE +namespace { +static int RegisterSstPartitionerFactories(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + SstPartitionerFixedPrefixFactory::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new SstPartitionerFixedPrefixFactory(0)); + return guard->get(); + }); + return 1; +} +} // namespace +#endif // ROCKSDB_LITE + +Status SstPartitionerFactory::CreateFromString( + const ConfigOptions& options, const std::string& value, + std::shared_ptr* result) { +#ifndef ROCKSDB_LITE + static std::once_flag once; + std::call_once(once, [&]() { + RegisterSstPartitionerFactories(*(ObjectLibrary::Default().get()), ""); + }); +#endif // ROCKSDB_LITE + return LoadSharedObject(options, value, nullptr, + result); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/subcompaction_state.cc b/src/rocksdb/db/compaction/subcompaction_state.cc new file mode 100644 index 000000000..0c56471e9 --- /dev/null +++ b/src/rocksdb/db/compaction/subcompaction_state.cc @@ -0,0 +1,106 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/subcompaction_state.h" + +#include "rocksdb/sst_partitioner.h" + +namespace ROCKSDB_NAMESPACE { +void SubcompactionState::AggregateCompactionStats( + InternalStats::CompactionStatsFull& compaction_stats) const { + compaction_stats.stats.Add(compaction_outputs_.stats_); + if (HasPenultimateLevelOutputs()) { + compaction_stats.has_penultimate_level_output = true; + compaction_stats.penultimate_level_stats.Add( + penultimate_level_outputs_.stats_); + } +} + +OutputIterator SubcompactionState::GetOutputs() const { + return OutputIterator(penultimate_level_outputs_.outputs_, + compaction_outputs_.outputs_); +} + +void SubcompactionState::Cleanup(Cache* cache) { + penultimate_level_outputs_.Cleanup(); + compaction_outputs_.Cleanup(); + + if (!status.ok()) { + for (const auto& out : GetOutputs()) { + // If this file was inserted into the table cache then remove + // them here because this compaction was not committed. + TableCache::Evict(cache, out.meta.fd.GetNumber()); + } + } + // TODO: sub_compact.io_status is not checked like status. Not sure if thats + // intentional. So ignoring the io_status as of now. + io_status.PermitUncheckedError(); +} + +Slice SubcompactionState::SmallestUserKey() const { + if (has_penultimate_level_outputs_) { + Slice a = compaction_outputs_.SmallestUserKey(); + Slice b = penultimate_level_outputs_.SmallestUserKey(); + if (a.empty()) { + return b; + } + if (b.empty()) { + return a; + } + const Comparator* user_cmp = + compaction->column_family_data()->user_comparator(); + if (user_cmp->Compare(a, b) > 0) { + return b; + } else { + return a; + } + } else { + return compaction_outputs_.SmallestUserKey(); + } +} + +Slice SubcompactionState::LargestUserKey() const { + if (has_penultimate_level_outputs_) { + Slice a = compaction_outputs_.LargestUserKey(); + Slice b = penultimate_level_outputs_.LargestUserKey(); + if (a.empty()) { + return b; + } + if (b.empty()) { + return a; + } + const Comparator* user_cmp = + compaction->column_family_data()->user_comparator(); + if (user_cmp->Compare(a, b) < 0) { + return b; + } else { + return a; + } + } else { + return compaction_outputs_.LargestUserKey(); + } +} + +Status SubcompactionState::AddToOutput( + const CompactionIterator& iter, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func) { + // update target output first + is_current_penultimate_level_ = iter.output_to_penultimate_level(); + current_outputs_ = is_current_penultimate_level_ ? &penultimate_level_outputs_ + : &compaction_outputs_; + if (is_current_penultimate_level_) { + has_penultimate_level_outputs_ = true; + } + + return Current().AddToOutput(iter, open_file_func, close_file_func); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/subcompaction_state.h b/src/rocksdb/db/compaction/subcompaction_state.h new file mode 100644 index 000000000..13e63120f --- /dev/null +++ b/src/rocksdb/db/compaction/subcompaction_state.h @@ -0,0 +1,214 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +#include "db/blob/blob_file_addition.h" +#include "db/blob/blob_garbage_meter.h" +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_iterator.h" +#include "db/compaction/compaction_outputs.h" +#include "db/internal_stats.h" +#include "db/output_validator.h" +#include "db/range_del_aggregator.h" + +namespace ROCKSDB_NAMESPACE { + +// Maintains state and outputs for each sub-compaction +// It contains 2 `CompactionOutputs`: +// 1. one for the normal output files +// 2. another for the penultimate level outputs +// a `current` pointer maintains the current output group, when calling +// `AddToOutput()`, it checks the output of the current compaction_iterator key +// and point `current` to the target output group. By default, it just points to +// normal compaction_outputs, if the compaction_iterator key should be placed on +// the penultimate level, `current` is changed to point to +// `penultimate_level_outputs`. +// The later operations uses `Current()` to get the target group. +// +// +----------+ +-----------------------------+ +---------+ +// | *current |--------> | compaction_outputs |----->| output | +// +----------+ +-----------------------------+ +---------+ +// | | output | +// | +---------+ +// | | ... | +// | +// | +-----------------------------+ +---------+ +// +-------------> | penultimate_level_outputs |----->| output | +// +-----------------------------+ +---------+ +// | ... | + +class SubcompactionState { + public: + const Compaction* compaction; + + // The boundaries of the key-range this compaction is interested in. No two + // sub-compactions may have overlapping key-ranges. + // 'start' is inclusive, 'end' is exclusive, and nullptr means unbounded + const std::optional start, end; + + // The return status of this sub-compaction + Status status; + + // The return IO Status of this sub-compaction + IOStatus io_status; + + // Notify on sub-compaction completion only if listener was notified on + // sub-compaction begin. + bool notify_on_subcompaction_completion = false; + + // compaction job stats for this sub-compaction + CompactionJobStats compaction_job_stats; + + // sub-compaction job id, which is used to identify different sub-compaction + // within the same compaction job. + const uint32_t sub_job_id; + + Slice SmallestUserKey() const; + + Slice LargestUserKey() const; + + // Get all outputs from the subcompaction. For per_key_placement compaction, + // it returns both the last level outputs and penultimate level outputs. + OutputIterator GetOutputs() const; + + // Assign range dels aggregator, for each range_del, it can only be assigned + // to one output level, for per_key_placement, it's going to be the + // penultimate level. + void AssignRangeDelAggregator( + std::unique_ptr&& range_del_agg) { + if (compaction->SupportsPerKeyPlacement()) { + penultimate_level_outputs_.AssignRangeDelAggregator( + std::move(range_del_agg)); + } else { + compaction_outputs_.AssignRangeDelAggregator(std::move(range_del_agg)); + } + } + + void RemoveLastEmptyOutput() { + compaction_outputs_.RemoveLastEmptyOutput(); + penultimate_level_outputs_.RemoveLastEmptyOutput(); + } + +#ifndef ROCKSDB_LITE + void BuildSubcompactionJobInfo( + SubcompactionJobInfo& subcompaction_job_info) const { + const Compaction* c = compaction; + const ColumnFamilyData* cfd = c->column_family_data(); + + subcompaction_job_info.cf_id = cfd->GetID(); + subcompaction_job_info.cf_name = cfd->GetName(); + subcompaction_job_info.status = status; + subcompaction_job_info.subcompaction_job_id = static_cast(sub_job_id); + subcompaction_job_info.base_input_level = c->start_level(); + subcompaction_job_info.output_level = c->output_level(); + subcompaction_job_info.stats = compaction_job_stats; + } +#endif // !ROCKSDB_LITE + + SubcompactionState() = delete; + SubcompactionState(const SubcompactionState&) = delete; + SubcompactionState& operator=(const SubcompactionState&) = delete; + + SubcompactionState(Compaction* c, const std::optional _start, + const std::optional _end, uint32_t _sub_job_id) + : compaction(c), + start(_start), + end(_end), + sub_job_id(_sub_job_id), + compaction_outputs_(c, /*is_penultimate_level=*/false), + penultimate_level_outputs_(c, /*is_penultimate_level=*/true) { + assert(compaction != nullptr); + // Set output split key (used for RoundRobin feature) only for normal + // compaction_outputs, output to penultimate_level feature doesn't support + // RoundRobin feature (and may never going to be supported, because for + // RoundRobin, the data time is mostly naturally sorted, no need to have + // per-key placement with output_to_penultimate_level). + compaction_outputs_.SetOutputSlitKey(start, end); + } + + SubcompactionState(SubcompactionState&& state) noexcept + : compaction(state.compaction), + start(state.start), + end(state.end), + status(std::move(state.status)), + io_status(std::move(state.io_status)), + notify_on_subcompaction_completion( + state.notify_on_subcompaction_completion), + compaction_job_stats(std::move(state.compaction_job_stats)), + sub_job_id(state.sub_job_id), + compaction_outputs_(std::move(state.compaction_outputs_)), + penultimate_level_outputs_(std::move(state.penultimate_level_outputs_)), + is_current_penultimate_level_(state.is_current_penultimate_level_), + has_penultimate_level_outputs_(state.has_penultimate_level_outputs_) { + current_outputs_ = is_current_penultimate_level_ + ? &penultimate_level_outputs_ + : &compaction_outputs_; + } + + bool HasPenultimateLevelOutputs() const { + return has_penultimate_level_outputs_ || + penultimate_level_outputs_.HasRangeDel(); + } + + bool IsCurrentPenultimateLevel() const { + return is_current_penultimate_level_; + } + + // Add all the new files from this compaction to version_edit + void AddOutputsEdit(VersionEdit* out_edit) const { + for (const auto& file : penultimate_level_outputs_.outputs_) { + out_edit->AddFile(compaction->GetPenultimateLevel(), file.meta); + } + for (const auto& file : compaction_outputs_.outputs_) { + out_edit->AddFile(compaction->output_level(), file.meta); + } + } + + void Cleanup(Cache* cache); + + void AggregateCompactionStats( + InternalStats::CompactionStatsFull& compaction_stats) const; + + CompactionOutputs& Current() const { + assert(current_outputs_); + return *current_outputs_; + } + + // Add compaction_iterator key/value to the `Current` output group. + Status AddToOutput(const CompactionIterator& iter, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func); + + // Close all compaction output files, both output_to_penultimate_level outputs + // and normal outputs. + Status CloseCompactionFiles(const Status& curr_status, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func) { + // Call FinishCompactionOutputFile() even if status is not ok: it needs to + // close the output file. + Status s = penultimate_level_outputs_.CloseOutput( + curr_status, open_file_func, close_file_func); + s = compaction_outputs_.CloseOutput(s, open_file_func, close_file_func); + return s; + } + + private: + // State kept for output being generated + CompactionOutputs compaction_outputs_; + CompactionOutputs penultimate_level_outputs_; + CompactionOutputs* current_outputs_ = &compaction_outputs_; + bool is_current_penultimate_level_ = false; + bool has_penultimate_level_outputs_ = false; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/tiered_compaction_test.cc b/src/rocksdb/db/compaction/tiered_compaction_test.cc new file mode 100644 index 000000000..aaebcfd94 --- /dev/null +++ b/src/rocksdb/db/compaction/tiered_compaction_test.cc @@ -0,0 +1,2028 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/iostats_context.h" +#include "rocksdb/listener.h" +#include "rocksdb/utilities/debug.h" +#include "test_util/mock_time_env.h" + +namespace ROCKSDB_NAMESPACE { + +#if !defined(ROCKSDB_LITE) + +class TieredCompactionTest : public DBTestBase, + public testing::WithParamInterface { + public: + TieredCompactionTest() + : DBTestBase("tiered_compaction_test", /*env_do_fsync=*/true), + kBasicCompStats(CompactionReason::kUniversalSizeAmplification, 1), + kBasicPerKeyPlacementCompStats( + CompactionReason::kUniversalSizeAmplification, 1), + kBasicFlushStats(CompactionReason::kFlush, 1) { + kBasicCompStats.micros = kHasValue; + kBasicCompStats.cpu_micros = kHasValue; + kBasicCompStats.bytes_read_non_output_levels = kHasValue; + kBasicCompStats.num_input_files_in_non_output_levels = kHasValue; + kBasicCompStats.num_input_records = kHasValue; + kBasicCompStats.num_dropped_records = kHasValue; + + kBasicPerLevelStats.num_output_records = kHasValue; + kBasicPerLevelStats.bytes_written = kHasValue; + kBasicPerLevelStats.num_output_files = kHasValue; + + kBasicPerKeyPlacementCompStats.micros = kHasValue; + kBasicPerKeyPlacementCompStats.cpu_micros = kHasValue; + kBasicPerKeyPlacementCompStats.Add(kBasicPerLevelStats); + + kBasicFlushStats.micros = kHasValue; + kBasicFlushStats.cpu_micros = kHasValue; + kBasicFlushStats.bytes_written = kHasValue; + kBasicFlushStats.num_output_files = kHasValue; + } + + protected: + static constexpr uint8_t kHasValue = 1; + + InternalStats::CompactionStats kBasicCompStats; + InternalStats::CompactionStats kBasicPerKeyPlacementCompStats; + InternalStats::CompactionOutputsStats kBasicPerLevelStats; + InternalStats::CompactionStats kBasicFlushStats; + + std::atomic_bool enable_per_key_placement = true; + + void SetUp() override { + SyncPoint::GetInstance()->SetCallBack( + "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) { + auto supports_per_key_placement = static_cast(arg); + *supports_per_key_placement = enable_per_key_placement; + }); + SyncPoint::GetInstance()->EnableProcessing(); + } + + const std::vector& GetCompactionStats() { + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + assert(versions->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + const InternalStats* const internal_stats = cfd->internal_stats(); + assert(internal_stats); + + return internal_stats->TEST_GetCompactionStats(); + } + + const InternalStats::CompactionStats& GetPerKeyPlacementCompactionStats() { + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + assert(versions->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + const InternalStats* const internal_stats = cfd->internal_stats(); + assert(internal_stats); + + return internal_stats->TEST_GetPerKeyPlacementCompactionStats(); + } + + // Verify the compaction stats, the stats are roughly compared + void VerifyCompactionStats( + const std::vector& expect_stats, + const InternalStats::CompactionStats& expect_pl_stats) { + const std::vector& stats = + GetCompactionStats(); + const size_t kLevels = expect_stats.size(); + ASSERT_EQ(kLevels, stats.size()); + + for (auto it = stats.begin(), expect = expect_stats.begin(); + it != stats.end(); it++, expect++) { + VerifyCompactionStats(*it, *expect); + } + + const InternalStats::CompactionStats& pl_stats = + GetPerKeyPlacementCompactionStats(); + VerifyCompactionStats(pl_stats, expect_pl_stats); + } + + void ResetAllStats(std::vector& stats, + InternalStats::CompactionStats& pl_stats) { + ASSERT_OK(dbfull()->ResetStats()); + for (auto& level_stats : stats) { + level_stats.Clear(); + } + pl_stats.Clear(); + } + + // bottommost_temperature is renaming to last_level_temperature, set either + // of them should have the same effect. + void SetColdTemperature(Options& options) { + if (GetParam()) { + options.bottommost_temperature = Temperature::kCold; + } else { + options.last_level_temperature = Temperature::kCold; + } + } + + private: + void CompareStats(uint64_t val, uint64_t expect) { + if (expect > 0) { + ASSERT_TRUE(val > 0); + } else { + ASSERT_EQ(val, 0); + } + } + + void VerifyCompactionStats( + const InternalStats::CompactionStats& stats, + const InternalStats::CompactionStats& expect_stats) { + CompareStats(stats.micros, expect_stats.micros); + CompareStats(stats.cpu_micros, expect_stats.cpu_micros); + CompareStats(stats.bytes_read_non_output_levels, + expect_stats.bytes_read_non_output_levels); + CompareStats(stats.bytes_read_output_level, + expect_stats.bytes_read_output_level); + CompareStats(stats.bytes_read_blob, expect_stats.bytes_read_blob); + CompareStats(stats.bytes_written, expect_stats.bytes_written); + CompareStats(stats.bytes_moved, expect_stats.bytes_moved); + CompareStats(stats.num_input_files_in_non_output_levels, + expect_stats.num_input_files_in_non_output_levels); + CompareStats(stats.num_input_files_in_output_level, + expect_stats.num_input_files_in_output_level); + CompareStats(stats.num_output_files, expect_stats.num_output_files); + CompareStats(stats.num_output_files_blob, + expect_stats.num_output_files_blob); + CompareStats(stats.num_input_records, expect_stats.num_input_records); + CompareStats(stats.num_dropped_records, expect_stats.num_dropped_records); + CompareStats(stats.num_output_records, expect_stats.num_output_records); + ASSERT_EQ(stats.count, expect_stats.count); + for (int i = 0; i < static_cast(CompactionReason::kNumOfReasons); + i++) { + ASSERT_EQ(stats.counts[i], expect_stats.counts[i]); + } + } +}; + +TEST_P(TieredCompactionTest, SequenceBasedTieredStorageUniversal) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + const int kLastLevel = kNumLevels - 1; + + auto options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + SetColdTemperature(options); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.statistics = CreateDBStatistics(); + options.max_subcompactions = 10; + DestroyAndReopen(options); + + std::atomic_uint64_t latest_cold_seq = 0; + std::vector seq_history; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast(arg); + context->output_to_penultimate_level = + context->seq_num > latest_cold_seq; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::vector expect_stats(kNumLevels); + InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel]; + InternalStats::CompactionStats expect_pl_stats; + + for (int i = 0; i < kNumTrigger; i++) { + for (int j = 0; j < kNumKeys; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + seq_history.emplace_back(dbfull()->GetLatestSequenceNumber()); + expect_stats[0].Add(kBasicFlushStats); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + + // the penultimate level file temperature is not cold, all data are output to + // the penultimate level. + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + // basic compaction stats are still counted to the last level + expect_stats[kLastLevel].Add(kBasicCompStats); + expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); + + VerifyCompactionStats(expect_stats, expect_pl_stats); + + ResetAllStats(expect_stats, expect_pl_stats); + + // move forward the cold_seq to split the file into 2 levels, so should have + // both the last level stats and the output_to_penultimate_level stats + latest_cold_seq = seq_history[0]; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + last_stats.Add(kBasicCompStats); + last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + last_stats.Add(kBasicPerLevelStats); + last_stats.num_dropped_records = 0; + expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); + expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + VerifyCompactionStats(expect_stats, expect_pl_stats); + + // delete all cold data, so all data will be on penultimate level + for (int i = 0; i < 10; i++) { + ASSERT_OK(Delete(Key(i))); + } + ASSERT_OK(Flush()); + + ResetAllStats(expect_stats, expect_pl_stats); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + last_stats.Add(kBasicCompStats); + last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + last_stats.bytes_read_output_level = kHasValue; + last_stats.num_input_files_in_output_level = kHasValue; + expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); + expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + VerifyCompactionStats(expect_stats, expect_pl_stats); + + // move forward the cold_seq again with range delete, take a snapshot to keep + // the range dels in both cold and hot SSTs + auto snap = db_->GetSnapshot(); + latest_cold_seq = seq_history[2]; + std::string start = Key(25), end = Key(35); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + + ResetAllStats(expect_stats, expect_pl_stats); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + last_stats.Add(kBasicCompStats); + last_stats.Add(kBasicPerLevelStats); + last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); + expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + VerifyCompactionStats(expect_stats, expect_pl_stats); + + // verify data + std::string value; + for (int i = 0; i < kNumKeys; i++) { + if (i < 10 || (i >= 25 && i < 35)) { + ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound()); + } else { + ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value)); + } + } + + // range delete all hot data + start = Key(30); + end = Key(130); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // no range del is dropped because of snapshot + ASSERT_EQ( + options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), + 0); + + // release the snapshot and do compaction again should remove all hot data + db_->ReleaseSnapshot(snap); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // 2 range dels are dropped + ASSERT_EQ( + options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), + 3); + + // move backward the cold_seq, for example the user may change the setting of + // hot/cold data, but it won't impact the existing cold data, as the sequence + // number is zeroed out. + latest_cold_seq = seq_history[1]; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); +} + +TEST_P(TieredCompactionTest, RangeBasedTieredStorageUniversal) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + const int kLastLevel = kNumLevels - 1; + + auto options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + SetColdTemperature(options); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.statistics = CreateDBStatistics(); + options.max_subcompactions = 10; + DestroyAndReopen(options); + auto cmp = options.comparator; + + port::Mutex mutex; + std::string hot_start = Key(10); + std::string hot_end = Key(50); + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast(arg); + MutexLock l(&mutex); + context->output_to_penultimate_level = + cmp->Compare(context->key, hot_start) >= 0 && + cmp->Compare(context->key, hot_end) < 0; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::vector expect_stats(kNumLevels); + InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel]; + InternalStats::CompactionStats expect_pl_stats; + + for (int i = 0; i < kNumTrigger; i++) { + for (int j = 0; j < kNumKeys; j++) { + ASSERT_OK(Put(Key(j), "value" + std::to_string(j))); + } + ASSERT_OK(Flush()); + expect_stats[0].Add(kBasicFlushStats); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + last_stats.Add(kBasicCompStats); + last_stats.Add(kBasicPerLevelStats); + expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); + VerifyCompactionStats(expect_stats, expect_pl_stats); + + ResetAllStats(expect_stats, expect_pl_stats); + + // change to all cold, no output_to_penultimate_level output + { + MutexLock l(&mutex); + hot_start = Key(100); + hot_end = Key(200); + } + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + last_stats.Add(kBasicCompStats); + last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + last_stats.Add(kBasicPerLevelStats); + last_stats.num_dropped_records = 0; + last_stats.bytes_read_output_level = kHasValue; + last_stats.num_input_files_in_output_level = kHasValue; + VerifyCompactionStats(expect_stats, expect_pl_stats); + + // change to all hot, universal compaction support moving data to up level if + // it's within compaction level range. + { + MutexLock l(&mutex); + hot_start = Key(0); + hot_end = Key(100); + } + + // No data is moved from cold tier to hot tier because no input files from L5 + // or higher, it's not safe to move data to output_to_penultimate_level level. + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + + // Add 2 keys in higher level, but in separated files, all keys can be moved + // up if it's hot + ASSERT_OK(Put(Key(0), "value" + std::to_string(0))); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(50), "value" + std::to_string(0))); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + // change to only 1 key cold, to test compaction could stop even it matches + // size amp compaction threshold + { + MutexLock l(&mutex); + hot_start = Key(1); + hot_end = Key(1000); + } + + // generate files just enough to trigger compaction + for (int i = 0; i < kNumTrigger - 1; i++) { + for (int j = 0; j < 1000; j++) { + ASSERT_OK(Put(Key(j), "value" + std::to_string(j))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->WaitForCompact( + true)); // make sure the compaction is able to finish + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + auto opts = db_->GetOptions(); + auto max_size_amp = + opts.compaction_options_universal.max_size_amplification_percent / 100; + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), + GetSstSizeHelper(Temperature::kCold) * max_size_amp); + + // delete all cold data + ASSERT_OK(Delete(Key(0))); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + // range delete overlap with both hot/cold data, with a snapshot to make sure + // the range del is saved + auto snap = db_->GetSnapshot(); + { + MutexLock l(&mutex); + hot_start = Key(50); + hot_end = Key(100); + } + std::string start = Key(1), end = Key(70); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // no range del is dropped until snapshot is released + ASSERT_EQ( + options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), + 0); + + // verify data + std::string value; + for (int i = 0; i < kNumKeys; i++) { + if (i < 70) { + ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound()); + } else { + ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value)); + } + } + + db_->ReleaseSnapshot(snap); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // range del is dropped + ASSERT_EQ( + options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), + 1); +} + +TEST_P(TieredCompactionTest, LevelColdRangeDelete) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + const int kLastLevel = kNumLevels - 1; + + auto options = CurrentOptions(); + SetColdTemperature(options); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.num_levels = kNumLevels; + options.statistics = CreateDBStatistics(); + options.max_subcompactions = 10; + DestroyAndReopen(options); + + std::atomic_uint64_t latest_cold_seq = 0; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast(arg); + context->output_to_penultimate_level = + context->seq_num > latest_cold_seq; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(i), "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,1", + FilesPerLevel()); // bottommost but not last level file is hot + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + // explicitly move the data to the last level + MoveFilesToLevel(kLastLevel); + + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + + auto snap = db_->GetSnapshot(); + + std::string start = Key(10); + std::string end = Key(50); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + + // 20->30 will be marked as cold data, but it cannot be placed to cold tier + // (bottommost) otherwise, it will be "deleted" by the range del in + // output_to_penultimate_level level verify that these data will be able to + // queried + for (int i = 20; i < 30; i++) { + ASSERT_OK(Put(Key(i), "value" + std::to_string(i))); + } + // make the range tombstone and data after that cold + latest_cold_seq = dbfull()->GetLatestSequenceNumber(); + + // add home hot data, just for test + for (int i = 30; i < 40; i++) { + ASSERT_OK(Put(Key(i), "value" + std::to_string(i))); + } + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + std::string value; + for (int i = 0; i < kNumKeys; i++) { + auto s = db_->Get(ReadOptions(), Key(i), &value); + if ((i >= 10 && i < 20) || (i >= 40 && i < 50)) { + ASSERT_TRUE(s.IsNotFound()); + } else { + ASSERT_OK(s); + } + } + + db_->ReleaseSnapshot(snap); +} + +// Test SST partitioner cut after every single key +class SingleKeySstPartitioner : public SstPartitioner { + public: + const char* Name() const override { return "SingleKeySstPartitioner"; } + + PartitionerResult ShouldPartition( + const PartitionerRequest& /*request*/) override { + return kRequired; + } + + bool CanDoTrivialMove(const Slice& /*smallest_user_key*/, + const Slice& /*largest_user_key*/) override { + return false; + } +}; + +class SingleKeySstPartitionerFactory : public SstPartitionerFactory { + public: + static const char* kClassName() { return "SingleKeySstPartitionerFactory"; } + const char* Name() const override { return kClassName(); } + + std::unique_ptr CreatePartitioner( + const SstPartitioner::Context& /* context */) const override { + return std::unique_ptr(new SingleKeySstPartitioner()); + } +}; + +TEST_P(TieredCompactionTest, LevelOutofBoundaryRangeDelete) { + const int kNumTrigger = 4; + const int kNumLevels = 3; + const int kNumKeys = 10; + + auto factory = std::make_shared(); + auto options = CurrentOptions(); + SetColdTemperature(options); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.num_levels = kNumLevels; + options.statistics = CreateDBStatistics(); + options.sst_partitioner_factory = factory; + options.max_subcompactions = 10; + DestroyAndReopen(options); + + std::atomic_uint64_t latest_cold_seq = 0; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast(arg); + context->output_to_penultimate_level = + context->seq_num > latest_cold_seq; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(i), "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + + MoveFilesToLevel(kNumLevels - 1); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + ASSERT_EQ("0,0,10", FilesPerLevel()); + + auto snap = db_->GetSnapshot(); + + // only range delete + std::string start = Key(3); + std::string end = Key(5); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), + 0); // tombstone has no size, even it's in hot tier + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + ASSERT_EQ("0,1,10", + FilesPerLevel()); // one file is at the penultimate level which + // only contains a range delete + + // Add 2 hot keys, each is a new SST, they will be placed in the same level as + // range del, but they don't have overlap with range del, make sure the range + // del will still be placed there + latest_cold_seq = dbfull()->GetLatestSequenceNumber(); + ASSERT_OK(Put(Key(0), "new value" + std::to_string(0))); + auto snap2 = db_->GetSnapshot(); + ASSERT_OK(Put(Key(6), "new value" + std::to_string(6))); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,2,10", + FilesPerLevel()); // one file is at the penultimate level + // which only contains a range delete + std::vector live_file_meta; + db_->GetLiveFilesMetaData(&live_file_meta); + bool found_sst_with_del = false; + uint64_t sst_with_del_num = 0; + for (const auto& meta : live_file_meta) { + if (meta.num_deletions > 0) { + // found SST with del, which has 2 entries, one for data one for range del + ASSERT_EQ(meta.level, + kNumLevels - 2); // output to penultimate level + ASSERT_EQ(meta.num_entries, 2); + ASSERT_EQ(meta.num_deletions, 1); + found_sst_with_del = true; + sst_with_del_num = meta.file_number; + } + } + ASSERT_TRUE(found_sst_with_del); + + // release the first snapshot and compact, which should compact the range del + // but new inserted key `0` and `6` are still hot data which will be placed on + // the penultimate level + db_->ReleaseSnapshot(snap); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,2,7", FilesPerLevel()); + db_->GetLiveFilesMetaData(&live_file_meta); + found_sst_with_del = false; + for (const auto& meta : live_file_meta) { + // check new SST with del (the old one may not yet be deleted after + // compaction) + if (meta.num_deletions > 0 && meta.file_number != sst_with_del_num) { + found_sst_with_del = true; + } + } + ASSERT_FALSE(found_sst_with_del); + + // Now make all data cold, key 0 will be moved to the last level, but key 6 is + // still in snap2, so it will be kept at the penultimate level + latest_cold_seq = dbfull()->GetLatestSequenceNumber(); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,1,8", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + db_->ReleaseSnapshot(snap2); + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,8", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); +} + +TEST_P(TieredCompactionTest, UniversalRangeDelete) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 10; + + auto factory = std::make_shared(); + + auto options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + SetColdTemperature(options); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.statistics = CreateDBStatistics(); + options.sst_partitioner_factory = factory; + options.max_subcompactions = 10; + DestroyAndReopen(options); + + std::atomic_uint64_t latest_cold_seq = 0; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast(arg); + context->output_to_penultimate_level = + context->seq_num > latest_cold_seq; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(i), "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + + // compact to the penultimate level with 10 files + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + ASSERT_EQ("0,0,0,0,0,10", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + // make all data cold + latest_cold_seq = dbfull()->GetLatestSequenceNumber(); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,10", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // range del which considered as hot data, but it will be merged and deleted + // with the last level data + std::string start = Key(3); + std::string end = Key(5); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + ASSERT_EQ("0,0,0,0,0,0,8", FilesPerLevel()); + + // range del with snapshot should be preserved in the penultimate level + auto snap = db_->GetSnapshot(); + + start = Key(6); + end = Key(8); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,8", FilesPerLevel()); + + // Add 2 hot keys, each is a new SST, they will be placed in the same level as + // range del, but no overlap with range del. + latest_cold_seq = dbfull()->GetLatestSequenceNumber(); + ASSERT_OK(Put(Key(4), "new value" + std::to_string(0))); + auto snap2 = db_->GetSnapshot(); + ASSERT_OK(Put(Key(9), "new value" + std::to_string(6))); + + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,2,8", FilesPerLevel()); + // find the SST with range del + std::vector live_file_meta; + db_->GetLiveFilesMetaData(&live_file_meta); + bool found_sst_with_del = false; + uint64_t sst_with_del_num = 0; + for (const auto& meta : live_file_meta) { + if (meta.num_deletions > 0) { + // found SST with del, which has 2 entries, one for data one for range del + ASSERT_EQ(meta.level, + kNumLevels - 2); // output_to_penultimate_level level + ASSERT_EQ(meta.num_entries, 2); + ASSERT_EQ(meta.num_deletions, 1); + found_sst_with_del = true; + sst_with_del_num = meta.file_number; + } + } + ASSERT_TRUE(found_sst_with_del); + + // release the first snapshot which should compact the range del, but data on + // the same level is still hot + db_->ReleaseSnapshot(snap); + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,2,6", FilesPerLevel()); + db_->GetLiveFilesMetaData(&live_file_meta); + // no range del should be found in SST + found_sst_with_del = false; + for (const auto& meta : live_file_meta) { + // check new SST with del (the old one may not yet be deleted after + // compaction) + if (meta.num_deletions > 0 && meta.file_number != sst_with_del_num) { + found_sst_with_del = true; + } + } + ASSERT_FALSE(found_sst_with_del); + + // make all data to cold, but key 6 is still protected by snap2 + latest_cold_seq = dbfull()->GetLatestSequenceNumber(); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,7", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + db_->ReleaseSnapshot(snap2); + + // release snapshot, everything go to bottommost + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,7", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); +} + +TEST_P(TieredCompactionTest, SequenceBasedTieredStorageLevel) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + const int kLastLevel = kNumLevels - 1; + + auto options = CurrentOptions(); + SetColdTemperature(options); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.num_levels = kNumLevels; + options.statistics = CreateDBStatistics(); + options.max_subcompactions = 10; + DestroyAndReopen(options); + + std::atomic_uint64_t latest_cold_seq = 0; + std::vector seq_history; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast(arg); + context->output_to_penultimate_level = + context->seq_num > latest_cold_seq; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::vector expect_stats(kNumLevels); + InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel]; + InternalStats::CompactionStats expect_pl_stats; + + for (int i = 0; i < kNumTrigger; i++) { + for (int j = 0; j < kNumKeys; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + expect_stats[0].Add(kBasicFlushStats); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + + // non last level is hot + ASSERT_EQ("0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + expect_stats[1].Add(kBasicCompStats); + expect_stats[1].Add(kBasicPerLevelStats); + expect_stats[1].ResetCompactionReason(CompactionReason::kLevelL0FilesNum); + VerifyCompactionStats(expect_stats, expect_pl_stats); + + // move all data to the last level + MoveFilesToLevel(kLastLevel); + + ResetAllStats(expect_stats, expect_pl_stats); + + // The compaction won't move the data up + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + last_stats.Add(kBasicCompStats); + last_stats.Add(kBasicPerLevelStats); + last_stats.num_dropped_records = 0; + last_stats.bytes_read_non_output_levels = 0; + last_stats.num_input_files_in_non_output_levels = 0; + last_stats.bytes_read_output_level = kHasValue; + last_stats.num_input_files_in_output_level = kHasValue; + last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + VerifyCompactionStats(expect_stats, expect_pl_stats); + + // Add new data, which is all hot and overriding all existing data + for (int i = 0; i < kNumTrigger; i++) { + for (int j = 0; j < kNumKeys; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + seq_history.emplace_back(dbfull()->GetLatestSequenceNumber()); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_EQ("0,1,0,0,0,0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + ResetAllStats(expect_stats, expect_pl_stats); + + // after compaction, all data are hot + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + for (int level = 2; level < kNumLevels - 1; level++) { + expect_stats[level].bytes_moved = kHasValue; + } + + last_stats.Add(kBasicCompStats); + last_stats.bytes_read_output_level = kHasValue; + last_stats.num_input_files_in_output_level = kHasValue; + last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); + expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + VerifyCompactionStats(expect_stats, expect_pl_stats); + + // move forward the cold_seq, try to split the data into cold and hot, but in + // this case it's unsafe to split the data + // because it's non-last-level but bottommost file, the sequence number will + // be zeroed out and lost the time information (with + // `level_compaction_dynamic_level_bytes` or Universal Compaction, it should + // be rare.) + // TODO(zjay): ideally we should avoid zero out non-last-level bottommost file + latest_cold_seq = seq_history[1]; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + seq_history.clear(); + + // manually move all data (cold) to last level + MoveFilesToLevel(kLastLevel); + seq_history.clear(); + // Add new data once again + for (int i = 0; i < kNumTrigger; i++) { + for (int j = 0; j < kNumKeys; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + seq_history.emplace_back(dbfull()->GetLatestSequenceNumber()); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + + latest_cold_seq = seq_history[0]; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // delete all cold data + for (int i = 0; i < 10; i++) { + ASSERT_OK(Delete(Key(i))); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + latest_cold_seq = seq_history[2]; + + MoveFilesToLevel(kLastLevel); + + // move forward the cold_seq again with range delete, take a snapshot to keep + // the range dels in bottommost + auto snap = db_->GetSnapshot(); + + std::string start = Key(25), end = Key(35); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + // add one small key and large key in the input level, to make sure it's able + // to move hot data to input level within that range + ASSERT_OK(Put(Key(0), "value" + std::to_string(0))); + ASSERT_OK(Put(Key(100), "value" + std::to_string(0))); + + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // verify data + std::string value; + for (int i = 1; i < 130; i++) { + if (i < 10 || (i >= 25 && i < 35)) { + ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound()); + } else { + ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value)); + } + } + + // delete all hot data + ASSERT_OK(Delete(Key(0))); + start = Key(30); + end = Key(101); // range [101, 130] is cold, because it's not in input range + // in previous compaction + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // no range del is dropped because of snapshot + ASSERT_EQ( + options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), + 0); + + db_->ReleaseSnapshot(snap); + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // 3 range dels dropped, the first one is double counted as expected, which is + // spread into 2 SST files + ASSERT_EQ( + options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), + 3); + + // move backward of cold_seq, which might happen when the user change the + // setting. the hot data won't move up, just to make sure it still runs + // fine, which is because: + // 1. sequence number is zeroed out, so no time information + // 2. leveled compaction only support move data up within the higher level + // input range + latest_cold_seq = seq_history[1]; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); +} + +TEST_P(TieredCompactionTest, RangeBasedTieredStorageLevel) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + + auto options = CurrentOptions(); + SetColdTemperature(options); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.level_compaction_dynamic_level_bytes = true; + options.num_levels = kNumLevels; + options.statistics = CreateDBStatistics(); + options.max_subcompactions = 10; + DestroyAndReopen(options); + auto cmp = options.comparator; + + port::Mutex mutex; + std::string hot_start = Key(10); + std::string hot_end = Key(50); + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast(arg); + MutexLock l(&mutex); + context->output_to_penultimate_level = + cmp->Compare(context->key, hot_start) >= 0 && + cmp->Compare(context->key, hot_end) < 0; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 0; i < kNumTrigger; i++) { + for (int j = 0; j < kNumKeys; j++) { + ASSERT_OK(Put(Key(j), "value" + std::to_string(j))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // change to all cold + { + MutexLock l(&mutex); + hot_start = Key(100); + hot_end = Key(200); + } + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // change to all hot, but level compaction only support move cold to hot + // within it's higher level input range. + { + MutexLock l(&mutex); + hot_start = Key(0); + hot_end = Key(100); + } + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // with mixed hot/cold data + { + MutexLock l(&mutex); + hot_start = Key(50); + hot_end = Key(100); + } + ASSERT_OK(Put(Key(0), "value" + std::to_string(0))); + ASSERT_OK(Put(Key(100), "value" + std::to_string(100))); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // delete all hot data, but with snapshot to keep the range del + auto snap = db_->GetSnapshot(); + std::string start = Key(50); + std::string end = Key(100); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // no range del is dropped because of snapshot + ASSERT_EQ( + options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), + 0); + + // release the snapshot and do compaction again should remove all hot data + db_->ReleaseSnapshot(snap); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + ASSERT_EQ( + options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), + 1); +} + +INSTANTIATE_TEST_CASE_P(TieredCompactionTest, TieredCompactionTest, + testing::Bool()); + +class PrecludeLastLevelTest : public DBTestBase { + public: + PrecludeLastLevelTest() + : DBTestBase("preclude_last_level_test", /*env_do_fsync=*/false) { + mock_clock_ = std::make_shared(env_->GetSystemClock()); + mock_env_ = std::make_unique(env_, mock_clock_); + } + + protected: + std::unique_ptr mock_env_; + std::shared_ptr mock_clock_; + + void SetUp() override { + mock_clock_->InstallTimedWaitFixCallback(); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::StartPeriodicTaskScheduler:Init", [&](void* arg) { + auto periodic_task_scheduler_ptr = + reinterpret_cast(arg); + periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock_.get()); + }); + mock_clock_->SetCurrentTime(0); + } +}; + +TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimeManualCompaction) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + const int kKeyPerSec = 10; + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.preserve_internal_time_seconds = 10000; + options.env = mock_env_.get(); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.num_levels = kNumLevels; + DestroyAndReopen(options); + + // pass some time first, otherwise the first a few keys write time are going + // to be zero, and internally zero has special meaning: kUnknownSeqnoTime + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); + + int sst_num = 0; + // Write files that are overlap and enough to trigger compaction + for (; sst_num < kNumTrigger; sst_num++) { + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); + }); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + + // all data is pushed to the last level + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + + // enable preclude feature + options.preclude_last_level_data_seconds = 10000; + options.last_level_temperature = Temperature::kCold; + Reopen(options); + + // all data is hot, even they're in the last level + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + + // Generate a sstable and trigger manual compaction + ASSERT_OK(Put(Key(10), "value")); + ASSERT_OK(Flush()); + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + // all data is moved up to the penultimate level + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + + // close explicitly, because the env is local variable which will be released + // first. + Close(); +} + +TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimeAutoCompaction) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + const int kKeyPerSec = 10; + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.preserve_internal_time_seconds = 10000; + options.env = mock_env_.get(); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.num_levels = kNumLevels; + DestroyAndReopen(options); + + // pass some time first, otherwise the first a few keys write time are going + // to be zero, and internally zero has special meaning: kUnknownSeqnoTime + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); + + int sst_num = 0; + // Write files that are overlap and enough to trigger compaction + for (; sst_num < kNumTrigger; sst_num++) { + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); + }); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + + // all data is pushed to the last level + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + + // enable preclude feature + options.preclude_last_level_data_seconds = 10000; + options.last_level_temperature = Temperature::kCold; + // make sure it won't trigger Size Amp compaction, unlike normal Size Amp + // compaction which is typically a last level compaction, when tiered Storage + // ("preclude_last_level") is enabled, size amp won't include the last level. + // As the last level would be in cold tier and the size would not be a + // problem, which also avoid frequent hot to cold storage compaction. + options.compaction_options_universal.max_size_amplification_percent = 400; + Reopen(options); + + // all data is hot, even they're in the last level + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + + // Write more data, but still all hot until the 10th SST, as: + // write a key every 10 seconds, 100 keys per SST, each SST takes 1000 seconds + // The preclude_last_level_data_seconds is 10k + Random rnd(301); + for (; sst_num < kNumTrigger * 2 - 1; sst_num++) { + for (int i = 0; i < kNumKeys; i++) { + // the value needs to be big enough to trigger full compaction + ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), rnd.RandomString(100))); + dbfull()->TEST_WaitForPeridicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); + }); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->WaitForCompact(true)); + } + + // all data is moved up to the penultimate level + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + + // close explicitly, because the env is local variable which will be released + // first. + Close(); +} + +TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimePartial) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + const int kKeyPerSec = 10; + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.preserve_internal_time_seconds = 2000; + options.env = mock_env_.get(); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.num_levels = kNumLevels; + DestroyAndReopen(options); + + // pass some time first, otherwise the first a few keys write time are going + // to be zero, and internally zero has special meaning: kUnknownSeqnoTime + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); + + int sst_num = 0; + // Write files that are overlap and enough to trigger compaction + for (; sst_num < kNumTrigger; sst_num++) { + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); + }); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + + // all data is pushed to the last level + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + + std::vector key_versions; + ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(), + std::numeric_limits::max(), + &key_versions)); + + // make sure there're more than 300 keys and first 100 keys are having seqno + // zeroed out, the last 100 key seqno not zeroed out + ASSERT_GT(key_versions.size(), 300); + for (int i = 0; i < 100; i++) { + ASSERT_EQ(key_versions[i].sequence, 0); + } + auto rit = key_versions.rbegin(); + for (int i = 0; i < 100; i++) { + ASSERT_GT(rit->sequence, 0); + rit++; + } + + // enable preclude feature + options.preclude_last_level_data_seconds = 2000; + options.last_level_temperature = Temperature::kCold; + Reopen(options); + + // Generate a sstable and trigger manual compaction + ASSERT_OK(Put(Key(10), "value")); + ASSERT_OK(Flush()); + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + // some data are moved up, some are not + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + + Close(); +} + +TEST_F(PrecludeLastLevelTest, SmallPrecludeTime) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.preclude_last_level_data_seconds = 60; + options.preserve_internal_time_seconds = 0; + options.env = mock_env_.get(); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.num_levels = kNumLevels; + options.last_level_temperature = Temperature::kCold; + DestroyAndReopen(options); + + Random rnd(301); + + dbfull()->TEST_WaitForPeridicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast(rnd.Uniform(10) + 1)); + }); + + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(100))); + dbfull()->TEST_WaitForPeridicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast(rnd.Uniform(2))); + }); + } + ASSERT_OK(Flush()); + + TablePropertiesCollection tables_props; + ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); + ASSERT_EQ(tables_props.size(), 1); + ASSERT_FALSE(tables_props.begin()->second->seqno_to_time_mapping.empty()); + SeqnoToTimeMapping tp_mapping; + ASSERT_OK( + tp_mapping.Add(tables_props.begin()->second->seqno_to_time_mapping)); + ASSERT_OK(tp_mapping.Sort()); + ASSERT_FALSE(tp_mapping.Empty()); + auto seqs = tp_mapping.TEST_GetInternalMapping(); + ASSERT_FALSE(seqs.empty()); + + // Wait more than preclude_last_level time, then make sure all the data is + // compacted to the last level even there's no write (no seqno -> time + // information was flushed to any SST). + mock_clock_->MockSleepForSeconds(100); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + Close(); +} + +TEST_F(PrecludeLastLevelTest, LastLevelOnlyCompactionPartial) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + const int kKeyPerSec = 10; + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.preserve_internal_time_seconds = 2000; + options.env = mock_env_.get(); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.num_levels = kNumLevels; + DestroyAndReopen(options); + + // pass some time first, otherwise the first a few keys write time are going + // to be zero, and internally zero has special meaning: kUnknownSeqnoTime + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); + + int sst_num = 0; + // Write files that are overlap and enough to trigger compaction + for (; sst_num < kNumTrigger; sst_num++) { + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); + }); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + + // all data is pushed to the last level + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + + // enable preclude feature + options.preclude_last_level_data_seconds = 2000; + options.last_level_temperature = Temperature::kCold; + Reopen(options); + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + // some data are moved up, some are not + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + + std::vector key_versions; + ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(), + std::numeric_limits::max(), + &key_versions)); + + // make sure there're more than 300 keys and first 100 keys are having seqno + // zeroed out, the last 100 key seqno not zeroed out + ASSERT_GT(key_versions.size(), 300); + for (int i = 0; i < 100; i++) { + ASSERT_EQ(key_versions[i].sequence, 0); + } + auto rit = key_versions.rbegin(); + for (int i = 0; i < 100; i++) { + ASSERT_GT(rit->sequence, 0); + rit++; + } + + Close(); +} + +class PrecludeLastLevelTestWithParms + : public PrecludeLastLevelTest, + public testing::WithParamInterface { + public: + PrecludeLastLevelTestWithParms() : PrecludeLastLevelTest() {} +}; + +TEST_P(PrecludeLastLevelTestWithParms, LastLevelOnlyCompactionNoPreclude) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + const int kKeyPerSec = 10; + + bool enable_preclude_last_level = GetParam(); + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.preserve_internal_time_seconds = 2000; + options.env = mock_env_.get(); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.num_levels = kNumLevels; + DestroyAndReopen(options); + + // pass some time first, otherwise the first a few keys write time are going + // to be zero, and internally zero has special meaning: kUnknownSeqnoTime + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); + + Random rnd(301); + int sst_num = 0; + // Write files that are overlap and enough to trigger compaction + for (; sst_num < kNumTrigger; sst_num++) { + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), rnd.RandomString(100))); + dbfull()->TEST_WaitForPeridicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); + }); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + + // all data is pushed to the last level + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + + std::atomic_bool is_manual_compaction_running = false; + std::atomic_bool verified_compaction_order = false; + + // Make sure the manual compaction is in progress and try to trigger a + // SizeRatio compaction by flushing 4 files to L0. The compaction will try to + // compact 4 files at L0 to L5 (the last empty level). + // If the preclude_last_feature is enabled, the auto triggered compaction + // cannot be picked. Otherwise, the auto triggered compaction can run in + // parallel with the last level compaction. + // L0: [a] [b] [c] [d] + // L5: (locked if preclude_last_level is enabled) + // L6: [z] (locked: manual compaction in progress) + // TODO: in this case, L0 files should just be compacted to L4, so the 2 + // compactions won't be overlapped. + SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::ProcessKeyValueCompaction()::Processing", [&](void* arg) { + auto compaction = static_cast(arg); + if (compaction->is_manual_compaction()) { + is_manual_compaction_running = true; + TEST_SYNC_POINT( + "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:" + "ManualCompaction1"); + TEST_SYNC_POINT( + "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:" + "ManualCompaction2"); + is_manual_compaction_running = false; + } + }); + + SyncPoint::GetInstance()->SetCallBack( + "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) { + auto compaction = static_cast(arg); + if (enable_preclude_last_level && is_manual_compaction_running) { + ASSERT_TRUE(compaction == nullptr); + verified_compaction_order = true; + } else { + ASSERT_TRUE(compaction != nullptr); + verified_compaction_order = true; + } + if (!compaction || !compaction->is_manual_compaction()) { + TEST_SYNC_POINT( + "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:" + "AutoCompactionPicked"); + } + }); + + SyncPoint::GetInstance()->LoadDependency({ + {"PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:" + "ManualCompaction1", + "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:StartWrite"}, + {"PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:" + "AutoCompactionPicked", + "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:" + "ManualCompaction2"}, + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + // only enable if the Parameter is true + if (enable_preclude_last_level) { + options.preclude_last_level_data_seconds = 2000; + } + options.max_background_jobs = 8; + options.last_level_temperature = Temperature::kCold; + Reopen(options); + + auto manual_compaction_thread = port::Thread([this]() { + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + cro.exclusive_manual_compaction = false; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + }); + + TEST_SYNC_POINT( + "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:StartWrite"); + auto stop_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + for (; sst_num < kNumTrigger * 2; sst_num++) { + for (int i = 0; i < kNumKeys; i++) { + // the value needs to be big enough to trigger full compaction + ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); + }); + } + ASSERT_OK(Flush()); + } + + manual_compaction_thread.join(); + + ASSERT_OK(dbfull()->WaitForCompact(true)); + + if (enable_preclude_last_level) { + ASSERT_NE("0,0,0,0,0,1,1", FilesPerLevel()); + } else { + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + } + ASSERT_TRUE(verified_compaction_order); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + stop_token.reset(); + + Close(); +} + +INSTANTIATE_TEST_CASE_P(PrecludeLastLevelTestWithParms, + PrecludeLastLevelTestWithParms, testing::Bool()); + +// partition the SST into 3 ranges [0, 19] [20, 39] [40, ...] +class ThreeRangesPartitioner : public SstPartitioner { + public: + const char* Name() const override { return "SingleKeySstPartitioner"; } + + PartitionerResult ShouldPartition( + const PartitionerRequest& request) override { + if ((cmp->CompareWithoutTimestamp(*request.current_user_key, + DBTestBase::Key(20)) >= 0 && + cmp->CompareWithoutTimestamp(*request.prev_user_key, + DBTestBase::Key(20)) < 0) || + (cmp->CompareWithoutTimestamp(*request.current_user_key, + DBTestBase::Key(40)) >= 0 && + cmp->CompareWithoutTimestamp(*request.prev_user_key, + DBTestBase::Key(40)) < 0)) { + return kRequired; + } else { + return kNotRequired; + } + } + + bool CanDoTrivialMove(const Slice& /*smallest_user_key*/, + const Slice& /*largest_user_key*/) override { + return false; + } + + const Comparator* cmp = BytewiseComparator(); +}; + +class ThreeRangesPartitionerFactory : public SstPartitionerFactory { + public: + static const char* kClassName() { + return "TombstoneTestSstPartitionerFactory"; + } + const char* Name() const override { return kClassName(); } + + std::unique_ptr CreatePartitioner( + const SstPartitioner::Context& /* context */) const override { + return std::unique_ptr(new ThreeRangesPartitioner()); + } +}; + +TEST_F(PrecludeLastLevelTest, PartialPenultimateLevelCompaction) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kKeyPerSec = 10; + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.env = mock_env_.get(); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.preserve_internal_time_seconds = 10000; + options.num_levels = kNumLevels; + DestroyAndReopen(options); + + // pass some time first, otherwise the first a few keys write time are going + // to be zero, and internally zero has special meaning: kUnknownSeqnoTime + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(10)); }); + + Random rnd(301); + + for (int i = 0; i < 300; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(100))); + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(kKeyPerSec); }); + } + ASSERT_OK(Flush()); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + // make sure all data is compacted to the last level + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + + // Create 3 L5 files + auto factory = std::make_shared(); + options.sst_partitioner_factory = factory; + + Reopen(options); + + for (int i = 0; i < kNumTrigger - 1; i++) { + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(i * 100 + j), rnd.RandomString(10))); + } + ASSERT_OK(Flush()); + } + + ASSERT_OK(dbfull()->WaitForCompact(true)); + + // L5: [0,19] [20,39] [40,299] + // L6: [0, 299] + ASSERT_EQ("0,0,0,0,0,3,1", FilesPerLevel()); + + // enable tiered storage feature + options.preclude_last_level_data_seconds = 10000; + options.last_level_temperature = Temperature::kCold; + options.statistics = CreateDBStatistics(); + Reopen(options); + + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + ASSERT_EQ(meta.levels[5].files.size(), 3); + ASSERT_EQ(meta.levels[6].files.size(), 1); + ASSERT_EQ(meta.levels[6].files[0].smallestkey, Key(0)); + ASSERT_EQ(meta.levels[6].files[0].largestkey, Key(299)); + + std::string file_path = meta.levels[5].files[1].db_path; + std::vector files; + // pick 3rd file @L5 + file@L6 for compaction + files.push_back(file_path + "/" + meta.levels[5].files[2].name); + files.push_back(file_path + "/" + meta.levels[6].files[0].name); + ASSERT_OK(db_->CompactFiles(CompactionOptions(), files, 6)); + + // The compaction only moved partial of the hot data to hot tier, range[0,39] + // is unsafe to move up, otherwise, they will be overlapped with the existing + // files@L5. + // The output should be: + // L5: [0,19] [20,39] [40,299] <-- Temperature::kUnknown + // L6: [0,19] [20,39] <-- Temperature::kCold + // L6 file is split because of the customized partitioner + ASSERT_EQ("0,0,0,0,0,3,2", FilesPerLevel()); + + // even all the data is hot, but not all data are moved to the hot tier + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + db_->GetColumnFamilyMetaData(&meta); + ASSERT_EQ(meta.levels[5].files.size(), 3); + ASSERT_EQ(meta.levels[6].files.size(), 2); + for (const auto& file : meta.levels[5].files) { + ASSERT_EQ(file.temperature, Temperature::kUnknown); + } + for (const auto& file : meta.levels[6].files) { + ASSERT_EQ(file.temperature, Temperature::kCold); + } + ASSERT_EQ(meta.levels[6].files[0].smallestkey, Key(0)); + ASSERT_EQ(meta.levels[6].files[0].largestkey, Key(19)); + ASSERT_EQ(meta.levels[6].files[1].smallestkey, Key(20)); + ASSERT_EQ(meta.levels[6].files[1].largestkey, Key(39)); + + Close(); +} + +struct TestPropertiesCollector : public TablePropertiesCollector { + Status AddUserKey(const Slice& key, const Slice& /*value*/, + EntryType /*type*/, SequenceNumber /*seq*/, + uint64_t /*file_size*/) override { + if (cmp->Compare(key, DBTestBase::Key(100)) == 0) { + has_key_100 = true; + } + if (cmp->Compare(key, DBTestBase::Key(200)) == 0) { + has_key_200 = true; + } + + return Status::OK(); + } + + const char* Name() const override { return "TestTablePropertiesCollector"; } + + UserCollectedProperties GetReadableProperties() const override { + UserCollectedProperties ret; + return ret; + } + + Status Finish(UserCollectedProperties* /*properties*/) override { + // The LSM tree would be like: + // L5: [0,19] [20,39] [40,299] + // L6: [0, 299] + // the 3rd file @L5 has both 100 and 200, which will be marked for + // compaction + // Also avoid marking flushed SST for compaction, which won't have both 100 + // and 200 + if (has_key_100 && has_key_200) { + need_compact_ = true; + } else { + need_compact_ = false; + } + has_key_100 = false; + has_key_200 = false; + return Status::OK(); + } + + bool NeedCompact() const override { return need_compact_; } + + const Comparator* cmp = BytewiseComparator(); + + private: + bool has_key_100 = false; + bool has_key_200 = false; + + bool need_compact_ = false; +}; + +class TestPropertiesCollectorFactory : public TablePropertiesCollectorFactory { + public: + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context /*context*/) override { + return new TestPropertiesCollector; + } + const char* Name() const override { return "TestTablePropertiesCollector"; } +}; + +TEST_F(PrecludeLastLevelTest, PartialPenultimateLevelCompactionWithRangeDel) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kKeyPerSec = 10; + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.env = mock_env_.get(); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.preserve_internal_time_seconds = 10000; + options.num_levels = kNumLevels; + // set a small max_compaction_bytes to avoid input level expansion + options.max_compaction_bytes = 30000; + options.ignore_max_compaction_bytes_for_input = false; + DestroyAndReopen(options); + + // pass some time first, otherwise the first a few keys write time are going + // to be zero, and internally zero has special meaning: kUnknownSeqnoTime + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(10)); }); + + Random rnd(301); + + for (int i = 0; i < 300; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(100))); + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(kKeyPerSec); }); + } + ASSERT_OK(Flush()); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + // make sure all data is compacted to the last level + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + + // Create 3 L5 files + auto factory = std::make_shared(); + options.sst_partitioner_factory = factory; + + // the user defined properties_collector will mark the 3rd file for compaction + auto collector_factory = std::make_shared(); + options.table_properties_collector_factories.resize(1); + options.table_properties_collector_factories[0] = collector_factory; + // enable tiered storage feature + options.preclude_last_level_data_seconds = 10000; + options.last_level_temperature = Temperature::kCold; + Reopen(options); + + for (int i = 0; i < kNumTrigger - 2; i++) { + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(i * 100 + j), rnd.RandomString(10))); + } + ASSERT_OK(Flush()); + } + + // make sure there is one and only one compaction supports per-key placement + // but has the penultimate level output disabled. + std::atomic_int per_key_comp_num = 0; + SyncPoint::GetInstance()->SetCallBack( + "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) { + auto compaction = static_cast(arg); + if (compaction->SupportsPerKeyPlacement()) { + ASSERT_EQ(compaction->GetPenultimateOutputRangeType(), + Compaction::PenultimateOutputRangeType::kDisabled); + per_key_comp_num++; + } + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(200 + j), rnd.RandomString(10))); + } + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(32), Key(40))); + ASSERT_OK(Flush()); + + // Before the per-key placement compaction, the LSM tress should be like: + // L5: [0,19] [20,40] [40,299] + // L6: [0, 299] + // The 2nd file @L5 has the largest key 40 because of range del + + ASSERT_OK(dbfull()->WaitForCompact(true)); + + ASSERT_EQ(per_key_comp_num, 1); + + // the compaction won't move any data to the penultimate level + ASSERT_EQ("0,0,0,0,0,2,3", FilesPerLevel()); + + Close(); +} + +#endif // !defined(ROCKSDB_LITE) + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { +#if !defined(ROCKSDB_LITE) + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +#else + (void)argc; + (void)argv; + return 0; +#endif +} diff --git a/src/rocksdb/db/comparator_db_test.cc b/src/rocksdb/db/comparator_db_test.cc new file mode 100644 index 000000000..e5e3493b3 --- /dev/null +++ b/src/rocksdb/db/comparator_db_test.cc @@ -0,0 +1,678 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#include +#include +#include + +#include "memtable/stl_wrappers.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/hash.h" +#include "util/kv_map.h" +#include "util/random.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { +namespace { + +static const Comparator* kTestComparator = nullptr; + +class KVIter : public Iterator { + public: + explicit KVIter(const stl_wrappers::KVMap* map) + : map_(map), iter_(map_->end()) {} + bool Valid() const override { return iter_ != map_->end(); } + void SeekToFirst() override { iter_ = map_->begin(); } + void SeekToLast() override { + if (map_->empty()) { + iter_ = map_->end(); + } else { + iter_ = map_->find(map_->rbegin()->first); + } + } + void Seek(const Slice& k) override { + iter_ = map_->lower_bound(k.ToString()); + } + void SeekForPrev(const Slice& k) override { + iter_ = map_->upper_bound(k.ToString()); + Prev(); + } + void Next() override { ++iter_; } + void Prev() override { + if (iter_ == map_->begin()) { + iter_ = map_->end(); + return; + } + --iter_; + } + + Slice key() const override { return iter_->first; } + Slice value() const override { return iter_->second; } + Status status() const override { return Status::OK(); } + + private: + const stl_wrappers::KVMap* const map_; + stl_wrappers::KVMap::const_iterator iter_; +}; + +void AssertItersEqual(Iterator* iter1, Iterator* iter2) { + ASSERT_EQ(iter1->Valid(), iter2->Valid()); + if (iter1->Valid()) { + ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString()); + ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString()); + } +} + +// Measuring operations on DB (expect to be empty). +// source_strings are candidate keys +void DoRandomIteraratorTest(DB* db, std::vector source_strings, + Random* rnd, int num_writes, int num_iter_ops, + int num_trigger_flush) { + stl_wrappers::KVMap map((stl_wrappers::LessOfComparator(kTestComparator))); + + for (int i = 0; i < num_writes; i++) { + if (num_trigger_flush > 0 && i != 0 && i % num_trigger_flush == 0) { + db->Flush(FlushOptions()); + } + + int type = rnd->Uniform(2); + int index = rnd->Uniform(static_cast(source_strings.size())); + auto& key = source_strings[index]; + switch (type) { + case 0: + // put + map[key] = key; + ASSERT_OK(db->Put(WriteOptions(), key, key)); + break; + case 1: + // delete + if (map.find(key) != map.end()) { + map.erase(key); + } + ASSERT_OK(db->Delete(WriteOptions(), key)); + break; + default: + assert(false); + } + } + + std::unique_ptr iter(db->NewIterator(ReadOptions())); + std::unique_ptr result_iter(new KVIter(&map)); + + bool is_valid = false; + for (int i = 0; i < num_iter_ops; i++) { + // Random walk and make sure iter and result_iter returns the + // same key and value + int type = rnd->Uniform(6); + ASSERT_OK(iter->status()); + switch (type) { + case 0: + // Seek to First + iter->SeekToFirst(); + result_iter->SeekToFirst(); + break; + case 1: + // Seek to last + iter->SeekToLast(); + result_iter->SeekToLast(); + break; + case 2: { + // Seek to random key + auto key_idx = rnd->Uniform(static_cast(source_strings.size())); + auto key = source_strings[key_idx]; + iter->Seek(key); + result_iter->Seek(key); + break; + } + case 3: + // Next + if (is_valid) { + iter->Next(); + result_iter->Next(); + } else { + continue; + } + break; + case 4: + // Prev + if (is_valid) { + iter->Prev(); + result_iter->Prev(); + } else { + continue; + } + break; + default: { + assert(type == 5); + auto key_idx = rnd->Uniform(static_cast(source_strings.size())); + auto key = source_strings[key_idx]; + std::string result; + auto status = db->Get(ReadOptions(), key, &result); + if (map.find(key) == map.end()) { + ASSERT_TRUE(status.IsNotFound()); + } else { + ASSERT_EQ(map[key], result); + } + break; + } + } + AssertItersEqual(iter.get(), result_iter.get()); + is_valid = iter->Valid(); + } +} + +class DoubleComparator : public Comparator { + public: + DoubleComparator() {} + + const char* Name() const override { return "DoubleComparator"; } + + int Compare(const Slice& a, const Slice& b) const override { +#ifndef CYGWIN + double da = std::stod(a.ToString()); + double db = std::stod(b.ToString()); +#else + double da = std::strtod(a.ToString().c_str(), 0 /* endptr */); + double db = std::strtod(a.ToString().c_str(), 0 /* endptr */); +#endif + if (da == db) { + return a.compare(b); + } else if (da > db) { + return 1; + } else { + return -1; + } + } + void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override {} + + void FindShortSuccessor(std::string* /*key*/) const override {} +}; + +class HashComparator : public Comparator { + public: + HashComparator() {} + + const char* Name() const override { return "HashComparator"; } + + int Compare(const Slice& a, const Slice& b) const override { + uint32_t ha = Hash(a.data(), a.size(), 66); + uint32_t hb = Hash(b.data(), b.size(), 66); + if (ha == hb) { + return a.compare(b); + } else if (ha > hb) { + return 1; + } else { + return -1; + } + } + void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override {} + + void FindShortSuccessor(std::string* /*key*/) const override {} +}; + +class TwoStrComparator : public Comparator { + public: + TwoStrComparator() {} + + const char* Name() const override { return "TwoStrComparator"; } + + int Compare(const Slice& a, const Slice& b) const override { + assert(a.size() >= 2); + assert(b.size() >= 2); + size_t size_a1 = static_cast(a[0]); + size_t size_b1 = static_cast(b[0]); + size_t size_a2 = static_cast(a[1]); + size_t size_b2 = static_cast(b[1]); + assert(size_a1 + size_a2 + 2 == a.size()); + assert(size_b1 + size_b2 + 2 == b.size()); + + Slice a1 = Slice(a.data() + 2, size_a1); + Slice b1 = Slice(b.data() + 2, size_b1); + Slice a2 = Slice(a.data() + 2 + size_a1, size_a2); + Slice b2 = Slice(b.data() + 2 + size_b1, size_b2); + + if (a1 != b1) { + return a1.compare(b1); + } + return a2.compare(b2); + } + void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override {} + + void FindShortSuccessor(std::string* /*key*/) const override {} +}; +} // anonymous namespace + +class ComparatorDBTest + : public testing::Test, + virtual public ::testing::WithParamInterface { + private: + std::string dbname_; + Env* env_; + DB* db_; + Options last_options_; + std::unique_ptr comparator_guard; + + public: + ComparatorDBTest() : env_(Env::Default()), db_(nullptr) { + kTestComparator = BytewiseComparator(); + dbname_ = test::PerThreadDBPath("comparator_db_test"); + BlockBasedTableOptions toptions; + toptions.format_version = GetParam(); + last_options_.table_factory.reset( + ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(toptions)); + EXPECT_OK(DestroyDB(dbname_, last_options_)); + } + + ~ComparatorDBTest() override { + delete db_; + EXPECT_OK(DestroyDB(dbname_, last_options_)); + kTestComparator = BytewiseComparator(); + } + + DB* GetDB() { return db_; } + + void SetOwnedComparator(const Comparator* cmp, bool owner = true) { + if (owner) { + comparator_guard.reset(cmp); + } else { + comparator_guard.reset(); + } + kTestComparator = cmp; + last_options_.comparator = cmp; + } + + // Return the current option configuration. + Options* GetOptions() { return &last_options_; } + + void DestroyAndReopen() { + // Destroy using last options + Destroy(); + ASSERT_OK(TryReopen()); + } + + void Destroy() { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, last_options_)); + } + + Status TryReopen() { + delete db_; + db_ = nullptr; + last_options_.create_if_missing = true; + + return DB::Open(last_options_, dbname_, &db_); + } +}; + +INSTANTIATE_TEST_CASE_P(FormatDef, ComparatorDBTest, + testing::Values(test::kDefaultFormatVersion)); +INSTANTIATE_TEST_CASE_P(FormatLatest, ComparatorDBTest, + testing::Values(kLatestFormatVersion)); + +TEST_P(ComparatorDBTest, Bytewise) { + for (int rand_seed = 301; rand_seed < 306; rand_seed++) { + DestroyAndReopen(); + Random rnd(rand_seed); + DoRandomIteraratorTest(GetDB(), + {"a", "b", "c", "d", "e", "f", "g", "h", "i"}, &rnd, + 8, 100, 3); + } +} + +TEST_P(ComparatorDBTest, SimpleSuffixReverseComparator) { + SetOwnedComparator(new test::SimpleSuffixReverseComparator()); + + for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) { + Options* opt = GetOptions(); + opt->comparator = kTestComparator; + DestroyAndReopen(); + Random rnd(rnd_seed); + + std::vector source_strings; + std::vector source_prefixes; + // Randomly generate 5 prefixes + for (int i = 0; i < 5; i++) { + source_prefixes.push_back(rnd.HumanReadableString(8)); + } + for (int j = 0; j < 20; j++) { + int prefix_index = rnd.Uniform(static_cast(source_prefixes.size())); + std::string key = source_prefixes[prefix_index] + + rnd.HumanReadableString(rnd.Uniform(8)); + source_strings.push_back(key); + } + + DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 30, 600, 66); + } +} + +TEST_P(ComparatorDBTest, Uint64Comparator) { + SetOwnedComparator(test::Uint64Comparator(), false /* owner */); + + for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) { + Options* opt = GetOptions(); + opt->comparator = kTestComparator; + DestroyAndReopen(); + Random rnd(rnd_seed); + Random64 rnd64(rnd_seed); + + std::vector source_strings; + // Randomly generate source keys + for (int i = 0; i < 100; i++) { + uint64_t r = rnd64.Next(); + std::string str; + str.resize(8); + memcpy(&str[0], static_cast(&r), 8); + source_strings.push_back(str); + } + + DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66); + } +} + +TEST_P(ComparatorDBTest, DoubleComparator) { + SetOwnedComparator(new DoubleComparator()); + + for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) { + Options* opt = GetOptions(); + opt->comparator = kTestComparator; + DestroyAndReopen(); + Random rnd(rnd_seed); + + std::vector source_strings; + // Randomly generate source keys + for (int i = 0; i < 100; i++) { + uint32_t r = rnd.Next(); + uint32_t divide_order = rnd.Uniform(8); + double to_divide = 1.0; + for (uint32_t j = 0; j < divide_order; j++) { + to_divide *= 10.0; + } + source_strings.push_back(std::to_string(r / to_divide)); + } + + DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66); + } +} + +TEST_P(ComparatorDBTest, HashComparator) { + SetOwnedComparator(new HashComparator()); + + for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) { + Options* opt = GetOptions(); + opt->comparator = kTestComparator; + DestroyAndReopen(); + Random rnd(rnd_seed); + + std::vector source_strings; + // Randomly generate source keys + for (int i = 0; i < 100; i++) { + source_strings.push_back(test::RandomKey(&rnd, 8)); + } + + DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66); + } +} + +TEST_P(ComparatorDBTest, TwoStrComparator) { + SetOwnedComparator(new TwoStrComparator()); + + for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) { + Options* opt = GetOptions(); + opt->comparator = kTestComparator; + DestroyAndReopen(); + Random rnd(rnd_seed); + + std::vector source_strings; + // Randomly generate source keys + for (int i = 0; i < 100; i++) { + std::string str; + uint32_t size1 = rnd.Uniform(8); + uint32_t size2 = rnd.Uniform(8); + str.append(1, static_cast(size1)); + str.append(1, static_cast(size2)); + str.append(test::RandomKey(&rnd, size1)); + str.append(test::RandomKey(&rnd, size2)); + source_strings.push_back(str); + } + + DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66); + } +} + +namespace { +void VerifyNotSuccessor(const Slice& s, const Slice& t) { + auto bc = BytewiseComparator(); + auto rbc = ReverseBytewiseComparator(); + ASSERT_FALSE(bc->IsSameLengthImmediateSuccessor(s, t)); + ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(s, t)); + ASSERT_FALSE(bc->IsSameLengthImmediateSuccessor(t, s)); + ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(t, s)); +} + +void VerifySuccessor(const Slice& s, const Slice& t) { + auto bc = BytewiseComparator(); + auto rbc = ReverseBytewiseComparator(); + ASSERT_TRUE(bc->IsSameLengthImmediateSuccessor(s, t)); + ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(s, t)); + ASSERT_FALSE(bc->IsSameLengthImmediateSuccessor(t, s)); + // Should be true but that increases exposure to a design bug in + // auto_prefix_mode, so currently set to FALSE + ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(t, s)); +} + +} // anonymous namespace + +TEST_P(ComparatorDBTest, IsSameLengthImmediateSuccessor) { + { + // different length + Slice s("abcxy"); + Slice t("abcxyz"); + VerifyNotSuccessor(s, t); + } + { + Slice s("abcxyz"); + Slice t("abcxy"); + VerifyNotSuccessor(s, t); + } + { + // not last byte different + Slice s("abc1xyz"); + Slice t("abc2xyz"); + VerifyNotSuccessor(s, t); + } + { + // same string + Slice s("abcxyz"); + Slice t("abcxyz"); + VerifyNotSuccessor(s, t); + } + { + Slice s("abcxy"); + Slice t("abcxz"); + VerifySuccessor(s, t); + } + { + const char s_array[] = "\x50\x8a\xac"; + const char t_array[] = "\x50\x8a\xad"; + Slice s(s_array); + Slice t(t_array); + VerifySuccessor(s, t); + } + { + const char s_array[] = "\x50\x8a\xff"; + const char t_array[] = "\x50\x8b\x00"; + Slice s(s_array, 3); + Slice t(t_array, 3); + VerifySuccessor(s, t); + } + { + const char s_array[] = "\x50\x8a\xff\xff"; + const char t_array[] = "\x50\x8b\x00\x00"; + Slice s(s_array, 4); + Slice t(t_array, 4); + VerifySuccessor(s, t); + } + { + const char s_array[] = "\x50\x8a\xff\xff"; + const char t_array[] = "\x50\x8b\x00\x01"; + Slice s(s_array, 4); + Slice t(t_array, 4); + VerifyNotSuccessor(s, t); + } +} + +TEST_P(ComparatorDBTest, FindShortestSeparator) { + std::string s1 = "abc1xyz"; + std::string s2 = "abc3xy"; + + BytewiseComparator()->FindShortestSeparator(&s1, s2); + ASSERT_EQ("abc2", s1); + + s1 = "abc5xyztt"; + + ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2); + ASSERT_EQ("abc5", s1); + + s1 = "abc3"; + s2 = "abc2xy"; + ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2); + ASSERT_EQ("abc3", s1); + + s1 = "abc3xyz"; + s2 = "abc2xy"; + ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2); + ASSERT_EQ("abc3", s1); + + s1 = "abc3xyz"; + s2 = "abc2"; + ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2); + ASSERT_EQ("abc3", s1); + + std::string old_s1 = s1 = "abc2xy"; + s2 = "abc2"; + ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2); + ASSERT_TRUE(old_s1 >= s1); + ASSERT_TRUE(s1 > s2); +} + +TEST_P(ComparatorDBTest, SeparatorSuccessorRandomizeTest) { + // Char list for boundary cases. + std::array char_list{{0, 1, 2, 253, 254, 255}}; + Random rnd(301); + + for (int attempts = 0; attempts < 1000; attempts++) { + uint32_t size1 = rnd.Skewed(4); + uint32_t size2; + + if (rnd.OneIn(2)) { + // size2 to be random size + size2 = rnd.Skewed(4); + } else { + // size1 is within [-2, +2] of size1 + int diff = static_cast(rnd.Uniform(5)) - 2; + int tmp_size2 = static_cast(size1) + diff; + if (tmp_size2 < 0) { + tmp_size2 = 0; + } + size2 = static_cast(tmp_size2); + } + + std::string s1; + std::string s2; + for (uint32_t i = 0; i < size1; i++) { + if (rnd.OneIn(2)) { + // Use random byte + s1 += static_cast(rnd.Uniform(256)); + } else { + // Use one byte in char_list + char c = static_cast(char_list[rnd.Uniform(sizeof(char_list))]); + s1 += c; + } + } + + // First set s2 to be the same as s1, and then modify s2. + s2 = s1; + s2.resize(size2); + // We start from the back of the string + if (size2 > 0) { + uint32_t pos = size2 - 1; + do { + if (pos >= size1 || rnd.OneIn(4)) { + // For 1/4 chance, use random byte + s2[pos] = static_cast(rnd.Uniform(256)); + } else if (rnd.OneIn(4)) { + // In 1/4 chance, stop here. + break; + } else { + // Create a char within [-2, +2] of the matching char of s1. + int diff = static_cast(rnd.Uniform(5)) - 2; + // char may be signed or unsigned based on platform. + int s1_char = static_cast(static_cast(s1[pos])); + int s2_char = s1_char + diff; + if (s2_char < 0) { + s2_char = 0; + } + if (s2_char > 255) { + s2_char = 255; + } + s2[pos] = static_cast(s2_char); + } + } while (pos-- != 0); + } + + // Test separators + for (int rev = 0; rev < 2; rev++) { + if (rev == 1) { + // switch s1 and s2 + std::string t = s1; + s1 = s2; + s2 = t; + } + std::string separator = s1; + BytewiseComparator()->FindShortestSeparator(&separator, s2); + std::string rev_separator = s1; + ReverseBytewiseComparator()->FindShortestSeparator(&rev_separator, s2); + + if (s1 == s2) { + ASSERT_EQ(s1, separator); + ASSERT_EQ(s2, rev_separator); + } else if (s1 < s2) { + ASSERT_TRUE(s1 <= separator); + ASSERT_TRUE(s2 > separator); + ASSERT_LE(separator.size(), std::max(s1.size(), s2.size())); + ASSERT_EQ(s1, rev_separator); + } else { + ASSERT_TRUE(s1 >= rev_separator); + ASSERT_TRUE(s2 < rev_separator); + ASSERT_LE(rev_separator.size(), std::max(s1.size(), s2.size())); + ASSERT_EQ(s1, separator); + } + } + + // Test successors + std::string succ = s1; + BytewiseComparator()->FindShortSuccessor(&succ); + ASSERT_TRUE(succ >= s1); + + succ = s1; + ReverseBytewiseComparator()->FindShortSuccessor(&succ); + ASSERT_TRUE(succ <= s1); + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/convenience.cc b/src/rocksdb/db/convenience.cc new file mode 100644 index 000000000..6344d356d --- /dev/null +++ b/src/rocksdb/db/convenience.cc @@ -0,0 +1,81 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#ifndef ROCKSDB_LITE + +#include "rocksdb/convenience.h" + +#include "db/db_impl/db_impl.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { + +void CancelAllBackgroundWork(DB* db, bool wait) { + (static_cast_with_check(db->GetRootDB())) + ->CancelAllBackgroundWork(wait); +} + +Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end, + bool include_end) { + RangePtr range(begin, end); + return DeleteFilesInRanges(db, column_family, &range, 1, include_end); +} + +Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family, + const RangePtr* ranges, size_t n, bool include_end) { + return (static_cast_with_check(db->GetRootDB())) + ->DeleteFilesInRanges(column_family, ranges, n, include_end); +} + +Status VerifySstFileChecksum(const Options& options, + const EnvOptions& env_options, + const std::string& file_path) { + return VerifySstFileChecksum(options, env_options, ReadOptions(), file_path); +} +Status VerifySstFileChecksum(const Options& options, + const EnvOptions& env_options, + const ReadOptions& read_options, + const std::string& file_path, + const SequenceNumber& largest_seqno) { + std::unique_ptr file; + uint64_t file_size; + InternalKeyComparator internal_comparator(options.comparator); + ImmutableOptions ioptions(options); + + Status s = ioptions.fs->NewRandomAccessFile( + file_path, FileOptions(env_options), &file, nullptr); + if (s.ok()) { + s = ioptions.fs->GetFileSize(file_path, IOOptions(), &file_size, nullptr); + } else { + return s; + } + std::unique_ptr table_reader; + std::unique_ptr file_reader( + new RandomAccessFileReader( + std::move(file), file_path, ioptions.clock, nullptr /* io_tracer */, + nullptr /* stats */, 0 /* hist_type */, nullptr /* file_read_hist */, + ioptions.rate_limiter.get())); + const bool kImmortal = true; + auto reader_options = TableReaderOptions( + ioptions, options.prefix_extractor, env_options, internal_comparator, + false /* skip_filters */, !kImmortal, false /* force_direct_prefetch */, + -1 /* level */); + reader_options.largest_seqno = largest_seqno; + s = ioptions.table_factory->NewTableReader( + reader_options, std::move(file_reader), file_size, &table_reader, + false /* prefetch_index_and_filter_in_cache */); + if (!s.ok()) { + return s; + } + s = table_reader->VerifyChecksum(read_options, + TableReaderCaller::kUserVerifyChecksum); + return s; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/corruption_test.cc b/src/rocksdb/db/corruption_test.cc new file mode 100644 index 000000000..8ccac6130 --- /dev/null +++ b/src/rocksdb/db/corruption_test.cc @@ -0,0 +1,1587 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/options.h" +#ifndef ROCKSDB_LITE + +#include +#include +#include + +#include + +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "db/log_format.h" +#include "db/version_set.h" +#include "file/filename.h" +#include "port/stack_trace.h" +#include "rocksdb/cache.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/table.h" +#include "rocksdb/utilities/transaction_db.h" +#include "rocksdb/write_batch.h" +#include "table/block_based/block_based_table_builder.h" +#include "table/meta_blocks.h" +#include "table/mock_table.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/cast_util.h" +#include "util/random.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +static constexpr int kValueSize = 1000; +namespace { +// A wrapper that allows injection of errors. +class ErrorEnv : public EnvWrapper { + public: + bool writable_file_error_; + int num_writable_file_errors_; + + explicit ErrorEnv(Env* _target) + : EnvWrapper(_target), + writable_file_error_(false), + num_writable_file_errors_(0) {} + const char* Name() const override { return "ErrorEnv"; } + + virtual Status NewWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& soptions) override { + result->reset(); + if (writable_file_error_) { + ++num_writable_file_errors_; + return Status::IOError(fname, "fake error"); + } + return target()->NewWritableFile(fname, result, soptions); + } +}; +} // anonymous namespace +class CorruptionTest : public testing::Test { + public: + std::shared_ptr env_guard_; + ErrorEnv* env_; + std::string dbname_; + std::shared_ptr tiny_cache_; + Options options_; + DB* db_; + + CorruptionTest() { + // If LRU cache shard bit is smaller than 2 (or -1 which will automatically + // set it to 0), test SequenceNumberRecovery will fail, likely because of a + // bug in recovery code. Keep it 4 for now to make the test passes. + tiny_cache_ = NewLRUCache(100, 4); + Env* base_env = Env::Default(); + EXPECT_OK( + test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_)); + EXPECT_NE(base_env, nullptr); + env_ = new ErrorEnv(base_env); + options_.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords; + options_.env = env_; + dbname_ = test::PerThreadDBPath(env_, "corruption_test"); + Status s = DestroyDB(dbname_, options_); + EXPECT_OK(s); + + db_ = nullptr; + options_.create_if_missing = true; + BlockBasedTableOptions table_options; + table_options.block_size_deviation = 0; // make unit test pass for now + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(); + options_.create_if_missing = false; + } + + ~CorruptionTest() override { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({}); + SyncPoint::GetInstance()->ClearAllCallBacks(); + delete db_; + db_ = nullptr; + if (getenv("KEEP_DB")) { + fprintf(stdout, "db is still at %s\n", dbname_.c_str()); + } else { + Options opts; + opts.env = env_->target(); + EXPECT_OK(DestroyDB(dbname_, opts)); + } + delete env_; + } + + void CloseDb() { + delete db_; + db_ = nullptr; + } + + Status TryReopen(Options* options = nullptr) { + delete db_; + db_ = nullptr; + Options opt = (options ? *options : options_); + if (opt.env == Options().env) { + // If env is not overridden, replace it with ErrorEnv. + // Otherwise, the test already uses a non-default Env. + opt.env = env_; + } + opt.arena_block_size = 4096; + BlockBasedTableOptions table_options; + table_options.block_cache = tiny_cache_; + table_options.block_size_deviation = 0; + opt.table_factory.reset(NewBlockBasedTableFactory(table_options)); + return DB::Open(opt, dbname_, &db_); + } + + void Reopen(Options* options = nullptr) { ASSERT_OK(TryReopen(options)); } + + void RepairDB() { + delete db_; + db_ = nullptr; + ASSERT_OK(::ROCKSDB_NAMESPACE::RepairDB(dbname_, options_)); + } + + void Build(int n, int start, int flush_every) { + std::string key_space, value_space; + WriteBatch batch; + for (int i = 0; i < n; i++) { + if (flush_every != 0 && i != 0 && i % flush_every == 0) { + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + } + // if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n); + Slice key = Key(i + start, &key_space); + batch.Clear(); + ASSERT_OK(batch.Put(key, Value(i + start, &value_space))); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + } + } + + void Build(int n, int flush_every = 0) { Build(n, 0, flush_every); } + + void Check(int min_expected, int max_expected) { + uint64_t next_expected = 0; + uint64_t missed = 0; + int bad_keys = 0; + int bad_values = 0; + int correct = 0; + std::string value_space; + // Do not verify checksums. If we verify checksums then the + // db itself will raise errors because data is corrupted. + // Instead, we want the reads to be successful and this test + // will detect whether the appropriate corruptions have + // occurred. + Iterator* iter = db_->NewIterator(ReadOptions(false, true)); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + uint64_t key; + Slice in(iter->key()); + if (!ConsumeDecimalNumber(&in, &key) || !in.empty() || + key < next_expected) { + bad_keys++; + continue; + } + missed += (key - next_expected); + next_expected = key + 1; + if (iter->value() != Value(static_cast(key), &value_space)) { + bad_values++; + } else { + correct++; + } + } + iter->status().PermitUncheckedError(); + delete iter; + + fprintf( + stderr, + "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%llu\n", + min_expected, max_expected, correct, bad_keys, bad_values, + static_cast(missed)); + ASSERT_LE(min_expected, correct); + ASSERT_GE(max_expected, correct); + } + + void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { + // Pick file to corrupt + std::vector filenames; + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); + uint64_t number; + FileType type; + std::string fname; + int picked_number = -1; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type) && type == filetype && + static_cast(number) > picked_number) { // Pick latest file + fname = dbname_ + "/" + filenames[i]; + picked_number = static_cast(number); + } + } + ASSERT_TRUE(!fname.empty()) << filetype; + + ASSERT_OK(test::CorruptFile(env_, fname, offset, bytes_to_corrupt)); + } + + // corrupts exactly one file at level `level`. if no file found at level, + // asserts + void CorruptTableFileAtLevel(int level, int offset, int bytes_to_corrupt) { + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + for (const auto& m : metadata) { + if (m.level == level) { + ASSERT_OK(test::CorruptFile(env_, dbname_ + "/" + m.name, offset, + bytes_to_corrupt)); + return; + } + } + FAIL() << "no file found at level"; + } + + int Property(const std::string& name) { + std::string property; + int result; + if (db_->GetProperty(name, &property) && + sscanf(property.c_str(), "%d", &result) == 1) { + return result; + } else { + return -1; + } + } + + // Return the ith key + Slice Key(int i, std::string* storage) { + char buf[100]; + snprintf(buf, sizeof(buf), "%016d", i); + storage->assign(buf, strlen(buf)); + return Slice(*storage); + } + + // Return the value to associate with the specified key + Slice Value(int k, std::string* storage) { + if (k == 0) { + // Ugh. Random seed of 0 used to produce no entropy. This code + // preserves the implementation that was in place when all of the + // magic values in this file were picked. + *storage = std::string(kValueSize, ' '); + } else { + Random r(k); + *storage = r.RandomString(kValueSize); + } + return Slice(*storage); + } + + void GetSortedWalFiles(std::vector& file_nums) { + std::vector tmp_files; + ASSERT_OK(env_->GetChildren(dbname_, &tmp_files)); + FileType type = kWalFile; + for (const auto& file : tmp_files) { + uint64_t number = 0; + if (ParseFileName(file, &number, &type) && type == kWalFile) { + file_nums.push_back(number); + } + } + std::sort(file_nums.begin(), file_nums.end()); + } + + void CorruptFileWithTruncation(FileType file, uint64_t number, + uint64_t bytes_to_truncate = 0) { + std::string path; + switch (file) { + case FileType::kWalFile: + path = LogFileName(dbname_, number); + break; + // TODO: Add other file types as this method is being used for those file + // types. + default: + return; + } + uint64_t old_size = 0; + ASSERT_OK(env_->GetFileSize(path, &old_size)); + assert(old_size > bytes_to_truncate); + uint64_t new_size = old_size - bytes_to_truncate; + // If bytes_to_truncate == 0, it will do full truncation. + if (bytes_to_truncate == 0) { + new_size = 0; + } + ASSERT_OK(test::TruncateFile(env_, path, new_size)); + } +}; + +TEST_F(CorruptionTest, Recovery) { + Build(100); + Check(100, 100); +#ifdef OS_WIN + // On Wndows OS Disk cache does not behave properly + // We do not call FlushBuffers on every Flush. If we do not close + // the log file prior to the corruption we end up with the first + // block not corrupted but only the second. However, under the debugger + // things work just fine but never pass when running normally + // For that reason people may want to run with unbuffered I/O. That option + // is not available for WAL though. + CloseDb(); +#endif + Corrupt(kWalFile, 19, 1); // WriteBatch tag for first record + Corrupt(kWalFile, log::kBlockSize + 1000, 1); // Somewhere in second block + ASSERT_TRUE(!TryReopen().ok()); + options_.paranoid_checks = false; + Reopen(&options_); + + // The 64 records in the first two log blocks are completely lost. + Check(36, 36); +} + +TEST_F(CorruptionTest, PostPITRCorruptionWALsRetained) { + // Repro for bug where WALs following the point-in-time recovery were not + // retained leading to the next recovery failing. + CloseDb(); + + options_.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + + const std::string test_cf_name = "test_cf"; + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); + cf_descs.emplace_back(test_cf_name, ColumnFamilyOptions()); + + uint64_t log_num; + { + options_.create_missing_column_families = true; + std::vector cfhs; + ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_)); + assert(db_ != nullptr); // suppress false clang-analyze report + + ASSERT_OK(db_->Put(WriteOptions(), cfhs[0], "k", "v")); + ASSERT_OK(db_->Put(WriteOptions(), cfhs[1], "k", "v")); + ASSERT_OK(db_->Put(WriteOptions(), cfhs[0], "k2", "v2")); + std::vector file_nums; + GetSortedWalFiles(file_nums); + log_num = file_nums.back(); + for (auto* cfh : cfhs) { + delete cfh; + } + CloseDb(); + } + + CorruptFileWithTruncation(FileType::kWalFile, log_num, + /*bytes_to_truncate=*/1); + + { + // Recover "k" -> "v" for both CFs. "k2" -> "v2" is lost due to truncation. + options_.avoid_flush_during_recovery = true; + std::vector cfhs; + ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_)); + assert(db_ != nullptr); // suppress false clang-analyze report + + // Flush one but not both CFs and write some data so there's a seqno gap + // between the PITR corruption and the next DB session's first WAL. + ASSERT_OK(db_->Put(WriteOptions(), cfhs[1], "k2", "v2")); + ASSERT_OK(db_->Flush(FlushOptions(), cfhs[1])); + + for (auto* cfh : cfhs) { + delete cfh; + } + CloseDb(); + } + + // With the bug, this DB open would remove the WALs following the PITR + // corruption. Then, the next recovery would fail. + for (int i = 0; i < 2; ++i) { + std::vector cfhs; + ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_)); + assert(db_ != nullptr); // suppress false clang-analyze report + + for (auto* cfh : cfhs) { + delete cfh; + } + CloseDb(); + } +} + +TEST_F(CorruptionTest, RecoverWriteError) { + env_->writable_file_error_ = true; + Status s = TryReopen(); + ASSERT_TRUE(!s.ok()); +} + +TEST_F(CorruptionTest, NewFileErrorDuringWrite) { + // Do enough writing to force minor compaction + env_->writable_file_error_ = true; + const int num = + static_cast(3 + (Options().write_buffer_size / kValueSize)); + std::string value_storage; + Status s; + bool failed = false; + for (int i = 0; i < num; i++) { + WriteBatch batch; + ASSERT_OK(batch.Put("a", Value(100, &value_storage))); + s = db_->Write(WriteOptions(), &batch); + if (!s.ok()) { + failed = true; + } + ASSERT_TRUE(!failed || !s.ok()); + } + ASSERT_TRUE(!s.ok()); + ASSERT_GE(env_->num_writable_file_errors_, 1); + env_->writable_file_error_ = false; + Reopen(); +} + +TEST_F(CorruptionTest, TableFile) { + Build(100); + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr)); + ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr)); + + Corrupt(kTableFile, 100, 1); + Check(99, 99); + ASSERT_NOK(dbi->VerifyChecksum()); +} + +TEST_F(CorruptionTest, VerifyChecksumReadahead) { + Options options; + SpecialEnv senv(env_->target()); + options.env = &senv; + // Disable block cache as we are going to check checksum for + // the same file twice and measure number of reads. + BlockBasedTableOptions table_options_no_bc; + table_options_no_bc.no_block_cache = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options_no_bc)); + + Reopen(&options); + + Build(10000); + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr)); + ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr)); + + senv.count_random_reads_ = true; + senv.random_read_counter_.Reset(); + ASSERT_OK(dbi->VerifyChecksum()); + + // Make sure the counter is enabled. + ASSERT_GT(senv.random_read_counter_.Read(), 0); + + // The SST file is about 10MB. Default readahead size is 256KB. + // Give a conservative 20 reads for metadata blocks, The number + // of random reads should be within 10 MB / 256KB + 20 = 60. + ASSERT_LT(senv.random_read_counter_.Read(), 60); + + senv.random_read_bytes_counter_ = 0; + ReadOptions ro; + ro.readahead_size = size_t{32 * 1024}; + ASSERT_OK(dbi->VerifyChecksum(ro)); + // The SST file is about 10MB. We set readahead size to 32KB. + // Give 0 to 20 reads for metadata blocks, and allow real read + // to range from 24KB to 48KB. The lower bound would be: + // 10MB / 48KB + 0 = 213 + // The higher bound is + // 10MB / 24KB + 20 = 447. + ASSERT_GE(senv.random_read_counter_.Read(), 213); + ASSERT_LE(senv.random_read_counter_.Read(), 447); + + // Test readahead shouldn't break mmap mode (where it should be + // disabled). + options.allow_mmap_reads = true; + Reopen(&options); + dbi = static_cast(db_); + ASSERT_OK(dbi->VerifyChecksum(ro)); + + CloseDb(); +} + +TEST_F(CorruptionTest, TableFileIndexData) { + Options options; + // very big, we'll trigger flushes manually + options.write_buffer_size = 100 * 1024 * 1024; + Reopen(&options); + // build 2 tables, flush at 5000 + Build(10000, 5000); + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + + // corrupt an index block of an entire file + Corrupt(kTableFile, -2000, 500); + options.paranoid_checks = false; + Reopen(&options); + dbi = static_cast_with_check(db_); + // one full file may be readable, since only one was corrupted + // the other file should be fully non-readable, since index was corrupted + Check(0, 5000); + ASSERT_NOK(dbi->VerifyChecksum()); + + // In paranoid mode, the db cannot be opened due to the corrupted file. + ASSERT_TRUE(TryReopen().IsCorruption()); +} + +TEST_F(CorruptionTest, MissingDescriptor) { + Build(1000); + RepairDB(); + Reopen(); + Check(1000, 1000); +} + +TEST_F(CorruptionTest, SequenceNumberRecovery) { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5")); + RepairDB(); + Reopen(); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v5", v); + // Write something. If sequence number was not recovered properly, + // it will be hidden by an earlier write. + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6")); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v6", v); + Reopen(); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v6", v); +} + +TEST_F(CorruptionTest, CorruptedDescriptor) { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello")); + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK( + dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr)); + + Corrupt(kDescriptorFile, 0, 1000); + Status s = TryReopen(); + ASSERT_TRUE(!s.ok()); + + RepairDB(); + Reopen(); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("hello", v); +} + +TEST_F(CorruptionTest, CompactionInputError) { + Options options; + options.env = env_; + Reopen(&options); + Build(10); + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr)); + ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr)); + ASSERT_EQ(1, Property("rocksdb.num-files-at-level2")); + + Corrupt(kTableFile, 100, 1); + Check(9, 9); + ASSERT_NOK(dbi->VerifyChecksum()); + + // Force compactions by writing lots of values + Build(10000); + Check(10000, 10000); + ASSERT_NOK(dbi->VerifyChecksum()); +} + +TEST_F(CorruptionTest, CompactionInputErrorParanoid) { + Options options; + options.env = env_; + options.paranoid_checks = true; + options.write_buffer_size = 131072; + options.max_write_buffer_number = 2; + Reopen(&options); + DBImpl* dbi = static_cast_with_check(db_); + + // Fill levels >= 1 + for (int level = 1; level < dbi->NumberLevels(); level++) { + ASSERT_OK(dbi->Put(WriteOptions(), "", "begin")); + ASSERT_OK(dbi->Put(WriteOptions(), "~", "end")); + ASSERT_OK(dbi->TEST_FlushMemTable()); + for (int comp_level = 0; comp_level < dbi->NumberLevels() - level; + ++comp_level) { + ASSERT_OK(dbi->TEST_CompactRange(comp_level, nullptr, nullptr)); + } + } + + Reopen(&options); + + dbi = static_cast_with_check(db_); + Build(10); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbi->TEST_WaitForCompact()); + ASSERT_EQ(1, Property("rocksdb.num-files-at-level0")); + + CorruptTableFileAtLevel(0, 100, 1); + Check(9, 9); + ASSERT_NOK(dbi->VerifyChecksum()); + + // Write must eventually fail because of corrupted table + Status s; + std::string tmp1, tmp2; + bool failed = false; + for (int i = 0; i < 10000; i++) { + s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2)); + if (!s.ok()) { + failed = true; + } + // if one write failed, every subsequent write must fail, too + ASSERT_TRUE(!failed || !s.ok()) << "write did not fail in a corrupted db"; + } + ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db"; +} + +TEST_F(CorruptionTest, UnrelatedKeys) { + Build(10); + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + Corrupt(kTableFile, 100, 1); + ASSERT_NOK(dbi->VerifyChecksum()); + + std::string tmp1, tmp2; + ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2))); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); + ASSERT_EQ(Value(1000, &tmp2).ToString(), v); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); + ASSERT_EQ(Value(1000, &tmp2).ToString(), v); +} + +TEST_F(CorruptionTest, RangeDeletionCorrupted) { + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b")); + ASSERT_OK(db_->Flush(FlushOptions())); + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(static_cast(1), metadata.size()); + std::string filename = dbname_ + metadata[0].name; + + FileOptions file_opts; + const auto& fs = options_.env->GetFileSystem(); + std::unique_ptr file_reader; + ASSERT_OK(RandomAccessFileReader::Create(fs, filename, file_opts, + &file_reader, nullptr)); + + uint64_t file_size; + ASSERT_OK( + fs->GetFileSize(filename, file_opts.io_options, &file_size, nullptr)); + + BlockHandle range_del_handle; + ASSERT_OK(FindMetaBlockInFile( + file_reader.get(), file_size, kBlockBasedTableMagicNumber, + ImmutableOptions(options_), kRangeDelBlockName, &range_del_handle)); + + ASSERT_OK(TryReopen()); + ASSERT_OK(test::CorruptFile(env_, filename, + static_cast(range_del_handle.offset()), 1)); + ASSERT_TRUE(TryReopen().IsCorruption()); +} + +TEST_F(CorruptionTest, FileSystemStateCorrupted) { + for (int iter = 0; iter < 2; ++iter) { + Options options; + options.env = env_; + options.paranoid_checks = true; + options.create_if_missing = true; + Reopen(&options); + Build(10); + ASSERT_OK(db_->Flush(FlushOptions())); + DBImpl* dbi = static_cast_with_check(db_); + std::vector metadata; + dbi->GetLiveFilesMetaData(&metadata); + ASSERT_GT(metadata.size(), 0); + std::string filename = dbname_ + metadata[0].name; + + delete db_; + db_ = nullptr; + + if (iter == 0) { // corrupt file size + std::unique_ptr file; + ASSERT_OK(env_->NewWritableFile(filename, &file, EnvOptions())); + ASSERT_OK(file->Append(Slice("corrupted sst"))); + file.reset(); + Status x = TryReopen(&options); + ASSERT_TRUE(x.IsCorruption()); + } else { // delete the file + ASSERT_OK(env_->DeleteFile(filename)); + Status x = TryReopen(&options); + ASSERT_TRUE(x.IsCorruption()); + } + + ASSERT_OK(DestroyDB(dbname_, options_)); + } +} + +static const auto& corruption_modes = { + mock::MockTableFactory::kCorruptNone, mock::MockTableFactory::kCorruptKey, + mock::MockTableFactory::kCorruptValue, + mock::MockTableFactory::kCorruptReorderKey}; + +TEST_F(CorruptionTest, ParanoidFileChecksOnFlush) { + Options options; + options.env = env_; + options.check_flush_compaction_key_order = false; + options.paranoid_file_checks = true; + options.create_if_missing = true; + Status s; + for (const auto& mode : corruption_modes) { + delete db_; + db_ = nullptr; + s = DestroyDB(dbname_, options); + ASSERT_OK(s); + std::shared_ptr mock = + std::make_shared(); + options.table_factory = mock; + mock->SetCorruptionMode(mode); + ASSERT_OK(DB::Open(options, dbname_, &db_)); + assert(db_ != nullptr); // suppress false clang-analyze report + Build(10); + s = db_->Flush(FlushOptions()); + if (mode == mock::MockTableFactory::kCorruptNone) { + ASSERT_OK(s); + } else { + ASSERT_NOK(s); + } + } +} + +TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) { + Options options; + options.env = env_; + options.paranoid_file_checks = true; + options.create_if_missing = true; + options.check_flush_compaction_key_order = false; + Status s; + for (const auto& mode : corruption_modes) { + delete db_; + db_ = nullptr; + s = DestroyDB(dbname_, options); + ASSERT_OK(s); + std::shared_ptr mock = + std::make_shared(); + options.table_factory = mock; + ASSERT_OK(DB::Open(options, dbname_, &db_)); + assert(db_ != nullptr); // suppress false clang-analyze report + Build(100, 2); + // ASSERT_OK(db_->Flush(FlushOptions())); + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + mock->SetCorruptionMode(mode); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + s = dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr); + if (mode == mock::MockTableFactory::kCorruptNone) { + ASSERT_OK(s); + } else { + ASSERT_NOK(s); + } + } +} + +TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeFirst) { + Options options; + options.env = env_; + options.check_flush_compaction_key_order = false; + options.paranoid_file_checks = true; + options.create_if_missing = true; + for (bool do_flush : {true, false}) { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, options)); + ASSERT_OK(DB::Open(options, dbname_, &db_)); + std::string start, end; + assert(db_ != nullptr); // suppress false clang-analyze report + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(3, &start), Key(7, &end))); + auto snap = db_->GetSnapshot(); + ASSERT_NE(snap, nullptr); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(8, &start), Key(9, &end))); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(2, &start), Key(5, &end))); + Build(10); + if (do_flush) { + ASSERT_OK(db_->Flush(FlushOptions())); + } else { + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK( + dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr)); + } + db_->ReleaseSnapshot(snap); + } +} + +TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRange) { + Options options; + options.env = env_; + options.check_flush_compaction_key_order = false; + options.paranoid_file_checks = true; + options.create_if_missing = true; + for (bool do_flush : {true, false}) { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, options)); + ASSERT_OK(DB::Open(options, dbname_, &db_)); + assert(db_ != nullptr); // suppress false clang-analyze report + Build(10, 0, 0); + std::string start, end; + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(5, &start), Key(15, &end))); + auto snap = db_->GetSnapshot(); + ASSERT_NE(snap, nullptr); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(8, &start), Key(9, &end))); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(12, &start), Key(17, &end))); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(2, &start), Key(4, &end))); + Build(10, 10, 0); + if (do_flush) { + ASSERT_OK(db_->Flush(FlushOptions())); + } else { + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK( + dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr)); + } + db_->ReleaseSnapshot(snap); + } +} + +TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeLast) { + Options options; + options.env = env_; + options.check_flush_compaction_key_order = false; + options.paranoid_file_checks = true; + options.create_if_missing = true; + for (bool do_flush : {true, false}) { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, options)); + ASSERT_OK(DB::Open(options, dbname_, &db_)); + assert(db_ != nullptr); // suppress false clang-analyze report + std::string start, end; + Build(10); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(3, &start), Key(7, &end))); + auto snap = db_->GetSnapshot(); + ASSERT_NE(snap, nullptr); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(6, &start), Key(8, &end))); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(2, &start), Key(5, &end))); + if (do_flush) { + ASSERT_OK(db_->Flush(FlushOptions())); + } else { + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK( + dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr)); + } + db_->ReleaseSnapshot(snap); + } +} + +TEST_F(CorruptionTest, LogCorruptionErrorsInCompactionIterator) { + Options options; + options.env = env_; + options.create_if_missing = true; + options.allow_data_in_errors = true; + auto mode = mock::MockTableFactory::kCorruptKey; + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, options)); + + std::shared_ptr mock = + std::make_shared(); + mock->SetCorruptionMode(mode); + options.table_factory = mock; + + ASSERT_OK(DB::Open(options, dbname_, &db_)); + assert(db_ != nullptr); // suppress false clang-analyze report + Build(100, 2); + + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + Status s = + dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsCorruption()); +} + +TEST_F(CorruptionTest, CompactionKeyOrderCheck) { + Options options; + options.env = env_; + options.paranoid_file_checks = false; + options.create_if_missing = true; + options.check_flush_compaction_key_order = false; + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, options)); + std::shared_ptr mock = + std::make_shared(); + options.table_factory = mock; + ASSERT_OK(DB::Open(options, dbname_, &db_)); + assert(db_ != nullptr); // suppress false clang-analyze report + mock->SetCorruptionMode(mock::MockTableFactory::kCorruptReorderKey); + Build(100, 2); + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + + mock->SetCorruptionMode(mock::MockTableFactory::kCorruptNone); + ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "true"}})); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_NOK( + dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr)); +} + +TEST_F(CorruptionTest, FlushKeyOrderCheck) { + Options options; + options.env = env_; + options.paranoid_file_checks = false; + options.create_if_missing = true; + ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "true"}})); + + ASSERT_OK(db_->Put(WriteOptions(), "foo1", "v1")); + ASSERT_OK(db_->Put(WriteOptions(), "foo2", "v1")); + ASSERT_OK(db_->Put(WriteOptions(), "foo3", "v1")); + ASSERT_OK(db_->Put(WriteOptions(), "foo4", "v1")); + + int cnt = 0; + // Generate some out of order keys from the memtable + SyncPoint::GetInstance()->SetCallBack( + "MemTableIterator::Next:0", [&](void* arg) { + MemTableRep::Iterator* mem_iter = + static_cast(arg); + if (++cnt == 3) { + mem_iter->Prev(); + mem_iter->Prev(); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + Status s = static_cast_with_check(db_)->TEST_FlushMemTable(); + ASSERT_NOK(s); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(CorruptionTest, DisableKeyOrderCheck) { + ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "false"}})); + DBImpl* dbi = static_cast_with_check(db_); + + SyncPoint::GetInstance()->SetCallBack( + "OutputValidator::Add:order_check", + [&](void* /*arg*/) { ASSERT_TRUE(false); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(db_->Put(WriteOptions(), "foo1", "v1")); + ASSERT_OK(db_->Put(WriteOptions(), "foo3", "v1")); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(db_->Put(WriteOptions(), "foo2", "v1")); + ASSERT_OK(db_->Put(WriteOptions(), "foo4", "v1")); + ASSERT_OK(dbi->TEST_FlushMemTable()); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK( + dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(CorruptionTest, VerifyWholeTableChecksum) { + CloseDb(); + Options options; + options.env = env_; + ASSERT_OK(DestroyDB(dbname_, options)); + options.create_if_missing = true; + options.file_checksum_gen_factory = + ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory(); + Reopen(&options); + + Build(10, 5); + + ASSERT_OK(db_->VerifyFileChecksums(ReadOptions())); + CloseDb(); + + // Corrupt the first byte of each table file, this must be data block. + Corrupt(kTableFile, 0, 1); + + ASSERT_OK(TryReopen(&options)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + int count{0}; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::VerifyFullFileChecksum:mismatch", [&](void* arg) { + auto* s = reinterpret_cast(arg); + ASSERT_NE(s, nullptr); + ++count; + ASSERT_NOK(*s); + }); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsCorruption()); + ASSERT_EQ(1, count); +} + +class CrashDuringRecoveryWithCorruptionTest + : public CorruptionTest, + public testing::WithParamInterface> { + public: + explicit CrashDuringRecoveryWithCorruptionTest() + : CorruptionTest(), + avoid_flush_during_recovery_(std::get<0>(GetParam())), + track_and_verify_wals_in_manifest_(std::get<1>(GetParam())) {} + + protected: + const bool avoid_flush_during_recovery_; + const bool track_and_verify_wals_in_manifest_; +}; + +INSTANTIATE_TEST_CASE_P(CorruptionTest, CrashDuringRecoveryWithCorruptionTest, + ::testing::Values(std::make_tuple(true, false), + std::make_tuple(false, false), + std::make_tuple(true, true), + std::make_tuple(false, true))); + +// In case of non-TransactionDB with avoid_flush_during_recovery = true, RocksDB +// won't flush the data from WAL to L0 for all column families if possible. As a +// result, not all column families can increase their log_numbers, and +// min_log_number_to_keep won't change. +// It may prematurely persist a new MANIFEST even before we can declare the DB +// is in consistent state after recovery (this is when the new WAL is synced) +// and advances log_numbers for some column families. +// +// If there is power failure before we sync the new WAL, we will end up in +// a situation in which after persisting the MANIFEST, RocksDB will see some +// column families' log_numbers larger than the corrupted wal, and +// "Column family inconsistency: SST file contains data beyond the point of +// corruption" error will be hit, causing recovery to fail. +// +// After adding the fix, only after new WAL is synced, RocksDB persist a new +// MANIFEST with column families to ensure RocksDB is in consistent state. +// RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is +// synced immediately afterwards. The sequence number of the sentinel +// WriteBatch will be the next sequence number immediately after the largest +// sequence number recovered from previous WALs and MANIFEST because of which DB +// will be in consistent state. +// If a future recovery starts from the new MANIFEST, then it means the new WAL +// is successfully synced. Due to the sentinel empty write batch at the +// beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point. +// If future recovery starts from the old MANIFEST, it means the writing the new +// MANIFEST failed. It won't have the "SST ahead of WAL" error. +// +// The combination of corrupting a WAL and injecting an error during subsequent +// re-open exposes the bug of prematurely persisting a new MANIFEST with +// advanced ColumnFamilyData::log_number. +TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) { + CloseDb(); + Options options; + options.track_and_verify_wals_in_manifest = + track_and_verify_wals_in_manifest_; + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + options.avoid_flush_during_recovery = false; + options.env = env_; + ASSERT_OK(DestroyDB(dbname_, options)); + options.create_if_missing = true; + options.max_write_buffer_number = 8; + + Reopen(&options); + Status s; + const std::string test_cf_name = "test_cf"; + ColumnFamilyHandle* cfh = nullptr; + s = db_->CreateColumnFamily(options, test_cf_name, &cfh); + ASSERT_OK(s); + delete cfh; + CloseDb(); + + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, options); + std::vector handles; + + // 1. Open and populate the DB. Write and flush default_cf several times to + // advance wal number so that some column families have advanced log_number + // while other don't. + { + ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_)); + auto* dbimpl = static_cast_with_check(db_); + assert(dbimpl); + + // Write one key to test_cf. + ASSERT_OK(db_->Put(WriteOptions(), handles[1], "old_key", "dontcare")); + ASSERT_OK(db_->Flush(FlushOptions(), handles[1])); + + // Write to default_cf and flush this cf several times to advance wal + // number. TEST_SwitchMemtable makes sure WALs are not synced and test can + // corrupt un-sync WAL. + for (int i = 0; i < 2; ++i) { + ASSERT_OK(db_->Put(WriteOptions(), "key" + std::to_string(i), + "value" + std::to_string(i))); + ASSERT_OK(dbimpl->TEST_SwitchMemtable()); + } + + for (auto* h : handles) { + delete h; + } + handles.clear(); + CloseDb(); + } + + // 2. Corrupt second last un-syned wal file to emulate power reset which + // caused the DB to lose the un-synced WAL. + { + std::vector file_nums; + GetSortedWalFiles(file_nums); + size_t size = file_nums.size(); + assert(size >= 2); + uint64_t log_num = file_nums[size - 2]; + CorruptFileWithTruncation(FileType::kWalFile, log_num, + /*bytes_to_truncate=*/8); + } + + // 3. After first crash reopen the DB which contains corrupted WAL. Default + // family has higher log number than corrupted wal number. + // + // Case1: If avoid_flush_during_recovery = true, RocksDB won't flush the data + // from WAL to L0 for all column families (test_cf_name in this case). As a + // result, not all column families can increase their log_numbers, and + // min_log_number_to_keep won't change. + // + // Case2: If avoid_flush_during_recovery = false, all column families have + // flushed their data from WAL to L0 during recovery, and none of them will + // ever need to read the WALs again. + + // 4. Fault is injected to fail the recovery. + { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::GetLogSizeAndMaybeTruncate:0", [&](void* arg) { + auto* tmp_s = reinterpret_cast(arg); + assert(tmp_s); + *tmp_s = Status::IOError("Injected"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + handles.clear(); + options.avoid_flush_during_recovery = true; + s = DB::Open(options, dbname_, cf_descs, &handles, &db_); + ASSERT_TRUE(s.IsIOError()); + ASSERT_EQ("IO error: Injected", s.ToString()); + for (auto* h : handles) { + delete h; + } + CloseDb(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + } + + // 5. After second crash reopen the db with second corruption. Default family + // has higher log number than corrupted wal number. + // + // Case1: If avoid_flush_during_recovery = true, we persist a new + // MANIFEST with advanced log_numbers for some column families only after + // syncing the WAL. So during second crash, RocksDB will skip the corrupted + // WAL files as they have been moved to different folder. Since newly synced + // WAL file's sequence number (sentinel WriteBatch) will be the next + // sequence number immediately after the largest sequence number recovered + // from previous WALs and MANIFEST, db will be in consistent state and opens + // successfully. + // + // Case2: If avoid_flush_during_recovery = false, the corrupted WAL is below + // this number. So during a second crash after persisting the new MANIFEST, + // RocksDB will skip the corrupted WAL(s) because they are all below this + // bound. Therefore, we won't hit the "column family inconsistency" error + // message. + { + options.avoid_flush_during_recovery = avoid_flush_during_recovery_; + ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_)); + + // Verify that data is not lost. + { + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), handles[1], "old_key", &v)); + ASSERT_EQ("dontcare", v); + + v.clear(); + ASSERT_OK(db_->Get(ReadOptions(), "key" + std::to_string(0), &v)); + ASSERT_EQ("value" + std::to_string(0), v); + + // Since it's corrupting second last wal, below key is not found. + v.clear(); + ASSERT_EQ(db_->Get(ReadOptions(), "key" + std::to_string(1), &v), + Status::NotFound()); + } + + for (auto* h : handles) { + delete h; + } + handles.clear(); + CloseDb(); + } +} + +// In case of TransactionDB, it enables two-phase-commit. The prepare section of +// an uncommitted transaction always need to be kept. Even if we perform flush +// during recovery, we may still need to hold an old WAL. The +// min_log_number_to_keep won't change, and "Column family inconsistency: SST +// file contains data beyond the point of corruption" error will be hit, causing +// recovery to fail. +// +// After adding the fix, only after new WAL is synced, RocksDB persist a new +// MANIFEST with column families to ensure RocksDB is in consistent state. +// RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is +// synced immediately afterwards. The sequence number of the sentinel +// WriteBatch will be the next sequence number immediately after the largest +// sequence number recovered from previous WALs and MANIFEST because of which DB +// will be in consistent state. +// If a future recovery starts from the new MANIFEST, then it means the new WAL +// is successfully synced. Due to the sentinel empty write batch at the +// beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point. +// If future recovery starts from the old MANIFEST, it means the writing the new +// MANIFEST failed. It won't have the "SST ahead of WAL" error. +// +// The combination of corrupting a WAL and injecting an error during subsequent +// re-open exposes the bug of prematurely persisting a new MANIFEST with +// advanced ColumnFamilyData::log_number. +TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) { + CloseDb(); + Options options; + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + options.track_and_verify_wals_in_manifest = + track_and_verify_wals_in_manifest_; + options.avoid_flush_during_recovery = false; + options.env = env_; + ASSERT_OK(DestroyDB(dbname_, options)); + options.create_if_missing = true; + options.max_write_buffer_number = 3; + Reopen(&options); + + // Create cf test_cf_name. + ColumnFamilyHandle* cfh = nullptr; + const std::string test_cf_name = "test_cf"; + Status s = db_->CreateColumnFamily(options, test_cf_name, &cfh); + ASSERT_OK(s); + delete cfh; + CloseDb(); + + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, options); + std::vector handles; + + TransactionDB* txn_db = nullptr; + TransactionDBOptions txn_db_opts; + + // 1. Open and populate the DB. Write and flush default_cf several times to + // advance wal number so that some column families have advanced log_number + // while other don't. + { + ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs, + &handles, &txn_db)); + + auto* txn = txn_db->BeginTransaction(WriteOptions(), TransactionOptions()); + // Put cf1 + ASSERT_OK(txn->Put(handles[1], "foo", "value")); + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Prepare()); + ASSERT_OK(txn_db->Flush(FlushOptions())); + + delete txn; + txn = nullptr; + + auto* dbimpl = static_cast_with_check(txn_db->GetRootDB()); + assert(dbimpl); + + // Put and flush cf0 + for (int i = 0; i < 2; ++i) { + ASSERT_OK(txn_db->Put(WriteOptions(), "key" + std::to_string(i), + "value" + std::to_string(i))); + ASSERT_OK(dbimpl->TEST_SwitchMemtable()); + } + + // Put cf1 + txn = txn_db->BeginTransaction(WriteOptions(), TransactionOptions()); + ASSERT_OK(txn->Put(handles[1], "foo1", "value1")); + ASSERT_OK(txn->Commit()); + + delete txn; + txn = nullptr; + + for (auto* h : handles) { + delete h; + } + handles.clear(); + delete txn_db; + } + + // 2. Corrupt second last wal to emulate power reset which caused the DB to + // lose the un-synced WAL. + { + std::vector file_nums; + GetSortedWalFiles(file_nums); + size_t size = file_nums.size(); + assert(size >= 2); + uint64_t log_num = file_nums[size - 2]; + CorruptFileWithTruncation(FileType::kWalFile, log_num, + /*bytes_to_truncate=*/8); + } + + // 3. After first crash reopen the DB which contains corrupted WAL. Default + // family has higher log number than corrupted wal number. There may be old + // WAL files that it must not delete because they can contain data of + // uncommitted transactions. As a result, min_log_number_to_keep won't change. + + { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::Open::BeforeSyncWAL", [&](void* arg) { + auto* tmp_s = reinterpret_cast(arg); + assert(tmp_s); + *tmp_s = Status::IOError("Injected"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + handles.clear(); + s = TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs, &handles, + &txn_db); + ASSERT_TRUE(s.IsIOError()); + ASSERT_EQ("IO error: Injected", s.ToString()); + for (auto* h : handles) { + delete h; + } + CloseDb(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + } + + // 4. Corrupt max_wal_num. + { + std::vector file_nums; + GetSortedWalFiles(file_nums); + size_t size = file_nums.size(); + uint64_t log_num = file_nums[size - 1]; + CorruptFileWithTruncation(FileType::kWalFile, log_num); + } + + // 5. After second crash reopen the db with second corruption. Default family + // has higher log number than corrupted wal number. + // We persist a new MANIFEST with advanced log_numbers for some column + // families only after syncing the WAL. So during second crash, RocksDB will + // skip the corrupted WAL files as they have been moved to different folder. + // Since newly synced WAL file's sequence number (sentinel WriteBatch) will be + // the next sequence number immediately after the largest sequence number + // recovered from previous WALs and MANIFEST, db will be in consistent state + // and opens successfully. + { + ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs, + &handles, &txn_db)); + + // Verify that data is not lost. + { + std::string v; + // Key not visible since it's not committed. + ASSERT_EQ(txn_db->Get(ReadOptions(), handles[1], "foo", &v), + Status::NotFound()); + + v.clear(); + ASSERT_OK(txn_db->Get(ReadOptions(), "key" + std::to_string(0), &v)); + ASSERT_EQ("value" + std::to_string(0), v); + + // Last WAL is corrupted which contains two keys below. + v.clear(); + ASSERT_EQ(txn_db->Get(ReadOptions(), "key" + std::to_string(1), &v), + Status::NotFound()); + v.clear(); + ASSERT_EQ(txn_db->Get(ReadOptions(), handles[1], "foo1", &v), + Status::NotFound()); + } + + for (auto* h : handles) { + delete h; + } + delete txn_db; + } +} + +// This test is similar to +// CrashDuringRecoveryWithCorruptionTest.CrashDuringRecovery except it calls +// flush and corrupts Last WAL. It calls flush to sync some of the WALs and +// remaining are unsyned one of which is then corrupted to simulate crash. +// +// In case of non-TransactionDB with avoid_flush_during_recovery = true, RocksDB +// won't flush the data from WAL to L0 for all column families if possible. As a +// result, not all column families can increase their log_numbers, and +// min_log_number_to_keep won't change. +// It may prematurely persist a new MANIFEST even before we can declare the DB +// is in consistent state after recovery (this is when the new WAL is synced) +// and advances log_numbers for some column families. +// +// If there is power failure before we sync the new WAL, we will end up in +// a situation in which after persisting the MANIFEST, RocksDB will see some +// column families' log_numbers larger than the corrupted wal, and +// "Column family inconsistency: SST file contains data beyond the point of +// corruption" error will be hit, causing recovery to fail. +// +// After adding the fix, only after new WAL is synced, RocksDB persist a new +// MANIFEST with column families to ensure RocksDB is in consistent state. +// RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is +// synced immediately afterwards. The sequence number of the sentinel +// WriteBatch will be the next sequence number immediately after the largest +// sequence number recovered from previous WALs and MANIFEST because of which DB +// will be in consistent state. +// If a future recovery starts from the new MANIFEST, then it means the new WAL +// is successfully synced. Due to the sentinel empty write batch at the +// beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point. +// If future recovery starts from the old MANIFEST, it means the writing the new +// MANIFEST failed. It won't have the "SST ahead of WAL" error. + +// The combination of corrupting a WAL and injecting an error during subsequent +// re-open exposes the bug of prematurely persisting a new MANIFEST with +// advanced ColumnFamilyData::log_number. +TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecoveryWithFlush) { + CloseDb(); + Options options; + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + options.avoid_flush_during_recovery = false; + options.env = env_; + options.create_if_missing = true; + + ASSERT_OK(DestroyDB(dbname_, options)); + Reopen(&options); + + ColumnFamilyHandle* cfh = nullptr; + const std::string test_cf_name = "test_cf"; + Status s = db_->CreateColumnFamily(options, test_cf_name, &cfh); + ASSERT_OK(s); + delete cfh; + + CloseDb(); + + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, options); + std::vector handles; + + { + ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_)); + + // Write one key to test_cf. + ASSERT_OK(db_->Put(WriteOptions(), handles[1], "old_key", "dontcare")); + + // Write to default_cf and flush this cf several times to advance wal + // number. + for (int i = 0; i < 2; ++i) { + ASSERT_OK(db_->Put(WriteOptions(), "key" + std::to_string(i), + "value" + std::to_string(i))); + ASSERT_OK(db_->Flush(FlushOptions())); + } + + ASSERT_OK(db_->Put(WriteOptions(), handles[1], "dontcare", "dontcare")); + for (auto* h : handles) { + delete h; + } + handles.clear(); + CloseDb(); + } + + // Corrupt second last un-syned wal file to emulate power reset which + // caused the DB to lose the un-synced WAL. + { + std::vector file_nums; + GetSortedWalFiles(file_nums); + size_t size = file_nums.size(); + uint64_t log_num = file_nums[size - 1]; + CorruptFileWithTruncation(FileType::kWalFile, log_num, + /*bytes_to_truncate=*/8); + } + + // Fault is injected to fail the recovery. + { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::GetLogSizeAndMaybeTruncate:0", [&](void* arg) { + auto* tmp_s = reinterpret_cast(arg); + assert(tmp_s); + *tmp_s = Status::IOError("Injected"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + handles.clear(); + options.avoid_flush_during_recovery = true; + s = DB::Open(options, dbname_, cf_descs, &handles, &db_); + ASSERT_TRUE(s.IsIOError()); + ASSERT_EQ("IO error: Injected", s.ToString()); + for (auto* h : handles) { + delete h; + } + CloseDb(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + } + + // Reopen db again + { + options.avoid_flush_during_recovery = avoid_flush_during_recovery_; + ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_)); + + // Verify that data is not lost. + { + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), handles[1], "old_key", &v)); + ASSERT_EQ("dontcare", v); + + for (int i = 0; i < 2; ++i) { + v.clear(); + ASSERT_OK(db_->Get(ReadOptions(), "key" + std::to_string(i), &v)); + ASSERT_EQ("value" + std::to_string(i), v); + } + + // Since it's corrupting last wal after Flush, below key is not found. + v.clear(); + ASSERT_EQ(db_->Get(ReadOptions(), handles[1], "dontcare", &v), + Status::NotFound()); + } + + for (auto* h : handles) { + delete h; + } + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as RepairDB() is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/cuckoo_table_db_test.cc b/src/rocksdb/db/cuckoo_table_db_test.cc new file mode 100644 index 000000000..868b798ea --- /dev/null +++ b/src/rocksdb/db/cuckoo_table_db_test.cc @@ -0,0 +1,361 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "table/cuckoo/cuckoo_table_factory.h" +#include "table/cuckoo/cuckoo_table_reader.h" +#include "table/meta_blocks.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/cast_util.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class CuckooTableDBTest : public testing::Test { + private: + std::string dbname_; + Env* env_; + DB* db_; + + public: + CuckooTableDBTest() : env_(Env::Default()) { + dbname_ = test::PerThreadDBPath("cuckoo_table_db_test"); + EXPECT_OK(DestroyDB(dbname_, Options())); + db_ = nullptr; + Reopen(); + } + + ~CuckooTableDBTest() override { + delete db_; + EXPECT_OK(DestroyDB(dbname_, Options())); + } + + Options CurrentOptions() { + Options options; + options.table_factory.reset(NewCuckooTableFactory()); + options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true)); + options.allow_mmap_reads = true; + options.create_if_missing = true; + options.allow_concurrent_memtable_write = false; + return options; + } + + DBImpl* dbfull() { return static_cast_with_check(db_); } + + // The following util methods are copied from plain_table_db_test. + void Reopen(Options* options = nullptr) { + delete db_; + db_ = nullptr; + Options opts; + if (options != nullptr) { + opts = *options; + } else { + opts = CurrentOptions(); + opts.create_if_missing = true; + } + ASSERT_OK(DB::Open(opts, dbname_, &db_)); + } + + void DestroyAndReopen(Options* options) { + assert(options); + ASSERT_OK(db_->Close()); + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, *options)); + Reopen(options); + } + + Status Put(const Slice& k, const Slice& v) { + return db_->Put(WriteOptions(), k, v); + } + + Status Delete(const std::string& k) { return db_->Delete(WriteOptions(), k); } + + std::string Get(const std::string& k) { + ReadOptions options; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + int NumTableFilesAtLevel(int level) { + std::string property; + EXPECT_TRUE(db_->GetProperty( + "rocksdb.num-files-at-level" + std::to_string(level), &property)); + return atoi(property.c_str()); + } + + // Return spread of files per level + std::string FilesPerLevel() { + std::string result; + size_t last_non_zero_offset = 0; + for (int level = 0; level < db_->NumberLevels(); level++) { + int f = NumTableFilesAtLevel(level); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; + } +}; + +TEST_F(CuckooTableDBTest, Flush) { + // Try with empty DB first. + ASSERT_TRUE(dbfull() != nullptr); + ASSERT_EQ("NOT_FOUND", Get("key2")); + + // Add some values to db. + Options options = CurrentOptions(); + Reopen(&options); + + ASSERT_OK(Put("key1", "v1")); + ASSERT_OK(Put("key2", "v2")); + ASSERT_OK(Put("key3", "v3")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + + TablePropertiesCollection ptc; + ASSERT_OK(reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); + VerifySstUniqueIds(ptc); + ASSERT_EQ(1U, ptc.size()); + ASSERT_EQ(3U, ptc.begin()->second->num_entries); + ASSERT_EQ("1", FilesPerLevel()); + + ASSERT_EQ("v1", Get("key1")); + ASSERT_EQ("v2", Get("key2")); + ASSERT_EQ("v3", Get("key3")); + ASSERT_EQ("NOT_FOUND", Get("key4")); + + // Now add more keys and flush. + ASSERT_OK(Put("key4", "v4")); + ASSERT_OK(Put("key5", "v5")); + ASSERT_OK(Put("key6", "v6")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + + ASSERT_OK(reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); + VerifySstUniqueIds(ptc); + ASSERT_EQ(2U, ptc.size()); + auto row = ptc.begin(); + ASSERT_EQ(3U, row->second->num_entries); + ASSERT_EQ(3U, (++row)->second->num_entries); + ASSERT_EQ("2", FilesPerLevel()); + ASSERT_EQ("v1", Get("key1")); + ASSERT_EQ("v2", Get("key2")); + ASSERT_EQ("v3", Get("key3")); + ASSERT_EQ("v4", Get("key4")); + ASSERT_EQ("v5", Get("key5")); + ASSERT_EQ("v6", Get("key6")); + + ASSERT_OK(Delete("key6")); + ASSERT_OK(Delete("key5")); + ASSERT_OK(Delete("key4")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_OK(reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); + VerifySstUniqueIds(ptc); + ASSERT_EQ(3U, ptc.size()); + row = ptc.begin(); + ASSERT_EQ(3U, row->second->num_entries); + ASSERT_EQ(3U, (++row)->second->num_entries); + ASSERT_EQ(3U, (++row)->second->num_entries); + ASSERT_EQ("3", FilesPerLevel()); + ASSERT_EQ("v1", Get("key1")); + ASSERT_EQ("v2", Get("key2")); + ASSERT_EQ("v3", Get("key3")); + ASSERT_EQ("NOT_FOUND", Get("key4")); + ASSERT_EQ("NOT_FOUND", Get("key5")); + ASSERT_EQ("NOT_FOUND", Get("key6")); +} + +TEST_F(CuckooTableDBTest, FlushWithDuplicateKeys) { + Options options = CurrentOptions(); + Reopen(&options); + ASSERT_OK(Put("key1", "v1")); + ASSERT_OK(Put("key2", "v2")); + ASSERT_OK(Put("key1", "v3")); // Duplicate + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + + TablePropertiesCollection ptc; + ASSERT_OK(reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); + VerifySstUniqueIds(ptc); + ASSERT_EQ(1U, ptc.size()); + ASSERT_EQ(2U, ptc.begin()->second->num_entries); + ASSERT_EQ("1", FilesPerLevel()); + ASSERT_EQ("v3", Get("key1")); + ASSERT_EQ("v2", Get("key2")); +} + +namespace { +static std::string Key(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "key_______%06d", i); + return std::string(buf); +} +static std::string Uint64Key(uint64_t i) { + std::string str; + str.resize(8); + memcpy(&str[0], static_cast(&i), 8); + return str; +} +} // namespace. + +TEST_F(CuckooTableDBTest, Uint64Comparator) { + Options options = CurrentOptions(); + options.comparator = test::Uint64Comparator(); + DestroyAndReopen(&options); + + ASSERT_OK(Put(Uint64Key(1), "v1")); + ASSERT_OK(Put(Uint64Key(2), "v2")); + ASSERT_OK(Put(Uint64Key(3), "v3")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + + ASSERT_EQ("v1", Get(Uint64Key(1))); + ASSERT_EQ("v2", Get(Uint64Key(2))); + ASSERT_EQ("v3", Get(Uint64Key(3))); + ASSERT_EQ("NOT_FOUND", Get(Uint64Key(4))); + + // Add more keys. + ASSERT_OK(Delete(Uint64Key(2))); // Delete. + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_OK(Put(Uint64Key(3), "v0")); // Update. + ASSERT_OK(Put(Uint64Key(4), "v4")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ("v1", Get(Uint64Key(1))); + ASSERT_EQ("NOT_FOUND", Get(Uint64Key(2))); + ASSERT_EQ("v0", Get(Uint64Key(3))); + ASSERT_EQ("v4", Get(Uint64Key(4))); +} + +TEST_F(CuckooTableDBTest, CompactionIntoMultipleFiles) { + // Create a big L0 file and check it compacts into multiple files in L1. + Options options = CurrentOptions(); + options.write_buffer_size = 270 << 10; + // Two SST files should be created, each containing 14 keys. + // Number of buckets will be 16. Total size ~156 KB. + options.target_file_size_base = 160 << 10; + Reopen(&options); + + // Write 28 values, each 10016 B ~ 10KB + for (int idx = 0; idx < 28; ++idx) { + ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + char(idx)))); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ("1", FilesPerLevel()); + + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow trivial move */)); + ASSERT_EQ("0,2", FilesPerLevel()); + for (int idx = 0; idx < 28; ++idx) { + ASSERT_EQ(std::string(10000, 'a' + char(idx)), Get(Key(idx))); + } +} + +TEST_F(CuckooTableDBTest, SameKeyInsertedInTwoDifferentFilesAndCompacted) { + // Insert same key twice so that they go to different SST files. Then wait for + // compaction and check if the latest value is stored and old value removed. + Options options = CurrentOptions(); + options.write_buffer_size = 100 << 10; // 100KB + options.level0_file_num_compaction_trigger = 2; + Reopen(&options); + + // Write 11 values, each 10016 B + for (int idx = 0; idx < 11; ++idx) { + ASSERT_OK(Put(Key(idx), std::string(10000, 'a'))); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ("1", FilesPerLevel()); + + // Generate one more file in level-0, and should trigger level-0 compaction + for (int idx = 0; idx < 11; ++idx) { + ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + char(idx)))); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); + + ASSERT_EQ("0,1", FilesPerLevel()); + for (int idx = 0; idx < 11; ++idx) { + ASSERT_EQ(std::string(10000, 'a' + char(idx)), Get(Key(idx))); + } +} + +TEST_F(CuckooTableDBTest, AdaptiveTable) { + Options options = CurrentOptions(); + + // Ensure options compatible with PlainTable + options.prefix_extractor.reset(NewCappedPrefixTransform(8)); + + // Write some keys using cuckoo table. + options.table_factory.reset(NewCuckooTableFactory()); + Reopen(&options); + + ASSERT_OK(Put("key1", "v1")); + ASSERT_OK(Put("key2", "v2")); + ASSERT_OK(Put("key3", "v3")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + + // Write some keys using plain table. + std::shared_ptr block_based_factory( + NewBlockBasedTableFactory()); + std::shared_ptr plain_table_factory(NewPlainTableFactory()); + std::shared_ptr cuckoo_table_factory(NewCuckooTableFactory()); + options.create_if_missing = false; + options.table_factory.reset( + NewAdaptiveTableFactory(plain_table_factory, block_based_factory, + plain_table_factory, cuckoo_table_factory)); + Reopen(&options); + ASSERT_OK(Put("key4", "v4")); + ASSERT_OK(Put("key1", "v5")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + + // Write some keys using block based table. + options.table_factory.reset( + NewAdaptiveTableFactory(block_based_factory, block_based_factory, + plain_table_factory, cuckoo_table_factory)); + Reopen(&options); + ASSERT_OK(Put("key5", "v6")); + ASSERT_OK(Put("key2", "v7")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + + ASSERT_EQ("v5", Get("key1")); + ASSERT_EQ("v7", Get("key2")); + ASSERT_EQ("v3", Get("key3")); + ASSERT_EQ("v4", Get("key4")); + ASSERT_EQ("v6", Get("key5")); +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + if (ROCKSDB_NAMESPACE::port::kLittleEndian) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); + } else { + fprintf(stderr, "SKIPPED as Cuckoo table doesn't support Big Endian\n"); + return 0; + } +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/db_basic_test.cc b/src/rocksdb/db/db_basic_test.cc new file mode 100644 index 000000000..a28ac2b88 --- /dev/null +++ b/src/rocksdb/db/db_basic_test.cc @@ -0,0 +1,4643 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#include "db/db_test_util.h" +#include "options/options_helper.h" +#include "port/stack_trace.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/table.h" +#include "rocksdb/utilities/debug.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/block_builder.h" +#if !defined(ROCKSDB_LITE) +#include "test_util/sync_point.h" +#endif +#include "util/file_checksum_helper.h" +#include "util/random.h" +#include "utilities/counted_fs.h" +#include "utilities/fault_injection_env.h" +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/string_append/stringappend.h" + +namespace ROCKSDB_NAMESPACE { + +class DBBasicTest : public DBTestBase { + public: + DBBasicTest() : DBTestBase("db_basic_test", /*env_do_fsync=*/false) {} +}; + +TEST_F(DBBasicTest, OpenWhenOpen) { + Options options = CurrentOptions(); + options.env = env_; + DB* db2 = nullptr; + Status s = DB::Open(options, dbname_, &db2); + ASSERT_NOK(s) << [db2]() { + delete db2; + return "db2 open: ok"; + }(); + ASSERT_EQ(Status::Code::kIOError, s.code()); + ASSERT_EQ(Status::SubCode::kNone, s.subcode()); + ASSERT_TRUE(strstr(s.getState(), "lock ") != nullptr); + + delete db2; +} + +TEST_F(DBBasicTest, EnableDirectIOWithZeroBuf) { + if (!IsDirectIOSupported()) { + ROCKSDB_GTEST_BYPASS("Direct IO not supported"); + return; + } + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.use_direct_io_for_flush_and_compaction = true; + options.writable_file_max_buffer_size = 0; + ASSERT_TRUE(TryReopen(options).IsInvalidArgument()); + + options.writable_file_max_buffer_size = 1024; + Reopen(options); + const std::unordered_map new_db_opts = { + {"writable_file_max_buffer_size", "0"}}; + ASSERT_TRUE(db_->SetDBOptions(new_db_opts).IsInvalidArgument()); +} + +TEST_F(DBBasicTest, UniqueSession) { + Options options = CurrentOptions(); + std::string sid1, sid2, sid3, sid4; + + ASSERT_OK(db_->GetDbSessionId(sid1)); + Reopen(options); + ASSERT_OK(db_->GetDbSessionId(sid2)); + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(db_->GetDbSessionId(sid4)); + Reopen(options); + ASSERT_OK(db_->GetDbSessionId(sid3)); + + ASSERT_NE(sid1, sid2); + ASSERT_NE(sid1, sid3); + ASSERT_NE(sid2, sid3); + + ASSERT_EQ(sid2, sid4); + + // Expected compact format for session ids (see notes in implementation) + TestRegex expected("[0-9A-Z]{20}"); + EXPECT_MATCHES_REGEX(sid1, expected); + EXPECT_MATCHES_REGEX(sid2, expected); + EXPECT_MATCHES_REGEX(sid3, expected); + +#ifndef ROCKSDB_LITE + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + ASSERT_OK(db_->GetDbSessionId(sid1)); + // Test uniqueness between readonly open (sid1) and regular open (sid3) + ASSERT_NE(sid1, sid3); + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + ASSERT_OK(db_->GetDbSessionId(sid2)); + ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(db_->GetDbSessionId(sid3)); + + ASSERT_NE(sid1, sid2); + + ASSERT_EQ(sid2, sid3); +#endif // ROCKSDB_LITE + + CreateAndReopenWithCF({"goku"}, options); + ASSERT_OK(db_->GetDbSessionId(sid1)); + ASSERT_OK(Put("bar", "e1")); + ASSERT_OK(db_->GetDbSessionId(sid2)); + ASSERT_EQ("e1", Get("bar")); + ASSERT_OK(db_->GetDbSessionId(sid3)); + ReopenWithColumnFamilies({"default", "goku"}, options); + ASSERT_OK(db_->GetDbSessionId(sid4)); + + ASSERT_EQ(sid1, sid2); + ASSERT_EQ(sid2, sid3); + + ASSERT_NE(sid1, sid4); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBBasicTest, ReadOnlyDB) { + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Put("foo", "v3")); + Close(); + + auto verify_one_iter = [&](Iterator* iter) { + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + ++count; + } + // Always expect two keys: "foo" and "bar" + ASSERT_EQ(count, 2); + }; + + auto verify_all_iters = [&]() { + Iterator* iter = db_->NewIterator(ReadOptions()); + verify_one_iter(iter); + delete iter; + + std::vector iters; + ASSERT_OK(db_->NewIterators(ReadOptions(), + {dbfull()->DefaultColumnFamily()}, &iters)); + ASSERT_EQ(static_cast(1), iters.size()); + verify_one_iter(iters[0]); + delete iters[0]; + }; + + auto options = CurrentOptions(); + assert(options.env == env_); + ASSERT_OK(ReadOnlyReopen(options)); + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + verify_all_iters(); + Close(); + + // Reopen and flush memtable. + Reopen(options); + ASSERT_OK(Flush()); + Close(); + // Now check keys in read only mode. + ASSERT_OK(ReadOnlyReopen(options)); + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + verify_all_iters(); + ASSERT_TRUE(db_->SyncWAL().IsNotSupported()); +} + +// TODO akanksha: Update the test to check that combination +// does not actually write to FS (use open read-only with +// CompositeEnvWrapper+ReadOnlyFileSystem). +TEST_F(DBBasicTest, DISABLED_ReadOnlyDBWithWriteDBIdToManifestSet) { + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Put("foo", "v3")); + Close(); + + auto options = CurrentOptions(); + options.write_dbid_to_manifest = true; + assert(options.env == env_); + ASSERT_OK(ReadOnlyReopen(options)); + std::string db_id1; + ASSERT_OK(db_->GetDbIdentity(db_id1)); + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + Iterator* iter = db_->NewIterator(ReadOptions()); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + ++count; + } + ASSERT_EQ(count, 2); + delete iter; + Close(); + + // Reopen and flush memtable. + Reopen(options); + ASSERT_OK(Flush()); + Close(); + // Now check keys in read only mode. + ASSERT_OK(ReadOnlyReopen(options)); + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + ASSERT_TRUE(db_->SyncWAL().IsNotSupported()); + std::string db_id2; + ASSERT_OK(db_->GetDbIdentity(db_id2)); + ASSERT_EQ(db_id1, db_id2); +} + +TEST_F(DBBasicTest, CompactedDB) { + const uint64_t kFileSize = 1 << 20; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.write_buffer_size = kFileSize; + options.target_file_size_base = kFileSize; + options.max_bytes_for_level_base = 1 << 30; + options.compression = kNoCompression; + Reopen(options); + // 1 L0 file, use CompactedDB if max_open_files = -1 + ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, '1'))); + ASSERT_OK(Flush()); + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + Status s = Put("new", "value"); + ASSERT_EQ(s.ToString(), + "Not implemented: Not supported operation in read only mode."); + ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa")); + Close(); + options.max_open_files = -1; + ASSERT_OK(ReadOnlyReopen(options)); + s = Put("new", "value"); + ASSERT_EQ(s.ToString(), + "Not implemented: Not supported in compacted db mode."); + ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa")); + Close(); + Reopen(options); + // Add more L0 files + ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, '2'))); + ASSERT_OK(Flush()); + ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, 'a'))); + ASSERT_OK(Flush()); + ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, 'b'))); + ASSERT_OK(Put("eee", DummyString(kFileSize / 2, 'e'))); + ASSERT_OK(Flush()); + ASSERT_OK(Put("something_not_flushed", "x")); + Close(); + + ASSERT_OK(ReadOnlyReopen(options)); + // Fallback to read-only DB + s = Put("new", "value"); + ASSERT_EQ(s.ToString(), + "Not implemented: Not supported operation in read only mode."); + + // TODO: validate that other write ops return NotImplemented + // (DBImplReadOnly is missing some overrides) + + // Ensure no deadlock on flush triggered by another API function + // (Old deadlock bug depends on something_not_flushed above.) + std::vector files; + uint64_t manifest_file_size; + ASSERT_OK(db_->GetLiveFiles(files, &manifest_file_size, /*flush*/ true)); + LiveFilesStorageInfoOptions lfsi_opts; + lfsi_opts.wal_size_for_flush = 0; // always + std::vector files2; + ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsi_opts, &files2)); + + Close(); + + // Full compaction + Reopen(options); + // Add more keys + ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f'))); + ASSERT_OK(Put("hhh", DummyString(kFileSize / 2, 'h'))); + ASSERT_OK(Put("iii", DummyString(kFileSize / 2, 'i'))); + ASSERT_OK(Put("jjj", DummyString(kFileSize / 2, 'j'))); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(3, NumTableFilesAtLevel(1)); + Close(); + + // CompactedDB + ASSERT_OK(ReadOnlyReopen(options)); + s = Put("new", "value"); + ASSERT_EQ(s.ToString(), + "Not implemented: Not supported in compacted db mode."); + ASSERT_EQ("NOT_FOUND", Get("abc")); + ASSERT_EQ(DummyString(kFileSize / 2, 'a'), Get("aaa")); + ASSERT_EQ(DummyString(kFileSize / 2, 'b'), Get("bbb")); + ASSERT_EQ("NOT_FOUND", Get("ccc")); + ASSERT_EQ(DummyString(kFileSize / 2, 'e'), Get("eee")); + ASSERT_EQ(DummyString(kFileSize / 2, 'f'), Get("fff")); + ASSERT_EQ("NOT_FOUND", Get("ggg")); + ASSERT_EQ(DummyString(kFileSize / 2, 'h'), Get("hhh")); + ASSERT_EQ(DummyString(kFileSize / 2, 'i'), Get("iii")); + ASSERT_EQ(DummyString(kFileSize / 2, 'j'), Get("jjj")); + ASSERT_EQ("NOT_FOUND", Get("kkk")); + + // TODO: validate that other write ops return NotImplemented + // (CompactedDB is missing some overrides) + + // Ensure no deadlock on flush triggered by another API function + ASSERT_OK(db_->GetLiveFiles(files, &manifest_file_size, /*flush*/ true)); + ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsi_opts, &files2)); + + // MultiGet + std::vector values; + std::vector status_list = dbfull()->MultiGet( + ReadOptions(), + std::vector({Slice("aaa"), Slice("ccc"), Slice("eee"), + Slice("ggg"), Slice("iii"), Slice("kkk")}), + &values); + ASSERT_EQ(status_list.size(), static_cast(6)); + ASSERT_EQ(values.size(), static_cast(6)); + ASSERT_OK(status_list[0]); + ASSERT_EQ(DummyString(kFileSize / 2, 'a'), values[0]); + ASSERT_TRUE(status_list[1].IsNotFound()); + ASSERT_OK(status_list[2]); + ASSERT_EQ(DummyString(kFileSize / 2, 'e'), values[2]); + ASSERT_TRUE(status_list[3].IsNotFound()); + ASSERT_OK(status_list[4]); + ASSERT_EQ(DummyString(kFileSize / 2, 'i'), values[4]); + ASSERT_TRUE(status_list[5].IsNotFound()); + + Reopen(options); + // Add a key + ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f'))); + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + s = Put("new", "value"); + ASSERT_EQ(s.ToString(), + "Not implemented: Not supported operation in read only mode."); +} + +TEST_F(DBBasicTest, LevelLimitReopen) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + + const std::string value(1024 * 1024, ' '); + int i = 0; + while (NumTableFilesAtLevel(2, 1) == 0) { + ASSERT_OK(Put(1, Key(i++), value)); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + + options.num_levels = 1; + options.max_bytes_for_level_multiplier_additional.resize(1, 1); + Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ(s.IsInvalidArgument(), true); + ASSERT_EQ(s.ToString(), + "Invalid argument: db has more levels than options.num_levels"); + + options.num_levels = 10; + options.max_bytes_for_level_multiplier_additional.resize(10, 1); + ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); +} +#endif // ROCKSDB_LITE + +TEST_F(DBBasicTest, PutDeleteGet) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_OK(Put(1, "foo", "v2")); + ASSERT_EQ("v2", Get(1, "foo")); + ASSERT_OK(Delete(1, "foo")); + ASSERT_EQ("NOT_FOUND", Get(1, "foo")); + } while (ChangeOptions()); +} + +TEST_F(DBBasicTest, PutSingleDeleteGet) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_OK(Put(1, "foo2", "v2")); + ASSERT_EQ("v2", Get(1, "foo2")); + ASSERT_OK(SingleDelete(1, "foo")); + ASSERT_EQ("NOT_FOUND", Get(1, "foo")); + // Ski FIFO and universal compaction because they do not apply to the test + // case. Skip MergePut because single delete does not get removed when it + // encounters a merge. + } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction | + kSkipMergePut)); +} + +TEST_F(DBBasicTest, EmptyFlush) { + // It is possible to produce empty flushes when using single deletes. Tests + // whether empty flushes cause issues. + do { + Random rnd(301); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "a", Slice())); + ASSERT_OK(SingleDelete(1, "a")); + ASSERT_OK(Flush(1)); + + ASSERT_EQ("[ ]", AllEntriesFor("a", 1)); + // Skip FIFO and universal compaction as they do not apply to the test + // case. Skip MergePut because merges cannot be combined with single + // deletions. + } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction | + kSkipMergePut)); +} + +TEST_F(DBBasicTest, GetFromVersions) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Flush(1)); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("NOT_FOUND", Get(0, "foo")); + } while (ChangeOptions()); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBBasicTest, GetSnapshot) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override)); + // Try with both a short key and a long key + for (int i = 0; i < 2; i++) { + std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x'); + ASSERT_OK(Put(1, key, "v1")); + const Snapshot* s1 = db_->GetSnapshot(); + ASSERT_OK(Put(1, key, "v2")); + ASSERT_EQ("v2", Get(1, key)); + ASSERT_EQ("v1", Get(1, key, s1)); + ASSERT_OK(Flush(1)); + ASSERT_EQ("v2", Get(1, key)); + ASSERT_EQ("v1", Get(1, key, s1)); + db_->ReleaseSnapshot(s1); + } + } while (ChangeOptions()); +} +#endif // ROCKSDB_LITE + +TEST_F(DBBasicTest, CheckLock) { + do { + DB* localdb = nullptr; + Options options = CurrentOptions(); + ASSERT_OK(TryReopen(options)); + + // second open should fail + Status s = DB::Open(options, dbname_, &localdb); + ASSERT_NOK(s) << [localdb]() { + delete localdb; + return "localdb open: ok"; + }(); +#ifdef OS_LINUX + ASSERT_TRUE(s.ToString().find("lock ") != std::string::npos); +#endif // OS_LINUX + } while (ChangeCompactOptions()); +} + +TEST_F(DBBasicTest, FlushMultipleMemtable) { + do { + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.max_write_buffer_size_to_maintain = -1; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); + + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); + ASSERT_OK(Flush(1)); + } while (ChangeCompactOptions()); +} + +TEST_F(DBBasicTest, FlushEmptyColumnFamily) { + // Block flush thread and disable compaction thread + env_->SetBackgroundThreads(1, Env::HIGH); + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + test::SleepingBackgroundTask sleeping_task_high; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_high, Env::Priority::HIGH); + + Options options = CurrentOptions(); + // disable compaction + options.disable_auto_compactions = true; + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.max_write_buffer_number = 2; + options.min_write_buffer_number_to_merge = 1; + options.max_write_buffer_size_to_maintain = + static_cast(options.write_buffer_size); + CreateAndReopenWithCF({"pikachu"}, options); + + // Compaction can still go through even if no thread can flush the + // mem table. + ASSERT_OK(Flush(0)); + ASSERT_OK(Flush(1)); + + // Insert can go through + ASSERT_OK(dbfull()->Put(writeOpt, handles_[0], "foo", "v1")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); + + ASSERT_EQ("v1", Get(0, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); + + sleeping_task_high.WakeUp(); + sleeping_task_high.WaitUntilDone(); + + // Flush can still go through. + ASSERT_OK(Flush(0)); + ASSERT_OK(Flush(1)); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); +} + +TEST_F(DBBasicTest, Flush) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + SetPerfLevel(kEnableTime); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); + // this will now also flush the last 2 writes + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); + + get_perf_context()->Reset(); + Get(1, "foo"); + ASSERT_TRUE((int)get_perf_context()->get_from_output_files_time > 0); + ASSERT_EQ(2, (int)get_perf_context()->get_read_bytes); + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); + + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2")); + ASSERT_OK(Flush(1)); + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("v2", Get(1, "bar")); + get_perf_context()->Reset(); + ASSERT_EQ("v2", Get(1, "foo")); + ASSERT_TRUE((int)get_perf_context()->get_from_output_files_time > 0); + + writeOpt.disableWAL = false; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3")); + ASSERT_OK(Flush(1)); + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + // 'foo' should be there because its put + // has WAL enabled. + ASSERT_EQ("v3", Get(1, "foo")); + ASSERT_EQ("v3", Get(1, "bar")); + + SetPerfLevel(kDisable); + } while (ChangeCompactOptions()); +} + +TEST_F(DBBasicTest, ManifestRollOver) { + do { + Options options; + options.max_manifest_file_size = 10; // 10 bytes + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, options); + { + ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1'))); + ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2'))); + ASSERT_OK(Put(1, "manifest_key3", std::string(1000, '3'))); + uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo(); + ASSERT_OK(Flush(1)); // This should trigger LogAndApply. + uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo(); + ASSERT_GT(manifest_after_flush, manifest_before_flush); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush); + // check if a new manifest file got inserted or not. + ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1")); + ASSERT_EQ(std::string(1000, '2'), Get(1, "manifest_key2")); + ASSERT_EQ(std::string(1000, '3'), Get(1, "manifest_key3")); + } + } while (ChangeCompactOptions()); +} + +TEST_F(DBBasicTest, IdentityAcrossRestarts) { + constexpr size_t kMinIdSize = 10; + do { + for (bool with_manifest : {false, true}) { + std::string idfilename = IdentityFileName(dbname_); + std::string id1, tmp; + ASSERT_OK(db_->GetDbIdentity(id1)); + ASSERT_GE(id1.size(), kMinIdSize); + + Options options = CurrentOptions(); + options.write_dbid_to_manifest = with_manifest; + Reopen(options); + std::string id2; + ASSERT_OK(db_->GetDbIdentity(id2)); + // id2 should match id1 because identity was not regenerated + ASSERT_EQ(id1, id2); + ASSERT_OK(ReadFileToString(env_, idfilename, &tmp)); + ASSERT_EQ(tmp, id2); + + // Recover from deleted/missing IDENTITY + ASSERT_OK(env_->DeleteFile(idfilename)); + Reopen(options); + std::string id3; + ASSERT_OK(db_->GetDbIdentity(id3)); + if (with_manifest) { + // id3 should match id1 because identity was restored from manifest + ASSERT_EQ(id1, id3); + } else { + // id3 should NOT match id1 because identity was regenerated + ASSERT_NE(id1, id3); + ASSERT_GE(id3.size(), kMinIdSize); + } + ASSERT_OK(ReadFileToString(env_, idfilename, &tmp)); + ASSERT_EQ(tmp, id3); + + // Recover from truncated IDENTITY + { + std::unique_ptr w; + ASSERT_OK(env_->NewWritableFile(idfilename, &w, EnvOptions())); + ASSERT_OK(w->Close()); + } + Reopen(options); + std::string id4; + ASSERT_OK(db_->GetDbIdentity(id4)); + if (with_manifest) { + // id4 should match id1 because identity was restored from manifest + ASSERT_EQ(id1, id4); + } else { + // id4 should NOT match id1 because identity was regenerated + ASSERT_NE(id1, id4); + ASSERT_GE(id4.size(), kMinIdSize); + } + ASSERT_OK(ReadFileToString(env_, idfilename, &tmp)); + ASSERT_EQ(tmp, id4); + + // Recover from overwritten IDENTITY + std::string silly_id = "asdf123456789"; + { + std::unique_ptr w; + ASSERT_OK(env_->NewWritableFile(idfilename, &w, EnvOptions())); + ASSERT_OK(w->Append(silly_id)); + ASSERT_OK(w->Close()); + } + Reopen(options); + std::string id5; + ASSERT_OK(db_->GetDbIdentity(id5)); + if (with_manifest) { + // id4 should match id1 because identity was restored from manifest + ASSERT_EQ(id1, id5); + } else { + ASSERT_EQ(id5, silly_id); + } + ASSERT_OK(ReadFileToString(env_, idfilename, &tmp)); + ASSERT_EQ(tmp, id5); + } + } while (ChangeCompactOptions()); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBBasicTest, Snapshot) { + env_->SetMockSleep(); + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override)); + ASSERT_OK(Put(0, "foo", "0v1")); + ASSERT_OK(Put(1, "foo", "1v1")); + + const Snapshot* s1 = db_->GetSnapshot(); + ASSERT_EQ(1U, GetNumSnapshots()); + uint64_t time_snap1 = GetTimeOldestSnapshots(); + ASSERT_GT(time_snap1, 0U); + ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); + ASSERT_OK(Put(0, "foo", "0v2")); + ASSERT_OK(Put(1, "foo", "1v2")); + + env_->MockSleepForSeconds(1); + + const Snapshot* s2 = db_->GetSnapshot(); + ASSERT_EQ(2U, GetNumSnapshots()); + ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); + ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); + ASSERT_OK(Put(0, "foo", "0v3")); + ASSERT_OK(Put(1, "foo", "1v3")); + + { + ManagedSnapshot s3(db_); + ASSERT_EQ(3U, GetNumSnapshots()); + ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); + ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); + + ASSERT_OK(Put(0, "foo", "0v4")); + ASSERT_OK(Put(1, "foo", "1v4")); + ASSERT_EQ("0v1", Get(0, "foo", s1)); + ASSERT_EQ("1v1", Get(1, "foo", s1)); + ASSERT_EQ("0v2", Get(0, "foo", s2)); + ASSERT_EQ("1v2", Get(1, "foo", s2)); + ASSERT_EQ("0v3", Get(0, "foo", s3.snapshot())); + ASSERT_EQ("1v3", Get(1, "foo", s3.snapshot())); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + } + + ASSERT_EQ(2U, GetNumSnapshots()); + ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); + ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); + ASSERT_EQ("0v1", Get(0, "foo", s1)); + ASSERT_EQ("1v1", Get(1, "foo", s1)); + ASSERT_EQ("0v2", Get(0, "foo", s2)); + ASSERT_EQ("1v2", Get(1, "foo", s2)); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + + db_->ReleaseSnapshot(s1); + ASSERT_EQ("0v2", Get(0, "foo", s2)); + ASSERT_EQ("1v2", Get(1, "foo", s2)); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + ASSERT_EQ(1U, GetNumSnapshots()); + ASSERT_LT(time_snap1, GetTimeOldestSnapshots()); + ASSERT_EQ(GetSequenceOldestSnapshots(), s2->GetSequenceNumber()); + + db_->ReleaseSnapshot(s2); + ASSERT_EQ(0U, GetNumSnapshots()); + ASSERT_EQ(GetSequenceOldestSnapshots(), 0); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + } while (ChangeOptions()); +} + +#endif // ROCKSDB_LITE + +class DBBasicMultiConfigs : public DBBasicTest, + public ::testing::WithParamInterface { + public: + DBBasicMultiConfigs() { option_config_ = GetParam(); } + + static std::vector GenerateOptionConfigs() { + std::vector option_configs; + for (int option_config = kDefault; option_config < kEnd; ++option_config) { + if (!ShouldSkipOptions(option_config, kSkipFIFOCompaction)) { + option_configs.push_back(option_config); + } + } + return option_configs; + } +}; + +TEST_P(DBBasicMultiConfigs, CompactBetweenSnapshots) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + Options options = CurrentOptions(options_override); + options.disable_auto_compactions = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + Random rnd(301); + FillLevels("a", "z", 1); + + ASSERT_OK(Put(1, "foo", "first")); + const Snapshot* snapshot1 = db_->GetSnapshot(); + ASSERT_OK(Put(1, "foo", "second")); + ASSERT_OK(Put(1, "foo", "third")); + ASSERT_OK(Put(1, "foo", "fourth")); + const Snapshot* snapshot2 = db_->GetSnapshot(); + ASSERT_OK(Put(1, "foo", "fifth")); + ASSERT_OK(Put(1, "foo", "sixth")); + + // All entries (including duplicates) exist + // before any compaction or flush is triggered. + ASSERT_EQ(AllEntriesFor("foo", 1), + "[ sixth, fifth, fourth, third, second, first ]"); + ASSERT_EQ("sixth", Get(1, "foo")); + ASSERT_EQ("fourth", Get(1, "foo", snapshot2)); + ASSERT_EQ("first", Get(1, "foo", snapshot1)); + + // After a flush, "second", "third" and "fifth" should + // be removed + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth, first ]"); + + // after we release the snapshot1, only two values left + db_->ReleaseSnapshot(snapshot1); + FillLevels("a", "z", 1); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr)); + + // We have only one valid snapshot snapshot2. Since snapshot1 is + // not valid anymore, "first" should be removed by a compaction. + ASSERT_EQ("sixth", Get(1, "foo")); + ASSERT_EQ("fourth", Get(1, "foo", snapshot2)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth ]"); + + // after we release the snapshot2, only one value should be left + db_->ReleaseSnapshot(snapshot2); + FillLevels("a", "z", 1); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr)); + ASSERT_EQ("sixth", Get(1, "foo")); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth ]"); +} + +INSTANTIATE_TEST_CASE_P( + DBBasicMultiConfigs, DBBasicMultiConfigs, + ::testing::ValuesIn(DBBasicMultiConfigs::GenerateOptionConfigs())); + +TEST_F(DBBasicTest, DBOpen_Options) { + Options options = CurrentOptions(); + Close(); + Destroy(options); + + // Does not exist, and create_if_missing == false: error + DB* db = nullptr; + options.create_if_missing = false; + Status s = DB::Open(options, dbname_, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr); + ASSERT_TRUE(db == nullptr); + + // Does not exist, and create_if_missing == true: OK + options.create_if_missing = true; + s = DB::Open(options, dbname_, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + + delete db; + db = nullptr; + + // Does exist, and error_if_exists == true: error + options.create_if_missing = false; + options.error_if_exists = true; + s = DB::Open(options, dbname_, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr); + ASSERT_TRUE(db == nullptr); + + // Does exist, and error_if_exists == false: OK + options.create_if_missing = true; + options.error_if_exists = false; + s = DB::Open(options, dbname_, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + + delete db; + db = nullptr; +} + +TEST_F(DBBasicTest, CompactOnFlush) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + do { + Options options = CurrentOptions(options_override); + options.disable_auto_compactions = true; + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v1 ]"); + + // Write two new keys + ASSERT_OK(Put(1, "a", "begin")); + ASSERT_OK(Put(1, "z", "end")); + ASSERT_OK(Flush(1)); + + // Case1: Delete followed by a put + ASSERT_OK(Delete(1, "foo")); + ASSERT_OK(Put(1, "foo", "v2")); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]"); + + // After the current memtable is flushed, the DEL should + // have been removed + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); + + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], + nullptr, nullptr)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]"); + + // Case 2: Delete followed by another delete + ASSERT_OK(Delete(1, "foo")); + ASSERT_OK(Delete(1, "foo")); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, DEL, v2 ]"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v2 ]"); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], + nullptr, nullptr)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); + + // Case 3: Put followed by a delete + ASSERT_OK(Put(1, "foo", "v3")); + ASSERT_OK(Delete(1, "foo")); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v3 ]"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL ]"); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], + nullptr, nullptr)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); + + // Case 4: Put followed by another Put + ASSERT_OK(Put(1, "foo", "v4")); + ASSERT_OK(Put(1, "foo", "v5")); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5, v4 ]"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]"); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], + nullptr, nullptr)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]"); + + // clear database + ASSERT_OK(Delete(1, "foo")); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], + nullptr, nullptr)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); + + // Case 5: Put followed by snapshot followed by another Put + // Both puts should remain. + ASSERT_OK(Put(1, "foo", "v6")); + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(Put(1, "foo", "v7")); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v7, v6 ]"); + db_->ReleaseSnapshot(snapshot); + + // clear database + ASSERT_OK(Delete(1, "foo")); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], + nullptr, nullptr)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); + + // Case 5: snapshot followed by a put followed by another Put + // Only the last put should remain. + const Snapshot* snapshot1 = db_->GetSnapshot(); + ASSERT_OK(Put(1, "foo", "v8")); + ASSERT_OK(Put(1, "foo", "v9")); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v9 ]"); + db_->ReleaseSnapshot(snapshot1); + } while (ChangeCompactOptions()); +} + +TEST_F(DBBasicTest, FlushOneColumnFamily) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich", + "alyosha", "popovich"}, + options); + + ASSERT_OK(Put(0, "Default", "Default")); + ASSERT_OK(Put(1, "pikachu", "pikachu")); + ASSERT_OK(Put(2, "ilya", "ilya")); + ASSERT_OK(Put(3, "muromec", "muromec")); + ASSERT_OK(Put(4, "dobrynia", "dobrynia")); + ASSERT_OK(Put(5, "nikitich", "nikitich")); + ASSERT_OK(Put(6, "alyosha", "alyosha")); + ASSERT_OK(Put(7, "popovich", "popovich")); + + for (int i = 0; i < 8; ++i) { + ASSERT_OK(Flush(i)); + auto tables = ListTableFiles(env_, dbname_); + ASSERT_EQ(tables.size(), i + 1U); + } +} + +TEST_F(DBBasicTest, MultiGetSimple) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + SetPerfLevel(kEnableCount); + ASSERT_OK(Put(1, "k1", "v1")); + ASSERT_OK(Put(1, "k2", "v2")); + ASSERT_OK(Put(1, "k3", "v3")); + ASSERT_OK(Put(1, "k4", "v4")); + ASSERT_OK(Delete(1, "k4")); + ASSERT_OK(Put(1, "k5", "v5")); + ASSERT_OK(Delete(1, "no_key")); + + std::vector keys({"k1", "k2", "k3", "k4", "k5", "no_key"}); + + std::vector values(20, "Temporary data to be overwritten"); + std::vector cfs(keys.size(), handles_[1]); + + get_perf_context()->Reset(); + std::vector s = db_->MultiGet(ReadOptions(), cfs, keys, &values); + ASSERT_EQ(values.size(), keys.size()); + ASSERT_EQ(values[0], "v1"); + ASSERT_EQ(values[1], "v2"); + ASSERT_EQ(values[2], "v3"); + ASSERT_EQ(values[4], "v5"); + // four kv pairs * two bytes per value + ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes); + + ASSERT_OK(s[0]); + ASSERT_OK(s[1]); + ASSERT_OK(s[2]); + ASSERT_TRUE(s[3].IsNotFound()); + ASSERT_OK(s[4]); + ASSERT_TRUE(s[5].IsNotFound()); + SetPerfLevel(kDisable); + } while (ChangeCompactOptions()); +} + +TEST_F(DBBasicTest, MultiGetEmpty) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + // Empty Key Set + std::vector keys; + std::vector values; + std::vector cfs; + std::vector s = db_->MultiGet(ReadOptions(), cfs, keys, &values); + ASSERT_EQ(s.size(), 0U); + + // Empty Database, Empty Key Set + Options options = CurrentOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + s = db_->MultiGet(ReadOptions(), cfs, keys, &values); + ASSERT_EQ(s.size(), 0U); + + // Empty Database, Search for Keys + keys.resize(2); + keys[0] = "a"; + keys[1] = "b"; + cfs.push_back(handles_[0]); + cfs.push_back(handles_[1]); + s = db_->MultiGet(ReadOptions(), cfs, keys, &values); + ASSERT_EQ(static_cast(s.size()), 2); + ASSERT_TRUE(s[0].IsNotFound() && s[1].IsNotFound()); + } while (ChangeCompactOptions()); +} + +class DBBlockChecksumTest : public DBBasicTest, + public testing::WithParamInterface {}; + +INSTANTIATE_TEST_CASE_P(FormatVersions, DBBlockChecksumTest, + testing::ValuesIn(test::kFooterFormatVersionsToTest)); + +TEST_P(DBBlockChecksumTest, BlockChecksumTest) { + BlockBasedTableOptions table_options; + table_options.format_version = GetParam(); + Options options = CurrentOptions(); + const int kNumPerFile = 2; + + const auto algs = GetSupportedChecksums(); + const int algs_size = static_cast(algs.size()); + + // generate one table with each type of checksum + for (int i = 0; i < algs_size; ++i) { + table_options.checksum = algs[i]; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + for (int j = 0; j < kNumPerFile; ++j) { + ASSERT_OK(Put(Key(i * kNumPerFile + j), Key(i * kNumPerFile + j))); + } + ASSERT_OK(Flush()); + } + + // with each valid checksum type setting... + for (int i = 0; i < algs_size; ++i) { + table_options.checksum = algs[i]; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + // verify every type of checksum (should be regardless of that setting) + for (int j = 0; j < algs_size * kNumPerFile; ++j) { + ASSERT_EQ(Key(j), Get(Key(j))); + } + } + + // Now test invalid checksum type + table_options.checksum = static_cast(123); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ASSERT_TRUE(TryReopen(options).IsInvalidArgument()); +} + +// On Windows you can have either memory mapped file or a file +// with unbuffered access. So this asserts and does not make +// sense to run +#ifndef OS_WIN +TEST_F(DBBasicTest, MmapAndBufferOptions) { + if (!IsMemoryMappedAccessSupported()) { + return; + } + Options options = CurrentOptions(); + + options.use_direct_reads = true; + options.allow_mmap_reads = true; + ASSERT_NOK(TryReopen(options)); + + // All other combinations are acceptable + options.use_direct_reads = false; + ASSERT_OK(TryReopen(options)); + + if (IsDirectIOSupported()) { + options.use_direct_reads = true; + options.allow_mmap_reads = false; + ASSERT_OK(TryReopen(options)); + } + + options.use_direct_reads = false; + ASSERT_OK(TryReopen(options)); +} +#endif + +class TestEnv : public EnvWrapper { + public: + explicit TestEnv(Env* base_env) : EnvWrapper(base_env), close_count(0) {} + static const char* kClassName() { return "TestEnv"; } + const char* Name() const override { return kClassName(); } + + class TestLogger : public Logger { + public: + using Logger::Logv; + explicit TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; } + ~TestLogger() override { + if (!closed_) { + CloseHelper().PermitUncheckedError(); + } + } + void Logv(const char* /*format*/, va_list /*ap*/) override {} + + protected: + Status CloseImpl() override { return CloseHelper(); } + + private: + Status CloseHelper() { + env->CloseCountInc(); + ; + return Status::IOError(); + } + TestEnv* env; + }; + + void CloseCountInc() { close_count++; } + + int GetCloseCount() { return close_count; } + + Status NewLogger(const std::string& /*fname*/, + std::shared_ptr* result) override { + result->reset(new TestLogger(this)); + return Status::OK(); + } + + private: + int close_count; +}; + +TEST_F(DBBasicTest, DBClose) { + Options options = GetDefaultOptions(); + std::string dbname = test::PerThreadDBPath("db_close_test"); + ASSERT_OK(DestroyDB(dbname, options)); + + DB* db = nullptr; + TestEnv* env = new TestEnv(env_); + std::unique_ptr local_env_guard(env); + options.create_if_missing = true; + options.env = env; + Status s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + + s = db->Close(); + ASSERT_EQ(env->GetCloseCount(), 1); + ASSERT_EQ(s, Status::IOError()); + + delete db; + ASSERT_EQ(env->GetCloseCount(), 1); + + // Do not call DB::Close() and ensure our logger Close() still gets called + s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + delete db; + ASSERT_EQ(env->GetCloseCount(), 2); + + // Provide our own logger and ensure DB::Close() does not close it + options.info_log.reset(new TestEnv::TestLogger(env)); + options.create_if_missing = false; + s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + + s = db->Close(); + ASSERT_EQ(s, Status::OK()); + delete db; + ASSERT_EQ(env->GetCloseCount(), 2); + options.info_log.reset(); + ASSERT_EQ(env->GetCloseCount(), 3); +} + +TEST_F(DBBasicTest, DBCloseAllDirectoryFDs) { + Options options = GetDefaultOptions(); + std::string dbname = test::PerThreadDBPath("db_close_all_dir_fds_test"); + // Configure a specific WAL directory + options.wal_dir = dbname + "_wal_dir"; + // Configure 3 different data directories + options.db_paths.emplace_back(dbname + "_1", 512 * 1024); + options.db_paths.emplace_back(dbname + "_2", 4 * 1024 * 1024); + options.db_paths.emplace_back(dbname + "_3", 1024 * 1024 * 1024); + + ASSERT_OK(DestroyDB(dbname, options)); + + DB* db = nullptr; + std::unique_ptr env = NewCompositeEnv( + std::make_shared(FileSystem::Default())); + options.create_if_missing = true; + options.env = env.get(); + Status s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + + // Explicitly close the database to ensure the open and close counter for + // directories are equivalent + s = db->Close(); + auto* counted_fs = + options.env->GetFileSystem()->CheckedCast(); + ASSERT_TRUE(counted_fs != nullptr); + ASSERT_EQ(counted_fs->counters()->dir_opens, + counted_fs->counters()->dir_closes); + ASSERT_OK(s); + delete db; +} + +TEST_F(DBBasicTest, DBCloseFlushError) { + std::unique_ptr fault_injection_env( + new FaultInjectionTestEnv(env_)); + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.manual_wal_flush = true; + options.write_buffer_size = 100; + options.env = fault_injection_env.get(); + + Reopen(options); + ASSERT_OK(Put("key1", "value1")); + ASSERT_OK(Put("key2", "value2")); + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + ASSERT_OK(Put("key3", "value3")); + fault_injection_env->SetFilesystemActive(false); + Status s = dbfull()->Close(); + ASSERT_NE(s, Status::OK()); + // retry should return the same error + s = dbfull()->Close(); + ASSERT_NE(s, Status::OK()); + fault_injection_env->SetFilesystemActive(true); + // retry close() is no-op even the system is back. Could be improved if + // Close() is retry-able: #9029 + s = dbfull()->Close(); + ASSERT_NE(s, Status::OK()); + Destroy(options); +} + +class DBMultiGetTestWithParam + : public DBBasicTest, + public testing::WithParamInterface> {}; + +TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) { +#ifndef USE_COROUTINES + if (std::get<1>(GetParam())) { + ROCKSDB_GTEST_SKIP("This test requires coroutine support"); + return; + } +#endif // USE_COROUTINES + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich", + "alyosha", "popovich"}, + options); + // tuples + std::vector> cf_kv_vec; + static const int num_keys = 24; + cf_kv_vec.reserve(num_keys); + + for (int i = 0; i < num_keys; ++i) { + int cf = i / 3; + int cf_key = 1 % 3; + cf_kv_vec.emplace_back(std::make_tuple( + cf, "cf" + std::to_string(cf) + "_key_" + std::to_string(cf_key), + "cf" + std::to_string(cf) + "_val_" + std::to_string(cf_key))); + ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]), + std::get<2>(cf_kv_vec[i]))); + } + + int get_sv_count = 0; + ROCKSDB_NAMESPACE::DBImpl* db = static_cast_with_check(db_); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) { + if (++get_sv_count == 2) { + // After MultiGet refs a couple of CFs, flush all CFs so MultiGet + // is forced to repeat the process + for (int i = 0; i < num_keys; ++i) { + int cf = i / 3; + int cf_key = i % 8; + if (cf_key == 0) { + ASSERT_OK(Flush(cf)); + } + ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]), + std::get<2>(cf_kv_vec[i]) + "_2")); + } + } + if (get_sv_count == 11) { + for (int i = 0; i < 8; ++i) { + auto* cfd = static_cast_with_check( + db->GetColumnFamilyHandle(i)) + ->cfd(); + ASSERT_EQ(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse); + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + std::vector cfs; + std::vector keys; + std::vector values; + + for (int i = 0; i < num_keys; ++i) { + cfs.push_back(std::get<0>(cf_kv_vec[i])); + keys.push_back(std::get<1>(cf_kv_vec[i])); + } + + values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()), + std::get<1>(GetParam())); + ASSERT_EQ(values.size(), num_keys); + for (unsigned int j = 0; j < values.size(); ++j) { + ASSERT_EQ(values[j], std::get<2>(cf_kv_vec[j]) + "_2"); + } + + keys.clear(); + cfs.clear(); + cfs.push_back(std::get<0>(cf_kv_vec[0])); + keys.push_back(std::get<1>(cf_kv_vec[0])); + cfs.push_back(std::get<0>(cf_kv_vec[3])); + keys.push_back(std::get<1>(cf_kv_vec[3])); + cfs.push_back(std::get<0>(cf_kv_vec[4])); + keys.push_back(std::get<1>(cf_kv_vec[4])); + values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()), + std::get<1>(GetParam())); + ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[0]) + "_2"); + ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[3]) + "_2"); + ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[4]) + "_2"); + + keys.clear(); + cfs.clear(); + cfs.push_back(std::get<0>(cf_kv_vec[7])); + keys.push_back(std::get<1>(cf_kv_vec[7])); + cfs.push_back(std::get<0>(cf_kv_vec[6])); + keys.push_back(std::get<1>(cf_kv_vec[6])); + cfs.push_back(std::get<0>(cf_kv_vec[1])); + keys.push_back(std::get<1>(cf_kv_vec[1])); + values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()), + std::get<1>(GetParam())); + ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[7]) + "_2"); + ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[6]) + "_2"); + ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[1]) + "_2"); + + for (int cf = 0; cf < 8; ++cf) { + auto* cfd = + static_cast_with_check( + static_cast_with_check(db_)->GetColumnFamilyHandle(cf)) + ->cfd(); + ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse); + ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVObsolete); + } +} + +TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFMutex) { +#ifndef USE_COROUTINES + if (std::get<1>(GetParam())) { + ROCKSDB_GTEST_SKIP("This test requires coroutine support"); + return; + } +#endif // USE_COROUTINES + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich", + "alyosha", "popovich"}, + options); + + for (int i = 0; i < 8; ++i) { + ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key", + "cf" + std::to_string(i) + "_val")); + } + + int get_sv_count = 0; + int retries = 0; + bool last_try = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::MultiGet::LastTry", [&](void* /*arg*/) { + last_try = true; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) { + if (last_try) { + return; + } + if (++get_sv_count == 2) { + ++retries; + get_sv_count = 0; + for (int i = 0; i < 8; ++i) { + ASSERT_OK(Flush(i)); + ASSERT_OK(Put( + i, "cf" + std::to_string(i) + "_key", + "cf" + std::to_string(i) + "_val" + std::to_string(retries))); + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + std::vector cfs; + std::vector keys; + std::vector values; + + for (int i = 0; i < 8; ++i) { + cfs.push_back(i); + keys.push_back("cf" + std::to_string(i) + "_key"); + } + + values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()), + std::get<1>(GetParam())); + ASSERT_TRUE(last_try); + ASSERT_EQ(values.size(), 8); + for (unsigned int j = 0; j < values.size(); ++j) { + ASSERT_EQ(values[j], + "cf" + std::to_string(j) + "_val" + std::to_string(retries)); + } + for (int i = 0; i < 8; ++i) { + auto* cfd = + static_cast_with_check( + static_cast_with_check(db_)->GetColumnFamilyHandle(i)) + ->cfd(); + ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse); + } +} + +TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFSnapshot) { +#ifndef USE_COROUTINES + if (std::get<1>(GetParam())) { + ROCKSDB_GTEST_SKIP("This test requires coroutine support"); + return; + } +#endif // USE_COROUTINES + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich", + "alyosha", "popovich"}, + options); + + for (int i = 0; i < 8; ++i) { + ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key", + "cf" + std::to_string(i) + "_val")); + } + + int get_sv_count = 0; + ROCKSDB_NAMESPACE::DBImpl* db = static_cast_with_check(db_); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) { + if (++get_sv_count == 2) { + for (int i = 0; i < 8; ++i) { + ASSERT_OK(Flush(i)); + ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key", + "cf" + std::to_string(i) + "_val2")); + } + } + if (get_sv_count == 8) { + for (int i = 0; i < 8; ++i) { + auto* cfd = static_cast_with_check( + db->GetColumnFamilyHandle(i)) + ->cfd(); + ASSERT_TRUE( + (cfd->TEST_GetLocalSV()->Get() == SuperVersion::kSVInUse) || + (cfd->TEST_GetLocalSV()->Get() == SuperVersion::kSVObsolete)); + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + std::vector cfs; + std::vector keys; + std::vector values; + + for (int i = 0; i < 8; ++i) { + cfs.push_back(i); + keys.push_back("cf" + std::to_string(i) + "_key"); + } + + const Snapshot* snapshot = db_->GetSnapshot(); + values = MultiGet(cfs, keys, snapshot, std::get<0>(GetParam()), + std::get<1>(GetParam())); + db_->ReleaseSnapshot(snapshot); + ASSERT_EQ(values.size(), 8); + for (unsigned int j = 0; j < values.size(); ++j) { + ASSERT_EQ(values[j], "cf" + std::to_string(j) + "_val"); + } + for (int i = 0; i < 8; ++i) { + auto* cfd = + static_cast_with_check( + static_cast_with_check(db_)->GetColumnFamilyHandle(i)) + ->cfd(); + ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse); + } +} + +TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFUnsorted) { +#ifndef USE_COROUTINES + if (std::get<1>(GetParam())) { + ROCKSDB_GTEST_SKIP("This test requires coroutine support"); + return; + } +#endif // USE_COROUTINES + Options options = CurrentOptions(); + CreateAndReopenWithCF({"one", "two"}, options); + + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(2, "baz", "xyz")); + ASSERT_OK(Put(1, "abc", "def")); + + // Note: keys for the same CF do not form a consecutive range + std::vector cfs{1, 2, 1}; + std::vector keys{"foo", "baz", "abc"}; + std::vector values; + + values = MultiGet(cfs, keys, /* snapshot */ nullptr, + /* batched */ std::get<0>(GetParam()), + /* async */ std::get<1>(GetParam())); + + ASSERT_EQ(values.size(), 3); + ASSERT_EQ(values[0], "bar"); + ASSERT_EQ(values[1], "xyz"); + ASSERT_EQ(values[2], "def"); +} + +TEST_P(DBMultiGetTestWithParam, MultiGetBatchedSimpleUnsorted) { +#ifndef USE_COROUTINES + if (std::get<1>(GetParam())) { + ROCKSDB_GTEST_SKIP("This test requires coroutine support"); + return; + } +#endif // USE_COROUTINES + // Skip for unbatched MultiGet + if (!std::get<0>(GetParam())) { + ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet"); + return; + } + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + SetPerfLevel(kEnableCount); + ASSERT_OK(Put(1, "k1", "v1")); + ASSERT_OK(Put(1, "k2", "v2")); + ASSERT_OK(Put(1, "k3", "v3")); + ASSERT_OK(Put(1, "k4", "v4")); + ASSERT_OK(Delete(1, "k4")); + ASSERT_OK(Put(1, "k5", "v5")); + ASSERT_OK(Delete(1, "no_key")); + + get_perf_context()->Reset(); + + std::vector keys({"no_key", "k5", "k4", "k3", "k2", "k1"}); + std::vector values(keys.size()); + std::vector cfs(keys.size(), handles_[1]); + std::vector s(keys.size()); + + ReadOptions ro; + ro.async_io = std::get<1>(GetParam()); + db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(), + s.data(), false); + + ASSERT_EQ(values.size(), keys.size()); + ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v1"); + ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v2"); + ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v3"); + ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5"); + // four kv pairs * two bytes per value + ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes); + + ASSERT_TRUE(s[0].IsNotFound()); + ASSERT_OK(s[1]); + ASSERT_TRUE(s[2].IsNotFound()); + ASSERT_OK(s[3]); + ASSERT_OK(s[4]); + ASSERT_OK(s[5]); + + SetPerfLevel(kDisable); + } while (ChangeCompactOptions()); +} + +TEST_P(DBMultiGetTestWithParam, MultiGetBatchedSortedMultiFile) { +#ifndef USE_COROUTINES + if (std::get<1>(GetParam())) { + ROCKSDB_GTEST_SKIP("This test requires coroutine support"); + return; + } +#endif // USE_COROUTINES + // Skip for unbatched MultiGet + if (!std::get<0>(GetParam())) { + ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet"); + return; + } + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + SetPerfLevel(kEnableCount); + // To expand the power of this test, generate > 1 table file and + // mix with memtable + ASSERT_OK(Put(1, "k1", "v1")); + ASSERT_OK(Put(1, "k2", "v2")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "k3", "v3")); + ASSERT_OK(Put(1, "k4", "v4")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Delete(1, "k4")); + ASSERT_OK(Put(1, "k5", "v5")); + ASSERT_OK(Delete(1, "no_key")); + + get_perf_context()->Reset(); + + std::vector keys({"k1", "k2", "k3", "k4", "k5", "no_key"}); + std::vector values(keys.size()); + std::vector cfs(keys.size(), handles_[1]); + std::vector s(keys.size()); + + ReadOptions ro; + ro.async_io = std::get<1>(GetParam()); + db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(), + s.data(), true); + + ASSERT_EQ(values.size(), keys.size()); + ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v1"); + ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v2"); + ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v3"); + ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v5"); + // four kv pairs * two bytes per value + ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes); + + ASSERT_OK(s[0]); + ASSERT_OK(s[1]); + ASSERT_OK(s[2]); + ASSERT_TRUE(s[3].IsNotFound()); + ASSERT_OK(s[4]); + ASSERT_TRUE(s[5].IsNotFound()); + + SetPerfLevel(kDisable); + } while (ChangeOptions()); +} + +TEST_P(DBMultiGetTestWithParam, MultiGetBatchedDuplicateKeys) { +#ifndef USE_COROUTINES + if (std::get<1>(GetParam())) { + ROCKSDB_GTEST_SKIP("This test requires coroutine support"); + return; + } +#endif // USE_COROUTINES + // Skip for unbatched MultiGet + if (!std::get<0>(GetParam())) { + ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet"); + return; + } + Options opts = CurrentOptions(); + opts.merge_operator = MergeOperators::CreateStringAppendOperator(); + CreateAndReopenWithCF({"pikachu"}, opts); + SetPerfLevel(kEnableCount); + // To expand the power of this test, generate > 1 table file and + // mix with memtable + ASSERT_OK(Merge(1, "k1", "v1")); + ASSERT_OK(Merge(1, "k2", "v2")); + ASSERT_OK(Flush(1)); + MoveFilesToLevel(2, 1); + ASSERT_OK(Merge(1, "k3", "v3")); + ASSERT_OK(Merge(1, "k4", "v4")); + ASSERT_OK(Flush(1)); + MoveFilesToLevel(2, 1); + ASSERT_OK(Merge(1, "k4", "v4_2")); + ASSERT_OK(Merge(1, "k6", "v6")); + ASSERT_OK(Flush(1)); + MoveFilesToLevel(2, 1); + ASSERT_OK(Merge(1, "k7", "v7")); + ASSERT_OK(Merge(1, "k8", "v8")); + ASSERT_OK(Flush(1)); + MoveFilesToLevel(2, 1); + + get_perf_context()->Reset(); + + std::vector keys({"k8", "k8", "k8", "k4", "k4", "k1", "k3"}); + std::vector values(keys.size()); + std::vector cfs(keys.size(), handles_[1]); + std::vector s(keys.size()); + + ReadOptions ro; + ro.async_io = std::get<1>(GetParam()); + db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(), + s.data(), false); + + ASSERT_EQ(values.size(), keys.size()); + ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v8"); + ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v8"); + ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v8"); + ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v4,v4_2"); + ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v4,v4_2"); + ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v1"); + ASSERT_EQ(std::string(values[6].data(), values[6].size()), "v3"); + ASSERT_EQ(24, (int)get_perf_context()->multiget_read_bytes); + + for (Status& status : s) { + ASSERT_OK(status); + } + + SetPerfLevel(kDisable); +} + +TEST_P(DBMultiGetTestWithParam, MultiGetBatchedMultiLevel) { +#ifndef USE_COROUTINES + if (std::get<1>(GetParam())) { + ROCKSDB_GTEST_SKIP("This test requires coroutine support"); + return; + } +#endif // USE_COROUTINES + // Skip for unbatched MultiGet + if (!std::get<0>(GetParam())) { + ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet"); + return; + } + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + Reopen(options); + int num_keys = 0; + + for (int i = 0; i < 128; ++i) { + ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i))); + num_keys++; + if (num_keys == 8) { + ASSERT_OK(Flush()); + num_keys = 0; + } + } + if (num_keys > 0) { + ASSERT_OK(Flush()); + num_keys = 0; + } + MoveFilesToLevel(2); + + for (int i = 0; i < 128; i += 3) { + ASSERT_OK(Put("key_" + std::to_string(i), "val_l1_" + std::to_string(i))); + num_keys++; + if (num_keys == 8) { + ASSERT_OK(Flush()); + num_keys = 0; + } + } + if (num_keys > 0) { + ASSERT_OK(Flush()); + num_keys = 0; + } + MoveFilesToLevel(1); + + for (int i = 0; i < 128; i += 5) { + ASSERT_OK(Put("key_" + std::to_string(i), "val_l0_" + std::to_string(i))); + num_keys++; + if (num_keys == 8) { + ASSERT_OK(Flush()); + num_keys = 0; + } + } + if (num_keys > 0) { + ASSERT_OK(Flush()); + num_keys = 0; + } + ASSERT_EQ(0, num_keys); + + for (int i = 0; i < 128; i += 9) { + ASSERT_OK(Put("key_" + std::to_string(i), "val_mem_" + std::to_string(i))); + } + + std::vector keys; + std::vector values; + + for (int i = 64; i < 80; ++i) { + keys.push_back("key_" + std::to_string(i)); + } + + values = MultiGet(keys, nullptr, std::get<1>(GetParam())); + ASSERT_EQ(values.size(), 16); + for (unsigned int j = 0; j < values.size(); ++j) { + int key = j + 64; + if (key % 9 == 0) { + ASSERT_EQ(values[j], "val_mem_" + std::to_string(key)); + } else if (key % 5 == 0) { + ASSERT_EQ(values[j], "val_l0_" + std::to_string(key)); + } else if (key % 3 == 0) { + ASSERT_EQ(values[j], "val_l1_" + std::to_string(key)); + } else { + ASSERT_EQ(values[j], "val_l2_" + std::to_string(key)); + } + } +} + +TEST_P(DBMultiGetTestWithParam, MultiGetBatchedMultiLevelMerge) { +#ifndef USE_COROUTINES + if (std::get<1>(GetParam())) { + ROCKSDB_GTEST_SKIP("This test requires coroutine support"); + return; + } +#endif // USE_COROUTINES + // Skip for unbatched MultiGet + if (!std::get<0>(GetParam())) { + ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet"); + return; + } + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + int num_keys = 0; + + for (int i = 0; i < 128; ++i) { + ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i))); + num_keys++; + if (num_keys == 8) { + ASSERT_OK(Flush()); + num_keys = 0; + } + } + if (num_keys > 0) { + ASSERT_OK(Flush()); + num_keys = 0; + } + MoveFilesToLevel(2); + + for (int i = 0; i < 128; i += 3) { + ASSERT_OK(Merge("key_" + std::to_string(i), "val_l1_" + std::to_string(i))); + num_keys++; + if (num_keys == 8) { + ASSERT_OK(Flush()); + num_keys = 0; + } + } + if (num_keys > 0) { + ASSERT_OK(Flush()); + num_keys = 0; + } + MoveFilesToLevel(1); + + for (int i = 0; i < 128; i += 5) { + ASSERT_OK(Merge("key_" + std::to_string(i), "val_l0_" + std::to_string(i))); + num_keys++; + if (num_keys == 8) { + ASSERT_OK(Flush()); + num_keys = 0; + } + } + if (num_keys > 0) { + ASSERT_OK(Flush()); + num_keys = 0; + } + ASSERT_EQ(0, num_keys); + + for (int i = 0; i < 128; i += 9) { + ASSERT_OK( + Merge("key_" + std::to_string(i), "val_mem_" + std::to_string(i))); + } + + std::vector keys; + std::vector values; + + for (int i = 32; i < 80; ++i) { + keys.push_back("key_" + std::to_string(i)); + } + + values = MultiGet(keys, nullptr, std::get<1>(GetParam())); + ASSERT_EQ(values.size(), keys.size()); + for (unsigned int j = 0; j < 48; ++j) { + int key = j + 32; + std::string value; + value.append("val_l2_" + std::to_string(key)); + if (key % 3 == 0) { + value.append(","); + value.append("val_l1_" + std::to_string(key)); + } + if (key % 5 == 0) { + value.append(","); + value.append("val_l0_" + std::to_string(key)); + } + if (key % 9 == 0) { + value.append(","); + value.append("val_mem_" + std::to_string(key)); + } + ASSERT_EQ(values[j], value); + } +} + +TEST_P(DBMultiGetTestWithParam, MultiGetBatchedValueSizeInMemory) { +#ifndef USE_COROUTINES + if (std::get<1>(GetParam())) { + ROCKSDB_GTEST_SKIP("This test requires coroutine support"); + return; + } +#endif // USE_COROUTINES + // Skip for unbatched MultiGet + if (!std::get<0>(GetParam())) { + ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet"); + return; + } + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + SetPerfLevel(kEnableCount); + ASSERT_OK(Put(1, "k1", "v_1")); + ASSERT_OK(Put(1, "k2", "v_2")); + ASSERT_OK(Put(1, "k3", "v_3")); + ASSERT_OK(Put(1, "k4", "v_4")); + ASSERT_OK(Put(1, "k5", "v_5")); + ASSERT_OK(Put(1, "k6", "v_6")); + std::vector keys = {"k1", "k2", "k3", "k4", "k5", "k6"}; + std::vector values(keys.size()); + std::vector s(keys.size()); + std::vector cfs(keys.size(), handles_[1]); + + get_perf_context()->Reset(); + ReadOptions ro; + ro.value_size_soft_limit = 11; + ro.async_io = std::get<1>(GetParam()); + db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(), + s.data(), false); + + ASSERT_EQ(values.size(), keys.size()); + for (unsigned int i = 0; i < 4; i++) { + ASSERT_EQ(std::string(values[i].data(), values[i].size()), + "v_" + std::to_string(i + 1)); + } + + for (unsigned int i = 4; i < 6; i++) { + ASSERT_TRUE(s[i].IsAborted()); + } + + ASSERT_EQ(12, (int)get_perf_context()->multiget_read_bytes); + SetPerfLevel(kDisable); +} + +TEST_P(DBMultiGetTestWithParam, MultiGetBatchedValueSize) { +#ifndef USE_COROUTINES + if (std::get<1>(GetParam())) { + ROCKSDB_GTEST_SKIP("This test requires coroutine support"); + return; + } +#endif // USE_COROUTINES + // Skip for unbatched MultiGet + if (!std::get<0>(GetParam())) { + return; + } + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + SetPerfLevel(kEnableCount); + + ASSERT_OK(Put(1, "k6", "v6")); + ASSERT_OK(Put(1, "k7", "v7_")); + ASSERT_OK(Put(1, "k3", "v3_")); + ASSERT_OK(Put(1, "k4", "v4")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Delete(1, "k4")); + ASSERT_OK(Put(1, "k11", "v11")); + ASSERT_OK(Delete(1, "no_key")); + ASSERT_OK(Put(1, "k8", "v8_")); + ASSERT_OK(Put(1, "k13", "v13")); + ASSERT_OK(Put(1, "k14", "v14")); + ASSERT_OK(Put(1, "k15", "v15")); + ASSERT_OK(Put(1, "k16", "v16")); + ASSERT_OK(Put(1, "k17", "v17")); + ASSERT_OK(Flush(1)); + + ASSERT_OK(Put(1, "k1", "v1_")); + ASSERT_OK(Put(1, "k2", "v2_")); + ASSERT_OK(Put(1, "k5", "v5_")); + ASSERT_OK(Put(1, "k9", "v9_")); + ASSERT_OK(Put(1, "k10", "v10")); + ASSERT_OK(Delete(1, "k2")); + ASSERT_OK(Delete(1, "k6")); + + get_perf_context()->Reset(); + + std::vector keys({"k1", "k10", "k11", "k12", "k13", "k14", "k15", + "k16", "k17", "k2", "k3", "k4", "k5", "k6", "k7", + "k8", "k9", "no_key"}); + std::vector values(keys.size()); + std::vector cfs(keys.size(), handles_[1]); + std::vector s(keys.size()); + + ReadOptions ro; + ro.value_size_soft_limit = 20; + ro.async_io = std::get<1>(GetParam()); + db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(), + s.data(), false); + + ASSERT_EQ(values.size(), keys.size()); + + // In memory keys + ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v1_"); + ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v10"); + ASSERT_TRUE(s[9].IsNotFound()); // k2 + ASSERT_EQ(std::string(values[12].data(), values[12].size()), "v5_"); + ASSERT_TRUE(s[13].IsNotFound()); // k6 + ASSERT_EQ(std::string(values[16].data(), values[16].size()), "v9_"); + + // In sst files + ASSERT_EQ(std::string(values[2].data(), values[1].size()), "v11"); + ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v13"); + ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v14"); + + // Remaining aborted after value_size exceeds. + ASSERT_TRUE(s[3].IsAborted()); + ASSERT_TRUE(s[6].IsAborted()); + ASSERT_TRUE(s[7].IsAborted()); + ASSERT_TRUE(s[8].IsAborted()); + ASSERT_TRUE(s[10].IsAborted()); + ASSERT_TRUE(s[11].IsAborted()); + ASSERT_TRUE(s[14].IsAborted()); + ASSERT_TRUE(s[15].IsAborted()); + ASSERT_TRUE(s[17].IsAborted()); + + // 6 kv pairs * 3 bytes per value (i.e. 18) + ASSERT_EQ(21, (int)get_perf_context()->multiget_read_bytes); + SetPerfLevel(kDisable); + } while (ChangeCompactOptions()); +} + +TEST_P(DBMultiGetTestWithParam, MultiGetBatchedValueSizeMultiLevelMerge) { + if (std::get<1>(GetParam())) { + ROCKSDB_GTEST_BYPASS("This test needs to be fixed for async IO"); + return; + } + // Skip for unbatched MultiGet + if (!std::get<0>(GetParam())) { + ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet"); + return; + } + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + int num_keys = 0; + + for (int i = 0; i < 64; ++i) { + ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i))); + num_keys++; + if (num_keys == 8) { + ASSERT_OK(Flush()); + num_keys = 0; + } + } + if (num_keys > 0) { + ASSERT_OK(Flush()); + num_keys = 0; + } + MoveFilesToLevel(2); + + for (int i = 0; i < 64; i += 3) { + ASSERT_OK(Merge("key_" + std::to_string(i), "val_l1_" + std::to_string(i))); + num_keys++; + if (num_keys == 8) { + ASSERT_OK(Flush()); + num_keys = 0; + } + } + if (num_keys > 0) { + ASSERT_OK(Flush()); + num_keys = 0; + } + MoveFilesToLevel(1); + + for (int i = 0; i < 64; i += 5) { + ASSERT_OK(Merge("key_" + std::to_string(i), "val_l0_" + std::to_string(i))); + num_keys++; + if (num_keys == 8) { + ASSERT_OK(Flush()); + num_keys = 0; + } + } + if (num_keys > 0) { + ASSERT_OK(Flush()); + num_keys = 0; + } + ASSERT_EQ(0, num_keys); + + for (int i = 0; i < 64; i += 9) { + ASSERT_OK( + Merge("key_" + std::to_string(i), "val_mem_" + std::to_string(i))); + } + + std::vector keys_str; + for (int i = 10; i < 50; ++i) { + keys_str.push_back("key_" + std::to_string(i)); + } + + std::vector keys(keys_str.size()); + for (int i = 0; i < 40; i++) { + keys[i] = Slice(keys_str[i]); + } + + std::vector values(keys_str.size()); + std::vector statuses(keys_str.size()); + ReadOptions read_options; + read_options.verify_checksums = true; + read_options.value_size_soft_limit = 380; + read_options.async_io = std::get<1>(GetParam()); + db_->MultiGet(read_options, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data()); + + ASSERT_EQ(values.size(), keys.size()); + + for (unsigned int j = 0; j < 26; ++j) { + int key = j + 10; + std::string value; + value.append("val_l2_" + std::to_string(key)); + if (key % 3 == 0) { + value.append(","); + value.append("val_l1_" + std::to_string(key)); + } + if (key % 5 == 0) { + value.append(","); + value.append("val_l0_" + std::to_string(key)); + } + if (key % 9 == 0) { + value.append(","); + value.append("val_mem_" + std::to_string(key)); + } + ASSERT_EQ(values[j], value); + ASSERT_OK(statuses[j]); + } + + // All remaning keys status is set Status::Abort + for (unsigned int j = 26; j < 40; j++) { + ASSERT_TRUE(statuses[j].IsAborted()); + } +} + +INSTANTIATE_TEST_CASE_P(DBMultiGetTestWithParam, DBMultiGetTestWithParam, + testing::Combine(testing::Bool(), testing::Bool())); + +#if USE_COROUTINES +class DBMultiGetAsyncIOTest : public DBBasicTest, + public ::testing::WithParamInterface { + public: + DBMultiGetAsyncIOTest() + : DBBasicTest(), statistics_(ROCKSDB_NAMESPACE::CreateDBStatistics()) { + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10)); + options_ = CurrentOptions(); + options_.disable_auto_compactions = true; + options_.statistics = statistics_; + options_.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options_); + int num_keys = 0; + + // Put all keys in the bottommost level, and overwrite some keys + // in L0 and L1 + for (int i = 0; i < 256; ++i) { + EXPECT_OK(Put(Key(i), "val_l2_" + std::to_string(i))); + num_keys++; + if (num_keys == 8) { + EXPECT_OK(Flush()); + num_keys = 0; + } + } + if (num_keys > 0) { + EXPECT_OK(Flush()); + num_keys = 0; + } + MoveFilesToLevel(2); + + for (int i = 0; i < 128; i += 3) { + EXPECT_OK(Put(Key(i), "val_l1_" + std::to_string(i))); + num_keys++; + if (num_keys == 8) { + EXPECT_OK(Flush()); + num_keys = 0; + } + } + if (num_keys > 0) { + EXPECT_OK(Flush()); + num_keys = 0; + } + // Put some range deletes in L1 + for (int i = 128; i < 256; i += 32) { + std::string range_begin = Key(i); + std::string range_end = Key(i + 16); + EXPECT_OK(dbfull()->DeleteRange(WriteOptions(), + dbfull()->DefaultColumnFamily(), + range_begin, range_end)); + // Also do some Puts to force creation of bloom filter + for (int j = i + 16; j < i + 32; ++j) { + if (j % 3 == 0) { + EXPECT_OK(Put(Key(j), "val_l1_" + std::to_string(j))); + } + } + EXPECT_OK(Flush()); + } + MoveFilesToLevel(1); + + for (int i = 0; i < 128; i += 5) { + EXPECT_OK(Put(Key(i), "val_l0_" + std::to_string(i))); + num_keys++; + if (num_keys == 8) { + EXPECT_OK(Flush()); + num_keys = 0; + } + } + if (num_keys > 0) { + EXPECT_OK(Flush()); + num_keys = 0; + } + EXPECT_EQ(0, num_keys); + } + + const std::shared_ptr& statistics() { return statistics_; } + + protected: + void ReopenDB() { Reopen(options_); } + + private: + std::shared_ptr statistics_; + Options options_; +}; + +TEST_P(DBMultiGetAsyncIOTest, GetFromL0) { + // All 3 keys in L0. The L0 files should be read serially. + std::vector key_strs{Key(0), Key(40), Key(80)}; + std::vector keys{key_strs[0], key_strs[1], key_strs[2]}; + std::vector values(key_strs.size()); + std::vector statuses(key_strs.size()); + + ReadOptions ro; + ro.async_io = true; + ro.optimize_multiget_for_io = GetParam(); + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data()); + ASSERT_EQ(values.size(), 3); + ASSERT_OK(statuses[0]); + ASSERT_OK(statuses[1]); + ASSERT_OK(statuses[2]); + ASSERT_EQ(values[0], "val_l0_" + std::to_string(0)); + ASSERT_EQ(values[1], "val_l0_" + std::to_string(40)); + ASSERT_EQ(values[2], "val_l0_" + std::to_string(80)); + + HistogramData multiget_io_batch_size; + + statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size); + + // With async IO, lookups will happen in parallel for each key + if (GetParam()) { + ASSERT_EQ(multiget_io_batch_size.count, 1); + ASSERT_EQ(multiget_io_batch_size.max, 3); + ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3); + } else { + // Without Async IO, MultiGet will call MultiRead 3 times, once for each + // L0 file + ASSERT_EQ(multiget_io_batch_size.count, 3); + } +} + +TEST_P(DBMultiGetAsyncIOTest, GetFromL1) { + std::vector key_strs; + std::vector keys; + std::vector values; + std::vector statuses; + + key_strs.push_back(Key(33)); + key_strs.push_back(Key(54)); + key_strs.push_back(Key(102)); + keys.push_back(key_strs[0]); + keys.push_back(key_strs[1]); + keys.push_back(key_strs[2]); + values.resize(keys.size()); + statuses.resize(keys.size()); + + ReadOptions ro; + ro.async_io = true; + ro.optimize_multiget_for_io = GetParam(); + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data()); + ASSERT_EQ(values.size(), 3); + ASSERT_EQ(statuses[0], Status::OK()); + ASSERT_EQ(statuses[1], Status::OK()); + ASSERT_EQ(statuses[2], Status::OK()); + ASSERT_EQ(values[0], "val_l1_" + std::to_string(33)); + ASSERT_EQ(values[1], "val_l1_" + std::to_string(54)); + ASSERT_EQ(values[2], "val_l1_" + std::to_string(102)); + + HistogramData multiget_io_batch_size; + + statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size); + + // A batch of 3 async IOs is expected, one for each overlapping file in L1 + ASSERT_EQ(multiget_io_batch_size.count, 1); + ASSERT_EQ(multiget_io_batch_size.max, 3); + ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3); +} + +TEST_P(DBMultiGetAsyncIOTest, GetFromL1Error) { + std::vector key_strs; + std::vector keys; + std::vector values; + std::vector statuses; + + key_strs.push_back(Key(33)); + key_strs.push_back(Key(54)); + key_strs.push_back(Key(102)); + keys.push_back(key_strs[0]); + keys.push_back(key_strs[1]); + keys.push_back(key_strs[2]); + values.resize(keys.size()); + statuses.resize(keys.size()); + + SyncPoint::GetInstance()->SetCallBack( + "TableCache::GetTableReader:BeforeOpenFile", [&](void* status) { + static int count = 0; + count++; + // Fail the last table reader open, which is the 6th SST file + // since 3 overlapping L0 files + 3 L1 files containing the keys + if (count == 6) { + Status* s = static_cast(status); + *s = Status::IOError(); + } + }); + // DB open will create table readers unless we reduce the table cache + // capacity. + // SanitizeOptions will set max_open_files to minimum of 20. Table cache + // is allocated with max_open_files - 10 as capacity. So override + // max_open_files to 11 so table cache capacity will become 1. This will + // prevent file open during DB open and force the file to be opened + // during MultiGet + SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = (int*)arg; + *max_open_files = 11; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ReopenDB(); + + ReadOptions ro; + ro.async_io = true; + ro.optimize_multiget_for_io = GetParam(); + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data()); + SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_EQ(values.size(), 3); + ASSERT_EQ(statuses[0], Status::OK()); + ASSERT_EQ(statuses[1], Status::OK()); + ASSERT_EQ(statuses[2], Status::IOError()); + + HistogramData multiget_io_batch_size; + + statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size); + + // A batch of 3 async IOs is expected, one for each overlapping file in L1 + ASSERT_EQ(multiget_io_batch_size.count, 1); + ASSERT_EQ(multiget_io_batch_size.max, 2); + ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2); +} + +TEST_P(DBMultiGetAsyncIOTest, LastKeyInFile) { + std::vector key_strs; + std::vector keys; + std::vector values; + std::vector statuses; + + // 21 is the last key in the first L1 file + key_strs.push_back(Key(21)); + key_strs.push_back(Key(54)); + key_strs.push_back(Key(102)); + keys.push_back(key_strs[0]); + keys.push_back(key_strs[1]); + keys.push_back(key_strs[2]); + values.resize(keys.size()); + statuses.resize(keys.size()); + + ReadOptions ro; + ro.async_io = true; + ro.optimize_multiget_for_io = GetParam(); + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data()); + ASSERT_EQ(values.size(), 3); + ASSERT_EQ(statuses[0], Status::OK()); + ASSERT_EQ(statuses[1], Status::OK()); + ASSERT_EQ(statuses[2], Status::OK()); + ASSERT_EQ(values[0], "val_l1_" + std::to_string(21)); + ASSERT_EQ(values[1], "val_l1_" + std::to_string(54)); + ASSERT_EQ(values[2], "val_l1_" + std::to_string(102)); + + HistogramData multiget_io_batch_size; + + statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size); + + // Since the first MultiGet key is the last key in a file, the MultiGet is + // expected to lookup in that file first, before moving on to other files. + // So the first file lookup will issue one async read, and the next lookup + // will lookup 2 files in parallel and issue 2 async reads + ASSERT_EQ(multiget_io_batch_size.count, 2); + ASSERT_EQ(multiget_io_batch_size.max, 2); +} + +TEST_P(DBMultiGetAsyncIOTest, GetFromL1AndL2) { + std::vector key_strs; + std::vector keys; + std::vector values; + std::vector statuses; + + // 33 and 102 are in L1, and 56 is in L2 + key_strs.push_back(Key(33)); + key_strs.push_back(Key(56)); + key_strs.push_back(Key(102)); + keys.push_back(key_strs[0]); + keys.push_back(key_strs[1]); + keys.push_back(key_strs[2]); + values.resize(keys.size()); + statuses.resize(keys.size()); + + ReadOptions ro; + ro.async_io = true; + ro.optimize_multiget_for_io = GetParam(); + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data()); + ASSERT_EQ(values.size(), 3); + ASSERT_EQ(statuses[0], Status::OK()); + ASSERT_EQ(statuses[1], Status::OK()); + ASSERT_EQ(statuses[2], Status::OK()); + ASSERT_EQ(values[0], "val_l1_" + std::to_string(33)); + ASSERT_EQ(values[1], "val_l2_" + std::to_string(56)); + ASSERT_EQ(values[2], "val_l1_" + std::to_string(102)); + + HistogramData multiget_io_batch_size; + + statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size); + + // There are 2 keys in L1 in twp separate files, and 1 in L2. With + // optimize_multiget_for_io, all three lookups will happen in parallel. + // Otherwise, the L2 lookup will happen after L1. + ASSERT_EQ(multiget_io_batch_size.count, GetParam() ? 1 : 2); + ASSERT_EQ(multiget_io_batch_size.max, GetParam() ? 3 : 2); +} + +TEST_P(DBMultiGetAsyncIOTest, GetFromL2WithRangeOverlapL0L1) { + std::vector key_strs; + std::vector keys; + std::vector values; + std::vector statuses; + + // 19 and 26 are in L2, but overlap with L0 and L1 file ranges + key_strs.push_back(Key(19)); + key_strs.push_back(Key(26)); + keys.push_back(key_strs[0]); + keys.push_back(key_strs[1]); + values.resize(keys.size()); + statuses.resize(keys.size()); + + ReadOptions ro; + ro.async_io = true; + ro.optimize_multiget_for_io = GetParam(); + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data()); + ASSERT_EQ(values.size(), 2); + ASSERT_EQ(statuses[0], Status::OK()); + ASSERT_EQ(statuses[1], Status::OK()); + ASSERT_EQ(values[0], "val_l2_" + std::to_string(19)); + ASSERT_EQ(values[1], "val_l2_" + std::to_string(26)); + + // Bloom filters in L0/L1 will avoid the coroutine calls in those levels + ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2); +} + +TEST_P(DBMultiGetAsyncIOTest, GetFromL2WithRangeDelInL1) { + std::vector key_strs; + std::vector keys; + std::vector values; + std::vector statuses; + + // 139 and 163 are in L2, but overlap with a range deletes in L1 + key_strs.push_back(Key(139)); + key_strs.push_back(Key(163)); + keys.push_back(key_strs[0]); + keys.push_back(key_strs[1]); + values.resize(keys.size()); + statuses.resize(keys.size()); + + ReadOptions ro; + ro.async_io = true; + ro.optimize_multiget_for_io = GetParam(); + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data()); + ASSERT_EQ(values.size(), 2); + ASSERT_EQ(statuses[0], Status::NotFound()); + ASSERT_EQ(statuses[1], Status::NotFound()); + + // Bloom filters in L0/L1 will avoid the coroutine calls in those levels + ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2); +} + +TEST_P(DBMultiGetAsyncIOTest, GetFromL1AndL2WithRangeDelInL1) { + std::vector key_strs; + std::vector keys; + std::vector values; + std::vector statuses; + + // 139 and 163 are in L2, but overlap with a range deletes in L1 + key_strs.push_back(Key(139)); + key_strs.push_back(Key(144)); + key_strs.push_back(Key(163)); + keys.push_back(key_strs[0]); + keys.push_back(key_strs[1]); + keys.push_back(key_strs[2]); + values.resize(keys.size()); + statuses.resize(keys.size()); + + ReadOptions ro; + ro.async_io = true; + ro.optimize_multiget_for_io = GetParam(); + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data()); + ASSERT_EQ(values.size(), keys.size()); + ASSERT_EQ(statuses[0], Status::NotFound()); + ASSERT_EQ(statuses[1], Status::OK()); + ASSERT_EQ(values[1], "val_l1_" + std::to_string(144)); + ASSERT_EQ(statuses[2], Status::NotFound()); + + // Bloom filters in L0/L1 will avoid the coroutine calls in those levels + ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3); +} + +INSTANTIATE_TEST_CASE_P(DBMultiGetAsyncIOTest, DBMultiGetAsyncIOTest, + testing::Bool()); +#endif // USE_COROUTINES + +TEST_F(DBBasicTest, MultiGetStats) { + Options options; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.env = env_; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.block_size = 1; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + table_options.partition_filters = true; + table_options.no_block_cache = true; + table_options.cache_index_and_filter_blocks = false; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + + int total_keys = 2000; + std::vector keys_str(total_keys); + std::vector keys(total_keys); + static size_t kMultiGetBatchSize = 100; + std::vector values(kMultiGetBatchSize); + std::vector s(kMultiGetBatchSize); + ReadOptions read_opts; + + Random rnd(309); + // Create Multiple SST files at multiple levels. + for (int i = 0; i < 500; ++i) { + keys_str[i] = "k" + std::to_string(i); + keys[i] = Slice(keys_str[i]); + ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000))); + if (i % 100 == 0) { + ASSERT_OK(Flush(1)); + } + } + ASSERT_OK(Flush(1)); + MoveFilesToLevel(2, 1); + + for (int i = 501; i < 1000; ++i) { + keys_str[i] = "k" + std::to_string(i); + keys[i] = Slice(keys_str[i]); + ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000))); + if (i % 100 == 0) { + ASSERT_OK(Flush(1)); + } + } + + ASSERT_OK(Flush(1)); + MoveFilesToLevel(2, 1); + + for (int i = 1001; i < total_keys; ++i) { + keys_str[i] = "k" + std::to_string(i); + keys[i] = Slice(keys_str[i]); + ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000))); + if (i % 100 == 0) { + ASSERT_OK(Flush(1)); + } + } + ASSERT_OK(Flush(1)); + MoveFilesToLevel(1, 1); + Close(); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_OK(options.statistics->Reset()); + + db_->MultiGet(read_opts, handles_[1], kMultiGetBatchSize, &keys[1250], + values.data(), s.data(), false); + + ASSERT_EQ(values.size(), kMultiGetBatchSize); + HistogramData hist_level; + HistogramData hist_index_and_filter_blocks; + HistogramData hist_sst; + + options.statistics->histogramData(NUM_LEVEL_READ_PER_MULTIGET, &hist_level); + options.statistics->histogramData(NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, + &hist_index_and_filter_blocks); + options.statistics->histogramData(NUM_SST_READ_PER_LEVEL, &hist_sst); + + // Maximum number of blocks read from a file system in a level. + ASSERT_EQ(hist_level.max, 1); + ASSERT_GT(hist_index_and_filter_blocks.max, 0); + // Maximum number of sst files read from file system in a level. + ASSERT_EQ(hist_sst.max, 2); + + // Minimun number of blocks read in a level. + ASSERT_EQ(hist_level.min, 1); + ASSERT_GT(hist_index_and_filter_blocks.min, 0); + // Minimun number of sst files read in a level. + ASSERT_EQ(hist_sst.min, 1); + + for (PinnableSlice& value : values) { + value.Reset(); + } + for (Status& status : s) { + status = Status::OK(); + } + db_->MultiGet(read_opts, handles_[1], kMultiGetBatchSize, &keys[950], + values.data(), s.data(), false); + options.statistics->histogramData(NUM_LEVEL_READ_PER_MULTIGET, &hist_level); + ASSERT_EQ(hist_level.max, 2); +} + +// Test class for batched MultiGet with prefix extractor +// Param bool - If true, use partitioned filters +// If false, use full filter block +class MultiGetPrefixExtractorTest : public DBBasicTest, + public ::testing::WithParamInterface { +}; + +TEST_P(MultiGetPrefixExtractorTest, Batched) { + Options options = CurrentOptions(); + options.prefix_extractor.reset(NewFixedPrefixTransform(2)); + options.memtable_prefix_bloom_size_ratio = 10; + BlockBasedTableOptions bbto; + if (GetParam()) { + bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + bbto.partition_filters = true; + } + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + bbto.cache_index_and_filter_blocks = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + SetPerfLevel(kEnableCount); + get_perf_context()->Reset(); + + ASSERT_OK(Put("k", "v0")); + ASSERT_OK(Put("kk1", "v1")); + ASSERT_OK(Put("kk2", "v2")); + ASSERT_OK(Put("kk3", "v3")); + ASSERT_OK(Put("kk4", "v4")); + std::vector keys( + {"k", "kk1", "kk2", "kk3", "kk4", "rofl", "lmho"}); + std::vector expected( + {"v0", "v1", "v2", "v3", "v4", "NOT_FOUND", "NOT_FOUND"}); + std::vector values; + values = MultiGet(keys, nullptr); + ASSERT_EQ(values, expected); + // One key ("k") is not queried against the filter because it is outside + // the prefix_extractor domain, leaving 6 keys with queried prefixes. + ASSERT_EQ(get_perf_context()->bloom_memtable_miss_count, 2); + ASSERT_EQ(get_perf_context()->bloom_memtable_hit_count, 4); + ASSERT_OK(Flush()); + + get_perf_context()->Reset(); + values = MultiGet(keys, nullptr); + ASSERT_EQ(values, expected); + ASSERT_EQ(get_perf_context()->bloom_sst_miss_count, 2); + ASSERT_EQ(get_perf_context()->bloom_sst_hit_count, 4); + + // Also check Get stat + get_perf_context()->Reset(); + for (size_t i = 0; i < keys.size(); ++i) { + values[i] = Get(keys[i]); + } + ASSERT_EQ(values, expected); + ASSERT_EQ(get_perf_context()->bloom_sst_miss_count, 2); + ASSERT_EQ(get_perf_context()->bloom_sst_hit_count, 4); +} + +INSTANTIATE_TEST_CASE_P(MultiGetPrefix, MultiGetPrefixExtractorTest, + ::testing::Bool()); + +#ifndef ROCKSDB_LITE +class DBMultiGetRowCacheTest : public DBBasicTest, + public ::testing::WithParamInterface {}; + +TEST_P(DBMultiGetRowCacheTest, MultiGetBatched) { + do { + option_config_ = kRowCache; + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + CreateAndReopenWithCF({"pikachu"}, options); + SetPerfLevel(kEnableCount); + ASSERT_OK(Put(1, "k1", "v1")); + ASSERT_OK(Put(1, "k2", "v2")); + ASSERT_OK(Put(1, "k3", "v3")); + ASSERT_OK(Put(1, "k4", "v4")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "k5", "v5")); + const Snapshot* snap1 = dbfull()->GetSnapshot(); + ASSERT_OK(Delete(1, "k4")); + ASSERT_OK(Flush(1)); + const Snapshot* snap2 = dbfull()->GetSnapshot(); + + get_perf_context()->Reset(); + + std::vector keys({"no_key", "k5", "k4", "k3", "k1"}); + std::vector values(keys.size()); + std::vector cfs(keys.size(), handles_[1]); + std::vector s(keys.size()); + + ReadOptions ro; + bool use_snapshots = GetParam(); + if (use_snapshots) { + ro.snapshot = snap2; + } + db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(), + s.data(), false); + + ASSERT_EQ(values.size(), keys.size()); + ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v1"); + ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v3"); + ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5"); + // four kv pairs * two bytes per value + ASSERT_EQ(6, (int)get_perf_context()->multiget_read_bytes); + + ASSERT_TRUE(s[0].IsNotFound()); + ASSERT_OK(s[1]); + ASSERT_TRUE(s[2].IsNotFound()); + ASSERT_OK(s[3]); + ASSERT_OK(s[4]); + + // Call MultiGet() again with some intersection with the previous set of + // keys. Those should already be in the row cache. + keys.assign({"no_key", "k5", "k3", "k2"}); + for (size_t i = 0; i < keys.size(); ++i) { + values[i].Reset(); + s[i] = Status::OK(); + } + get_perf_context()->Reset(); + + if (use_snapshots) { + ro.snapshot = snap1; + } + db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(), + values.data(), s.data(), false); + + ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v2"); + ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v3"); + ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5"); + // four kv pairs * two bytes per value + ASSERT_EQ(6, (int)get_perf_context()->multiget_read_bytes); + + ASSERT_TRUE(s[0].IsNotFound()); + ASSERT_OK(s[1]); + ASSERT_OK(s[2]); + ASSERT_OK(s[3]); + if (use_snapshots) { + // Only reads from the first SST file would have been cached, since + // snapshot seq no is > fd.largest_seqno + ASSERT_EQ(1, TestGetTickerCount(options, ROW_CACHE_HIT)); + } else { + ASSERT_EQ(2, TestGetTickerCount(options, ROW_CACHE_HIT)); + } + + SetPerfLevel(kDisable); + dbfull()->ReleaseSnapshot(snap1); + dbfull()->ReleaseSnapshot(snap2); + } while (ChangeCompactOptions()); +} + +INSTANTIATE_TEST_CASE_P(DBMultiGetRowCacheTest, DBMultiGetRowCacheTest, + testing::Values(true, false)); + +TEST_F(DBBasicTest, GetAllKeyVersions) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.disable_auto_compactions = true; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_EQ(2, handles_.size()); + const size_t kNumInserts = 4; + const size_t kNumDeletes = 4; + const size_t kNumUpdates = 4; + + // Check default column family + for (size_t i = 0; i != kNumInserts; ++i) { + ASSERT_OK(Put(std::to_string(i), "value")); + } + for (size_t i = 0; i != kNumUpdates; ++i) { + ASSERT_OK(Put(std::to_string(i), "value1")); + } + for (size_t i = 0; i != kNumDeletes; ++i) { + ASSERT_OK(Delete(std::to_string(i))); + } + std::vector key_versions; + ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(), + std::numeric_limits::max(), + &key_versions)); + ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size()); + for (size_t i = 0; i < kNumInserts + kNumDeletes + kNumUpdates; i++) { + if (i % 3 == 0) { + ASSERT_EQ(key_versions[i].GetTypeName(), "TypeDeletion"); + } else { + ASSERT_EQ(key_versions[i].GetTypeName(), "TypeValue"); + } + } + ASSERT_OK(GetAllKeyVersions(db_, handles_[0], Slice(), Slice(), + std::numeric_limits::max(), + &key_versions)); + ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size()); + + // Check non-default column family + for (size_t i = 0; i + 1 != kNumInserts; ++i) { + ASSERT_OK(Put(1, std::to_string(i), "value")); + } + for (size_t i = 0; i + 1 != kNumUpdates; ++i) { + ASSERT_OK(Put(1, std::to_string(i), "value1")); + } + for (size_t i = 0; i + 1 != kNumDeletes; ++i) { + ASSERT_OK(Delete(1, std::to_string(i))); + } + ASSERT_OK(GetAllKeyVersions(db_, handles_[1], Slice(), Slice(), + std::numeric_limits::max(), + &key_versions)); + ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates - 3, key_versions.size()); +} + +TEST_F(DBBasicTest, ValueTypeString) { + KeyVersion key_version; + // when adding new type, please also update `value_type_string_map` + for (unsigned char i = ValueType::kTypeDeletion; i < ValueType::kTypeMaxValid; + i++) { + key_version.type = i; + ASSERT_TRUE(key_version.GetTypeName() != "Invalid"); + } +} +#endif // !ROCKSDB_LITE + +TEST_F(DBBasicTest, MultiGetIOBufferOverrun) { + Options options = CurrentOptions(); + Random rnd(301); + BlockBasedTableOptions table_options; + table_options.pin_l0_filter_and_index_blocks_in_cache = true; + table_options.block_size = 16 * 1024; + ASSERT_TRUE(table_options.block_size > + BlockBasedTable::kMultiGetReadStackBufSize); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + + std::string zero_str(128, '\0'); + for (int i = 0; i < 100; ++i) { + // Make the value compressible. A purely random string doesn't compress + // and the resultant data block will not be compressed + std::string value(rnd.RandomString(128) + zero_str); + assert(Put(Key(i), value) == Status::OK()); + } + ASSERT_OK(Flush()); + + std::vector key_data(10); + std::vector keys; + // We cannot resize a PinnableSlice vector, so just set initial size to + // largest we think we will need + std::vector values(10); + std::vector statuses; + ReadOptions ro; + + // Warm up the cache first + key_data.emplace_back(Key(0)); + keys.emplace_back(Slice(key_data.back())); + key_data.emplace_back(Key(50)); + keys.emplace_back(Slice(key_data.back())); + statuses.resize(keys.size()); + + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); +} + +TEST_F(DBBasicTest, IncrementalRecoveryNoCorrupt) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + size_t num_cfs = handles_.size(); + ASSERT_EQ(3, num_cfs); + WriteOptions write_opts; + write_opts.disableWAL = true; + for (size_t cf = 0; cf != num_cfs; ++cf) { + for (size_t i = 0; i != 10000; ++i) { + std::string key_str = Key(static_cast(i)); + std::string value_str = std::to_string(cf) + "_" + std::to_string(i); + + ASSERT_OK(Put(static_cast(cf), key_str, value_str)); + if (0 == (i % 1000)) { + ASSERT_OK(Flush(static_cast(cf))); + } + } + } + for (size_t cf = 0; cf != num_cfs; ++cf) { + ASSERT_OK(Flush(static_cast(cf))); + } + Close(); + options.best_efforts_recovery = true; + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"}, + options); + num_cfs = handles_.size(); + ASSERT_EQ(3, num_cfs); + for (size_t cf = 0; cf != num_cfs; ++cf) { + for (int i = 0; i != 10000; ++i) { + std::string key_str = Key(static_cast(i)); + std::string expected_value_str = + std::to_string(cf) + "_" + std::to_string(i); + ASSERT_EQ(expected_value_str, Get(static_cast(cf), key_str)); + } + } +} + +TEST_F(DBBasicTest, BestEffortsRecoveryWithVersionBuildingFailure) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(Flush()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) { + ASSERT_NE(nullptr, arg); + *(reinterpret_cast(arg)) = + Status::Corruption("Inject corruption"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + options.best_efforts_recovery = true; + Status s = TryReopen(options); + ASSERT_TRUE(s.IsCorruption()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +#ifndef ROCKSDB_LITE +namespace { +class TableFileListener : public EventListener { + public: + void OnTableFileCreated(const TableFileCreationInfo& info) override { + InstrumentedMutexLock lock(&mutex_); + cf_to_paths_[info.cf_name].push_back(info.file_path); + } + std::vector& GetFiles(const std::string& cf_name) { + InstrumentedMutexLock lock(&mutex_); + return cf_to_paths_[cf_name]; + } + + private: + InstrumentedMutex mutex_; + std::unordered_map> cf_to_paths_; +}; +} // anonymous namespace + +TEST_F(DBBasicTest, LastSstFileNotInManifest) { + // If the last sst file is not tracked in MANIFEST, + // or the VersionEdit for the last sst file is not synced, + // on recovery, the last sst file should be deleted, + // and new sst files shouldn't reuse its file number. + Options options = CurrentOptions(); + DestroyAndReopen(options); + Close(); + + // Manually add a sst file. + constexpr uint64_t kSstFileNumber = 100; + const std::string kSstFile = MakeTableFileName(dbname_, kSstFileNumber); + ASSERT_OK(WriteStringToFile(env_, /* data = */ "bad sst file content", + /* fname = */ kSstFile, + /* should_sync = */ true)); + ASSERT_OK(env_->FileExists(kSstFile)); + + TableFileListener* listener = new TableFileListener(); + options.listeners.emplace_back(listener); + Reopen(options); + // kSstFile should already be deleted. + ASSERT_TRUE(env_->FileExists(kSstFile).IsNotFound()); + + ASSERT_OK(Put("k", "v")); + ASSERT_OK(Flush()); + // New sst file should have file number > kSstFileNumber. + std::vector& files = + listener->GetFiles(kDefaultColumnFamilyName); + ASSERT_EQ(files.size(), 1); + const std::string fname = files[0].erase(0, (dbname_ + "/").size()); + uint64_t number = 0; + FileType type = kTableFile; + ASSERT_TRUE(ParseFileName(fname, &number, &type)); + ASSERT_EQ(type, kTableFile); + ASSERT_GT(number, kSstFileNumber); +} + +TEST_F(DBBasicTest, RecoverWithMissingFiles) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + TableFileListener* listener = new TableFileListener(); + // Disable auto compaction to simplify SST file name tracking. + options.disable_auto_compactions = true; + options.listeners.emplace_back(listener); + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + std::vector all_cf_names = {kDefaultColumnFamilyName, "pikachu", + "eevee"}; + size_t num_cfs = handles_.size(); + ASSERT_EQ(3, num_cfs); + for (size_t cf = 0; cf != num_cfs; ++cf) { + ASSERT_OK(Put(static_cast(cf), "a", "0_value")); + ASSERT_OK(Flush(static_cast(cf))); + ASSERT_OK(Put(static_cast(cf), "b", "0_value")); + ASSERT_OK(Flush(static_cast(cf))); + ASSERT_OK(Put(static_cast(cf), "c", "0_value")); + ASSERT_OK(Flush(static_cast(cf))); + } + + // Delete and corrupt files + for (size_t i = 0; i < all_cf_names.size(); ++i) { + std::vector& files = listener->GetFiles(all_cf_names[i]); + ASSERT_EQ(3, files.size()); + std::string corrupted_data; + ASSERT_OK(ReadFileToString(env_, files[files.size() - 1], &corrupted_data)); + ASSERT_OK(WriteStringToFile( + env_, corrupted_data.substr(0, corrupted_data.size() - 2), + files[files.size() - 1], /*should_sync=*/true)); + for (int j = static_cast(files.size() - 2); j >= static_cast(i); + --j) { + ASSERT_OK(env_->DeleteFile(files[j])); + } + } + options.best_efforts_recovery = true; + ReopenWithColumnFamilies(all_cf_names, options); + // Verify data + ReadOptions read_opts; + read_opts.total_order_seek = true; + { + std::unique_ptr iter(db_->NewIterator(read_opts, handles_[0])); + iter->SeekToFirst(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + iter.reset(db_->NewIterator(read_opts, handles_[1])); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a", iter->key()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + iter.reset(db_->NewIterator(read_opts, handles_[2])); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a", iter->key()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("b", iter->key()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + } +} + +TEST_F(DBBasicTest, BestEffortsRecoveryTryMultipleManifests) { + Options options = CurrentOptions(); + options.env = env_; + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "value0")); + ASSERT_OK(Flush()); + Close(); + { + // Hack by adding a new MANIFEST with high file number + std::string garbage(10, '\0'); + ASSERT_OK(WriteStringToFile(env_, garbage, dbname_ + "/MANIFEST-001000", + /*should_sync=*/true)); + } + { + // Hack by adding a corrupted SST not referenced by any MANIFEST + std::string garbage(10, '\0'); + ASSERT_OK(WriteStringToFile(env_, garbage, dbname_ + "/001001.sst", + /*should_sync=*/true)); + } + + options.best_efforts_recovery = true; + + Reopen(options); + ASSERT_OK(Put("bar", "value")); +} + +TEST_F(DBBasicTest, RecoverWithNoCurrentFile) { + Options options = CurrentOptions(); + options.env = env_; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + options.best_efforts_recovery = true; + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); + ASSERT_EQ(2, handles_.size()); + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(Put(1, "bar", "value")); + ASSERT_OK(Flush()); + ASSERT_OK(Flush(1)); + Close(); + ASSERT_OK(env_->DeleteFile(CurrentFileName(dbname_))); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); + std::vector cf_names; + ASSERT_OK(DB::ListColumnFamilies(DBOptions(options), dbname_, &cf_names)); + ASSERT_EQ(2, cf_names.size()); + for (const auto& name : cf_names) { + ASSERT_TRUE(name == kDefaultColumnFamilyName || name == "pikachu"); + } +} + +TEST_F(DBBasicTest, RecoverWithNoManifest) { + Options options = CurrentOptions(); + options.env = env_; + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(Flush()); + Close(); + { + // Delete all MANIFEST. + std::vector files; + ASSERT_OK(env_->GetChildren(dbname_, &files)); + for (const auto& file : files) { + uint64_t number = 0; + FileType type = kWalFile; + if (ParseFileName(file, &number, &type) && type == kDescriptorFile) { + ASSERT_OK(env_->DeleteFile(dbname_ + "/" + file)); + } + } + } + options.best_efforts_recovery = true; + options.create_if_missing = false; + Status s = TryReopen(options); + ASSERT_TRUE(s.IsInvalidArgument()); + options.create_if_missing = true; + Reopen(options); + // Since no MANIFEST exists, best-efforts recovery creates a new, empty db. + ASSERT_EQ("NOT_FOUND", Get("foo")); +} + +TEST_F(DBBasicTest, SkipWALIfMissingTableFiles) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + TableFileListener* listener = new TableFileListener(); + options.listeners.emplace_back(listener); + CreateAndReopenWithCF({"pikachu"}, options); + std::vector kAllCfNames = {kDefaultColumnFamilyName, "pikachu"}; + size_t num_cfs = handles_.size(); + ASSERT_EQ(2, num_cfs); + for (int cf = 0; cf < static_cast(kAllCfNames.size()); ++cf) { + ASSERT_OK(Put(cf, "a", "0_value")); + ASSERT_OK(Flush(cf)); + ASSERT_OK(Put(cf, "b", "0_value")); + } + // Delete files + for (size_t i = 0; i < kAllCfNames.size(); ++i) { + std::vector& files = listener->GetFiles(kAllCfNames[i]); + ASSERT_EQ(1, files.size()); + for (int j = static_cast(files.size() - 1); j >= static_cast(i); + --j) { + ASSERT_OK(env_->DeleteFile(files[j])); + } + } + options.best_efforts_recovery = true; + ReopenWithColumnFamilies(kAllCfNames, options); + // Verify WAL is not applied + ReadOptions read_opts; + read_opts.total_order_seek = true; + std::unique_ptr iter(db_->NewIterator(read_opts, handles_[0])); + iter->SeekToFirst(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + iter.reset(db_->NewIterator(read_opts, handles_[1])); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a", iter->key()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); +} + +TEST_F(DBBasicTest, DisableTrackWal) { + // If WAL tracking was enabled, and then disabled during reopen, + // the previously tracked WALs should be removed from MANIFEST. + + Options options = CurrentOptions(); + options.track_and_verify_wals_in_manifest = true; + // extremely small write buffer size, + // so that new WALs are created more frequently. + options.write_buffer_size = 100; + options.env = env_; + DestroyAndReopen(options); + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put("foo" + std::to_string(i), "value" + std::to_string(i))); + } + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + ASSERT_OK(db_->SyncWAL()); + // Some WALs are tracked. + ASSERT_FALSE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty()); + Close(); + + // Disable WAL tracking. + options.track_and_verify_wals_in_manifest = false; + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + // Previously tracked WALs are cleared. + ASSERT_TRUE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty()); + Close(); + + // Re-enable WAL tracking again. + options.track_and_verify_wals_in_manifest = true; + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + ASSERT_TRUE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty()); + Close(); +} +#endif // !ROCKSDB_LITE + +TEST_F(DBBasicTest, ManifestChecksumMismatch) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + ASSERT_OK(Put("bar", "value")); + ASSERT_OK(Flush()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "LogWriter::EmitPhysicalRecord:BeforeEncodeChecksum", [&](void* arg) { + auto* crc = reinterpret_cast(arg); + *crc = *crc + 1; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + WriteOptions write_opts; + write_opts.disableWAL = true; + Status s = db_->Put(write_opts, "foo", "value"); + ASSERT_OK(s); + ASSERT_OK(Flush()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + ASSERT_OK(Put("foo", "value1")); + ASSERT_OK(Flush()); + s = TryReopen(options); + ASSERT_TRUE(s.IsCorruption()); +} + +TEST_F(DBBasicTest, ConcurrentlyCloseDB) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + std::vector workers; + for (int i = 0; i < 10; i++) { + workers.push_back(std::thread([&]() { + auto s = db_->Close(); + ASSERT_OK(s); + })); + } + for (auto& w : workers) { + w.join(); + } +} + +#ifndef ROCKSDB_LITE +class DBBasicTestTrackWal : public DBTestBase, + public testing::WithParamInterface { + public: + DBBasicTestTrackWal() + : DBTestBase("db_basic_test_track_wal", /*env_do_fsync=*/false) {} + + int CountWalFiles() { + VectorLogPtr log_files; + EXPECT_OK(dbfull()->GetSortedWalFiles(log_files)); + return static_cast(log_files.size()); + }; +}; + +TEST_P(DBBasicTestTrackWal, DoNotTrackObsoleteWal) { + // If a WAL becomes obsolete after flushing, but is not deleted from disk yet, + // then if SyncWAL is called afterwards, the obsolete WAL should not be + // tracked in MANIFEST. + + Options options = CurrentOptions(); + options.create_if_missing = true; + options.track_and_verify_wals_in_manifest = true; + options.atomic_flush = GetParam(); + + DestroyAndReopen(options); + CreateAndReopenWithCF({"cf"}, options); + ASSERT_EQ(handles_.size(), 2); // default, cf + // Do not delete WALs. + ASSERT_OK(db_->DisableFileDeletions()); + constexpr int n = 10; + std::vector> wals(n); + for (size_t i = 0; i < n; i++) { + // Generate a new WAL for each key-value. + const int cf = i % 2; + ASSERT_OK(db_->GetCurrentWalFile(&wals[i])); + ASSERT_OK(Put(cf, "k" + std::to_string(i), "v" + std::to_string(i))); + ASSERT_OK(Flush({0, 1})); + } + ASSERT_EQ(CountWalFiles(), n); + // Since all WALs are obsolete, no WAL should be tracked in MANIFEST. + ASSERT_OK(db_->SyncWAL()); + + // Manually delete all WALs. + Close(); + for (const auto& wal : wals) { + ASSERT_OK(env_->DeleteFile(LogFileName(dbname_, wal->LogNumber()))); + } + + // If SyncWAL tracks the obsolete WALs in MANIFEST, + // reopen will fail because the WALs are missing from disk. + ASSERT_OK(TryReopenWithColumnFamilies({"default", "cf"}, options)); + Destroy(options); +} + +INSTANTIATE_TEST_CASE_P(DBBasicTestTrackWal, DBBasicTestTrackWal, + testing::Bool()); +#endif // ROCKSDB_LITE + +class DBBasicTestMultiGet : public DBTestBase { + public: + DBBasicTestMultiGet(std::string test_dir, int num_cfs, bool compressed_cache, + bool uncompressed_cache, bool _compression_enabled, + bool _fill_cache, uint32_t compression_parallel_threads) + : DBTestBase(test_dir, /*env_do_fsync=*/false) { + compression_enabled_ = _compression_enabled; + fill_cache_ = _fill_cache; + + if (compressed_cache) { + std::shared_ptr cache = NewLRUCache(1048576); + compressed_cache_ = std::make_shared(cache); + } + if (uncompressed_cache) { + std::shared_ptr cache = NewLRUCache(1048576); + uncompressed_cache_ = std::make_shared(cache); + } + + env_->count_random_reads_ = true; + + Options options = CurrentOptions(); + Random rnd(301); + BlockBasedTableOptions table_options; + +#ifndef ROCKSDB_LITE + if (compression_enabled_) { + std::vector compression_types; + compression_types = GetSupportedCompressions(); + // Not every platform may have compression libraries available, so + // dynamically pick based on what's available + CompressionType tmp_type = kNoCompression; + for (auto c_type : compression_types) { + if (c_type != kNoCompression) { + tmp_type = c_type; + break; + } + } + if (tmp_type != kNoCompression) { + options.compression = tmp_type; + } else { + compression_enabled_ = false; + } + } +#else + // GetSupportedCompressions() is not available in LITE build + if (!Snappy_Supported()) { + compression_enabled_ = false; + } +#endif // ROCKSDB_LITE + + table_options.block_cache = uncompressed_cache_; + if (table_options.block_cache == nullptr) { + table_options.no_block_cache = true; + } else { + table_options.pin_l0_filter_and_index_blocks_in_cache = true; + } + table_options.block_cache_compressed = compressed_cache_; + table_options.flush_block_policy_factory.reset( + new MyFlushBlockPolicyFactory()); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + if (!compression_enabled_) { + options.compression = kNoCompression; + } else { + options.compression_opts.parallel_threads = compression_parallel_threads; + } + options_ = options; + Reopen(options); + + if (num_cfs > 1) { + for (int cf = 0; cf < num_cfs; ++cf) { + cf_names_.emplace_back("cf" + std::to_string(cf)); + } + CreateColumnFamilies(cf_names_, options); + cf_names_.emplace_back("default"); + } + + std::string zero_str(128, '\0'); + for (int cf = 0; cf < num_cfs; ++cf) { + for (int i = 0; i < 100; ++i) { + // Make the value compressible. A purely random string doesn't compress + // and the resultant data block will not be compressed + values_.emplace_back(rnd.RandomString(128) + zero_str); + assert(((num_cfs == 1) ? Put(Key(i), values_[i]) + : Put(cf, Key(i), values_[i])) == Status::OK()); + } + if (num_cfs == 1) { + EXPECT_OK(Flush()); + } else { + EXPECT_OK(dbfull()->Flush(FlushOptions(), handles_[cf])); + } + + for (int i = 0; i < 100; ++i) { + // block cannot gain space by compression + uncompressable_values_.emplace_back(rnd.RandomString(256) + '\0'); + std::string tmp_key = "a" + Key(i); + assert(((num_cfs == 1) ? Put(tmp_key, uncompressable_values_[i]) + : Put(cf, tmp_key, uncompressable_values_[i])) == + Status::OK()); + } + if (num_cfs == 1) { + EXPECT_OK(Flush()); + } else { + EXPECT_OK(dbfull()->Flush(FlushOptions(), handles_[cf])); + } + } + // Clear compressed cache, which is always pre-populated + if (compressed_cache_) { + compressed_cache_->SetCapacity(0); + compressed_cache_->SetCapacity(1048576); + } + } + + bool CheckValue(int i, const std::string& value) { + if (values_[i].compare(value) == 0) { + return true; + } + return false; + } + + bool CheckUncompressableValue(int i, const std::string& value) { + if (uncompressable_values_[i].compare(value) == 0) { + return true; + } + return false; + } + + const std::vector& GetCFNames() const { return cf_names_; } + + int num_lookups() { return uncompressed_cache_->num_lookups(); } + int num_found() { return uncompressed_cache_->num_found(); } + int num_inserts() { return uncompressed_cache_->num_inserts(); } + + int num_lookups_compressed() { return compressed_cache_->num_lookups(); } + int num_found_compressed() { return compressed_cache_->num_found(); } + int num_inserts_compressed() { return compressed_cache_->num_inserts(); } + + bool fill_cache() { return fill_cache_; } + bool compression_enabled() { return compression_enabled_; } + bool has_compressed_cache() { return compressed_cache_ != nullptr; } + bool has_uncompressed_cache() { return uncompressed_cache_ != nullptr; } + Options get_options() { return options_; } + + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + protected: + class MyFlushBlockPolicyFactory : public FlushBlockPolicyFactory { + public: + MyFlushBlockPolicyFactory() {} + + virtual const char* Name() const override { + return "MyFlushBlockPolicyFactory"; + } + + virtual FlushBlockPolicy* NewFlushBlockPolicy( + const BlockBasedTableOptions& /*table_options*/, + const BlockBuilder& data_block_builder) const override { + return new MyFlushBlockPolicy(data_block_builder); + } + }; + + class MyFlushBlockPolicy : public FlushBlockPolicy { + public: + explicit MyFlushBlockPolicy(const BlockBuilder& data_block_builder) + : num_keys_(0), data_block_builder_(data_block_builder) {} + + bool Update(const Slice& /*key*/, const Slice& /*value*/) override { + if (data_block_builder_.empty()) { + // First key in this block + num_keys_ = 1; + return false; + } + // Flush every 10 keys + if (num_keys_ == 10) { + num_keys_ = 1; + return true; + } + num_keys_++; + return false; + } + + private: + int num_keys_; + const BlockBuilder& data_block_builder_; + }; + + class MyBlockCache : public CacheWrapper { + public: + explicit MyBlockCache(std::shared_ptr target) + : CacheWrapper(target), + num_lookups_(0), + num_found_(0), + num_inserts_(0) {} + + const char* Name() const override { return "MyBlockCache"; } + + using Cache::Insert; + Status Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value), + Handle** handle = nullptr, + Priority priority = Priority::LOW) override { + num_inserts_++; + return target_->Insert(key, value, charge, deleter, handle, priority); + } + + using Cache::Lookup; + Handle* Lookup(const Slice& key, Statistics* stats = nullptr) override { + num_lookups_++; + Handle* handle = target_->Lookup(key, stats); + if (handle != nullptr) { + num_found_++; + } + return handle; + } + int num_lookups() { return num_lookups_; } + + int num_found() { return num_found_; } + + int num_inserts() { return num_inserts_; } + + private: + int num_lookups_; + int num_found_; + int num_inserts_; + }; + + std::shared_ptr compressed_cache_; + std::shared_ptr uncompressed_cache_; + Options options_; + bool compression_enabled_; + std::vector values_; + std::vector uncompressable_values_; + bool fill_cache_; + std::vector cf_names_; +}; + +class DBBasicTestWithParallelIO + : public DBBasicTestMultiGet, + public testing::WithParamInterface< + std::tuple> { + public: + DBBasicTestWithParallelIO() + : DBBasicTestMultiGet("/db_basic_test_with_parallel_io", 1, + std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()), std::get<3>(GetParam()), + std::get<4>(GetParam())) {} +}; + +TEST_P(DBBasicTestWithParallelIO, MultiGet) { + std::vector key_data(10); + std::vector keys; + // We cannot resize a PinnableSlice vector, so just set initial size to + // largest we think we will need + std::vector values(10); + std::vector statuses; + ReadOptions ro; + ro.fill_cache = fill_cache(); + + // Warm up the cache first + key_data.emplace_back(Key(0)); + keys.emplace_back(Slice(key_data.back())); + key_data.emplace_back(Key(50)); + keys.emplace_back(Slice(key_data.back())); + statuses.resize(keys.size()); + + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + ASSERT_TRUE(CheckValue(0, values[0].ToString())); + ASSERT_TRUE(CheckValue(50, values[1].ToString())); + + int random_reads = env_->random_read_counter_.Read(); + key_data[0] = Key(1); + key_data[1] = Key(51); + keys[0] = Slice(key_data[0]); + keys[1] = Slice(key_data[1]); + values[0].Reset(); + values[1].Reset(); + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + ASSERT_TRUE(CheckValue(1, values[0].ToString())); + ASSERT_TRUE(CheckValue(51, values[1].ToString())); + + bool read_from_cache = false; + if (fill_cache()) { + if (has_uncompressed_cache()) { + read_from_cache = true; + } else if (has_compressed_cache() && compression_enabled()) { + read_from_cache = true; + } + } + + int expected_reads = random_reads + (read_from_cache ? 0 : 2); + ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads); + + keys.resize(10); + statuses.resize(10); + std::vector key_ints{1, 2, 15, 16, 55, 81, 82, 83, 84, 85}; + for (size_t i = 0; i < key_ints.size(); ++i) { + key_data[i] = Key(key_ints[i]); + keys[i] = Slice(key_data[i]); + statuses[i] = Status::OK(); + values[i].Reset(); + } + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + for (size_t i = 0; i < key_ints.size(); ++i) { + ASSERT_OK(statuses[i]); + ASSERT_TRUE(CheckValue(key_ints[i], values[i].ToString())); + } + if (compression_enabled() && !has_compressed_cache()) { + expected_reads += (read_from_cache ? 2 : 3); + } else { + expected_reads += (read_from_cache ? 2 : 4); + } + ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads); + + keys.resize(10); + statuses.resize(10); + std::vector key_uncmp{1, 2, 15, 16, 55, 81, 82, 83, 84, 85}; + for (size_t i = 0; i < key_uncmp.size(); ++i) { + key_data[i] = "a" + Key(key_uncmp[i]); + keys[i] = Slice(key_data[i]); + statuses[i] = Status::OK(); + values[i].Reset(); + } + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + for (size_t i = 0; i < key_uncmp.size(); ++i) { + ASSERT_OK(statuses[i]); + ASSERT_TRUE(CheckUncompressableValue(key_uncmp[i], values[i].ToString())); + } + if (compression_enabled() && !has_compressed_cache()) { + expected_reads += (read_from_cache ? 3 : 3); + } else { + expected_reads += (read_from_cache ? 4 : 4); + } + ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads); + + keys.resize(5); + statuses.resize(5); + std::vector key_tr{1, 2, 15, 16, 55}; + for (size_t i = 0; i < key_tr.size(); ++i) { + key_data[i] = "a" + Key(key_tr[i]); + keys[i] = Slice(key_data[i]); + statuses[i] = Status::OK(); + values[i].Reset(); + } + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + for (size_t i = 0; i < key_tr.size(); ++i) { + ASSERT_OK(statuses[i]); + ASSERT_TRUE(CheckUncompressableValue(key_tr[i], values[i].ToString())); + } + if (compression_enabled() && !has_compressed_cache()) { + expected_reads += (read_from_cache ? 0 : 2); + ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads); + } else { + if (has_uncompressed_cache()) { + expected_reads += (read_from_cache ? 0 : 3); + ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads); + } else { + // A rare case, even we enable the block compression but some of data + // blocks are not compressed due to content. If user only enable the + // compressed cache, the uncompressed blocks will not tbe cached, and + // block reads will be triggered. The number of reads is related to + // the compression algorithm. + ASSERT_TRUE(env_->random_read_counter_.Read() >= expected_reads); + } + } +} + +#ifndef ROCKSDB_LITE +TEST_P(DBBasicTestWithParallelIO, MultiGetDirectIO) { + class FakeDirectIOEnv : public EnvWrapper { + class FakeDirectIOSequentialFile; + class FakeDirectIORandomAccessFile; + + public: + FakeDirectIOEnv(Env* env) : EnvWrapper(env) {} + static const char* kClassName() { return "FakeDirectIOEnv"; } + const char* Name() const override { return kClassName(); } + + Status NewRandomAccessFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override { + std::unique_ptr file; + assert(options.use_direct_reads); + EnvOptions opts = options; + opts.use_direct_reads = false; + Status s = target()->NewRandomAccessFile(fname, &file, opts); + if (!s.ok()) { + return s; + } + result->reset(new FakeDirectIORandomAccessFile(std::move(file))); + return s; + } + + private: + class FakeDirectIOSequentialFile : public SequentialFileWrapper { + public: + FakeDirectIOSequentialFile(std::unique_ptr&& file) + : SequentialFileWrapper(file.get()), file_(std::move(file)) {} + ~FakeDirectIOSequentialFile() {} + + bool use_direct_io() const override { return true; } + size_t GetRequiredBufferAlignment() const override { return 1; } + + private: + std::unique_ptr file_; + }; + + class FakeDirectIORandomAccessFile : public RandomAccessFileWrapper { + public: + FakeDirectIORandomAccessFile(std::unique_ptr&& file) + : RandomAccessFileWrapper(file.get()), file_(std::move(file)) {} + ~FakeDirectIORandomAccessFile() {} + + bool use_direct_io() const override { return true; } + size_t GetRequiredBufferAlignment() const override { return 1; } + + private: + std::unique_ptr file_; + }; + }; + + std::unique_ptr env(new FakeDirectIOEnv(env_)); + Options opts = get_options(); + opts.env = env.get(); + opts.use_direct_reads = true; + Reopen(opts); + + std::vector key_data(10); + std::vector keys; + // We cannot resize a PinnableSlice vector, so just set initial size to + // largest we think we will need + std::vector values(10); + std::vector statuses; + ReadOptions ro; + ro.fill_cache = fill_cache(); + + // Warm up the cache first + key_data.emplace_back(Key(0)); + keys.emplace_back(Slice(key_data.back())); + key_data.emplace_back(Key(50)); + keys.emplace_back(Slice(key_data.back())); + statuses.resize(keys.size()); + + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + ASSERT_TRUE(CheckValue(0, values[0].ToString())); + ASSERT_TRUE(CheckValue(50, values[1].ToString())); + + int random_reads = env_->random_read_counter_.Read(); + key_data[0] = Key(1); + key_data[1] = Key(51); + keys[0] = Slice(key_data[0]); + keys[1] = Slice(key_data[1]); + values[0].Reset(); + values[1].Reset(); + if (uncompressed_cache_) { + uncompressed_cache_->SetCapacity(0); + uncompressed_cache_->SetCapacity(1048576); + } + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + ASSERT_TRUE(CheckValue(1, values[0].ToString())); + ASSERT_TRUE(CheckValue(51, values[1].ToString())); + + bool read_from_cache = false; + if (fill_cache()) { + if (has_uncompressed_cache()) { + read_from_cache = true; + } else if (has_compressed_cache() && compression_enabled()) { + read_from_cache = true; + } + } + + int expected_reads = random_reads; + if (!compression_enabled() || !has_compressed_cache()) { + expected_reads += 2; + } else { + expected_reads += (read_from_cache ? 0 : 2); + } + if (env_->random_read_counter_.Read() != expected_reads) { + ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads); + } + Close(); +} +#endif // ROCKSDB_LITE + +TEST_P(DBBasicTestWithParallelIO, MultiGetWithChecksumMismatch) { + std::vector key_data(10); + std::vector keys; + // We cannot resize a PinnableSlice vector, so just set initial size to + // largest we think we will need + std::vector values(10); + std::vector statuses; + int read_count = 0; + ReadOptions ro; + ro.fill_cache = fill_cache(); + + SyncPoint::GetInstance()->SetCallBack( + "RetrieveMultipleBlocks:VerifyChecksum", [&](void* status) { + Status* s = static_cast(status); + read_count++; + if (read_count == 2) { + *s = Status::Corruption(); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Warm up the cache first + key_data.emplace_back(Key(0)); + keys.emplace_back(Slice(key_data.back())); + key_data.emplace_back(Key(50)); + keys.emplace_back(Slice(key_data.back())); + statuses.resize(keys.size()); + + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + ASSERT_TRUE(CheckValue(0, values[0].ToString())); + // ASSERT_TRUE(CheckValue(50, values[1].ToString())); + ASSERT_EQ(statuses[0], Status::OK()); + ASSERT_EQ(statuses[1], Status::Corruption()); + + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBBasicTestWithParallelIO, MultiGetWithMissingFile) { + std::vector key_data(10); + std::vector keys; + // We cannot resize a PinnableSlice vector, so just set initial size to + // largest we think we will need + std::vector values(10); + std::vector statuses; + ReadOptions ro; + ro.fill_cache = fill_cache(); + + SyncPoint::GetInstance()->SetCallBack( + "TableCache::MultiGet:FindTable", [&](void* status) { + Status* s = static_cast(status); + *s = Status::IOError(); + }); + // DB open will create table readers unless we reduce the table cache + // capacity. + // SanitizeOptions will set max_open_files to minimum of 20. Table cache + // is allocated with max_open_files - 10 as capacity. So override + // max_open_files to 11 so table cache capacity will become 1. This will + // prevent file open during DB open and force the file to be opened + // during MultiGet + SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = (int*)arg; + *max_open_files = 11; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Reopen(CurrentOptions()); + + // Warm up the cache first + key_data.emplace_back(Key(0)); + keys.emplace_back(Slice(key_data.back())); + key_data.emplace_back(Key(50)); + keys.emplace_back(Slice(key_data.back())); + statuses.resize(keys.size()); + + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + ASSERT_EQ(statuses[0], Status::IOError()); + ASSERT_EQ(statuses[1], Status::IOError()); + + SyncPoint::GetInstance()->DisableProcessing(); +} + +INSTANTIATE_TEST_CASE_P(ParallelIO, DBBasicTestWithParallelIO, + // Params are as follows - + // Param 0 - Compressed cache enabled + // Param 1 - Uncompressed cache enabled + // Param 2 - Data compression enabled + // Param 3 - ReadOptions::fill_cache + // Param 4 - CompressionOptions::parallel_threads + ::testing::Combine(::testing::Bool(), ::testing::Bool(), + ::testing::Bool(), ::testing::Bool(), + ::testing::Values(1, 4))); + +// Forward declaration +class DeadlineFS; + +class DeadlineRandomAccessFile : public FSRandomAccessFileOwnerWrapper { + public: + DeadlineRandomAccessFile(DeadlineFS& fs, + std::unique_ptr& file) + : FSRandomAccessFileOwnerWrapper(std::move(file)), fs_(fs) {} + + IOStatus Read(uint64_t offset, size_t len, const IOOptions& opts, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, IODebugContext* dbg) override; + + IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts, + std::function cb, + void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, + IODebugContext* dbg) override; + + private: + DeadlineFS& fs_; + std::unique_ptr file_; +}; + +class DeadlineFS : public FileSystemWrapper { + public: + // The error_on_delay parameter specifies whether a IOStatus::TimedOut() + // status should be returned after delaying the IO to exceed the timeout, + // or to simply delay but return success anyway. The latter mimics the + // behavior of PosixFileSystem, which does not enforce any timeout + explicit DeadlineFS(SpecialEnv* env, bool error_on_delay) + : FileSystemWrapper(env->GetFileSystem()), + deadline_(std::chrono::microseconds::zero()), + io_timeout_(std::chrono::microseconds::zero()), + env_(env), + timedout_(false), + ignore_deadline_(false), + error_on_delay_(error_on_delay) {} + + static const char* kClassName() { return "DeadlineFileSystem"; } + const char* Name() const override { return kClassName(); } + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + std::unique_ptr file; + IOStatus s = target()->NewRandomAccessFile(fname, opts, &file, dbg); + EXPECT_OK(s); + result->reset(new DeadlineRandomAccessFile(*this, file)); + + const std::chrono::microseconds deadline = GetDeadline(); + const std::chrono::microseconds io_timeout = GetIOTimeout(); + if (deadline.count() || io_timeout.count()) { + AssertDeadline(deadline, io_timeout, opts.io_options); + } + return ShouldDelay(opts.io_options); + } + + // Set a vector of {IO counter, delay in microseconds, return status} tuples + // that control when to inject a delay and duration of the delay + void SetDelayTrigger(const std::chrono::microseconds deadline, + const std::chrono::microseconds io_timeout, + const int trigger) { + delay_trigger_ = trigger; + io_count_ = 0; + deadline_ = deadline; + io_timeout_ = io_timeout; + timedout_ = false; + } + + // Increment the IO counter and return a delay in microseconds + IOStatus ShouldDelay(const IOOptions& opts) { + if (timedout_) { + return IOStatus::TimedOut(); + } else if (!deadline_.count() && !io_timeout_.count()) { + return IOStatus::OK(); + } + if (!ignore_deadline_ && delay_trigger_ == io_count_++) { + env_->SleepForMicroseconds(static_cast(opts.timeout.count() + 1)); + timedout_ = true; + if (error_on_delay_) { + return IOStatus::TimedOut(); + } + } + return IOStatus::OK(); + } + + const std::chrono::microseconds GetDeadline() { + return ignore_deadline_ ? std::chrono::microseconds::zero() : deadline_; + } + + const std::chrono::microseconds GetIOTimeout() { + return ignore_deadline_ ? std::chrono::microseconds::zero() : io_timeout_; + } + + bool TimedOut() { return timedout_; } + + void IgnoreDeadline(bool ignore) { ignore_deadline_ = ignore; } + + void AssertDeadline(const std::chrono::microseconds deadline, + const std::chrono::microseconds io_timeout, + const IOOptions& opts) const { + // Give a leeway of +- 10us as it can take some time for the Get/ + // MultiGet call to reach here, in order to avoid false alarms + std::chrono::microseconds now = + std::chrono::microseconds(env_->NowMicros()); + std::chrono::microseconds timeout; + if (deadline.count()) { + timeout = deadline - now; + if (io_timeout.count()) { + timeout = std::min(timeout, io_timeout); + } + } else { + timeout = io_timeout; + } + if (opts.timeout != timeout) { + ASSERT_EQ(timeout, opts.timeout); + } + } + + private: + // The number of IOs to trigger the delay after + int delay_trigger_; + // Current IO count + int io_count_; + // ReadOptions deadline for the Get/MultiGet/Iterator + std::chrono::microseconds deadline_; + // ReadOptions io_timeout for the Get/MultiGet/Iterator + std::chrono::microseconds io_timeout_; + SpecialEnv* env_; + // Flag to indicate whether we injected a delay + bool timedout_; + // Temporarily ignore deadlines/timeouts + bool ignore_deadline_; + // Return IOStatus::TimedOut() or IOStatus::OK() + bool error_on_delay_; +}; + +IOStatus DeadlineRandomAccessFile::Read(uint64_t offset, size_t len, + const IOOptions& opts, Slice* result, + char* scratch, + IODebugContext* dbg) const { + const std::chrono::microseconds deadline = fs_.GetDeadline(); + const std::chrono::microseconds io_timeout = fs_.GetIOTimeout(); + IOStatus s; + if (deadline.count() || io_timeout.count()) { + fs_.AssertDeadline(deadline, io_timeout, opts); + } + if (s.ok()) { + s = FSRandomAccessFileWrapper::Read(offset, len, opts, result, scratch, + dbg); + } + if (s.ok()) { + s = fs_.ShouldDelay(opts); + } + return s; +} + +IOStatus DeadlineRandomAccessFile::ReadAsync( + FSReadRequest& req, const IOOptions& opts, + std::function cb, void* cb_arg, + void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) { + const std::chrono::microseconds deadline = fs_.GetDeadline(); + const std::chrono::microseconds io_timeout = fs_.GetIOTimeout(); + IOStatus s; + if (deadline.count() || io_timeout.count()) { + fs_.AssertDeadline(deadline, io_timeout, opts); + } + if (s.ok()) { + s = FSRandomAccessFileWrapper::ReadAsync(req, opts, cb, cb_arg, io_handle, + del_fn, dbg); + } + if (s.ok()) { + s = fs_.ShouldDelay(opts); + } + return s; +} + +IOStatus DeadlineRandomAccessFile::MultiRead(FSReadRequest* reqs, + size_t num_reqs, + const IOOptions& options, + IODebugContext* dbg) { + const std::chrono::microseconds deadline = fs_.GetDeadline(); + const std::chrono::microseconds io_timeout = fs_.GetIOTimeout(); + IOStatus s; + if (deadline.count() || io_timeout.count()) { + fs_.AssertDeadline(deadline, io_timeout, options); + } + if (s.ok()) { + s = FSRandomAccessFileWrapper::MultiRead(reqs, num_reqs, options, dbg); + } + if (s.ok()) { + s = fs_.ShouldDelay(options); + } + return s; +} + +// A test class for intercepting random reads and injecting artificial +// delays. Used for testing the MultiGet deadline feature +class DBBasicTestMultiGetDeadline : public DBBasicTestMultiGet, + public testing::WithParamInterface { + public: + DBBasicTestMultiGetDeadline() + : DBBasicTestMultiGet( + "db_basic_test_multiget_deadline" /*Test dir*/, + 10 /*# of column families*/, false /*compressed cache enabled*/, + true /*uncompressed cache enabled*/, true /*compression enabled*/, + true /*ReadOptions.fill_cache*/, + 1 /*# of parallel compression threads*/) {} + + inline void CheckStatus(std::vector& statuses, size_t num_ok) { + for (size_t i = 0; i < statuses.size(); ++i) { + if (i < num_ok) { + EXPECT_OK(statuses[i]); + } else { + if (statuses[i] != Status::TimedOut()) { + EXPECT_EQ(statuses[i], Status::TimedOut()); + } + } + } + } +}; + +TEST_P(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) { +#ifndef USE_COROUTINES + if (GetParam()) { + ROCKSDB_GTEST_SKIP("This test requires coroutine support"); + return; + } +#endif // USE_COROUTINES + std::shared_ptr fs = std::make_shared(env_, false); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + Options options = CurrentOptions(); + + std::shared_ptr cache = NewLRUCache(1048576); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = env.get(); + SetTimeElapseOnlySleepOnReopen(&options); + ReopenWithColumnFamilies(GetCFNames(), options); + + // Test the non-batched version of MultiGet with multiple column + // families + std::vector key_str; + size_t i; + for (i = 0; i < 5; ++i) { + key_str.emplace_back(Key(static_cast(i))); + } + std::vector cfs(key_str.size()); + ; + std::vector keys(key_str.size()); + std::vector values(key_str.size()); + for (i = 0; i < key_str.size(); ++i) { + cfs[i] = handles_[i]; + keys[i] = Slice(key_str[i].data(), key_str[i].size()); + } + + ReadOptions ro; + ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; + ro.async_io = GetParam(); + // Delay the first IO + fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 0); + + std::vector statuses = dbfull()->MultiGet(ro, cfs, keys, &values); + // The first key is successful because we check after the lookup, but + // subsequent keys fail due to deadline exceeded + CheckStatus(statuses, 1); + + // Clear the cache + cache->SetCapacity(0); + cache->SetCapacity(1048576); + // Test non-batched Multiget with multiple column families and + // introducing an IO delay in one of the middle CFs + key_str.clear(); + for (i = 0; i < 10; ++i) { + key_str.emplace_back(Key(static_cast(i))); + } + cfs.resize(key_str.size()); + keys.resize(key_str.size()); + values.resize(key_str.size()); + for (i = 0; i < key_str.size(); ++i) { + // 2 keys per CF + cfs[i] = handles_[i / 2]; + keys[i] = Slice(key_str[i].data(), key_str[i].size()); + } + ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; + fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 1); + statuses = dbfull()->MultiGet(ro, cfs, keys, &values); + CheckStatus(statuses, 3); + + // Test batched MultiGet with an IO delay in the first data block read. + // Both keys in the first CF should succeed as they're in the same data + // block and would form one batch, and we check for deadline between + // batches. + std::vector pin_values(keys.size()); + cache->SetCapacity(0); + cache->SetCapacity(1048576); + statuses.clear(); + statuses.resize(keys.size()); + ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; + fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 0); + dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(), + pin_values.data(), statuses.data()); + CheckStatus(statuses, 2); + + // Similar to the previous one, but an IO delay in the third CF data block + // read + for (PinnableSlice& value : pin_values) { + value.Reset(); + } + cache->SetCapacity(0); + cache->SetCapacity(1048576); + statuses.clear(); + statuses.resize(keys.size()); + ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; + fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 2); + dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(), + pin_values.data(), statuses.data()); + CheckStatus(statuses, 6); + + // Similar to the previous one, but an IO delay in the last but one CF + for (PinnableSlice& value : pin_values) { + value.Reset(); + } + cache->SetCapacity(0); + cache->SetCapacity(1048576); + statuses.clear(); + statuses.resize(keys.size()); + ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; + fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 3); + dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(), + pin_values.data(), statuses.data()); + CheckStatus(statuses, 8); + + // Test batched MultiGet with single CF and lots of keys. Inject delay + // into the second batch of keys. As each batch is 32, the first 64 keys, + // i.e first two batches, should succeed and the rest should time out + for (PinnableSlice& value : pin_values) { + value.Reset(); + } + cache->SetCapacity(0); + cache->SetCapacity(1048576); + key_str.clear(); + for (i = 0; i < 100; ++i) { + key_str.emplace_back(Key(static_cast(i))); + } + keys.resize(key_str.size()); + pin_values.clear(); + pin_values.resize(key_str.size()); + for (i = 0; i < key_str.size(); ++i) { + keys[i] = Slice(key_str[i].data(), key_str[i].size()); + } + statuses.clear(); + statuses.resize(keys.size()); + ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; + fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 1); + dbfull()->MultiGet(ro, handles_[0], keys.size(), keys.data(), + pin_values.data(), statuses.data()); + CheckStatus(statuses, 64); + Close(); +} + +INSTANTIATE_TEST_CASE_P(DeadlineIO, DBBasicTestMultiGetDeadline, + ::testing::Bool()); + +TEST_F(DBBasicTest, ManifestWriteFailure) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.env = env_; + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ProcessManifestWrites:AfterSyncManifest", [&](void* arg) { + ASSERT_NE(nullptr, arg); + auto* s = reinterpret_cast(arg); + ASSERT_OK(*s); + // Manually overwrite return status + *s = Status::IOError(); + }); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put("key", "value")); + ASSERT_NOK(Flush()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->EnableProcessing(); + Reopen(options); +} + +TEST_F(DBBasicTest, DestroyDefaultCfHandle) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + for (const auto* h : handles_) { + ASSERT_NE(db_->DefaultColumnFamily(), h); + } + + // We have two handles to the default column family. The two handles point to + // different ColumnFamilyHandle objects. + assert(db_->DefaultColumnFamily()); + ASSERT_EQ(0U, db_->DefaultColumnFamily()->GetID()); + assert(handles_[0]); + ASSERT_EQ(0U, handles_[0]->GetID()); + + // You can destroy handles_[...]. + for (auto* h : handles_) { + ASSERT_OK(db_->DestroyColumnFamilyHandle(h)); + } + handles_.clear(); + + // But you should not destroy db_->DefaultColumnFamily(), since it's going to + // be deleted in `DBImpl::CloseHelper()`. Before that, it may be used + // elsewhere internally too. + ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily(); + ASSERT_TRUE(db_->DestroyColumnFamilyHandle(default_cf).IsInvalidArgument()); +} + +TEST_F(DBBasicTest, FailOpenIfLoggerCreationFail) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "rocksdb::CreateLoggerFromOptions:AfterGetPath", [&](void* arg) { + auto* s = reinterpret_cast(arg); + assert(s); + *s = Status::IOError("Injected"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Status s = TryReopen(options); + ASSERT_EQ(nullptr, options.info_log); + ASSERT_TRUE(s.IsIOError()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBBasicTest, VerifyFileChecksums) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.env = env_; + DestroyAndReopen(options); + ASSERT_OK(Put("a", "value")); + ASSERT_OK(Flush()); + ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsInvalidArgument()); + + options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + Reopen(options); + ASSERT_OK(db_->VerifyFileChecksums(ReadOptions())); + + // Write an L0 with checksum computed. + ASSERT_OK(Put("b", "value")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->VerifyFileChecksums(ReadOptions())); + + // Does the right thing but with the wrong name -- using it should lead to an + // error. + class MisnamedFileChecksumGenerator : public FileChecksumGenCrc32c { + public: + MisnamedFileChecksumGenerator(const FileChecksumGenContext& context) + : FileChecksumGenCrc32c(context) {} + + const char* Name() const override { return "sha1"; } + }; + + class MisnamedFileChecksumGenFactory : public FileChecksumGenCrc32cFactory { + public: + std::unique_ptr CreateFileChecksumGenerator( + const FileChecksumGenContext& context) override { + return std::unique_ptr( + new MisnamedFileChecksumGenerator(context)); + } + }; + + options.file_checksum_gen_factory.reset(new MisnamedFileChecksumGenFactory()); + Reopen(options); + ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsInvalidArgument()); +} + +// TODO: re-enable after we provide finer-grained control for WAL tracking to +// meet the needs of different use cases, durability levels and recovery modes. +TEST_F(DBBasicTest, DISABLED_ManualWalSync) { + Options options = CurrentOptions(); + options.track_and_verify_wals_in_manifest = true; + options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency; + DestroyAndReopen(options); + + ASSERT_OK(Put("x", "y")); + // This does not create a new WAL. + ASSERT_OK(db_->SyncWAL()); + EXPECT_FALSE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty()); + + std::unique_ptr wal; + Status s = db_->GetCurrentWalFile(&wal); + ASSERT_OK(s); + Close(); + + EXPECT_OK(env_->DeleteFile(LogFileName(dbname_, wal->LogNumber()))); + + ASSERT_TRUE(TryReopen(options).IsCorruption()); +} +#endif // !ROCKSDB_LITE + +// A test class for intercepting random reads and injecting artificial +// delays. Used for testing the deadline/timeout feature +class DBBasicTestDeadline + : public DBBasicTest, + public testing::WithParamInterface> {}; + +TEST_P(DBBasicTestDeadline, PointLookupDeadline) { + std::shared_ptr fs = std::make_shared(env_, true); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + bool set_deadline = std::get<0>(GetParam()); + bool set_timeout = std::get<1>(GetParam()); + + for (int option_config = kDefault; option_config < kEnd; ++option_config) { + if (ShouldSkipOptions(option_config, kSkipPlainTable | kSkipMmapReads)) { + continue; + } + option_config_ = option_config; + Options options = CurrentOptions(); + if (options.use_direct_reads) { + continue; + } + options.env = env.get(); + options.disable_auto_compactions = true; + Cache* block_cache = nullptr; + // Fileter block reads currently don't cause the request to get + // aborted on a read timeout, so its possible those block reads + // may get issued even if the deadline is past + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::Get:BeforeFilterMatch", + [&](void* /*arg*/) { fs->IgnoreDeadline(true); }); + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::Get:AfterFilterMatch", + [&](void* /*arg*/) { fs->IgnoreDeadline(false); }); + // DB open will create table readers unless we reduce the table cache + // capacity. + // SanitizeOptions will set max_open_files to minimum of 20. Table cache + // is allocated with max_open_files - 10 as capacity. So override + // max_open_files to 11 so table cache capacity will become 1. This will + // prevent file open during DB open and force the file to be opened + // during MultiGet + SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = (int*)arg; + *max_open_files = 11; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + SetTimeElapseOnlySleepOnReopen(&options); + Reopen(options); + + if (options.table_factory) { + block_cache = options.table_factory->GetOptions( + TableFactory::kBlockCacheOpts()); + } + + Random rnd(301); + for (int i = 0; i < 400; ++i) { + std::string key = "k" + std::to_string(i); + ASSERT_OK(Put(key, rnd.RandomString(100))); + } + ASSERT_OK(Flush()); + + bool timedout = true; + // A timeout will be forced when the IO counter reaches this value + int io_deadline_trigger = 0; + // Keep incrementing io_deadline_trigger and call Get() until there is an + // iteration that doesn't cause a timeout. This ensures that we cover + // all file reads in the point lookup path that can potentially timeout + // and cause the Get() to fail. + while (timedout) { + ReadOptions ro; + if (set_deadline) { + ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; + } + if (set_timeout) { + ro.io_timeout = std::chrono::microseconds{5000}; + } + fs->SetDelayTrigger(ro.deadline, ro.io_timeout, io_deadline_trigger); + + block_cache->SetCapacity(0); + block_cache->SetCapacity(1048576); + + std::string value; + Status s = dbfull()->Get(ro, "k50", &value); + if (fs->TimedOut()) { + ASSERT_EQ(s, Status::TimedOut()); + } else { + timedout = false; + ASSERT_OK(s); + } + io_deadline_trigger++; + } + // Reset the delay sequence in order to avoid false alarms during Reopen + fs->SetDelayTrigger(std::chrono::microseconds::zero(), + std::chrono::microseconds::zero(), 0); + } + Close(); +} + +TEST_P(DBBasicTestDeadline, IteratorDeadline) { + std::shared_ptr fs = std::make_shared(env_, true); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + bool set_deadline = std::get<0>(GetParam()); + bool set_timeout = std::get<1>(GetParam()); + + for (int option_config = kDefault; option_config < kEnd; ++option_config) { + if (ShouldSkipOptions(option_config, kSkipPlainTable | kSkipMmapReads)) { + continue; + } + Options options = CurrentOptions(); + if (options.use_direct_reads) { + continue; + } + options.env = env.get(); + options.disable_auto_compactions = true; + Cache* block_cache = nullptr; + // DB open will create table readers unless we reduce the table cache + // capacity. + // SanitizeOptions will set max_open_files to minimum of 20. Table cache + // is allocated with max_open_files - 10 as capacity. So override + // max_open_files to 11 so table cache capacity will become 1. This will + // prevent file open during DB open and force the file to be opened + // during MultiGet + SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = (int*)arg; + *max_open_files = 11; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + SetTimeElapseOnlySleepOnReopen(&options); + Reopen(options); + + if (options.table_factory) { + block_cache = options.table_factory->GetOptions( + TableFactory::kBlockCacheOpts()); + } + + Random rnd(301); + for (int i = 0; i < 400; ++i) { + std::string key = "k" + std::to_string(i); + ASSERT_OK(Put(key, rnd.RandomString(100))); + } + ASSERT_OK(Flush()); + + bool timedout = true; + // A timeout will be forced when the IO counter reaches this value + int io_deadline_trigger = 0; + // Keep incrementing io_deadline_trigger and call Get() until there is an + // iteration that doesn't cause a timeout. This ensures that we cover + // all file reads in the point lookup path that can potentially timeout + while (timedout) { + ReadOptions ro; + if (set_deadline) { + ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; + } + if (set_timeout) { + ro.io_timeout = std::chrono::microseconds{5000}; + } + fs->SetDelayTrigger(ro.deadline, ro.io_timeout, io_deadline_trigger); + + block_cache->SetCapacity(0); + block_cache->SetCapacity(1048576); + + Iterator* iter = dbfull()->NewIterator(ro); + int count = 0; + iter->Seek("k50"); + while (iter->Valid() && count++ < 100) { + iter->Next(); + } + if (fs->TimedOut()) { + ASSERT_FALSE(iter->Valid()); + ASSERT_EQ(iter->status(), Status::TimedOut()); + } else { + timedout = false; + ASSERT_OK(iter->status()); + } + delete iter; + io_deadline_trigger++; + } + // Reset the delay sequence in order to avoid false alarms during Reopen + fs->SetDelayTrigger(std::chrono::microseconds::zero(), + std::chrono::microseconds::zero(), 0); + } + Close(); +} + +// Param 0: If true, set read_options.deadline +// Param 1: If true, set read_options.io_timeout +INSTANTIATE_TEST_CASE_P(DBBasicTestDeadline, DBBasicTestDeadline, + ::testing::Values(std::make_tuple(true, false), + std::make_tuple(false, true), + std::make_tuple(true, true))); +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_block_cache_test.cc b/src/rocksdb/db/db_block_cache_test.cc new file mode 100644 index 000000000..db80b82cb --- /dev/null +++ b/src/rocksdb/db/db_block_cache_test.cc @@ -0,0 +1,2313 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include +#include +#include + +#include "cache/cache_entry_roles.h" +#include "cache/cache_key.h" +#include "cache/lru_cache.h" +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "env/unique_id_gen.h" +#include "port/stack_trace.h" +#include "rocksdb/persistent_cache.h" +#include "rocksdb/statistics.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/unique_id_impl.h" +#include "util/compression.h" +#include "util/defer.h" +#include "util/hash.h" +#include "util/math.h" +#include "util/random.h" +#include "utilities/fault_injection_fs.h" + +namespace ROCKSDB_NAMESPACE { + +class DBBlockCacheTest : public DBTestBase { + private: + size_t miss_count_ = 0; + size_t hit_count_ = 0; + size_t insert_count_ = 0; + size_t failure_count_ = 0; + size_t compression_dict_miss_count_ = 0; + size_t compression_dict_hit_count_ = 0; + size_t compression_dict_insert_count_ = 0; + size_t compressed_miss_count_ = 0; + size_t compressed_hit_count_ = 0; + size_t compressed_insert_count_ = 0; + size_t compressed_failure_count_ = 0; + + public: + const size_t kNumBlocks = 10; + const size_t kValueSize = 100; + + DBBlockCacheTest() + : DBTestBase("db_block_cache_test", /*env_do_fsync=*/true) {} + + BlockBasedTableOptions GetTableOptions() { + BlockBasedTableOptions table_options; + // Set a small enough block size so that each key-value get its own block. + table_options.block_size = 1; + return table_options; + } + + Options GetOptions(const BlockBasedTableOptions& table_options) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.avoid_flush_during_recovery = false; + // options.compression = kNoCompression; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + return options; + } + + void InitTable(const Options& /*options*/) { + std::string value(kValueSize, 'a'); + for (size_t i = 0; i < kNumBlocks; i++) { + ASSERT_OK(Put(std::to_string(i), value.c_str())); + } + } + + void RecordCacheCounters(const Options& options) { + miss_count_ = TestGetTickerCount(options, BLOCK_CACHE_MISS); + hit_count_ = TestGetTickerCount(options, BLOCK_CACHE_HIT); + insert_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD); + failure_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES); + compressed_miss_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS); + compressed_hit_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT); + compressed_insert_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD); + compressed_failure_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); + } + + void RecordCacheCountersForCompressionDict(const Options& options) { + compression_dict_miss_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS); + compression_dict_hit_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_HIT); + compression_dict_insert_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD); + } + + void CheckCacheCounters(const Options& options, size_t expected_misses, + size_t expected_hits, size_t expected_inserts, + size_t expected_failures) { + size_t new_miss_count = TestGetTickerCount(options, BLOCK_CACHE_MISS); + size_t new_hit_count = TestGetTickerCount(options, BLOCK_CACHE_HIT); + size_t new_insert_count = TestGetTickerCount(options, BLOCK_CACHE_ADD); + size_t new_failure_count = + TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES); + ASSERT_EQ(miss_count_ + expected_misses, new_miss_count); + ASSERT_EQ(hit_count_ + expected_hits, new_hit_count); + ASSERT_EQ(insert_count_ + expected_inserts, new_insert_count); + ASSERT_EQ(failure_count_ + expected_failures, new_failure_count); + miss_count_ = new_miss_count; + hit_count_ = new_hit_count; + insert_count_ = new_insert_count; + failure_count_ = new_failure_count; + } + + void CheckCacheCountersForCompressionDict( + const Options& options, size_t expected_compression_dict_misses, + size_t expected_compression_dict_hits, + size_t expected_compression_dict_inserts) { + size_t new_compression_dict_miss_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS); + size_t new_compression_dict_hit_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_HIT); + size_t new_compression_dict_insert_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD); + ASSERT_EQ(compression_dict_miss_count_ + expected_compression_dict_misses, + new_compression_dict_miss_count); + ASSERT_EQ(compression_dict_hit_count_ + expected_compression_dict_hits, + new_compression_dict_hit_count); + ASSERT_EQ( + compression_dict_insert_count_ + expected_compression_dict_inserts, + new_compression_dict_insert_count); + compression_dict_miss_count_ = new_compression_dict_miss_count; + compression_dict_hit_count_ = new_compression_dict_hit_count; + compression_dict_insert_count_ = new_compression_dict_insert_count; + } + + void CheckCompressedCacheCounters(const Options& options, + size_t expected_misses, + size_t expected_hits, + size_t expected_inserts, + size_t expected_failures) { + size_t new_miss_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS); + size_t new_hit_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT); + size_t new_insert_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD); + size_t new_failure_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); + ASSERT_EQ(compressed_miss_count_ + expected_misses, new_miss_count); + ASSERT_EQ(compressed_hit_count_ + expected_hits, new_hit_count); + ASSERT_EQ(compressed_insert_count_ + expected_inserts, new_insert_count); + ASSERT_EQ(compressed_failure_count_ + expected_failures, new_failure_count); + compressed_miss_count_ = new_miss_count; + compressed_hit_count_ = new_hit_count; + compressed_insert_count_ = new_insert_count; + compressed_failure_count_ = new_failure_count; + } + +#ifndef ROCKSDB_LITE + const std::array GetCacheEntryRoleCountsBg() { + // Verify in cache entry role stats + std::array cache_entry_role_counts; + std::map values; + EXPECT_TRUE(db_->GetMapProperty(DB::Properties::kFastBlockCacheEntryStats, + &values)); + for (size_t i = 0; i < kNumCacheEntryRoles; ++i) { + auto role = static_cast(i); + cache_entry_role_counts[i] = + ParseSizeT(values[BlockCacheEntryStatsMapKeys::EntryCount(role)]); + } + return cache_entry_role_counts; + } +#endif // ROCKSDB_LITE +}; + +TEST_F(DBBlockCacheTest, IteratorBlockCacheUsage) { + ReadOptions read_options; + read_options.fill_cache = false; + auto table_options = GetTableOptions(); + auto options = GetOptions(table_options); + InitTable(options); + + LRUCacheOptions co; + co.capacity = 0; + co.num_shard_bits = 0; + co.strict_capacity_limit = false; + // Needed not to count entry stats collector + co.metadata_charge_policy = kDontChargeCacheMetadata; + std::shared_ptr cache = NewLRUCache(co); + table_options.block_cache = cache; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + RecordCacheCounters(options); + + std::vector> iterators(kNumBlocks - 1); + Iterator* iter = nullptr; + + ASSERT_EQ(0, cache->GetUsage()); + iter = db_->NewIterator(read_options); + iter->Seek(std::to_string(0)); + ASSERT_LT(0, cache->GetUsage()); + delete iter; + iter = nullptr; + ASSERT_EQ(0, cache->GetUsage()); +} + +TEST_F(DBBlockCacheTest, TestWithoutCompressedBlockCache) { + ReadOptions read_options; + auto table_options = GetTableOptions(); + auto options = GetOptions(table_options); + InitTable(options); + + LRUCacheOptions co; + co.capacity = 0; + co.num_shard_bits = 0; + co.strict_capacity_limit = false; + // Needed not to count entry stats collector + co.metadata_charge_policy = kDontChargeCacheMetadata; + std::shared_ptr cache = NewLRUCache(co); + table_options.block_cache = cache; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + RecordCacheCounters(options); + + std::vector> iterators(kNumBlocks - 1); + Iterator* iter = nullptr; + + // Load blocks into cache. + for (size_t i = 0; i + 1 < kNumBlocks; i++) { + iter = db_->NewIterator(read_options); + iter->Seek(std::to_string(i)); + ASSERT_OK(iter->status()); + CheckCacheCounters(options, 1, 0, 1, 0); + iterators[i].reset(iter); + } + size_t usage = cache->GetUsage(); + ASSERT_LT(0, usage); + cache->SetCapacity(usage); + ASSERT_EQ(usage, cache->GetPinnedUsage()); + + // Test with strict capacity limit. + cache->SetStrictCapacityLimit(true); + iter = db_->NewIterator(read_options); + iter->Seek(std::to_string(kNumBlocks - 1)); + ASSERT_TRUE(iter->status().IsMemoryLimit()); + CheckCacheCounters(options, 1, 0, 0, 1); + delete iter; + iter = nullptr; + + // Release iterators and access cache again. + for (size_t i = 0; i + 1 < kNumBlocks; i++) { + iterators[i].reset(); + CheckCacheCounters(options, 0, 0, 0, 0); + } + ASSERT_EQ(0, cache->GetPinnedUsage()); + for (size_t i = 0; i + 1 < kNumBlocks; i++) { + iter = db_->NewIterator(read_options); + iter->Seek(std::to_string(i)); + ASSERT_OK(iter->status()); + CheckCacheCounters(options, 0, 1, 0, 0); + iterators[i].reset(iter); + } +} + +#ifdef SNAPPY +TEST_F(DBBlockCacheTest, TestWithCompressedBlockCache) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.block_cache_compressed = nullptr; + table_options.block_size = 1; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + table_options.cache_index_and_filter_blocks = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.compression = CompressionType::kSnappyCompression; + + DestroyAndReopen(options); + + std::string value(kValueSize, 'a'); + for (size_t i = 0; i < kNumBlocks; i++) { + ASSERT_OK(Put(std::to_string(i), value)); + ASSERT_OK(Flush()); + } + + ReadOptions read_options; + std::shared_ptr compressed_cache = NewLRUCache(1 << 25, 0, false); + LRUCacheOptions co; + co.capacity = 0; + co.num_shard_bits = 0; + co.strict_capacity_limit = false; + // Needed not to count entry stats collector + co.metadata_charge_policy = kDontChargeCacheMetadata; + std::shared_ptr cache = NewLRUCache(co); + table_options.block_cache = cache; + table_options.no_block_cache = false; + table_options.block_cache_compressed = compressed_cache; + table_options.max_auto_readahead_size = 0; + table_options.cache_index_and_filter_blocks = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + RecordCacheCounters(options); + + // Load blocks into cache. + for (size_t i = 0; i < kNumBlocks - 1; i++) { + ASSERT_EQ(value, Get(std::to_string(i))); + CheckCacheCounters(options, 1, 0, 1, 0); + CheckCompressedCacheCounters(options, 1, 0, 1, 0); + } + + size_t usage = cache->GetUsage(); + ASSERT_EQ(0, usage); + ASSERT_EQ(usage, cache->GetPinnedUsage()); + size_t compressed_usage = compressed_cache->GetUsage(); + ASSERT_LT(0, compressed_usage); + // Compressed block cache cannot be pinned. + ASSERT_EQ(0, compressed_cache->GetPinnedUsage()); + + // Set strict capacity limit flag. Now block will only load into compressed + // block cache. + cache->SetCapacity(usage); + cache->SetStrictCapacityLimit(true); + ASSERT_EQ(usage, cache->GetPinnedUsage()); + + // Load last key block. + ASSERT_EQ( + "Operation aborted: Memory limit reached: Insert failed due to LRU cache " + "being full.", + Get(std::to_string(kNumBlocks - 1))); + // Failure will also record the miss counter. + CheckCacheCounters(options, 1, 0, 0, 1); + CheckCompressedCacheCounters(options, 1, 0, 1, 0); + + // Clear strict capacity limit flag. This time we shall hit compressed block + // cache and load into block cache. + cache->SetStrictCapacityLimit(false); + // Load last key block. + ASSERT_EQ(value, Get(std::to_string(kNumBlocks - 1))); + CheckCacheCounters(options, 1, 0, 1, 0); + CheckCompressedCacheCounters(options, 0, 1, 0, 0); +} + +namespace { +class PersistentCacheFromCache : public PersistentCache { + public: + PersistentCacheFromCache(std::shared_ptr cache, bool read_only) + : cache_(cache), read_only_(read_only) {} + + Status Insert(const Slice& key, const char* data, + const size_t size) override { + if (read_only_) { + return Status::NotSupported(); + } + std::unique_ptr copy{new char[size]}; + std::copy_n(data, size, copy.get()); + Status s = cache_->Insert( + key, copy.get(), size, + GetCacheEntryDeleterForRole()); + if (s.ok()) { + copy.release(); + } + return s; + } + + Status Lookup(const Slice& key, std::unique_ptr* data, + size_t* size) override { + auto handle = cache_->Lookup(key); + if (handle) { + char* ptr = static_cast(cache_->Value(handle)); + *size = cache_->GetCharge(handle); + data->reset(new char[*size]); + std::copy_n(ptr, *size, data->get()); + cache_->Release(handle); + return Status::OK(); + } else { + return Status::NotFound(); + } + } + + bool IsCompressed() override { return false; } + + StatsType Stats() override { return StatsType(); } + + std::string GetPrintableOptions() const override { return ""; } + + uint64_t NewId() override { return cache_->NewId(); } + + private: + std::shared_ptr cache_; + bool read_only_; +}; + +class ReadOnlyCacheWrapper : public CacheWrapper { + using CacheWrapper::CacheWrapper; + + using Cache::Insert; + Status Insert(const Slice& /*key*/, void* /*value*/, size_t /*charge*/, + void (*)(const Slice& key, void* value) /*deleter*/, + Handle** /*handle*/, Priority /*priority*/) override { + return Status::NotSupported(); + } +}; + +} // anonymous namespace + +TEST_F(DBBlockCacheTest, TestWithSameCompressed) { + auto table_options = GetTableOptions(); + auto options = GetOptions(table_options); + InitTable(options); + + std::shared_ptr rw_cache{NewLRUCache(1000000)}; + std::shared_ptr rw_pcache{ + new PersistentCacheFromCache(rw_cache, /*read_only*/ false)}; + // Exercise some obscure behavior with read-only wrappers + std::shared_ptr ro_cache{new ReadOnlyCacheWrapper(rw_cache)}; + std::shared_ptr ro_pcache{ + new PersistentCacheFromCache(rw_cache, /*read_only*/ true)}; + + // Simple same pointer + table_options.block_cache = rw_cache; + table_options.block_cache_compressed = rw_cache; + table_options.persistent_cache.reset(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ASSERT_EQ(TryReopen(options).ToString(), + "Invalid argument: block_cache same as block_cache_compressed not " + "currently supported, and would be bad for performance anyway"); + + // Other cases + table_options.block_cache = ro_cache; + table_options.block_cache_compressed = rw_cache; + table_options.persistent_cache.reset(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ASSERT_EQ(TryReopen(options).ToString(), + "Invalid argument: block_cache and block_cache_compressed share " + "the same key space, which is not supported"); + + table_options.block_cache = rw_cache; + table_options.block_cache_compressed = ro_cache; + table_options.persistent_cache.reset(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ASSERT_EQ(TryReopen(options).ToString(), + "Invalid argument: block_cache_compressed and block_cache share " + "the same key space, which is not supported"); + + table_options.block_cache = ro_cache; + table_options.block_cache_compressed.reset(); + table_options.persistent_cache = rw_pcache; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ASSERT_EQ(TryReopen(options).ToString(), + "Invalid argument: block_cache and persistent_cache share the same " + "key space, which is not supported"); + + table_options.block_cache = rw_cache; + table_options.block_cache_compressed.reset(); + table_options.persistent_cache = ro_pcache; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ASSERT_EQ(TryReopen(options).ToString(), + "Invalid argument: persistent_cache and block_cache share the same " + "key space, which is not supported"); + + table_options.block_cache.reset(); + table_options.no_block_cache = true; + table_options.block_cache_compressed = ro_cache; + table_options.persistent_cache = rw_pcache; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ASSERT_EQ(TryReopen(options).ToString(), + "Invalid argument: block_cache_compressed and persistent_cache " + "share the same key space, which is not supported"); + + table_options.block_cache.reset(); + table_options.no_block_cache = true; + table_options.block_cache_compressed = rw_cache; + table_options.persistent_cache = ro_pcache; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ASSERT_EQ(TryReopen(options).ToString(), + "Invalid argument: persistent_cache and block_cache_compressed " + "share the same key space, which is not supported"); +} +#endif // SNAPPY + +#ifndef ROCKSDB_LITE + +// Make sure that when options.block_cache is set, after a new table is +// created its index/filter blocks are added to block cache. +TEST_F(DBBlockCacheTest, IndexAndFilterBlocksOfNewTableAddedToCache) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "key", "val")); + // Create a new table. + ASSERT_OK(Flush(1)); + + // index/filter blocks added to block cache right after table creation. + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(2, /* only index/filter were added */ + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS)); + uint64_t int_num; + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_EQ(int_num, 0U); + + // Make sure filter block is in cache. + std::string value; + ReadOptions ropt; + db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value); + + // Miss count should remain the same. + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + // Make sure index block is in cache. + auto index_block_hit = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT); + value = Get(1, "key"); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(index_block_hit + 1, + TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + + value = Get(1, "key"); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(index_block_hit + 2, + TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); +} + +// With fill_cache = false, fills up the cache, then iterates over the entire +// db, verify dummy entries inserted in `BlockBasedTable::NewDataBlockIterator` +// does not cause heap-use-after-free errors in COMPILE_WITH_ASAN=1 runs +TEST_F(DBBlockCacheTest, FillCacheAndIterateDB) { + ReadOptions read_options; + read_options.fill_cache = false; + auto table_options = GetTableOptions(); + auto options = GetOptions(table_options); + InitTable(options); + + std::shared_ptr cache = NewLRUCache(10, 0, true); + table_options.block_cache = cache; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + ASSERT_OK(Put("key1", "val1")); + ASSERT_OK(Put("key2", "val2")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("key3", "val3")); + ASSERT_OK(Put("key4", "val4")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("key5", "val5")); + ASSERT_OK(Put("key6", "val6")); + ASSERT_OK(Flush()); + + Iterator* iter = nullptr; + + iter = db_->NewIterator(read_options); + iter->Seek(std::to_string(0)); + while (iter->Valid()) { + iter->Next(); + } + delete iter; + iter = nullptr; +} + +TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + LRUCacheOptions co; + // 500 bytes are enough to hold the first two blocks + co.capacity = 500; + co.num_shard_bits = 0; + co.strict_capacity_limit = false; + co.metadata_charge_policy = kDontChargeCacheMetadata; + std::shared_ptr cache = NewLRUCache(co); + table_options.block_cache = cache; + table_options.filter_policy.reset(NewBloomFilterPolicy(20, true)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "longer_key", "val")); + // Create a new table + ASSERT_OK(Flush(1)); + size_t index_bytes_insert = + TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_INSERT); + size_t filter_bytes_insert = + TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT); + ASSERT_GT(index_bytes_insert, 0); + ASSERT_GT(filter_bytes_insert, 0); + ASSERT_EQ(cache->GetUsage(), index_bytes_insert + filter_bytes_insert); + // set the cache capacity to the current usage + cache->SetCapacity(index_bytes_insert + filter_bytes_insert); + // The index and filter eviction statistics were broken by the refactoring + // that moved the readers out of the block cache. Disabling these until we can + // bring the stats back. + // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0); + // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0); + // Note that the second key needs to be no longer than the first one. + // Otherwise the second index block may not fit in cache. + ASSERT_OK(Put(1, "key", "val")); + // Create a new table + ASSERT_OK(Flush(1)); + // cache evicted old index and block entries + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_INSERT), + index_bytes_insert); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT), + filter_bytes_insert); + // The index and filter eviction statistics were broken by the refactoring + // that moved the readers out of the block cache. Disabling these until we can + // bring the stats back. + // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), + // index_bytes_insert); + // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), + // filter_bytes_insert); +} + +#if (defined OS_LINUX || defined OS_WIN) +TEST_F(DBBlockCacheTest, WarmCacheWithDataBlocksDuringFlush) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + BlockBasedTableOptions table_options; + table_options.block_cache = NewLRUCache(1 << 25, 0, false); + table_options.cache_index_and_filter_blocks = false; + table_options.prepopulate_block_cache = + BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + std::string value(kValueSize, 'a'); + for (size_t i = 1; i <= kNumBlocks; i++) { + ASSERT_OK(Put(std::to_string(i), value)); + ASSERT_OK(Flush()); + ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD)); + ASSERT_EQ(value, Get(std::to_string(i))); + ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_DATA_MISS)); + ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_HIT)); + } + // Verify compaction not counted + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + EXPECT_EQ(kNumBlocks, + options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD)); +} + +// This test cache data, index and filter blocks during flush. +class DBBlockCacheTest1 : public DBTestBase, + public ::testing::WithParamInterface { + public: + const size_t kNumBlocks = 10; + const size_t kValueSize = 100; + DBBlockCacheTest1() : DBTestBase("db_block_cache_test1", true) {} +}; + +INSTANTIATE_TEST_CASE_P(DBBlockCacheTest1, DBBlockCacheTest1, + ::testing::Values(1, 2)); + +TEST_P(DBBlockCacheTest1, WarmCacheWithBlocksDuringFlush) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + BlockBasedTableOptions table_options; + table_options.block_cache = NewLRUCache(1 << 25, 0, false); + + uint32_t filter_type = GetParam(); + switch (filter_type) { + case 1: // partition_filter + table_options.partition_filters = true; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + break; + case 2: // full filter + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + break; + default: + assert(false); + } + + table_options.cache_index_and_filter_blocks = true; + table_options.prepopulate_block_cache = + BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + std::string value(kValueSize, 'a'); + for (size_t i = 1; i <= kNumBlocks; i++) { + ASSERT_OK(Put(std::to_string(i), value)); + ASSERT_OK(Flush()); + ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD)); + if (filter_type == 1) { + ASSERT_EQ(2 * i, + options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD)); + ASSERT_EQ(2 * i, + options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD)); + } else { + ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD)); + ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD)); + } + ASSERT_EQ(value, Get(std::to_string(i))); + + ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_DATA_MISS)); + ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(i * 3, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT)); + if (filter_type == 1) { + ASSERT_EQ(i * 3, + options.statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT)); + } else { + ASSERT_EQ(i * 2, + options.statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT)); + } + ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_FILTER_MISS)); + } + + // Verify compaction not counted + CompactRangeOptions cro; + // Ensure files are rewritten, not just trivially moved. + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(db_->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr)); + EXPECT_EQ(kNumBlocks, + options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD)); + // Index and filter blocks are automatically warmed when the new table file + // is automatically opened at the end of compaction. This is not easily + // disabled so results in the new index and filter blocks being warmed. + if (filter_type == 1) { + EXPECT_EQ(2 * (1 + kNumBlocks), + options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD)); + EXPECT_EQ(2 * (1 + kNumBlocks), + options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD)); + } else { + EXPECT_EQ(1 + kNumBlocks, + options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD)); + EXPECT_EQ(1 + kNumBlocks, + options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD)); + } +} + +TEST_F(DBBlockCacheTest, DynamicallyWarmCacheDuringFlush) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + BlockBasedTableOptions table_options; + table_options.block_cache = NewLRUCache(1 << 25, 0, false); + table_options.cache_index_and_filter_blocks = false; + table_options.prepopulate_block_cache = + BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly; + + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + std::string value(kValueSize, 'a'); + + for (size_t i = 1; i <= 5; i++) { + ASSERT_OK(Put(std::to_string(i), value)); + ASSERT_OK(Flush()); + ASSERT_EQ(1, + options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD)); + + ASSERT_EQ(value, Get(std::to_string(i))); + ASSERT_EQ(0, + options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD)); + ASSERT_EQ( + 0, options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS)); + ASSERT_EQ(1, + options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT)); + } + + ASSERT_OK(dbfull()->SetOptions( + {{"block_based_table_factory", "{prepopulate_block_cache=kDisable;}"}})); + + for (size_t i = 6; i <= kNumBlocks; i++) { + ASSERT_OK(Put(std::to_string(i), value)); + ASSERT_OK(Flush()); + ASSERT_EQ(0, + options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD)); + + ASSERT_EQ(value, Get(std::to_string(i))); + ASSERT_EQ(1, + options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD)); + ASSERT_EQ( + 1, options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS)); + ASSERT_EQ(0, + options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT)); + } +} +#endif + +namespace { + +// A mock cache wraps LRUCache, and record how many entries have been +// inserted for each priority. +class MockCache : public LRUCache { + public: + static uint32_t high_pri_insert_count; + static uint32_t low_pri_insert_count; + + MockCache() + : LRUCache((size_t)1 << 25 /*capacity*/, 0 /*num_shard_bits*/, + false /*strict_capacity_limit*/, 0.0 /*high_pri_pool_ratio*/, + 0.0 /*low_pri_pool_ratio*/) {} + + using ShardedCache::Insert; + + Status Insert(const Slice& key, void* value, + const Cache::CacheItemHelper* helper_cb, size_t charge, + Handle** handle, Priority priority) override { + DeleterFn delete_cb = helper_cb->del_cb; + if (priority == Priority::LOW) { + low_pri_insert_count++; + } else { + high_pri_insert_count++; + } + return LRUCache::Insert(key, value, charge, delete_cb, handle, priority); + } +}; + +uint32_t MockCache::high_pri_insert_count = 0; +uint32_t MockCache::low_pri_insert_count = 0; + +} // anonymous namespace + +TEST_F(DBBlockCacheTest, IndexAndFilterBlocksCachePriority) { + for (auto priority : {Cache::Priority::LOW, Cache::Priority::HIGH}) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.block_cache.reset(new MockCache()); + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + table_options.cache_index_and_filter_blocks_with_high_priority = + priority == Cache::Priority::HIGH ? true : false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + MockCache::high_pri_insert_count = 0; + MockCache::low_pri_insert_count = 0; + + // Create a new table. + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(Put("bar", "value")); + ASSERT_OK(Flush()); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + // index/filter blocks added to block cache right after table creation. + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(2, /* only index/filter were added */ + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS)); + if (priority == Cache::Priority::LOW) { + ASSERT_EQ(0u, MockCache::high_pri_insert_count); + ASSERT_EQ(2u, MockCache::low_pri_insert_count); + } else { + ASSERT_EQ(2u, MockCache::high_pri_insert_count); + ASSERT_EQ(0u, MockCache::low_pri_insert_count); + } + + // Access data block. + ASSERT_EQ("value", Get("foo")); + + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(3, /*adding data block*/ + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS)); + + // Data block should be inserted with low priority. + if (priority == Cache::Priority::LOW) { + ASSERT_EQ(0u, MockCache::high_pri_insert_count); + ASSERT_EQ(3u, MockCache::low_pri_insert_count); + } else { + ASSERT_EQ(2u, MockCache::high_pri_insert_count); + ASSERT_EQ(1u, MockCache::low_pri_insert_count); + } + } +} + +namespace { + +// An LRUCache wrapper that can falsely report "not found" on Lookup. +// This allows us to manipulate BlockBasedTableReader into thinking +// another thread inserted the data in between Lookup and Insert, +// while mostly preserving the LRUCache interface/behavior. +class LookupLiarCache : public CacheWrapper { + int nth_lookup_not_found_ = 0; + + public: + explicit LookupLiarCache(std::shared_ptr target) + : CacheWrapper(std::move(target)) {} + + using Cache::Lookup; + Handle* Lookup(const Slice& key, Statistics* stats) override { + if (nth_lookup_not_found_ == 1) { + nth_lookup_not_found_ = 0; + return nullptr; + } + if (nth_lookup_not_found_ > 1) { + --nth_lookup_not_found_; + } + return CacheWrapper::Lookup(key, stats); + } + + // 1 == next lookup, 2 == after next, etc. + void SetNthLookupNotFound(int n) { nth_lookup_not_found_ = n; } +}; + +} // anonymous namespace + +TEST_F(DBBlockCacheTest, AddRedundantStats) { + const size_t capacity = size_t{1} << 25; + const int num_shard_bits = 0; // 1 shard + int iterations_tested = 0; + for (std::shared_ptr base_cache : + {NewLRUCache(capacity, num_shard_bits), + HyperClockCacheOptions( + capacity, + BlockBasedTableOptions().block_size /*estimated_value_size*/, + num_shard_bits) + .MakeSharedCache()}) { + if (!base_cache) { + // Skip clock cache when not supported + continue; + } + ++iterations_tested; + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + std::shared_ptr cache = + std::make_shared(base_cache); + + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.block_cache = cache; + table_options.filter_policy.reset(NewBloomFilterPolicy(50)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + // Create a new table. + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(Put("bar", "value")); + ASSERT_OK(Flush()); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + // Normal access filter+index+data. + ASSERT_EQ("value", Get("foo")); + + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD)); + // -------- + ASSERT_EQ(3, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD_REDUNDANT)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD_REDUNDANT)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD_REDUNDANT)); + // -------- + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT)); + + // Againt access filter+index+data, but force redundant load+insert on index + cache->SetNthLookupNotFound(2); + ASSERT_EQ("value", Get("bar")); + + ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD)); + // -------- + ASSERT_EQ(4, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD_REDUNDANT)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD_REDUNDANT)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD_REDUNDANT)); + // -------- + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT)); + + // Access just filter (with high probability), and force redundant + // load+insert + cache->SetNthLookupNotFound(1); + ASSERT_EQ("NOT_FOUND", Get("this key was not added")); + + EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD)); + EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD)); + EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD)); + // -------- + EXPECT_EQ(5, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD_REDUNDANT)); + EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD_REDUNDANT)); + EXPECT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD_REDUNDANT)); + // -------- + EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT)); + + // Access just data, forcing redundant load+insert + ReadOptions read_options; + std::unique_ptr iter{db_->NewIterator(read_options)}; + cache->SetNthLookupNotFound(1); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), "bar"); + + EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD)); + EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD)); + EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD)); + // -------- + EXPECT_EQ(6, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD_REDUNDANT)); + EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD_REDUNDANT)); + EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD_REDUNDANT)); + // -------- + EXPECT_EQ(3, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT)); + } + EXPECT_GE(iterations_tested, 1); +} + +TEST_F(DBBlockCacheTest, ParanoidFileChecks) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.level0_file_num_compaction_trigger = 2; + options.paranoid_file_checks = true; + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = false; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "1_key", "val")); + ASSERT_OK(Put(1, "9_key", "val")); + // Create a new table. + ASSERT_OK(Flush(1)); + ASSERT_EQ(1, /* read and cache data block */ + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Put(1, "1_key2", "val2")); + ASSERT_OK(Put(1, "9_key2", "val2")); + // Create a new SST file. This will further trigger a compaction + // and generate another file. + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(3, /* Totally 3 files created up to now */ + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + // After disabling options.paranoid_file_checks. NO further block + // is added after generating a new file. + ASSERT_OK( + dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "false"}})); + + ASSERT_OK(Put(1, "1_key3", "val3")); + ASSERT_OK(Put(1, "9_key3", "val3")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "1_key4", "val4")); + ASSERT_OK(Put(1, "9_key4", "val4")); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(3, /* Totally 3 files created up to now */ + TestGetTickerCount(options, BLOCK_CACHE_ADD)); +} + +TEST_F(DBBlockCacheTest, CompressedCache) { + if (!Snappy_Supported()) { + return; + } + int num_iter = 80; + + // Run this test three iterations. + // Iteration 1: only a uncompressed block cache + // Iteration 2: only a compressed block cache + // Iteration 3: both block cache and compressed cache + // Iteration 4: both block cache and compressed cache, but DB is not + // compressed + for (int iter = 0; iter < 4; iter++) { + Options options = CurrentOptions(); + options.write_buffer_size = 64 * 1024; // small write buffer + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + BlockBasedTableOptions table_options; + switch (iter) { + case 0: + // only uncompressed block cache + table_options.block_cache = NewLRUCache(8 * 1024); + table_options.block_cache_compressed = nullptr; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + break; + case 1: + // no block cache, only compressed cache + table_options.no_block_cache = true; + table_options.block_cache = nullptr; + table_options.block_cache_compressed = NewLRUCache(8 * 1024); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + break; + case 2: + // both compressed and uncompressed block cache + table_options.block_cache = NewLRUCache(1024); + table_options.block_cache_compressed = NewLRUCache(8 * 1024); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + break; + case 3: + // both block cache and compressed cache, but DB is not compressed + // also, make block cache sizes bigger, to trigger block cache hits + table_options.block_cache = NewLRUCache(1024 * 1024); + table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.compression = kNoCompression; + break; + default: + FAIL(); + } + CreateAndReopenWithCF({"pikachu"}, options); + // default column family doesn't have block cache + Options no_block_cache_opts; + no_block_cache_opts.statistics = options.statistics; + no_block_cache_opts = CurrentOptions(no_block_cache_opts); + BlockBasedTableOptions table_options_no_bc; + table_options_no_bc.no_block_cache = true; + no_block_cache_opts.table_factory.reset( + NewBlockBasedTableFactory(table_options_no_bc)); + ReopenWithColumnFamilies( + {"default", "pikachu"}, + std::vector({no_block_cache_opts, options})); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + std::vector values; + std::string str; + for (int i = 0; i < num_iter; i++) { + if (i % 4 == 0) { // high compression ratio + str = rnd.RandomString(1000); + } + values.push_back(str); + ASSERT_OK(Put(1, Key(i), values[i])); + } + + // flush all data from memtable so that reads are from block cache + ASSERT_OK(Flush(1)); + + for (int i = 0; i < num_iter; i++) { + ASSERT_EQ(Get(1, Key(i)), values[i]); + } + + // check that we triggered the appropriate code paths in the cache + switch (iter) { + case 0: + // only uncompressed block cache + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); + break; + case 1: + // no block cache, only compressed cache + ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); + break; + case 2: + // both compressed and uncompressed block cache + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); + break; + case 3: + // both compressed and uncompressed block cache + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_HIT), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); + // compressed doesn't have any hits since blocks are not compressed on + // storage + ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT), 0); + break; + default: + FAIL(); + } + + options.create_if_missing = true; + DestroyAndReopen(options); + } +} + +TEST_F(DBBlockCacheTest, CacheCompressionDict) { + const int kNumFiles = 4; + const int kNumEntriesPerFile = 128; + const int kNumBytesPerEntry = 1024; + + // Try all the available libraries that support dictionary compression + std::vector compression_types; + if (Zlib_Supported()) { + compression_types.push_back(kZlibCompression); + } + if (LZ4_Supported()) { + compression_types.push_back(kLZ4Compression); + compression_types.push_back(kLZ4HCCompression); + } + if (ZSTD_Supported()) { + compression_types.push_back(kZSTD); + } else if (ZSTDNotFinal_Supported()) { + compression_types.push_back(kZSTDNotFinalCompression); + } + Random rnd(301); + for (auto compression_type : compression_types) { + Options options = CurrentOptions(); + options.bottommost_compression = compression_type; + options.bottommost_compression_opts.max_dict_bytes = 4096; + options.bottommost_compression_opts.enabled = true; + options.create_if_missing = true; + options.num_levels = 2; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry; + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.block_cache.reset(new MockCache()); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + RecordCacheCountersForCompressionDict(options); + + for (int i = 0; i < kNumFiles; ++i) { + ASSERT_EQ(i, NumTableFilesAtLevel(0, 0)); + for (int j = 0; j < kNumEntriesPerFile; ++j) { + std::string value = rnd.RandomString(kNumBytesPerEntry); + ASSERT_OK(Put(Key(j * kNumFiles + i), value.c_str())); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(1)); + + // Compression dictionary blocks are preloaded. + CheckCacheCountersForCompressionDict( + options, kNumFiles /* expected_compression_dict_misses */, + 0 /* expected_compression_dict_hits */, + kNumFiles /* expected_compression_dict_inserts */); + + // Seek to a key in a file. It should cause the SST's dictionary meta-block + // to be read. + RecordCacheCounters(options); + RecordCacheCountersForCompressionDict(options); + ReadOptions read_options; + ASSERT_NE("NOT_FOUND", Get(Key(kNumFiles * kNumEntriesPerFile - 1))); + // Two block hits: index and dictionary since they are prefetched + // One block missed/added: data block + CheckCacheCounters(options, 1 /* expected_misses */, 2 /* expected_hits */, + 1 /* expected_inserts */, 0 /* expected_failures */); + CheckCacheCountersForCompressionDict( + options, 0 /* expected_compression_dict_misses */, + 1 /* expected_compression_dict_hits */, + 0 /* expected_compression_dict_inserts */); + } +} + +static void ClearCache(Cache* cache) { + auto roles = CopyCacheDeleterRoleMap(); + std::deque keys; + Cache::ApplyToAllEntriesOptions opts; + auto callback = [&](const Slice& key, void* /*value*/, size_t /*charge*/, + Cache::DeleterFn deleter) { + if (roles.find(deleter) == roles.end()) { + // Keep the stats collector + return; + } + keys.push_back(key.ToString()); + }; + cache->ApplyToAllEntries(callback, opts); + for (auto& k : keys) { + cache->Erase(k); + } +} + +TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { + const size_t capacity = size_t{1} << 25; + int iterations_tested = 0; + for (bool partition : {false, true}) { + for (std::shared_ptr cache : + {NewLRUCache(capacity), + HyperClockCacheOptions( + capacity, + BlockBasedTableOptions().block_size /*estimated_value_size*/) + .MakeSharedCache()}) { + ++iterations_tested; + + Options options = CurrentOptions(); + SetTimeElapseOnlySleepOnReopen(&options); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.max_open_files = 13; + options.table_cache_numshardbits = 0; + // If this wakes up, it could interfere with test + options.stats_dump_period_sec = 0; + + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(50)); + if (partition) { + table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; + table_options.partition_filters = true; + } + table_options.metadata_cache_options.top_level_index_pinning = + PinningTier::kNone; + table_options.metadata_cache_options.partition_pinning = + PinningTier::kNone; + table_options.metadata_cache_options.unpartitioned_pinning = + PinningTier::kNone; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + // Create a new table. + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(Put("bar", "value")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("zfoo", "value")); + ASSERT_OK(Put("zbar", "value")); + ASSERT_OK(Flush()); + + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + + // Fresh cache + ClearCache(cache.get()); + + std::array expected{}; + // For CacheEntryStatsCollector + expected[static_cast(CacheEntryRole::kMisc)] = 1; + EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); + + std::array prev_expected = expected; + + // First access only filters + ASSERT_EQ("NOT_FOUND", Get("different from any key added")); + expected[static_cast(CacheEntryRole::kFilterBlock)] += 2; + if (partition) { + expected[static_cast(CacheEntryRole::kFilterMetaBlock)] += 2; + } + // Within some time window, we will get cached entry stats + EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg()); + // Not enough to force a miss + env_->MockSleepForSeconds(45); + EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg()); + // Enough to force a miss + env_->MockSleepForSeconds(601); + EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); + + // Now access index and data block + ASSERT_EQ("value", Get("foo")); + expected[static_cast(CacheEntryRole::kIndexBlock)]++; + if (partition) { + // top-level + expected[static_cast(CacheEntryRole::kIndexBlock)]++; + } + expected[static_cast(CacheEntryRole::kDataBlock)]++; + // Enough to force a miss + env_->MockSleepForSeconds(601); + // But inject a simulated long scan so that we need a longer + // interval to force a miss next time. + SyncPoint::GetInstance()->SetCallBack( + "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries", + [this](void*) { + // To spend no more than 0.2% of time scanning, we would need + // interval of at least 10000s + env_->MockSleepForSeconds(20); + }); + SyncPoint::GetInstance()->EnableProcessing(); + EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); + prev_expected = expected; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // The same for other file + ASSERT_EQ("value", Get("zfoo")); + expected[static_cast(CacheEntryRole::kIndexBlock)]++; + if (partition) { + // top-level + expected[static_cast(CacheEntryRole::kIndexBlock)]++; + } + expected[static_cast(CacheEntryRole::kDataBlock)]++; + // Because of the simulated long scan, this is not enough to force + // a miss + env_->MockSleepForSeconds(601); + EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg()); + // But this is enough + env_->MockSleepForSeconds(10000); + EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); + prev_expected = expected; + + // Also check the GetProperty interface + std::map values; + ASSERT_TRUE( + db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats, &values)); + + for (size_t i = 0; i < kNumCacheEntryRoles; ++i) { + auto role = static_cast(i); + EXPECT_EQ(std::to_string(expected[i]), + values[BlockCacheEntryStatsMapKeys::EntryCount(role)]); + } + + // Add one for kWriteBuffer + { + WriteBufferManager wbm(size_t{1} << 20, cache); + wbm.ReserveMem(1024); + expected[static_cast(CacheEntryRole::kWriteBuffer)]++; + // Now we check that the GetProperty interface is more agressive about + // re-scanning stats, but not totally aggressive. + // Within some time window, we will get cached entry stats + env_->MockSleepForSeconds(1); + EXPECT_EQ(std::to_string(prev_expected[static_cast( + CacheEntryRole::kWriteBuffer)]), + values[BlockCacheEntryStatsMapKeys::EntryCount( + CacheEntryRole::kWriteBuffer)]); + // Not enough for a "background" miss but enough for a "foreground" miss + env_->MockSleepForSeconds(45); + + ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats, + &values)); + EXPECT_EQ( + std::to_string( + expected[static_cast(CacheEntryRole::kWriteBuffer)]), + values[BlockCacheEntryStatsMapKeys::EntryCount( + CacheEntryRole::kWriteBuffer)]); + } + prev_expected = expected; + + // With collector pinned in cache, we should be able to hit + // even if the cache is full + ClearCache(cache.get()); + Cache::Handle* h = nullptr; + if (strcmp(cache->Name(), "LRUCache") == 0) { + ASSERT_OK(cache->Insert("Fill-it-up", nullptr, capacity + 1, + GetNoopDeleterForRole(), + &h, Cache::Priority::HIGH)); + } else { + // For ClockCache we use a 16-byte key. + ASSERT_OK(cache->Insert("Fill-it-up-xxxxx", nullptr, capacity + 1, + GetNoopDeleterForRole(), + &h, Cache::Priority::HIGH)); + } + ASSERT_GT(cache->GetUsage(), cache->GetCapacity()); + expected = {}; + // For CacheEntryStatsCollector + expected[static_cast(CacheEntryRole::kMisc)] = 1; + // For Fill-it-up + expected[static_cast(CacheEntryRole::kMisc)]++; + // Still able to hit on saved stats + EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg()); + // Enough to force a miss + env_->MockSleepForSeconds(1000); + EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); + + cache->Release(h); + + // Now we test that the DB mutex is not held during scans, for the ways + // we know how to (possibly) trigger them. Without a better good way to + // check this, we simply inject an acquire & release of the DB mutex + // deep in the stat collection code. If we were already holding the + // mutex, that is UB that would at least be found by TSAN. + int scan_count = 0; + SyncPoint::GetInstance()->SetCallBack( + "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries", + [this, &scan_count](void*) { + dbfull()->TEST_LockMutex(); + dbfull()->TEST_UnlockMutex(); + ++scan_count; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Different things that might trigger a scan, with mock sleeps to + // force a miss. + env_->MockSleepForSeconds(10000); + dbfull()->DumpStats(); + ASSERT_EQ(scan_count, 1); + + env_->MockSleepForSeconds(60); + ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kFastBlockCacheEntryStats, + &values)); + ASSERT_EQ(scan_count, 1); + ASSERT_TRUE( + db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats, &values)); + ASSERT_EQ(scan_count, 2); + + env_->MockSleepForSeconds(10000); + ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kFastBlockCacheEntryStats, + &values)); + ASSERT_EQ(scan_count, 3); + + env_->MockSleepForSeconds(60); + std::string value_str; + ASSERT_TRUE(db_->GetProperty(DB::Properties::kFastBlockCacheEntryStats, + &value_str)); + ASSERT_EQ(scan_count, 3); + ASSERT_TRUE( + db_->GetProperty(DB::Properties::kBlockCacheEntryStats, &value_str)); + ASSERT_EQ(scan_count, 4); + + env_->MockSleepForSeconds(10000); + ASSERT_TRUE(db_->GetProperty(DB::Properties::kFastBlockCacheEntryStats, + &value_str)); + ASSERT_EQ(scan_count, 5); + + ASSERT_TRUE(db_->GetProperty(DB::Properties::kCFStats, &value_str)); + // To match historical speed, querying this property no longer triggers + // a scan, even if results are old. But periodic dump stats should keep + // things reasonably updated. + ASSERT_EQ(scan_count, /*unchanged*/ 5); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + } + EXPECT_GE(iterations_tested, 1); + } +} + +namespace { + +void DummyFillCache(Cache& cache, size_t entry_size, + std::vector>& handles) { + // fprintf(stderr, "Entry size: %zu\n", entry_size); + handles.clear(); + cache.EraseUnRefEntries(); + void* fake_value = &cache; + size_t capacity = cache.GetCapacity(); + OffsetableCacheKey ck{"abc", "abc", 42}; + for (size_t my_usage = 0; my_usage < capacity;) { + size_t charge = std::min(entry_size, capacity - my_usage); + Cache::Handle* handle; + Status st = cache.Insert(ck.WithOffset(my_usage).AsSlice(), fake_value, + charge, /*deleter*/ nullptr, &handle); + ASSERT_OK(st); + handles.emplace_back(&cache, handle); + my_usage += charge; + } +} + +class CountingLogger : public Logger { + public: + ~CountingLogger() override {} + using Logger::Logv; + void Logv(const InfoLogLevel log_level, const char* format, + va_list /*ap*/) override { + if (std::strstr(format, "HyperClockCache") == nullptr) { + // Not a match + return; + } + // static StderrLogger debug; + // debug.Logv(log_level, format, ap); + if (log_level == InfoLogLevel::INFO_LEVEL) { + ++info_count_; + } else if (log_level == InfoLogLevel::WARN_LEVEL) { + ++warn_count_; + } else if (log_level == InfoLogLevel::ERROR_LEVEL) { + ++error_count_; + } + } + + std::array PopCounts() { + std::array rv{{info_count_, warn_count_, error_count_}}; + info_count_ = warn_count_ = error_count_ = 0; + return rv; + } + + private: + int info_count_{}; + int warn_count_{}; + int error_count_{}; +}; + +} // namespace + +TEST_F(DBBlockCacheTest, HyperClockCacheReportProblems) { + size_t capacity = 1024 * 1024; + size_t value_size_est = 8 * 1024; + HyperClockCacheOptions hcc_opts{capacity, value_size_est}; + hcc_opts.num_shard_bits = 2; // 4 shards + hcc_opts.metadata_charge_policy = kDontChargeCacheMetadata; + std::shared_ptr cache = hcc_opts.MakeSharedCache(); + std::shared_ptr logger = std::make_shared(); + + auto table_options = GetTableOptions(); + auto options = GetOptions(table_options); + table_options.block_cache = cache; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.info_log = logger; + // Going to sample more directly + options.stats_dump_period_sec = 0; + Reopen(options); + + std::vector> handles; + + // Clear anything from DB startup + logger->PopCounts(); + + // Fill cache based on expected size and check that when we + // don't report anything relevant in periodic stats dump + DummyFillCache(*cache, value_size_est, handles); + dbfull()->DumpStats(); + EXPECT_EQ(logger->PopCounts(), (std::array{{0, 0, 0}})); + + // Same, within reasonable bounds + DummyFillCache(*cache, value_size_est - value_size_est / 4, handles); + dbfull()->DumpStats(); + EXPECT_EQ(logger->PopCounts(), (std::array{{0, 0, 0}})); + + DummyFillCache(*cache, value_size_est + value_size_est / 3, handles); + dbfull()->DumpStats(); + EXPECT_EQ(logger->PopCounts(), (std::array{{0, 0, 0}})); + + // Estimate too high (value size too low) eventually reports ERROR + DummyFillCache(*cache, value_size_est / 2, handles); + dbfull()->DumpStats(); + EXPECT_EQ(logger->PopCounts(), (std::array{{0, 1, 0}})); + + DummyFillCache(*cache, value_size_est / 3, handles); + dbfull()->DumpStats(); + EXPECT_EQ(logger->PopCounts(), (std::array{{0, 0, 1}})); + + // Estimate too low (value size too high) starts with INFO + // and is only WARNING in the worst case + DummyFillCache(*cache, value_size_est * 2, handles); + dbfull()->DumpStats(); + EXPECT_EQ(logger->PopCounts(), (std::array{{1, 0, 0}})); + + DummyFillCache(*cache, value_size_est * 3, handles); + dbfull()->DumpStats(); + EXPECT_EQ(logger->PopCounts(), (std::array{{0, 1, 0}})); + + DummyFillCache(*cache, value_size_est * 20, handles); + dbfull()->DumpStats(); + EXPECT_EQ(logger->PopCounts(), (std::array{{0, 1, 0}})); +} + +#endif // ROCKSDB_LITE + +class DBBlockCacheKeyTest + : public DBTestBase, + public testing::WithParamInterface> { + public: + DBBlockCacheKeyTest() + : DBTestBase("db_block_cache_test", /*env_do_fsync=*/false) {} + + void SetUp() override { + use_compressed_cache_ = std::get<0>(GetParam()); + exclude_file_numbers_ = std::get<1>(GetParam()); + } + + bool use_compressed_cache_; + bool exclude_file_numbers_; +}; + +// Disable LinkFile so that we can physically copy a DB using Checkpoint. +// Disable file GetUniqueId to enable stable cache keys. +class StableCacheKeyTestFS : public FaultInjectionTestFS { + public: + explicit StableCacheKeyTestFS(const std::shared_ptr& base) + : FaultInjectionTestFS(base) { + SetFailGetUniqueId(true); + } + + virtual ~StableCacheKeyTestFS() override {} + + IOStatus LinkFile(const std::string&, const std::string&, const IOOptions&, + IODebugContext*) override { + return IOStatus::NotSupported("Disabled"); + } +}; + +TEST_P(DBBlockCacheKeyTest, StableCacheKeys) { + std::shared_ptr test_fs{ + new StableCacheKeyTestFS(env_->GetFileSystem())}; + std::unique_ptr test_env{ + new CompositeEnvWrapper(env_, test_fs)}; + + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.env = test_env.get(); + + // Corrupting the table properties corrupts the unique id. + // Ignore the unique id recorded in the manifest. + options.verify_sst_unique_id_in_manifest = false; + + BlockBasedTableOptions table_options; + + int key_count = 0; + uint64_t expected_stat = 0; + + std::function verify_stats; + if (use_compressed_cache_) { + if (!Snappy_Supported()) { + ROCKSDB_GTEST_SKIP("Compressed cache test requires snappy support"); + return; + } + options.compression = CompressionType::kSnappyCompression; + table_options.no_block_cache = true; + table_options.block_cache_compressed = NewLRUCache(1 << 25, 0, false); + verify_stats = [&options, &expected_stat] { + // One for ordinary SST file and one for external SST file + ASSERT_EQ(expected_stat, + options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_ADD)); + }; + } else { + table_options.cache_index_and_filter_blocks = true; + table_options.block_cache = NewLRUCache(1 << 25, 0, false); + verify_stats = [&options, &expected_stat] { + ASSERT_EQ(expected_stat, + options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD)); + ASSERT_EQ(expected_stat, + options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD)); + ASSERT_EQ(expected_stat, + options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD)); + }; + } + + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"koko"}, options); + + if (exclude_file_numbers_) { + // Simulate something like old behavior without file numbers in properties. + // This is a "control" side of the test that also ensures safely degraded + // behavior on old files. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableBuilder::BlockBasedTableBuilder:PreSetupBaseCacheKey", + [&](void* arg) { + TableProperties* props = reinterpret_cast(arg); + props->orig_file_number = 0; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + } + + std::function perform_gets = [&key_count, &expected_stat, this]() { + if (exclude_file_numbers_) { + // No cache key reuse should happen, because we can't rely on current + // file number being stable + expected_stat += key_count; + } else { + // Cache keys should be stable + expected_stat = key_count; + } + for (int i = 0; i < key_count; ++i) { + ASSERT_EQ(Get(1, Key(i)), "abc"); + } + }; + + // Ordinary SST files with same session id + const std::string something_compressible(500U, 'x'); + for (int i = 0; i < 2; ++i) { + ASSERT_OK(Put(1, Key(key_count), "abc")); + ASSERT_OK(Put(1, Key(key_count) + "a", something_compressible)); + ASSERT_OK(Flush(1)); + ++key_count; + } + +#ifndef ROCKSDB_LITE + // Save an export of those ordinary SST files for later + std::string export_files_dir = dbname_ + "/exported"; + ExportImportFilesMetaData* metadata_ptr_ = nullptr; + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir, + &metadata_ptr_)); + ASSERT_NE(metadata_ptr_, nullptr); + delete checkpoint; + checkpoint = nullptr; + + // External SST files with same session id + SstFileWriter sst_file_writer(EnvOptions(), options); + std::vector external; + for (int i = 0; i < 2; ++i) { + std::string f = dbname_ + "/external" + std::to_string(i) + ".sst"; + external.push_back(f); + ASSERT_OK(sst_file_writer.Open(f)); + ASSERT_OK(sst_file_writer.Put(Key(key_count), "abc")); + ASSERT_OK( + sst_file_writer.Put(Key(key_count) + "a", something_compressible)); + ++key_count; + ExternalSstFileInfo external_info; + ASSERT_OK(sst_file_writer.Finish(&external_info)); + IngestExternalFileOptions ingest_opts; + ASSERT_OK(db_->IngestExternalFile(handles_[1], {f}, ingest_opts)); + } + + if (exclude_file_numbers_) { + // FIXME(peterd): figure out where these extra ADDs are coming from + options.statistics->recordTick(BLOCK_CACHE_COMPRESSED_ADD, + uint64_t{0} - uint64_t{2}); + } +#endif + + perform_gets(); + verify_stats(); + + // Make sure we can cache hit after re-open + ReopenWithColumnFamilies({"default", "koko"}, options); + + perform_gets(); + verify_stats(); + + // Make sure we can cache hit even on a full copy of the DB. Using + // StableCacheKeyTestFS, Checkpoint will resort to full copy not hard link. + // (Checkpoint not available in LITE mode to test this.) +#ifndef ROCKSDB_LITE + auto db_copy_name = dbname_ + "-copy"; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->CreateCheckpoint(db_copy_name)); + delete checkpoint; + + Close(); + Destroy(options); + + // Switch to the DB copy + SaveAndRestore save_dbname(&dbname_, db_copy_name); + ReopenWithColumnFamilies({"default", "koko"}, options); + + perform_gets(); + verify_stats(); + + // And ensure that re-importing + ingesting the same files into a + // different DB uses same cache keys + DestroyAndReopen(options); + + ColumnFamilyHandle* cfh = nullptr; + ASSERT_OK(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + *metadata_ptr_, &cfh)); + ASSERT_NE(cfh, nullptr); + delete cfh; + cfh = nullptr; + delete metadata_ptr_; + metadata_ptr_ = nullptr; + + ASSERT_OK(DestroyDB(export_files_dir, options)); + + ReopenWithColumnFamilies({"default", "yoyo"}, options); + + IngestExternalFileOptions ingest_opts; + ASSERT_OK(db_->IngestExternalFile(handles_[1], {external}, ingest_opts)); + + perform_gets(); + verify_stats(); +#endif // !ROCKSDB_LITE + + Close(); + Destroy(options); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +class CacheKeyTest : public testing::Test { + public: + CacheKey GetBaseCacheKey() { + CacheKey rv = GetOffsetableCacheKey(0, /*min file_number*/ 1).WithOffset(0); + // Correct for file_number_ == 1 + *reinterpret_cast(&rv) ^= ReverseBits(uint64_t{1}); + return rv; + } + CacheKey GetCacheKey(uint64_t session_counter, uint64_t file_number, + uint64_t offset) { + OffsetableCacheKey offsetable = + GetOffsetableCacheKey(session_counter, file_number); + // * 4 to counteract optimization that strips lower 2 bits in encoding + // the offset in BlockBasedTable::GetCacheKey (which we prefer to include + // in unit tests to maximize functional coverage). + EXPECT_GE(offset * 4, offset); // no overflow + return BlockBasedTable::GetCacheKey(offsetable, + BlockHandle(offset * 4, /*size*/ 5)); + } + + protected: + OffsetableCacheKey GetOffsetableCacheKey(uint64_t session_counter, + uint64_t file_number) { + // Like SemiStructuredUniqueIdGen::GenerateNext + tp_.db_session_id = EncodeSessionId(base_session_upper_, + base_session_lower_ ^ session_counter); + tp_.db_id = std::to_string(db_id_); + tp_.orig_file_number = file_number; + bool is_stable; + std::string cur_session_id = ""; // ignored + uint64_t cur_file_number = 42; // ignored + OffsetableCacheKey rv; + BlockBasedTable::SetupBaseCacheKey(&tp_, cur_session_id, cur_file_number, + &rv, &is_stable); + EXPECT_TRUE(is_stable); + EXPECT_TRUE(!rv.IsEmpty()); + // BEGIN some assertions in relation to SST unique IDs + std::string external_unique_id_str; + EXPECT_OK(GetUniqueIdFromTableProperties(tp_, &external_unique_id_str)); + UniqueId64x2 sst_unique_id = {}; + EXPECT_OK(DecodeUniqueIdBytes(external_unique_id_str, &sst_unique_id)); + ExternalUniqueIdToInternal(&sst_unique_id); + OffsetableCacheKey ock = + OffsetableCacheKey::FromInternalUniqueId(&sst_unique_id); + EXPECT_EQ(rv.WithOffset(0).AsSlice(), ock.WithOffset(0).AsSlice()); + EXPECT_EQ(ock.ToInternalUniqueId(), sst_unique_id); + // END some assertions in relation to SST unique IDs + return rv; + } + + TableProperties tp_; + uint64_t base_session_upper_ = 0; + uint64_t base_session_lower_ = 0; + uint64_t db_id_ = 0; +}; + +TEST_F(CacheKeyTest, DBImplSessionIdStructure) { + // We have to generate our own session IDs for simulation purposes in other + // tests. Here we verify that the DBImpl implementation seems to match + // our construction here, by using lowest XORed-in bits for "session + // counter." + std::string session_id1 = DBImpl::GenerateDbSessionId(/*env*/ nullptr); + std::string session_id2 = DBImpl::GenerateDbSessionId(/*env*/ nullptr); + uint64_t upper1, upper2, lower1, lower2; + ASSERT_OK(DecodeSessionId(session_id1, &upper1, &lower1)); + ASSERT_OK(DecodeSessionId(session_id2, &upper2, &lower2)); + // Because generated in same process + ASSERT_EQ(upper1, upper2); + // Unless we generate > 4 billion session IDs in this process... + ASSERT_EQ(Upper32of64(lower1), Upper32of64(lower2)); + // But they must be different somewhere + ASSERT_NE(Lower32of64(lower1), Lower32of64(lower2)); +} + +namespace { +// Deconstruct cache key, based on knowledge of implementation details. +void DeconstructNonemptyCacheKey(const CacheKey& key, uint64_t* file_num_etc64, + uint64_t* offset_etc64) { + *file_num_etc64 = *reinterpret_cast(key.AsSlice().data()); + *offset_etc64 = *reinterpret_cast(key.AsSlice().data() + 8); + assert(*file_num_etc64 != 0); + if (*offset_etc64 == 0) { + std::swap(*file_num_etc64, *offset_etc64); + } + assert(*offset_etc64 != 0); +} + +// Make a bit mask of 0 to 64 bits +uint64_t MakeMask64(int bits) { + if (bits >= 64) { + return uint64_t{0} - 1; + } else { + return (uint64_t{1} << bits) - 1; + } +} + +// See CacheKeyTest::Encodings +struct CacheKeyDecoder { + // Inputs + uint64_t base_file_num_etc64, base_offset_etc64; + int session_counter_bits, file_number_bits, offset_bits; + + // Derived + uint64_t session_counter_mask, file_number_mask, offset_mask; + + // Outputs + uint64_t decoded_session_counter, decoded_file_num, decoded_offset; + + void SetBaseCacheKey(const CacheKey& base) { + DeconstructNonemptyCacheKey(base, &base_file_num_etc64, &base_offset_etc64); + } + + void SetRanges(int _session_counter_bits, int _file_number_bits, + int _offset_bits) { + session_counter_bits = _session_counter_bits; + session_counter_mask = MakeMask64(session_counter_bits); + file_number_bits = _file_number_bits; + file_number_mask = MakeMask64(file_number_bits); + offset_bits = _offset_bits; + offset_mask = MakeMask64(offset_bits); + } + + void Decode(const CacheKey& key) { + uint64_t file_num_etc64, offset_etc64; + DeconstructNonemptyCacheKey(key, &file_num_etc64, &offset_etc64); + + // First decode session counter + if (offset_bits + session_counter_bits <= 64) { + // fully recoverable from offset_etc64 + decoded_session_counter = + ReverseBits((offset_etc64 ^ base_offset_etc64)) & + session_counter_mask; + } else if (file_number_bits + session_counter_bits <= 64) { + // fully recoverable from file_num_etc64 + decoded_session_counter = DownwardInvolution( + (file_num_etc64 ^ base_file_num_etc64) & session_counter_mask); + } else { + // Need to combine parts from each word. + // Piece1 will contain some correct prefix of the bottom bits of + // session counter. + uint64_t piece1 = + ReverseBits((offset_etc64 ^ base_offset_etc64) & ~offset_mask); + int piece1_bits = 64 - offset_bits; + // Piece2 will contain involuded bits that we can combine with piece1 + // to infer rest of session counter + int piece2_bits = std::min(64 - file_number_bits, 64 - piece1_bits); + ASSERT_LT(piece2_bits, 64); + uint64_t piece2_mask = MakeMask64(piece2_bits); + uint64_t piece2 = (file_num_etc64 ^ base_file_num_etc64) & piece2_mask; + + // Cancel out the part of piece2 that we can infer from piece1 + // (DownwardInvolution distributes over xor) + piece2 ^= DownwardInvolution(piece1) & piece2_mask; + + // Now we need to solve for the unknown original bits in higher + // positions than piece1 provides. We use Gaussian elimination + // because we know that a piece2_bits X piece2_bits submatrix of + // the matrix underlying DownwardInvolution times the vector of + // unknown original bits equals piece2. + // + // Build an augmented row matrix for that submatrix, built column by + // column. + std::array aug_rows{}; + for (int i = 0; i < piece2_bits; ++i) { // over columns + uint64_t col_i = DownwardInvolution(uint64_t{1} << piece1_bits << i); + ASSERT_NE(col_i & 1U, 0); + for (int j = 0; j < piece2_bits; ++j) { // over rows + aug_rows[j] |= (col_i & 1U) << i; + col_i >>= 1; + } + } + // Augment with right hand side + for (int j = 0; j < piece2_bits; ++j) { // over rows + aug_rows[j] |= (piece2 & 1U) << piece2_bits; + piece2 >>= 1; + } + // Run Gaussian elimination + for (int i = 0; i < piece2_bits; ++i) { // over columns + // Find a row that can be used to cancel others + uint64_t canceller = 0; + // Note: Rows 0 through i-1 contain 1s in columns already eliminated + for (int j = i; j < piece2_bits; ++j) { // over rows + if (aug_rows[j] & (uint64_t{1} << i)) { + // Swap into appropriate row + std::swap(aug_rows[i], aug_rows[j]); + // Keep a handy copy for row reductions + canceller = aug_rows[i]; + break; + } + } + ASSERT_NE(canceller, 0); + for (int j = 0; j < piece2_bits; ++j) { // over rows + if (i != j && ((aug_rows[j] >> i) & 1) != 0) { + // Row reduction + aug_rows[j] ^= canceller; + } + } + } + // Extract result + decoded_session_counter = piece1; + for (int j = 0; j < piece2_bits; ++j) { // over rows + ASSERT_EQ(aug_rows[j] & piece2_mask, uint64_t{1} << j); + decoded_session_counter |= aug_rows[j] >> piece2_bits << piece1_bits + << j; + } + } + + decoded_offset = + offset_etc64 ^ base_offset_etc64 ^ ReverseBits(decoded_session_counter); + + decoded_file_num = ReverseBits(file_num_etc64 ^ base_file_num_etc64 ^ + DownwardInvolution(decoded_session_counter)); + } +}; +} // anonymous namespace + +TEST_F(CacheKeyTest, Encodings) { + // This test primarily verifies this claim from cache_key.cc: + // // In fact, if DB ids were not involved, we would be guaranteed unique + // // cache keys for files generated in a single process until total bits for + // // biggest session_id_counter, orig_file_number, and offset_in_file + // // reach 128 bits. + // + // To demonstrate this, CacheKeyDecoder can reconstruct the structured inputs + // to the cache key when provided an output cache key, the unstructured + // inputs, and bounds on the structured inputs. + // + // See OffsetableCacheKey comments in cache_key.cc. + + // We are going to randomly initialize some values that *should* not affect + // result + Random64 r{std::random_device{}()}; + + CacheKeyDecoder decoder; + db_id_ = r.Next(); + base_session_upper_ = r.Next(); + base_session_lower_ = r.Next(); + if (base_session_lower_ == 0) { + base_session_lower_ = 1; + } + + decoder.SetBaseCacheKey(GetBaseCacheKey()); + + // Loop over configurations and test those + for (int session_counter_bits = 0; session_counter_bits <= 64; + ++session_counter_bits) { + for (int file_number_bits = 1; file_number_bits <= 64; ++file_number_bits) { + // 62 bits max because unoptimized offset will be 64 bits in that case + for (int offset_bits = 0; offset_bits <= 62; ++offset_bits) { + if (session_counter_bits + file_number_bits + offset_bits > 128) { + break; + } + + decoder.SetRanges(session_counter_bits, file_number_bits, offset_bits); + + uint64_t session_counter = r.Next() & decoder.session_counter_mask; + uint64_t file_number = r.Next() & decoder.file_number_mask; + if (file_number == 0) { + // Minimum + file_number = 1; + } + uint64_t offset = r.Next() & decoder.offset_mask; + decoder.Decode(GetCacheKey(session_counter, file_number, offset)); + + EXPECT_EQ(decoder.decoded_session_counter, session_counter); + EXPECT_EQ(decoder.decoded_file_num, file_number); + EXPECT_EQ(decoder.decoded_offset, offset); + } + } + } +} + +INSTANTIATE_TEST_CASE_P(DBBlockCacheKeyTest, DBBlockCacheKeyTest, + ::testing::Combine(::testing::Bool(), + ::testing::Bool())); + +class DBBlockCachePinningTest + : public DBTestBase, + public testing::WithParamInterface< + std::tuple> { + public: + DBBlockCachePinningTest() + : DBTestBase("db_block_cache_test", /*env_do_fsync=*/false) {} + + void SetUp() override { + partition_index_and_filters_ = std::get<0>(GetParam()); + top_level_index_pinning_ = std::get<1>(GetParam()); + partition_pinning_ = std::get<2>(GetParam()); + unpartitioned_pinning_ = std::get<3>(GetParam()); + } + + bool partition_index_and_filters_; + PinningTier top_level_index_pinning_; + PinningTier partition_pinning_; + PinningTier unpartitioned_pinning_; +}; + +TEST_P(DBBlockCachePinningTest, TwoLevelDB) { + // Creates one file in L0 and one file in L1. Both files have enough data that + // their index and filter blocks are partitioned. The L1 file will also have + // a compression dictionary (those are trained only during compaction), which + // must be unpartitioned. + const int kKeySize = 32; + const int kBlockSize = 128; + const int kNumBlocksPerFile = 128; + const int kNumKeysPerFile = kBlockSize * kNumBlocksPerFile / kKeySize; + + Options options = CurrentOptions(); + // `kNoCompression` makes the unit test more portable. But it relies on the + // current behavior of persisting/accessing dictionary even when there's no + // (de)compression happening, which seems fairly likely to change over time. + options.compression = kNoCompression; + options.compression_opts.max_dict_bytes = 4 << 10; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.block_cache = NewLRUCache(1 << 20 /* capacity */); + table_options.block_size = kBlockSize; + table_options.metadata_block_size = kBlockSize; + table_options.cache_index_and_filter_blocks = true; + table_options.metadata_cache_options.top_level_index_pinning = + top_level_index_pinning_; + table_options.metadata_cache_options.partition_pinning = partition_pinning_; + table_options.metadata_cache_options.unpartitioned_pinning = + unpartitioned_pinning_; + table_options.filter_policy.reset( + NewBloomFilterPolicy(10 /* bits_per_key */)); + if (partition_index_and_filters_) { + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + table_options.partition_filters = true; + } + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + + Random rnd(301); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK(Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kKeySize))); + } + ASSERT_OK(Flush()); + if (i == 0) { + // Prevent trivial move so file will be rewritten with dictionary and + // reopened with L1's pinning settings. + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + } + } + + // Clear all unpinned blocks so unpinned blocks will show up as cache misses + // when reading a key from a file. + table_options.block_cache->EraseUnRefEntries(); + + // Get base cache values + uint64_t filter_misses = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + uint64_t index_misses = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS); + uint64_t compression_dict_misses = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS); + + // Read a key from the L0 file + Get(Key(kNumKeysPerFile)); + uint64_t expected_filter_misses = filter_misses; + uint64_t expected_index_misses = index_misses; + uint64_t expected_compression_dict_misses = compression_dict_misses; + if (partition_index_and_filters_) { + if (top_level_index_pinning_ == PinningTier::kNone) { + ++expected_filter_misses; + ++expected_index_misses; + } + if (partition_pinning_ == PinningTier::kNone) { + ++expected_filter_misses; + ++expected_index_misses; + } + } else { + if (unpartitioned_pinning_ == PinningTier::kNone) { + ++expected_filter_misses; + ++expected_index_misses; + } + } + if (unpartitioned_pinning_ == PinningTier::kNone) { + ++expected_compression_dict_misses; + } + ASSERT_EQ(expected_filter_misses, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(expected_index_misses, + TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(expected_compression_dict_misses, + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS)); + + // Clear all unpinned blocks so unpinned blocks will show up as cache misses + // when reading a key from a file. + table_options.block_cache->EraseUnRefEntries(); + + // Read a key from the L1 file + Get(Key(0)); + if (partition_index_and_filters_) { + if (top_level_index_pinning_ == PinningTier::kNone || + top_level_index_pinning_ == PinningTier::kFlushedAndSimilar) { + ++expected_filter_misses; + ++expected_index_misses; + } + if (partition_pinning_ == PinningTier::kNone || + partition_pinning_ == PinningTier::kFlushedAndSimilar) { + ++expected_filter_misses; + ++expected_index_misses; + } + } else { + if (unpartitioned_pinning_ == PinningTier::kNone || + unpartitioned_pinning_ == PinningTier::kFlushedAndSimilar) { + ++expected_filter_misses; + ++expected_index_misses; + } + } + if (unpartitioned_pinning_ == PinningTier::kNone || + unpartitioned_pinning_ == PinningTier::kFlushedAndSimilar) { + ++expected_compression_dict_misses; + } + ASSERT_EQ(expected_filter_misses, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(expected_index_misses, + TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(expected_compression_dict_misses, + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS)); +} + +INSTANTIATE_TEST_CASE_P( + DBBlockCachePinningTest, DBBlockCachePinningTest, + ::testing::Combine( + ::testing::Bool(), + ::testing::Values(PinningTier::kNone, PinningTier::kFlushedAndSimilar, + PinningTier::kAll), + ::testing::Values(PinningTier::kNone, PinningTier::kFlushedAndSimilar, + PinningTier::kAll), + ::testing::Values(PinningTier::kNone, PinningTier::kFlushedAndSimilar, + PinningTier::kAll))); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_bloom_filter_test.cc b/src/rocksdb/db/db_bloom_filter_test.cc new file mode 100644 index 000000000..d68ab6115 --- /dev/null +++ b/src/rocksdb/db/db_bloom_filter_test.cc @@ -0,0 +1,3498 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include + +#include "cache/cache_entry_roles.h" +#include "cache/cache_reservation_manager.h" +#include "db/db_test_util.h" +#include "options/options_helper.h" +#include "port/stack_trace.h" +#include "rocksdb/advanced_options.h" +#include "rocksdb/convenience.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/statistics.h" +#include "rocksdb/table.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/filter_policy_internal.h" +#include "table/format.h" +#include "test_util/testutil.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +std::shared_ptr Create(double bits_per_key, + const std::string& name) { + return BloomLikeFilterPolicy::Create(name, bits_per_key); +} +const std::string kLegacyBloom = test::LegacyBloomFilterPolicy::kClassName(); +const std::string kFastLocalBloom = + test::FastLocalBloomFilterPolicy::kClassName(); +const std::string kStandard128Ribbon = + test::Standard128RibbonFilterPolicy::kClassName(); +const std::string kAutoBloom = BloomFilterPolicy::kClassName(); +const std::string kAutoRibbon = RibbonFilterPolicy::kClassName(); +} // anonymous namespace + +// DB tests related to bloom filter. + +class DBBloomFilterTest : public DBTestBase { + public: + DBBloomFilterTest() + : DBTestBase("db_bloom_filter_test", /*env_do_fsync=*/true) {} +}; + +class DBBloomFilterTestWithParam + : public DBTestBase, + public testing::WithParamInterface< + std::tuple> { + // public testing::WithParamInterface { + protected: + std::string bfp_impl_; + bool partition_filters_; + uint32_t format_version_; + + public: + DBBloomFilterTestWithParam() + : DBTestBase("db_bloom_filter_tests", /*env_do_fsync=*/true) {} + + ~DBBloomFilterTestWithParam() override {} + + void SetUp() override { + bfp_impl_ = std::get<0>(GetParam()); + partition_filters_ = std::get<1>(GetParam()); + format_version_ = std::get<2>(GetParam()); + } +}; + +class DBBloomFilterTestDefFormatVersion : public DBBloomFilterTestWithParam {}; + +class SliceTransformLimitedDomainGeneric : public SliceTransform { + const char* Name() const override { + return "SliceTransformLimitedDomainGeneric"; + } + + Slice Transform(const Slice& src) const override { + return Slice(src.data(), 5); + } + + bool InDomain(const Slice& src) const override { + // prefix will be x???? + return src.size() >= 5; + } + + bool InRange(const Slice& dst) const override { + // prefix will be x???? + return dst.size() == 5; + } +}; + +// KeyMayExist can lead to a few false positives, but not false negatives. +// To make test deterministic, use a much larger number of bits per key-20 than +// bits in the key, so that false positives are eliminated +TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) { + do { + ReadOptions ropts; + std::string value; + anon::OptionsOverride options_override; + options_override.filter_policy = Create(20, bfp_impl_); + options_override.partition_filters = partition_filters_; + options_override.metadata_block_size = 32; + options_override.full_block_cache = true; + Options options = CurrentOptions(options_override); + if (partition_filters_) { + auto* table_options = + options.table_factory->GetOptions(); + if (table_options != nullptr && + table_options->index_type != + BlockBasedTableOptions::kTwoLevelIndexSearch) { + // In the current implementation partitioned filters depend on + // partitioned indexes + continue; + } + } + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + + ASSERT_OK(Put(1, "a", "b")); + bool value_found = false; + ASSERT_TRUE( + db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found)); + ASSERT_TRUE(value_found); + ASSERT_EQ("b", value); + + ASSERT_OK(Flush(1)); + value.clear(); + + uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS); + uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE( + db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found)); + ASSERT_TRUE(!value_found); + // assert that no new files were opened and no new blocks were + // read into block cache. + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Delete(1, "a")); + + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], + true /* disallow trivial move */)); + + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Delete(1, "c")); + + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + // KeyMayExist function only checks data in block caches, which is not used + // by plain table format. + } while ( + ChangeOptions(kSkipPlainTable | kSkipHashIndex | kSkipFIFOCompaction)); +} + +TEST_F(DBBloomFilterTest, GetFilterByPrefixBloomCustomPrefixExtractor) { + for (bool partition_filters : {true, false}) { + Options options = last_options_; + options.prefix_extractor = + std::make_shared(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + get_perf_context()->EnablePerLevelPerfContext(); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10)); + if (partition_filters) { + bbto.partition_filters = true; + bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + WriteOptions wo; + ReadOptions ro; + FlushOptions fo; + fo.wait = true; + std::string value; + + ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo")); + ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2")); + ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar")); + + ASSERT_OK(dbfull()->Flush(fo)); + + ASSERT_EQ("foo", Get("barbarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ( + 0, + (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + ASSERT_EQ("foo2", Get("barbarbar2")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ( + 0, + (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + ASSERT_EQ("NOT_FOUND", Get("barbarbar3")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ( + 0, + (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + + ASSERT_EQ("NOT_FOUND", Get("barfoofoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ( + 1, + (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); + ASSERT_EQ( + 2, + (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + + ro.total_order_seek = true; + // NOTE: total_order_seek no longer affects Get() + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); + ASSERT_EQ( + 3, + (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + + // No bloom on extractor changed +#ifndef ROCKSDB_LITE + ASSERT_OK(db_->SetOptions({{"prefix_extractor", "capped:10"}})); + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); + ASSERT_EQ( + 3, + (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); +#endif // ROCKSDB_LITE + + // No bloom on extractor changed, after re-open + options.prefix_extractor.reset(NewCappedPrefixTransform(10)); + Reopen(options); + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); + ASSERT_EQ( + 3, + (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + + get_perf_context()->Reset(); + } +} + +TEST_F(DBBloomFilterTest, GetFilterByPrefixBloom) { + for (bool partition_filters : {true, false}) { + Options options = last_options_; + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + get_perf_context()->EnablePerLevelPerfContext(); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10)); + if (partition_filters) { + bbto.partition_filters = true; + bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + WriteOptions wo; + ReadOptions ro; + FlushOptions fo; + fo.wait = true; + std::string value; + + ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo")); + ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2")); + ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar")); + + ASSERT_OK(dbfull()->Flush(fo)); + + ASSERT_EQ("foo", Get("barbarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("foo2", Get("barbarbar2")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("NOT_FOUND", Get("barbarbar3")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + + ASSERT_EQ("NOT_FOUND", Get("barfoofoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); + + ro.total_order_seek = true; + // NOTE: total_order_seek no longer affects Get() + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); + ASSERT_EQ( + 3, + (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + + // No bloom on extractor changed +#ifndef ROCKSDB_LITE + ASSERT_OK(db_->SetOptions({{"prefix_extractor", "capped:10"}})); + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); + ASSERT_EQ( + 3, + (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); +#endif // ROCKSDB_LITE + + get_perf_context()->Reset(); + } +} + +TEST_F(DBBloomFilterTest, WholeKeyFilterProp) { + for (bool partition_filters : {true, false}) { + Options options = last_options_; + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + get_perf_context()->EnablePerLevelPerfContext(); + + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10)); + bbto.whole_key_filtering = false; + if (partition_filters) { + bbto.partition_filters = true; + bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + WriteOptions wo; + ReadOptions ro; + FlushOptions fo; + fo.wait = true; + std::string value; + + ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); + // Needs insert some keys to make sure files are not filtered out by key + // ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + ASSERT_OK(dbfull()->Flush(fo)); + + Reopen(options); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + + // Reopen with whole key filtering enabled and prefix extractor + // NULL. Bloom filter should be off for both of whole key and + // prefix bloom. + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.prefix_extractor.reset(); + Reopen(options); + + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + // Write DB with only full key filtering. + ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); + // Needs insert some keys to make sure files are not filtered out by key + // ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Reopen with both of whole key off and prefix extractor enabled. + // Still no bloom filter should be used. + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + + // Try to create a DB with mixed files: + ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); + // Needs insert some keys to make sure files are not filtered out by key + // ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + options.prefix_extractor.reset(); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + // Try to create a DB with mixed files. + ASSERT_OK(dbfull()->Put(wo, "barfoo", "bar")); + // In this case needs insert some keys to make sure files are + // not filtered out by key ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + ASSERT_OK(Flush()); + + // Now we have two files: + // File 1: An older file with prefix bloom. + // File 2: A newer file with whole bloom filter. + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); + ASSERT_EQ("bar", Get("barfoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); + + // Reopen with the same setting: only whole key is used + Reopen(options); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 5); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 6); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); + ASSERT_EQ("bar", Get("barfoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); + + // Restart with both filters are allowed + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); + // File 1 will has it filtered out. + // File 2 will not, as prefix `foo` exists in the file. + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 8); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 10); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + ASSERT_EQ("bar", Get("barfoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + + // Restart with only prefix bloom is allowed. + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); + ASSERT_EQ("bar", Get("barfoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); + uint64_t bloom_filter_useful_all_levels = 0; + for (auto& kv : (*(get_perf_context()->level_to_perf_context))) { + if (kv.second.bloom_filter_useful > 0) { + bloom_filter_useful_all_levels += kv.second.bloom_filter_useful; + } + } + ASSERT_EQ(12, bloom_filter_useful_all_levels); + get_perf_context()->Reset(); + } +} + +TEST_P(DBBloomFilterTestWithParam, BloomFilter) { + do { + Options options = CurrentOptions(); + env_->count_random_reads_ = true; + options.env = env_; + // ChangeCompactOptions() only changes compaction style, which does not + // trigger reset of table_factory + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.filter_policy = Create(10, bfp_impl_); + table_options.partition_filters = partition_filters_; + if (partition_filters_) { + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + table_options.format_version = format_version_; + if (format_version_ >= 4) { + // value delta encoding challenged more with index interval > 1 + table_options.index_block_restart_interval = 8; + } + table_options.metadata_block_size = 32; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + CreateAndReopenWithCF({"pikachu"}, options); + + // Populate multiple layers + const int N = 10000; + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + Compact(1, "a", "z"); + for (int i = 0; i < N; i += 100) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + ASSERT_OK(Flush(1)); + + // Prevent auto compactions triggered by seeks + env_->delay_sstable_sync_.store(true, std::memory_order_release); + + // Lookup present keys. Should rarely read from small sstable. + env_->random_read_counter_.Reset(); + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + int reads = env_->random_read_counter_.Read(); + fprintf(stderr, "%d present => %d reads\n", N, reads); + ASSERT_GE(reads, N); + if (partition_filters_) { + // Without block cache, we read an extra partition filter per each + // level*read and a partition index per each read + ASSERT_LE(reads, 4 * N + 2 * N / 100); + } else { + ASSERT_LE(reads, N + 2 * N / 100); + } + + // Lookup present keys. Should rarely read from either sstable. + env_->random_read_counter_.Reset(); + for (int i = 0; i < N; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i) + ".missing")); + } + reads = env_->random_read_counter_.Read(); + fprintf(stderr, "%d missing => %d reads\n", N, reads); + if (partition_filters_) { + // With partitioned filter we read one extra filter per level per each + // missed read. + ASSERT_LE(reads, 2 * N + 3 * N / 100); + } else { + ASSERT_LE(reads, 3 * N / 100); + } + +#ifndef ROCKSDB_LITE + // Sanity check some table properties + std::map props; + ASSERT_TRUE(db_->GetMapProperty( + handles_[1], DB::Properties::kAggregatedTableProperties, &props)); + uint64_t nkeys = N + N / 100; + uint64_t filter_size = ParseUint64(props["filter_size"]); + EXPECT_LE(filter_size, + (partition_filters_ ? 12 : 11) * nkeys / /*bits / byte*/ 8); + if (bfp_impl_ == kAutoRibbon) { + // Sometimes using Ribbon filter which is more space-efficient + EXPECT_GE(filter_size, 7 * nkeys / /*bits / byte*/ 8); + } else { + // Always Bloom + EXPECT_GE(filter_size, 10 * nkeys / /*bits / byte*/ 8); + } + + uint64_t num_filter_entries = ParseUint64(props["num_filter_entries"]); + EXPECT_EQ(num_filter_entries, nkeys); +#endif // ROCKSDB_LITE + + env_->delay_sstable_sync_.store(false, std::memory_order_release); + Close(); + } while (ChangeCompactOptions()); +} + +namespace { + +class AlwaysTrueBitsBuilder : public FilterBitsBuilder { + public: + void AddKey(const Slice&) override {} + size_t EstimateEntriesAdded() override { return 0U; } + Slice Finish(std::unique_ptr* /* buf */) override { + // Interpreted as "always true" filter (0 probes over 1 byte of + // payload, 5 bytes metadata) + return Slice("\0\0\0\0\0\0", 6); + } + using FilterBitsBuilder::Finish; + size_t ApproximateNumEntries(size_t) override { return SIZE_MAX; } +}; + +class AlwaysTrueFilterPolicy : public ReadOnlyBuiltinFilterPolicy { + public: + explicit AlwaysTrueFilterPolicy(bool skip) : skip_(skip) {} + + FilterBitsBuilder* GetBuilderWithContext( + const FilterBuildingContext&) const override { + if (skip_) { + return nullptr; + } else { + return new AlwaysTrueBitsBuilder(); + } + } + + private: + bool skip_; +}; + +} // anonymous namespace + +TEST_P(DBBloomFilterTestWithParam, SkipFilterOnEssentiallyZeroBpk) { + constexpr int maxKey = 10; + auto PutFn = [&]() { + int i; + // Put + for (i = 0; i < maxKey; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + Flush(); + }; + auto GetFn = [&]() { + int i; + // Get OK + for (i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(Key(i))); + } + // Get NotFound + for (; i < maxKey * 2; i++) { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + }; + auto PutAndGetFn = [&]() { + PutFn(); + GetFn(); + }; +#ifndef ROCKSDB_LITE + std::map props; + const auto& kAggTableProps = DB::Properties::kAggregatedTableProperties; +#endif // ROCKSDB_LITE + + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.partition_filters = partition_filters_; + if (partition_filters_) { + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + table_options.format_version = format_version_; + + // Test 1: bits per key < 0.5 means skip filters -> no filter + // constructed or read. + table_options.filter_policy = Create(0.4, bfp_impl_); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + PutAndGetFn(); + + // Verify no filter access nor contruction + EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 0); + EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), 0); + +#ifndef ROCKSDB_LITE + props.clear(); + ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props)); + EXPECT_EQ(props["filter_size"], "0"); +#endif // ROCKSDB_LITE + + // Test 2: use custom API to skip filters -> no filter constructed + // or read. + table_options.filter_policy.reset( + new AlwaysTrueFilterPolicy(/* skip */ true)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + PutAndGetFn(); + + // Verify no filter access nor construction + EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 0); + EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), 0); + +#ifndef ROCKSDB_LITE + props.clear(); + ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props)); + EXPECT_EQ(props["filter_size"], "0"); +#endif // ROCKSDB_LITE + + // Control test: using an actual filter with 100% FP rate -> the filter + // is constructed and checked on read. + table_options.filter_policy.reset( + new AlwaysTrueFilterPolicy(/* skip */ false)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + PutAndGetFn(); + + // Verify filter is accessed (and constructed) + EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), + maxKey * 2); + EXPECT_EQ( + TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), + maxKey); +#ifndef ROCKSDB_LITE + props.clear(); + ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props)); + EXPECT_NE(props["filter_size"], "0"); +#endif // ROCKSDB_LITE + + // Test 3 (options test): Able to read existing filters with longstanding + // generated options file entry `filter_policy=rocksdb.BuiltinBloomFilter` + ASSERT_OK(FilterPolicy::CreateFromString(ConfigOptions(), + "rocksdb.BuiltinBloomFilter", + &table_options.filter_policy)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + GetFn(); + + // Verify filter is accessed + EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), + maxKey * 2); + EXPECT_EQ( + TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), + maxKey); + + // But new filters are not generated (configuration details unknown) + DestroyAndReopen(options); + PutAndGetFn(); + + // Verify no filter access nor construction + EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 0); + EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), 0); + +#ifndef ROCKSDB_LITE + props.clear(); + ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props)); + EXPECT_EQ(props["filter_size"], "0"); +#endif // ROCKSDB_LITE +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +INSTANTIATE_TEST_CASE_P( + FormatDef, DBBloomFilterTestDefFormatVersion, + ::testing::Values( + std::make_tuple(kAutoBloom, true, test::kDefaultFormatVersion), + std::make_tuple(kAutoBloom, false, test::kDefaultFormatVersion), + std::make_tuple(kAutoRibbon, false, test::kDefaultFormatVersion))); + +INSTANTIATE_TEST_CASE_P( + FormatDef, DBBloomFilterTestWithParam, + ::testing::Values( + std::make_tuple(kAutoBloom, true, test::kDefaultFormatVersion), + std::make_tuple(kAutoBloom, false, test::kDefaultFormatVersion), + std::make_tuple(kAutoRibbon, false, test::kDefaultFormatVersion))); + +INSTANTIATE_TEST_CASE_P( + FormatLatest, DBBloomFilterTestWithParam, + ::testing::Values(std::make_tuple(kAutoBloom, true, kLatestFormatVersion), + std::make_tuple(kAutoBloom, false, kLatestFormatVersion), + std::make_tuple(kAutoRibbon, false, + kLatestFormatVersion))); +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_F(DBBloomFilterTest, BloomFilterRate) { + while (ChangeFilterOptions()) { + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + get_perf_context()->EnablePerLevelPerfContext(); + CreateAndReopenWithCF({"pikachu"}, options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + // Add a large key to make the file contain wide range + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + Flush(1); + + // Check if they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + + // Check if filter is useful + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); + } + ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey * 0.98); + ASSERT_GE( + (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful, + maxKey * 0.98); + get_perf_context()->Reset(); + } +} + +namespace { +struct CompatibilityConfig { + std::shared_ptr policy; + bool partitioned; + uint32_t format_version; + + void SetInTableOptions(BlockBasedTableOptions* table_options) { + table_options->filter_policy = policy; + table_options->partition_filters = partitioned; + if (partitioned) { + table_options->index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } else { + table_options->index_type = + BlockBasedTableOptions::IndexType::kBinarySearch; + } + table_options->format_version = format_version; + } +}; +// High bits per key -> almost no FPs +std::shared_ptr kCompatibilityBloomPolicy{ + NewBloomFilterPolicy(20)}; +// bloom_before_level=-1 -> always use Ribbon +std::shared_ptr kCompatibilityRibbonPolicy{ + NewRibbonFilterPolicy(20, -1)}; + +std::vector kCompatibilityConfigs = { + {kCompatibilityBloomPolicy, false, BlockBasedTableOptions().format_version}, + {kCompatibilityBloomPolicy, true, BlockBasedTableOptions().format_version}, + {kCompatibilityBloomPolicy, false, /* legacy Bloom */ 4U}, + {kCompatibilityRibbonPolicy, false, + BlockBasedTableOptions().format_version}, + {kCompatibilityRibbonPolicy, true, BlockBasedTableOptions().format_version}, +}; +} // anonymous namespace + +TEST_F(DBBloomFilterTest, BloomFilterCompatibility) { + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.level0_file_num_compaction_trigger = + static_cast(kCompatibilityConfigs.size()) + 1; + options.max_open_files = -1; + + Close(); + + // Create one file for each kind of filter. Each file covers a distinct key + // range. + for (size_t i = 0; i < kCompatibilityConfigs.size(); ++i) { + BlockBasedTableOptions table_options; + kCompatibilityConfigs[i].SetInTableOptions(&table_options); + ASSERT_TRUE(table_options.filter_policy != nullptr); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + + std::string prefix = std::to_string(i) + "_"; + ASSERT_OK(Put(prefix + "A", "val")); + ASSERT_OK(Put(prefix + "Z", "val")); + ASSERT_OK(Flush()); + } + + // Test filter is used between each pair of {reader,writer} configurations, + // because any built-in FilterPolicy should be able to read filters from any + // other built-in FilterPolicy + for (size_t i = 0; i < kCompatibilityConfigs.size(); ++i) { + BlockBasedTableOptions table_options; + kCompatibilityConfigs[i].SetInTableOptions(&table_options); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + for (size_t j = 0; j < kCompatibilityConfigs.size(); ++j) { + std::string prefix = std::to_string(j) + "_"; + ASSERT_EQ("val", Get(prefix + "A")); // Filter positive + ASSERT_EQ("val", Get(prefix + "Z")); // Filter positive + // Filter negative, with high probability + ASSERT_EQ("NOT_FOUND", Get(prefix + "Q")); + EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), + 2); + EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + } + } +} + +// To align with the type of hash entry being reserved in implementation. +using FilterConstructionReserveMemoryHash = uint64_t; + +class ChargeFilterConstructionTestWithParam + : public DBTestBase, + public testing::WithParamInterface> { + public: + ChargeFilterConstructionTestWithParam() + : DBTestBase("db_bloom_filter_tests", + /*env_do_fsync=*/true), + num_key_(0), + charge_filter_construction_(std::get<0>(GetParam())), + policy_(std::get<1>(GetParam())), + partition_filters_(std::get<2>(GetParam())), + detect_filter_construct_corruption_(std::get<3>(GetParam())) { + if (charge_filter_construction_ == + CacheEntryRoleOptions::Decision::kDisabled || + policy_ == kLegacyBloom) { + // For these cases, we only interested in whether filter construction + // cache charging happens instead of its accuracy. Therefore we don't + // need many keys. + num_key_ = 5; + } else if (partition_filters_) { + // For PartitionFilter case, since we set + // table_options.metadata_block_size big enough such that each partition + // trigger at least 1 dummy entry reservation each for hash entries and + // final filter, we need a large number of keys to ensure we have at least + // two partitions. + num_key_ = 18 * + CacheReservationManagerImpl< + CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() / + sizeof(FilterConstructionReserveMemoryHash); + } else if (policy_ == kFastLocalBloom) { + // For Bloom Filter + FullFilter case, since we design the num_key_ to + // make hash entry cache charging be a multiple of dummy entries, the + // correct behavior of charging final filter on top of it will trigger at + // least another dummy entry insertion. Therefore we can assert that + // behavior and we don't need a large number of keys to verify we + // indeed charge the final filter for in cache, even though final + // filter is a lot smaller than hash entries. + num_key_ = 1 * + CacheReservationManagerImpl< + CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() / + sizeof(FilterConstructionReserveMemoryHash); + } else { + // For Ribbon Filter + FullFilter case, we need a large enough number of + // keys so that charging final filter after releasing the hash entries + // reservation will trigger at least another dummy entry (or equivalently + // to saying, causing another peak in cache charging) as banding + // reservation might not be a multiple of dummy entry. + num_key_ = 12 * + CacheReservationManagerImpl< + CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() / + sizeof(FilterConstructionReserveMemoryHash); + } + } + + BlockBasedTableOptions GetBlockBasedTableOptions() { + BlockBasedTableOptions table_options; + + // We set cache capacity big enough to prevent cache full for convenience in + // calculation. + constexpr std::size_t kCacheCapacity = 100 * 1024 * 1024; + + table_options.cache_usage_options.options_overrides.insert( + {CacheEntryRole::kFilterConstruction, + {/*.charged = */ charge_filter_construction_}}); + table_options.filter_policy = Create(10, policy_); + table_options.partition_filters = partition_filters_; + if (table_options.partition_filters) { + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + // We set table_options.metadata_block_size big enough so that each + // partition trigger at least 1 dummy entry insertion each for hash + // entries and final filter. + table_options.metadata_block_size = 409000; + } + table_options.detect_filter_construct_corruption = + detect_filter_construct_corruption_; + + LRUCacheOptions lo; + lo.capacity = kCacheCapacity; + lo.num_shard_bits = 0; // 2^0 shard + lo.strict_capacity_limit = true; + cache_ = std::make_shared< + TargetCacheChargeTrackingCache>( + (NewLRUCache(lo))); + table_options.block_cache = cache_; + + return table_options; + } + + std::size_t GetNumKey() { return num_key_; } + + CacheEntryRoleOptions::Decision ChargeFilterConstructMemory() { + return charge_filter_construction_; + } + + std::string GetFilterPolicy() { return policy_; } + + bool PartitionFilters() { return partition_filters_; } + + std::shared_ptr< + TargetCacheChargeTrackingCache> + GetCache() { + return cache_; + } + + private: + std::size_t num_key_; + CacheEntryRoleOptions::Decision charge_filter_construction_; + std::string policy_; + bool partition_filters_; + std::shared_ptr< + TargetCacheChargeTrackingCache> + cache_; + bool detect_filter_construct_corruption_; +}; + +INSTANTIATE_TEST_CASE_P( + ChargeFilterConstructionTestWithParam, + ChargeFilterConstructionTestWithParam, + ::testing::Values( + std::make_tuple(CacheEntryRoleOptions::Decision::kDisabled, + kFastLocalBloom, false, false), + + std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, + kFastLocalBloom, false, false), + std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, + kFastLocalBloom, false, true), + std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, + kFastLocalBloom, true, false), + std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, + kFastLocalBloom, true, true), + + std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, + kStandard128Ribbon, false, false), + std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, + kStandard128Ribbon, false, true), + std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, + kStandard128Ribbon, true, false), + std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, + kStandard128Ribbon, true, true), + + std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, kLegacyBloom, + false, false))); + +// TODO: Speed up this test, and reduce disk space usage (~700MB) +// The current test inserts many keys (on the scale of dummy entry size) +// in order to make small memory user (e.g, final filter, partitioned hash +// entries/filter/banding) , which is proportional to the number of +// keys, big enough so that its cache charging triggers dummy entry insertion +// and becomes observable in the test. +// +// However, inserting that many keys slows down this test and leaves future +// developers an opportunity to speed it up. +// +// Possible approaches & challenges: +// 1. Use sync point during cache charging of filter construction +// +// Benefit: It does not rely on triggering dummy entry insertion +// but the sync point to verify small memory user is charged correctly. +// +// Challenge: this approach is intrusive. +// +// 2. Make dummy entry size configurable and set it small in the test +// +// Benefit: It increases the precision of cache charging and therefore +// small memory usage can still trigger insertion of dummy entry. +// +// Challenge: change CacheReservationManager related APIs and a hack +// might be needed to control the size of dummmy entry of +// CacheReservationManager used in filter construction for testing +// since CacheReservationManager is not exposed at the high level. +// +TEST_P(ChargeFilterConstructionTestWithParam, Basic) { + Options options = CurrentOptions(); + // We set write_buffer_size big enough so that in the case where there is + // filter construction cache charging, flush won't be triggered before we + // manually trigger it for clean testing + options.write_buffer_size = 640 << 20; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + std::shared_ptr< + TargetCacheChargeTrackingCache> + cache = GetCache(); + options.create_if_missing = true; + // Disable auto compaction to prevent its unexpected side effect + // to the number of keys per partition designed by us in the test + options.disable_auto_compactions = true; + DestroyAndReopen(options); + int num_key = static_cast(GetNumKey()); + for (int i = 0; i < num_key; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + + ASSERT_EQ(cache->GetChargedCacheIncrementSum(), 0) + << "Flush was triggered too early in the test case with filter " + "construction cache charging - please make sure no flush triggered " + "during the key insertions above"; + + ASSERT_OK(Flush()); + + bool charge_filter_construction = (ChargeFilterConstructMemory() == + CacheEntryRoleOptions::Decision::kEnabled); + std::string policy = GetFilterPolicy(); + bool partition_filters = PartitionFilters(); + bool detect_filter_construct_corruption = + table_options.detect_filter_construct_corruption; + + std::deque filter_construction_cache_res_peaks = + cache->GetChargedCachePeaks(); + std::size_t filter_construction_cache_res_increments_sum = + cache->GetChargedCacheIncrementSum(); + + if (!charge_filter_construction) { + EXPECT_EQ(filter_construction_cache_res_peaks.size(), 0); + return; + } + + if (policy == kLegacyBloom) { + EXPECT_EQ(filter_construction_cache_res_peaks.size(), 0) + << "There shouldn't be filter construction cache charging as this " + "feature does not support kLegacyBloom"; + return; + } + + const std::size_t kDummyEntrySize = CacheReservationManagerImpl< + CacheEntryRole::kFilterConstruction>::GetDummyEntrySize(); + + const std::size_t predicted_hash_entries_cache_res = + num_key * sizeof(FilterConstructionReserveMemoryHash); + ASSERT_EQ(predicted_hash_entries_cache_res % kDummyEntrySize, 0) + << "It's by this test's design that predicted_hash_entries_cache_res is " + "a multipe of dummy entry"; + + const std::size_t predicted_hash_entries_cache_res_dummy_entry_num = + predicted_hash_entries_cache_res / kDummyEntrySize; + const std::size_t predicted_final_filter_cache_res = + static_cast( + std::ceil(1.0 * predicted_hash_entries_cache_res_dummy_entry_num / 6 * + (policy == kStandard128Ribbon ? 0.7 : 1))) * + kDummyEntrySize; + const std::size_t predicted_banding_cache_res = + static_cast( + std::ceil(predicted_hash_entries_cache_res_dummy_entry_num * 2.5)) * + kDummyEntrySize; + + if (policy == kFastLocalBloom) { + /* kFastLocalBloom + FullFilter + * p0 + * / \ + * b / \ + * / \ + * / \ + * 0/ \ + * hash entries = b - 0, final filter = p0 - b + * p0 = hash entries + final filter + * + * The test is designed in a way such that the reservation for b is a + * multiple of dummy entries so that reservation for (p0 - b) + * will trigger at least another dummy entry insertion. + * + * kFastLocalBloom + FullFilter + + * detect_filter_construct_corruption + * The peak p0 stays the same as + * (kFastLocalBloom + FullFilter) but just lasts + * longer since we release hash entries reservation later. + * + * kFastLocalBloom + PartitionedFilter + * p1 + * / \ + * p0 b'/ \ + * / \ / \ + * b / \ / \ + * / \ / \ + * / a \ + * 0/ \ + * partitioned hash entries1 = b - 0, partitioned hash entries1 = b' - a + * parittioned final filter1 = p0 - b, parittioned final filter2 = p1 - b' + * + * (increment p0 - 0) + (increment p1 - a) + * = partitioned hash entries1 + partitioned hash entries2 + * + parittioned final filter1 + parittioned final filter2 + * = hash entries + final filter + * + * kFastLocalBloom + PartitionedFilter + + * detect_filter_construct_corruption + * The peak p0, p1 stay the same as + * (kFastLocalBloom + PartitionedFilter) but just + * last longer since we release hash entries reservation later. + * + */ + if (!partition_filters) { + EXPECT_EQ(filter_construction_cache_res_peaks.size(), 1) + << "Filter construction cache charging should have only 1 peak in " + "case: kFastLocalBloom + FullFilter"; + std::size_t filter_construction_cache_res_peak = + filter_construction_cache_res_peaks[0]; + EXPECT_GT(filter_construction_cache_res_peak, + predicted_hash_entries_cache_res) + << "The testing number of hash entries is designed to make hash " + "entries cache charging be multiples of dummy entries" + " so the correct behavior of charging final filter on top of it" + " should've triggered at least another dummy entry insertion"; + + std::size_t predicted_filter_construction_cache_res_peak = + predicted_hash_entries_cache_res + predicted_final_filter_cache_res; + EXPECT_GE(filter_construction_cache_res_peak, + predicted_filter_construction_cache_res_peak * 0.9); + EXPECT_LE(filter_construction_cache_res_peak, + predicted_filter_construction_cache_res_peak * 1.1); + return; + } else { + EXPECT_GE(filter_construction_cache_res_peaks.size(), 2) + << "Filter construction cache charging should have multiple peaks " + "in case: kFastLocalBloom + " + "PartitionedFilter"; + std::size_t predicted_filter_construction_cache_res_increments_sum = + predicted_hash_entries_cache_res + predicted_final_filter_cache_res; + EXPECT_GE(filter_construction_cache_res_increments_sum, + predicted_filter_construction_cache_res_increments_sum * 0.9); + EXPECT_LE(filter_construction_cache_res_increments_sum, + predicted_filter_construction_cache_res_increments_sum * 1.1); + return; + } + } + + if (policy == kStandard128Ribbon) { + /* kStandard128Ribbon + FullFilter + * p0 + * / \ p1 + * / \/\ + * b / b' \ + * / \ + * 0/ \ + * hash entries = b - 0, banding = p0 - b, final filter = p1 - b' + * p0 = hash entries + banding + * + * The test is designed in a way such that the reservation for (p1 - b') + * will trigger at least another dummy entry insertion + * (or equivelantly to saying, creating another peak). + * + * kStandard128Ribbon + FullFilter + + * detect_filter_construct_corruption + * + * new p0 + * / \ + * / \ + * pre p0 \ + * / \ + * / \ + * b / \ + * / \ + * 0/ \ + * hash entries = b - 0, banding = pre p0 - b, + * final filter = new p0 - pre p0 + * new p0 = hash entries + banding + final filter + * + * The previous p0 will no longer be a peak since under + * detect_filter_construct_corruption == true, we do not release hash + * entries reserveration (like p0 - b' previously) until after final filter + * creation and post-verification + * + * kStandard128Ribbon + PartitionedFilter + * p3 + * p0 /\ p4 + * / \ p1 / \ /\ + * / \/\ b''/ a' \ + * b / b' \ / \ + * / \ / \ + * 0/ a \ + * partitioned hash entries1 = b - 0, partitioned hash entries2 = b'' - a + * partitioned banding1 = p0 - b, partitioned banding2 = p3 - b'' + * parittioned final filter1 = p1 - b',parittioned final filter2 = p4 - a' + * + * (increment p0 - 0) + (increment p1 - b') + * + (increment p3 - a) + (increment p4 - a') + * = partitioned hash entries1 + partitioned hash entries2 + * + parittioned banding1 + parittioned banding2 + * + parittioned final filter1 + parittioned final filter2 + * = hash entries + banding + final filter + * + * kStandard128Ribbon + PartitionedFilter + + * detect_filter_construct_corruption + * + * new p3 + * / \ + * pre p3 \ + * new p0 / \ + * / \ / \ + * pre p0 \ / \ + * / \ b'/ \ + * / \ / \ + * b / \ / \ + * / \a \ + * 0/ \ + * partitioned hash entries1 = b - 0, partitioned hash entries2 = b' - a + * partitioned banding1 = pre p0 - b, partitioned banding2 = pre p3 - b' + * parittioned final filter1 = new p0 - pre p0, + * parittioned final filter2 = new p3 - pre p3 + * + * The previous p0 and p3 will no longer be a peak since under + * detect_filter_construct_corruption == true, we do not release hash + * entries reserveration (like p0 - b', p3 - a' previously) until after + * parittioned final filter creation and post-verification + * + * However, increments sum stay the same as shown below: + * (increment new p0 - 0) + (increment new p3 - a) + * = partitioned hash entries1 + partitioned hash entries2 + * + parittioned banding1 + parittioned banding2 + * + parittioned final filter1 + parittioned final filter2 + * = hash entries + banding + final filter + * + */ + if (!partition_filters) { + ASSERT_GE( + std::floor( + 1.0 * predicted_final_filter_cache_res / + CacheReservationManagerImpl< + CacheEntryRole::kFilterConstruction>::GetDummyEntrySize()), + 1) + << "Final filter cache charging too small for this test - please " + "increase the number of keys"; + if (!detect_filter_construct_corruption) { + EXPECT_EQ(filter_construction_cache_res_peaks.size(), 2) + << "Filter construction cache charging should have 2 peaks in " + "case: kStandard128Ribbon + " + "FullFilter. " + "The second peak is resulted from charging the final filter " + "after " + "decreasing the hash entry reservation since the testing final " + "filter reservation is designed to be at least 1 dummy entry " + "size"; + + std::size_t filter_construction_cache_res_peak = + filter_construction_cache_res_peaks[0]; + std::size_t predicted_filter_construction_cache_res_peak = + predicted_hash_entries_cache_res + predicted_banding_cache_res; + EXPECT_GE(filter_construction_cache_res_peak, + predicted_filter_construction_cache_res_peak * 0.9); + EXPECT_LE(filter_construction_cache_res_peak, + predicted_filter_construction_cache_res_peak * 1.1); + } else { + EXPECT_EQ(filter_construction_cache_res_peaks.size(), 1) + << "Filter construction cache charging should have 1 peaks in " + "case: kStandard128Ribbon + FullFilter " + "+ detect_filter_construct_corruption. " + "The previous second peak now disappears since we don't " + "decrease the hash entry reservation" + "until after final filter reservation and post-verification"; + + std::size_t filter_construction_cache_res_peak = + filter_construction_cache_res_peaks[0]; + std::size_t predicted_filter_construction_cache_res_peak = + predicted_hash_entries_cache_res + predicted_banding_cache_res + + predicted_final_filter_cache_res; + EXPECT_GE(filter_construction_cache_res_peak, + predicted_filter_construction_cache_res_peak * 0.9); + EXPECT_LE(filter_construction_cache_res_peak, + predicted_filter_construction_cache_res_peak * 1.1); + } + return; + } else { + if (!detect_filter_construct_corruption) { + EXPECT_GE(filter_construction_cache_res_peaks.size(), 3) + << "Filter construction cache charging should have more than 3 " + "peaks " + "in case: kStandard128Ribbon + " + "PartitionedFilter"; + } else { + EXPECT_GE(filter_construction_cache_res_peaks.size(), 2) + << "Filter construction cache charging should have more than 2 " + "peaks " + "in case: kStandard128Ribbon + " + "PartitionedFilter + detect_filter_construct_corruption"; + } + std::size_t predicted_filter_construction_cache_res_increments_sum = + predicted_hash_entries_cache_res + predicted_banding_cache_res + + predicted_final_filter_cache_res; + EXPECT_GE(filter_construction_cache_res_increments_sum, + predicted_filter_construction_cache_res_increments_sum * 0.9); + EXPECT_LE(filter_construction_cache_res_increments_sum, + predicted_filter_construction_cache_res_increments_sum * 1.1); + return; + } + } +} + +class DBFilterConstructionCorruptionTestWithParam + : public DBTestBase, + public testing::WithParamInterface< + std::tuple> { + public: + DBFilterConstructionCorruptionTestWithParam() + : DBTestBase("db_bloom_filter_tests", + /*env_do_fsync=*/true) {} + + BlockBasedTableOptions GetBlockBasedTableOptions() { + BlockBasedTableOptions table_options; + table_options.detect_filter_construct_corruption = std::get<0>(GetParam()); + table_options.filter_policy = Create(10, std::get<1>(GetParam())); + table_options.partition_filters = std::get<2>(GetParam()); + if (table_options.partition_filters) { + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + // We set table_options.metadata_block_size small enough so we can + // trigger filter partitioning with GetNumKey() amount of keys + table_options.metadata_block_size = 10; + } + + return table_options; + } + + // Return an appropriate amount of keys for testing + // to generate a long filter (i.e, size >= 8 + kMetadataLen) + std::size_t GetNumKey() { return 5000; } +}; + +INSTANTIATE_TEST_CASE_P( + DBFilterConstructionCorruptionTestWithParam, + DBFilterConstructionCorruptionTestWithParam, + ::testing::Values(std::make_tuple(false, kFastLocalBloom, false), + std::make_tuple(true, kFastLocalBloom, false), + std::make_tuple(true, kFastLocalBloom, true), + std::make_tuple(true, kStandard128Ribbon, false), + std::make_tuple(true, kStandard128Ribbon, true))); + +TEST_P(DBFilterConstructionCorruptionTestWithParam, DetectCorruption) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.create_if_missing = true; + options.disable_auto_compactions = true; + + DestroyAndReopen(options); + int num_key = static_cast(GetNumKey()); + Status s; + + // Case 1: No corruption in filter construction + for (int i = 0; i < num_key; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + s = Flush(); + EXPECT_TRUE(s.ok()); + + // Case 2: Corruption of hash entries in filter construction + for (int i = 0; i < num_key; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + + SyncPoint::GetInstance()->SetCallBack( + "XXPH3FilterBitsBuilder::Finish::TamperHashEntries", [&](void* arg) { + std::deque* hash_entries_to_corrupt = + (std::deque*)arg; + assert(!hash_entries_to_corrupt->empty()); + *(hash_entries_to_corrupt->begin()) = + *(hash_entries_to_corrupt->begin()) ^ uint64_t { 1 }; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + s = Flush(); + + if (table_options.detect_filter_construct_corruption) { + EXPECT_TRUE(s.IsCorruption()); + EXPECT_TRUE( + s.ToString().find("Filter's hash entries checksum mismatched") != + std::string::npos); + } else { + EXPECT_TRUE(s.ok()); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearCallBack( + "XXPH3FilterBitsBuilder::Finish::" + "TamperHashEntries"); + + // Case 3: Corruption of filter content in filter construction + DestroyAndReopen(options); + + for (int i = 0; i < num_key; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + + SyncPoint::GetInstance()->SetCallBack( + "XXPH3FilterBitsBuilder::Finish::TamperFilter", [&](void* arg) { + std::pair*, std::size_t>* TEST_arg_pair = + (std::pair*, std::size_t>*)arg; + std::size_t filter_size = TEST_arg_pair->second; + // 5 is the kMetadataLen and + assert(filter_size >= 8 + 5); + std::unique_ptr* filter_content_to_corrupt = + TEST_arg_pair->first; + std::memset(filter_content_to_corrupt->get(), '\0', 8); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + s = Flush(); + + if (table_options.detect_filter_construct_corruption) { + EXPECT_TRUE(s.IsCorruption()); + EXPECT_TRUE(s.ToString().find("Corrupted filter content") != + std::string::npos); + } else { + EXPECT_TRUE(s.ok()); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearCallBack( + "XXPH3FilterBitsBuilder::Finish::" + "TamperFilter"); +} + +// RocksDB lite does not support dynamic options +#ifndef ROCKSDB_LITE +TEST_P(DBFilterConstructionCorruptionTestWithParam, + DynamicallyTurnOnAndOffDetectConstructCorruption) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + // We intend to turn on + // table_options.detect_filter_construct_corruption dynamically + // therefore we override this test parmater's value + table_options.detect_filter_construct_corruption = false; + + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.create_if_missing = true; + + int num_key = static_cast(GetNumKey()); + Status s; + + DestroyAndReopen(options); + + // Case 1: !table_options.detect_filter_construct_corruption + for (int i = 0; i < num_key; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + + SyncPoint::GetInstance()->SetCallBack( + "XXPH3FilterBitsBuilder::Finish::TamperHashEntries", [&](void* arg) { + std::deque* hash_entries_to_corrupt = + (std::deque*)arg; + assert(!hash_entries_to_corrupt->empty()); + *(hash_entries_to_corrupt->begin()) = + *(hash_entries_to_corrupt->begin()) ^ uint64_t { 1 }; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + s = Flush(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearCallBack( + "XXPH3FilterBitsBuilder::Finish::" + "TamperHashEntries"); + + ASSERT_FALSE(table_options.detect_filter_construct_corruption); + EXPECT_TRUE(s.ok()); + + // Case 2: dynamically turn on + // table_options.detect_filter_construct_corruption + ASSERT_OK(db_->SetOptions({{"block_based_table_factory", + "{detect_filter_construct_corruption=true;}"}})); + + for (int i = 0; i < num_key; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + + SyncPoint::GetInstance()->SetCallBack( + "XXPH3FilterBitsBuilder::Finish::TamperHashEntries", [&](void* arg) { + std::deque* hash_entries_to_corrupt = + (std::deque*)arg; + assert(!hash_entries_to_corrupt->empty()); + *(hash_entries_to_corrupt->begin()) = + *(hash_entries_to_corrupt->begin()) ^ uint64_t { 1 }; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + s = Flush(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearCallBack( + "XXPH3FilterBitsBuilder::Finish::" + "TamperHashEntries"); + + auto updated_table_options = + db_->GetOptions().table_factory->GetOptions(); + EXPECT_TRUE(updated_table_options->detect_filter_construct_corruption); + EXPECT_TRUE(s.IsCorruption()); + EXPECT_TRUE(s.ToString().find("Filter's hash entries checksum mismatched") != + std::string::npos); + + // Case 3: dynamically turn off + // table_options.detect_filter_construct_corruption + ASSERT_OK(db_->SetOptions({{"block_based_table_factory", + "{detect_filter_construct_corruption=false;}"}})); + updated_table_options = + db_->GetOptions().table_factory->GetOptions(); + EXPECT_FALSE(updated_table_options->detect_filter_construct_corruption); +} +#endif // ROCKSDB_LITE + +namespace { +// NOTE: This class is referenced by HISTORY.md as a model for a wrapper +// FilterPolicy selecting among configurations based on context. +class LevelAndStyleCustomFilterPolicy : public FilterPolicy { + public: + explicit LevelAndStyleCustomFilterPolicy(int bpk_fifo, int bpk_l0_other, + int bpk_otherwise) + : policy_fifo_(NewBloomFilterPolicy(bpk_fifo)), + policy_l0_other_(NewBloomFilterPolicy(bpk_l0_other)), + policy_otherwise_(NewBloomFilterPolicy(bpk_otherwise)) {} + + const char* Name() const override { + return "LevelAndStyleCustomFilterPolicy"; + } + + // OK to use built-in policy name because we are deferring to a + // built-in builder. We aren't changing the serialized format. + const char* CompatibilityName() const override { + return policy_fifo_->CompatibilityName(); + } + + FilterBitsBuilder* GetBuilderWithContext( + const FilterBuildingContext& context) const override { + if (context.compaction_style == kCompactionStyleFIFO) { + return policy_fifo_->GetBuilderWithContext(context); + } else if (context.level_at_creation == 0) { + return policy_l0_other_->GetBuilderWithContext(context); + } else { + return policy_otherwise_->GetBuilderWithContext(context); + } + } + + FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override { + // OK to defer to any of them; they all can parse built-in filters + // from any settings. + return policy_fifo_->GetFilterBitsReader(contents); + } + + private: + const std::unique_ptr policy_fifo_; + const std::unique_ptr policy_l0_other_; + const std::unique_ptr policy_otherwise_; +}; + +static std::map + table_file_creation_reason_to_string{ + {TableFileCreationReason::kCompaction, "kCompaction"}, + {TableFileCreationReason::kFlush, "kFlush"}, + {TableFileCreationReason::kMisc, "kMisc"}, + {TableFileCreationReason::kRecovery, "kRecovery"}, + }; + +class TestingContextCustomFilterPolicy + : public LevelAndStyleCustomFilterPolicy { + public: + explicit TestingContextCustomFilterPolicy(int bpk_fifo, int bpk_l0_other, + int bpk_otherwise) + : LevelAndStyleCustomFilterPolicy(bpk_fifo, bpk_l0_other, bpk_otherwise) { + } + + FilterBitsBuilder* GetBuilderWithContext( + const FilterBuildingContext& context) const override { + test_report_ += "cf="; + test_report_ += context.column_family_name; + test_report_ += ",s="; + test_report_ += + OptionsHelper::compaction_style_to_string[context.compaction_style]; + test_report_ += ",n="; + test_report_ += std::to_string(context.num_levels); + test_report_ += ",l="; + test_report_ += std::to_string(context.level_at_creation); + test_report_ += ",b="; + test_report_ += std::to_string(int{context.is_bottommost}); + test_report_ += ",r="; + test_report_ += table_file_creation_reason_to_string[context.reason]; + test_report_ += "\n"; + + return LevelAndStyleCustomFilterPolicy::GetBuilderWithContext(context); + } + + std::string DumpTestReport() { + std::string rv; + std::swap(rv, test_report_); + return rv; + } + + private: + mutable std::string test_report_; +}; +} // anonymous namespace + +TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) { + auto policy = std::make_shared(15, 8, 5); + Options options; + for (bool fifo : {true, false}) { + options = CurrentOptions(); + options.max_open_files = fifo ? -1 : options.max_open_files; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.compaction_style = + fifo ? kCompactionStyleFIFO : kCompactionStyleLevel; + + BlockBasedTableOptions table_options; + table_options.filter_policy = policy; + table_options.format_version = 5; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + TryReopen(options); + CreateAndReopenWithCF({fifo ? "abe" : "bob"}, options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey / 2; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + // Add a large key to make the file contain wide range + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + Flush(1); + EXPECT_EQ(policy->DumpTestReport(), + fifo ? "cf=abe,s=kCompactionStyleFIFO,n=7,l=0,b=0,r=kFlush\n" + : "cf=bob,s=kCompactionStyleLevel,n=7,l=0,b=0,r=kFlush\n"); + + for (int i = maxKey / 2; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + Flush(1); + EXPECT_EQ(policy->DumpTestReport(), + fifo ? "cf=abe,s=kCompactionStyleFIFO,n=7,l=0,b=0,r=kFlush\n" + : "cf=bob,s=kCompactionStyleLevel,n=7,l=0,b=0,r=kFlush\n"); + + // Check that they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + // Since we have two tables / two filters, we might have Bloom checks on + // our queries, but no more than one "useful" per query on a found key. + EXPECT_LE(TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey); + + // Check that we have two filters, each about + // fifo: 0.12% FP rate (15 bits per key) + // level: 2.3% FP rate (8 bits per key) + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); + } + { + auto useful_count = + TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL); + EXPECT_GE(useful_count, maxKey * 2 * (fifo ? 0.9980 : 0.975)); + EXPECT_LE(useful_count, maxKey * 2 * (fifo ? 0.9995 : 0.98)); + } + + if (!fifo) { // FIFO doesn't fully support CompactRange + // Full compaction + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr)); + EXPECT_EQ(policy->DumpTestReport(), + "cf=bob,s=kCompactionStyleLevel,n=7,l=1,b=1,r=kCompaction\n"); + + // Check that we now have one filter, about 9.2% FP rate (5 bits per key) + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); + } + { + auto useful_count = + TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL); + EXPECT_GE(useful_count, maxKey * 0.90); + EXPECT_LE(useful_count, maxKey * 0.91); + } + } else { +#ifndef ROCKSDB_LITE + // Also try external SST file + { + std::string file_path = dbname_ + "/external.sst"; + SstFileWriter sst_file_writer(EnvOptions(), options, handles_[1]); + ASSERT_OK(sst_file_writer.Open(file_path)); + ASSERT_OK(sst_file_writer.Put("key", "value")); + ASSERT_OK(sst_file_writer.Finish()); + } + // Note: kCompactionStyleLevel is default, ignored if num_levels == -1 + EXPECT_EQ(policy->DumpTestReport(), + "cf=abe,s=kCompactionStyleLevel,n=-1,l=-1,b=0,r=kMisc\n"); +#endif + } + + // Destroy + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1])); + handles_[1] = nullptr; + } +} + +class SliceTransformLimitedDomain : public SliceTransform { + const char* Name() const override { return "SliceTransformLimitedDomain"; } + + Slice Transform(const Slice& src) const override { + return Slice(src.data(), 5); + } + + bool InDomain(const Slice& src) const override { + // prefix will be x???? + return src.size() >= 5 && src[0] == 'x'; + } + + bool InRange(const Slice& dst) const override { + // prefix will be x???? + return dst.size() == 5 && dst[0] == 'x'; + } +}; + +TEST_F(DBBloomFilterTest, PrefixExtractorWithFilter1) { + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10)); + bbto.whole_key_filtering = false; + + Options options = CurrentOptions(); + options.prefix_extractor = std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + DestroyAndReopen(options); + + ASSERT_OK(Put("x1111_AAAA", "val1")); + ASSERT_OK(Put("x1112_AAAA", "val2")); + ASSERT_OK(Put("x1113_AAAA", "val3")); + ASSERT_OK(Put("x1114_AAAA", "val4")); + // Not in domain, wont be added to filter + ASSERT_OK(Put("zzzzz_AAAA", "val5")); + + ASSERT_OK(Flush()); + + ASSERT_EQ(Get("x1111_AAAA"), "val1"); + ASSERT_EQ(Get("x1112_AAAA"), "val2"); + ASSERT_EQ(Get("x1113_AAAA"), "val3"); + ASSERT_EQ(Get("x1114_AAAA"), "val4"); + // Was not added to filter but rocksdb will try to read it from the filter + ASSERT_EQ(Get("zzzzz_AAAA"), "val5"); +} + +TEST_F(DBBloomFilterTest, PrefixExtractorWithFilter2) { + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10)); + + Options options = CurrentOptions(); + options.prefix_extractor = std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + DestroyAndReopen(options); + + ASSERT_OK(Put("x1113_AAAA", "val3")); + ASSERT_OK(Put("x1114_AAAA", "val4")); + // Not in domain, wont be added to filter + ASSERT_OK(Put("zzzzz_AAAA", "val1")); + ASSERT_OK(Put("zzzzz_AAAB", "val2")); + ASSERT_OK(Put("zzzzz_AAAC", "val3")); + ASSERT_OK(Put("zzzzz_AAAD", "val4")); + + ASSERT_OK(Flush()); + + std::vector iter_res; + auto iter = db_->NewIterator(ReadOptions()); + // Seek to a key that was not in Domain + for (iter->Seek("zzzzz_AAAA"); iter->Valid(); iter->Next()) { + iter_res.emplace_back(iter->value().ToString()); + } + + std::vector expected_res = {"val1", "val2", "val3", "val4"}; + ASSERT_EQ(iter_res, expected_res); + delete iter; +} + +TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilter) { + // regression test for #2743. the range delete tombstones in memtable should + // be added even when Get() skips searching due to its prefix bloom filter + const int kMemtableSize = 1 << 20; // 1MB + const int kMemtablePrefixFilterSize = 1 << 13; // 8KB + const int kPrefixLen = 4; + Options options = CurrentOptions(); + options.memtable_prefix_bloom_size_ratio = + static_cast(kMemtablePrefixFilterSize) / kMemtableSize; + options.prefix_extractor.reset( + ROCKSDB_NAMESPACE::NewFixedPrefixTransform(kPrefixLen)); + options.write_buffer_size = kMemtableSize; + options.memtable_whole_key_filtering = false; + Reopen(options); + std::string key1("AAAABBBB"); + std::string key2("AAAACCCC"); // not in DB + std::string key3("AAAADDDD"); + std::string key4("AAAAEEEE"); + std::string value1("Value1"); + std::string value3("Value3"); + std::string value4("Value4"); + + ASSERT_OK(Put(key1, value1, WriteOptions())); + + // check memtable bloom stats + ASSERT_EQ("NOT_FOUND", Get(key2)); + ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count); + // same prefix, bloom filter false positive + ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); + + // enable whole key bloom filter + options.memtable_whole_key_filtering = true; + Reopen(options); + // check memtable bloom stats + ASSERT_OK(Put(key3, value3, WriteOptions())); + ASSERT_EQ("NOT_FOUND", Get(key2)); + // whole key bloom filter kicks in and determines it's a miss + ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); + + // verify whole key filtering does not depend on prefix_extractor + options.prefix_extractor.reset(); + Reopen(options); + // check memtable bloom stats + ASSERT_OK(Put(key4, value4, WriteOptions())); + ASSERT_EQ("NOT_FOUND", Get(key2)); + // whole key bloom filter kicks in and determines it's a miss + ASSERT_EQ(2, get_perf_context()->bloom_memtable_miss_count); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); +} + +TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilterMultiGet) { + Options options = CurrentOptions(); + options.memtable_prefix_bloom_size_ratio = 0.015; + options.memtable_whole_key_filtering = true; + Reopen(options); + std::string key1("AA"); + std::string key2("BB"); + std::string key3("CC"); + std::string key4("DD"); + std::string key_not("EE"); + std::string value1("Value1"); + std::string value2("Value2"); + std::string value3("Value3"); + std::string value4("Value4"); + + ASSERT_OK(Put(key1, value1, WriteOptions())); + ASSERT_OK(Put(key2, value2, WriteOptions())); + ASSERT_OK(Flush()); + ASSERT_OK(Put(key3, value3, WriteOptions())); + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(Put(key4, value4, WriteOptions())); + + // Delete key2 and key3 + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "BA", "CZ")); + + // Read without snapshot + auto results = MultiGet({key_not, key1, key2, key3, key4}); + ASSERT_EQ(results[0], "NOT_FOUND"); + ASSERT_EQ(results[1], value1); + ASSERT_EQ(results[2], "NOT_FOUND"); + ASSERT_EQ(results[3], "NOT_FOUND"); + ASSERT_EQ(results[4], value4); + + // Also check Get + ASSERT_EQ(Get(key1), value1); + ASSERT_EQ(Get(key2), "NOT_FOUND"); + ASSERT_EQ(Get(key3), "NOT_FOUND"); + ASSERT_EQ(Get(key4), value4); + + // Read with snapshot + results = MultiGet({key_not, key1, key2, key3, key4}, snapshot); + ASSERT_EQ(results[0], "NOT_FOUND"); + ASSERT_EQ(results[1], value1); + ASSERT_EQ(results[2], value2); + ASSERT_EQ(results[3], value3); + ASSERT_EQ(results[4], "NOT_FOUND"); + + // Also check Get + ASSERT_EQ(Get(key1, snapshot), value1); + ASSERT_EQ(Get(key2, snapshot), value2); + ASSERT_EQ(Get(key3, snapshot), value3); + ASSERT_EQ(Get(key4, snapshot), "NOT_FOUND"); + + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) { + constexpr size_t kPrefixSize = 8; + const std::string kKey = "key"; + assert(kKey.size() < kPrefixSize); + Options options = CurrentOptions(); + options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixSize)); + options.memtable_prefix_bloom_size_ratio = 0.25; + Reopen(options); + ASSERT_OK(Put(kKey, "v")); + ASSERT_EQ("v", Get(kKey)); + std::unique_ptr iter(dbfull()->NewIterator(ReadOptions())); + iter->Seek(kKey); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(kKey, iter->key()); + iter->SeekForPrev(kKey); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(kKey, iter->key()); +} + +class DBBloomFilterTestVaryPrefixAndFormatVer + : public DBTestBase, + public testing::WithParamInterface> { + protected: + bool use_prefix_; + uint32_t format_version_; + + public: + DBBloomFilterTestVaryPrefixAndFormatVer() + : DBTestBase("db_bloom_filter_tests", /*env_do_fsync=*/true) {} + + ~DBBloomFilterTestVaryPrefixAndFormatVer() override {} + + void SetUp() override { + use_prefix_ = std::get<0>(GetParam()); + format_version_ = std::get<1>(GetParam()); + } + + static std::string UKey(uint32_t i) { return Key(static_cast(i)); } +}; + +TEST_P(DBBloomFilterTestVaryPrefixAndFormatVer, PartitionedMultiGet) { + Options options = CurrentOptions(); + if (use_prefix_) { + // Entire key from UKey() + options.prefix_extractor.reset(NewCappedPrefixTransform(9)); + } + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(20)); + bbto.partition_filters = true; + bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + bbto.whole_key_filtering = !use_prefix_; + if (use_prefix_) { // (not related to prefix, just alternating between) + // Make sure code appropriately deals with metadata block size setting + // that is "too small" (smaller than minimum size for filter builder) + bbto.metadata_block_size = 63; + } else { + // Make sure the test will work even on platforms with large minimum + // filter size, due to large cache line size. + // (Largest cache line size + 10+% overhead.) + bbto.metadata_block_size = 290; + } + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + ReadOptions ropts; + + constexpr uint32_t N = 12000; + // Add N/2 evens + for (uint32_t i = 0; i < N; i += 2) { + ASSERT_OK(Put(UKey(i), UKey(i))); + } + ASSERT_OK(Flush()); +#ifndef ROCKSDB_LITE + ASSERT_EQ(TotalTableFiles(), 1); +#endif + + constexpr uint32_t Q = 29; + // MultiGet In + std::array keys; + std::array key_slices; + std::array column_families; + // MultiGet Out + std::array statuses; + std::array values; + + TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + TestGetAndResetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL); + TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL); + TestGetAndResetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED); + TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE); + TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE); + + // Check that initial clump of keys only loads one partition filter from + // block cache. + // And that spread out keys load many partition filters. + // In both cases, mix present vs. not present keys. + for (uint32_t stride : {uint32_t{1}, (N / Q) | 1}) { + for (uint32_t i = 0; i < Q; ++i) { + keys[i] = UKey(i * stride); + key_slices[i] = Slice(keys[i]); + column_families[i] = db_->DefaultColumnFamily(); + statuses[i] = Status(); + values[i] = PinnableSlice(); + } + + db_->MultiGet(ropts, Q, &column_families[0], &key_slices[0], &values[0], + /*timestamps=*/nullptr, &statuses[0], true); + + // Confirm correct status results + uint32_t number_not_found = 0; + for (uint32_t i = 0; i < Q; ++i) { + if ((i * stride % 2) == 0) { + ASSERT_OK(statuses[i]); + } else { + ASSERT_TRUE(statuses[i].IsNotFound()); + ++number_not_found; + } + } + + // Confirm correct Bloom stats (no FPs) + uint64_t filter_useful = TestGetAndResetTickerCount( + options, + use_prefix_ ? BLOOM_FILTER_PREFIX_USEFUL : BLOOM_FILTER_USEFUL); + uint64_t filter_checked = + TestGetAndResetTickerCount(options, use_prefix_ + ? BLOOM_FILTER_PREFIX_CHECKED + : BLOOM_FILTER_FULL_POSITIVE) + + (use_prefix_ ? 0 : filter_useful); + EXPECT_EQ(filter_useful, number_not_found); + EXPECT_EQ(filter_checked, Q); + if (!use_prefix_) { + EXPECT_EQ( + TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), + Q - number_not_found); + } + + // Confirm no duplicate loading same filter partition + uint64_t filter_accesses = + TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_HIT) + + TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + if (stride == 1) { + EXPECT_EQ(filter_accesses, 1); + } else { + // for large stride + EXPECT_GE(filter_accesses, Q / 2 + 1); + } + } + + // Check that a clump of keys (present and not) works when spanning + // two partitions + int found_spanning = 0; + for (uint32_t start = 0; start < N / 2;) { + for (uint32_t i = 0; i < Q; ++i) { + keys[i] = UKey(start + i); + key_slices[i] = Slice(keys[i]); + column_families[i] = db_->DefaultColumnFamily(); + statuses[i] = Status(); + values[i] = PinnableSlice(); + } + + db_->MultiGet(ropts, Q, &column_families[0], &key_slices[0], &values[0], + /*timestamps=*/nullptr, &statuses[0], true); + + // Confirm correct status results + uint32_t number_not_found = 0; + for (uint32_t i = 0; i < Q; ++i) { + if (((start + i) % 2) == 0) { + ASSERT_OK(statuses[i]); + } else { + ASSERT_TRUE(statuses[i].IsNotFound()); + ++number_not_found; + } + } + + // Confirm correct Bloom stats (might see some FPs) + uint64_t filter_useful = TestGetAndResetTickerCount( + options, + use_prefix_ ? BLOOM_FILTER_PREFIX_USEFUL : BLOOM_FILTER_USEFUL); + uint64_t filter_checked = + TestGetAndResetTickerCount(options, use_prefix_ + ? BLOOM_FILTER_PREFIX_CHECKED + : BLOOM_FILTER_FULL_POSITIVE) + + (use_prefix_ ? 0 : filter_useful); + EXPECT_GE(filter_useful, number_not_found - 2); // possible FP + EXPECT_EQ(filter_checked, Q); + if (!use_prefix_) { + EXPECT_EQ( + TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), + Q - number_not_found); + } + + // Confirm no duplicate loading of same filter partition + uint64_t filter_accesses = + TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_HIT) + + TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + if (filter_accesses == 2) { + // Spanned across partitions. + ++found_spanning; + if (found_spanning >= 2) { + break; + } else { + // Ensure that at least once we have at least one present and + // one non-present key on both sides of partition boundary. + start += 2; + } + } else { + EXPECT_EQ(filter_accesses, 1); + // See explanation at "start += 2" + start += Q - 4; + } + } + EXPECT_TRUE(found_spanning >= 2); +} + +INSTANTIATE_TEST_CASE_P(DBBloomFilterTestVaryPrefixAndFormatVer, + DBBloomFilterTestVaryPrefixAndFormatVer, + ::testing::Values( + // (use_prefix, format_version) + std::make_tuple(false, 2), + std::make_tuple(false, 3), + std::make_tuple(false, 4), + std::make_tuple(false, 5), std::make_tuple(true, 2), + std::make_tuple(true, 3), std::make_tuple(true, 4), + std::make_tuple(true, 5))); + +#ifndef ROCKSDB_LITE +namespace { +static const std::string kPlainTable = "test_PlainTableBloom"; +} // anonymous namespace + +class BloomStatsTestWithParam + : public DBBloomFilterTest, + public testing::WithParamInterface> { + public: + BloomStatsTestWithParam() { + bfp_impl_ = std::get<0>(GetParam()); + partition_filters_ = std::get<1>(GetParam()); + + options_.create_if_missing = true; + options_.prefix_extractor.reset( + ROCKSDB_NAMESPACE::NewFixedPrefixTransform(4)); + options_.memtable_prefix_bloom_size_ratio = + 8.0 * 1024.0 / static_cast(options_.write_buffer_size); + if (bfp_impl_ == kPlainTable) { + assert(!partition_filters_); // not supported in plain table + PlainTableOptions table_options; + options_.table_factory.reset(NewPlainTableFactory(table_options)); + } else { + BlockBasedTableOptions table_options; + if (partition_filters_) { + table_options.partition_filters = partition_filters_; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + table_options.filter_policy = Create(10, bfp_impl_); + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + } + options_.env = env_; + + get_perf_context()->Reset(); + DestroyAndReopen(options_); + } + + ~BloomStatsTestWithParam() override { + get_perf_context()->Reset(); + Destroy(options_); + } + + // Required if inheriting from testing::WithParamInterface<> + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + std::string bfp_impl_; + bool partition_filters_; + Options options_; +}; + +// 1 Insert 2 K-V pairs into DB +// 2 Call Get() for both keys - expext memtable bloom hit stat to be 2 +// 3 Call Get() for nonexisting key - expect memtable bloom miss stat to be 1 +// 4 Call Flush() to create SST +// 5 Call Get() for both keys - expext SST bloom hit stat to be 2 +// 6 Call Get() for nonexisting key - expect SST bloom miss stat to be 1 +// Test both: block and plain SST +TEST_P(BloomStatsTestWithParam, BloomStatsTest) { + std::string key1("AAAA"); + std::string key2("RXDB"); // not in DB + std::string key3("ZBRA"); + std::string value1("Value1"); + std::string value3("Value3"); + + ASSERT_OK(Put(key1, value1, WriteOptions())); + ASSERT_OK(Put(key3, value3, WriteOptions())); + + // check memtable bloom stats + ASSERT_EQ(value1, Get(key1)); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); + ASSERT_EQ(value3, Get(key3)); + ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count); + ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count); + + ASSERT_EQ("NOT_FOUND", Get(key2)); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count); + ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count); + + // sanity checks + ASSERT_EQ(0, get_perf_context()->bloom_sst_hit_count); + ASSERT_EQ(0, get_perf_context()->bloom_sst_miss_count); + + Flush(); + + // sanity checks + ASSERT_EQ(0, get_perf_context()->bloom_sst_hit_count); + ASSERT_EQ(0, get_perf_context()->bloom_sst_miss_count); + + // check SST bloom stats + ASSERT_EQ(value1, Get(key1)); + ASSERT_EQ(1, get_perf_context()->bloom_sst_hit_count); + ASSERT_EQ(value3, Get(key3)); + ASSERT_EQ(2, get_perf_context()->bloom_sst_hit_count); + + ASSERT_EQ("NOT_FOUND", Get(key2)); + ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count); +} + +// Same scenario as in BloomStatsTest but using an iterator +TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) { + std::string key1("AAAA"); + std::string key2("RXDB"); // not in DB + std::string key3("ZBRA"); + std::string value1("Value1"); + std::string value3("Value3"); + + ASSERT_OK(Put(key1, value1, WriteOptions())); + ASSERT_OK(Put(key3, value3, WriteOptions())); + + std::unique_ptr iter(dbfull()->NewIterator(ReadOptions())); + + // check memtable bloom stats + iter->Seek(key1); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value1, iter->value().ToString()); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); + ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count); + + iter->Seek(key3); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value3, iter->value().ToString()); + ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count); + ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count); + + iter->Seek(key2); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count); + ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count); + + Flush(); + + iter.reset(dbfull()->NewIterator(ReadOptions())); + + // Check SST bloom stats + iter->Seek(key1); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value1, iter->value().ToString()); + ASSERT_EQ(1, get_perf_context()->bloom_sst_hit_count); + + iter->Seek(key3); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value3, iter->value().ToString()); + uint64_t expected_hits = 2; + ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count); + + iter->Seek(key2); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count); + ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count); +} + +INSTANTIATE_TEST_CASE_P( + BloomStatsTestWithParam, BloomStatsTestWithParam, + ::testing::Values(std::make_tuple(kLegacyBloom, false), + std::make_tuple(kLegacyBloom, true), + std::make_tuple(kFastLocalBloom, false), + std::make_tuple(kFastLocalBloom, true), + std::make_tuple(kPlainTable, false))); + +namespace { +void PrefixScanInit(DBBloomFilterTest* dbtest) { + char buf[100]; + std::string keystr; + const int small_range_sstfiles = 5; + const int big_range_sstfiles = 5; + + // Generate 11 sst files with the following prefix ranges. + // GROUP 0: [0,10] (level 1) + // GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6] (level 0) + // GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10] (level 0) + // + // A seek with the previous API would do 11 random I/Os (to all the + // files). With the new API and a prefix filter enabled, we should + // only do 2 random I/O, to the 2 files containing the key. + + // GROUP 0 + snprintf(buf, sizeof(buf), "%02d______:start", 0); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", 10); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + ASSERT_OK(dbtest->Flush()); + ASSERT_OK(dbtest->dbfull()->CompactRange(CompactRangeOptions(), nullptr, + nullptr)); // move to level 1 + + // GROUP 1 + for (int i = 1; i <= small_range_sstfiles; i++) { + snprintf(buf, sizeof(buf), "%02d______:start", i); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", i + 1); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->Flush(); + } + + // GROUP 2 + for (int i = 1; i <= big_range_sstfiles; i++) { + snprintf(buf, sizeof(buf), "%02d______:start", 0); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", small_range_sstfiles + i + 1); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->Flush(); + } +} +} // anonymous namespace + +TEST_F(DBBloomFilterTest, PrefixScan) { + while (ChangeFilterOptions()) { + int count; + Slice prefix; + Slice key; + char buf[100]; + Iterator* iter; + snprintf(buf, sizeof(buf), "03______:"); + prefix = Slice(buf, 8); + key = Slice(buf, 9); + ASSERT_EQ(key.difference_offset(prefix), 8); + ASSERT_EQ(prefix.difference_offset(key), 8); + // db configs + env_->count_random_reads_ = true; + Options options = CurrentOptions(); + options.env = env_; + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.disable_auto_compactions = true; + options.max_background_compactions = 2; + options.create_if_missing = true; + options.memtable_factory.reset(NewHashSkipListRepFactory(16)); + assert(!options.unordered_write); + // It is incompatible with allow_concurrent_memtable_write=false + options.allow_concurrent_memtable_write = false; + + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + table_options.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + // 11 RAND I/Os + DestroyAndReopen(options); + PrefixScanInit(this); + count = 0; + env_->random_read_counter_.Reset(); + iter = db_->NewIterator(ReadOptions()); + for (iter->Seek(prefix); iter->Valid(); iter->Next()) { + if (!iter->key().starts_with(prefix)) { + break; + } + count++; + } + ASSERT_OK(iter->status()); + delete iter; + ASSERT_EQ(count, 2); + ASSERT_EQ(env_->random_read_counter_.Read(), 2); + Close(); + } // end of while +} + +TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) { + Options options = CurrentOptions(); + options.write_buffer_size = 64 * 1024; + options.arena_block_size = 4 * 1024; + options.target_file_size_base = 64 * 1024; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 4; + options.max_bytes_for_level_base = 256 * 1024; + options.max_write_buffer_number = 2; + options.max_background_compactions = 8; + options.max_background_flushes = 8; + options.compression = kNoCompression; + options.compaction_style = kCompactionStyleLevel; + options.level_compaction_dynamic_level_bytes = true; + BlockBasedTableOptions bbto; + bbto.cache_index_and_filter_blocks = true; + bbto.filter_policy.reset(NewBloomFilterPolicy(10)); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.optimize_filters_for_hits = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + get_perf_context()->Reset(); + get_perf_context()->EnablePerLevelPerfContext(); + CreateAndReopenWithCF({"mypikachu"}, options); + + int numkeys = 200000; + + // Generate randomly shuffled keys, so the updates are almost + // random. + std::vector keys; + keys.reserve(numkeys); + for (int i = 0; i < numkeys; i += 2) { + keys.push_back(i); + } + RandomShuffle(std::begin(keys), std::end(keys), /*seed*/ 42); + int num_inserted = 0; + for (int key : keys) { + ASSERT_OK(Put(1, Key(key), "val")); + if (++num_inserted % 1000 == 0) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + } + ASSERT_OK(Put(1, Key(0), "val")); + ASSERT_OK(Put(1, Key(numkeys), "val")); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + if (NumTableFilesAtLevel(0, 1) == 0) { + // No Level 0 file. Create one. + ASSERT_OK(Put(1, Key(0), "val")); + ASSERT_OK(Put(1, Key(numkeys), "val")); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + + for (int i = 1; i < numkeys; i += 2) { + ASSERT_EQ(Get(1, Key(i)), "NOT_FOUND"); + } + + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0)); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1)); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP)); + + // Now we have three sorted run, L0, L5 and L6 with most files in L6 have + // no bloom filter. Most keys be checked bloom filters twice. + ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 65000 * 2); + ASSERT_LT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 120000 * 2); + uint64_t bloom_filter_useful_all_levels = 0; + for (auto& kv : (*(get_perf_context()->level_to_perf_context))) { + if (kv.second.bloom_filter_useful > 0) { + bloom_filter_useful_all_levels += kv.second.bloom_filter_useful; + } + } + ASSERT_GT(bloom_filter_useful_all_levels, 65000 * 2); + ASSERT_LT(bloom_filter_useful_all_levels, 120000 * 2); + + for (int i = 0; i < numkeys; i += 2) { + ASSERT_EQ(Get(1, Key(i)), "val"); + } + + // Part 2 (read path): rewrite last level with blooms, then verify they get + // cached only if !optimize_filters_for_hits + options.disable_auto_compactions = true; + options.num_levels = 9; + options.optimize_filters_for_hits = false; + options.statistics = CreateDBStatistics(); + bbto.block_cache.reset(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + MoveFilesToLevel(7 /* level */, 1 /* column family index */); + + std::string value = Get(1, Key(0)); + uint64_t prev_cache_filter_hits = + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + value = Get(1, Key(0)); + ASSERT_EQ(prev_cache_filter_hits + 1, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + // Now that we know the filter blocks exist in the last level files, see if + // filter caching is skipped for this optimization + options.optimize_filters_for_hits = true; + options.statistics = CreateDBStatistics(); + bbto.block_cache.reset(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + + value = Get(1, Key(0)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(2 /* index and data block */, + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + // Check filter block ignored for files preloaded during DB::Open() + options.max_open_files = -1; + options.statistics = CreateDBStatistics(); + bbto.block_cache.reset(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + + uint64_t prev_cache_filter_misses = + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + Get(1, Key(0)); + ASSERT_EQ(prev_cache_filter_misses, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(prev_cache_filter_hits, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + // Check filter block ignored for file trivially-moved to bottom level + bbto.block_cache.reset(); + options.max_open_files = 100; // setting > -1 makes it not preload all files + options.statistics = CreateDBStatistics(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + + ASSERT_OK(Put(1, Key(numkeys + 1), "val")); + ASSERT_OK(Flush(1)); + + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + CompactRangeOptions compact_options; + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kSkip; + compact_options.change_level = true; + compact_options.target_level = 7; + ASSERT_TRUE(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr) + .IsNotSupported()); + + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + prev_cache_filter_misses = + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + value = Get(1, Key(numkeys + 1)); + ASSERT_EQ(prev_cache_filter_hits, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(prev_cache_filter_misses, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + + // Check filter block not cached for iterator + bbto.block_cache.reset(); + options.statistics = CreateDBStatistics(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + + std::unique_ptr iter(db_->NewIterator(ReadOptions(), handles_[1])); + iter->SeekToFirst(); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(2 /* index and data block */, + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + get_perf_context()->Reset(); +} + +int CountIter(std::unique_ptr& iter, const Slice& key) { + int count = 0; + for (iter->Seek(key); iter->Valid(); iter->Next()) { + count++; + } + EXPECT_OK(iter->status()); + return count; +} + +// use iterate_upper_bound to hint compatiability of existing bloom filters. +// The BF is considered compatible if 1) upper bound and seek key transform +// into the same string, or 2) the transformed seek key is of the same length +// as the upper bound and two keys are adjacent according to the comparator. +TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { + for (const auto& bfp_impl : BloomLikeFilterPolicy::GetAllFixedImpls()) { + Options options; + options.create_if_missing = true; + options.env = CurrentOptions().env; + options.prefix_extractor.reset(NewCappedPrefixTransform(4)); + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy = Create(10, bfp_impl); + table_options.index_shortening = BlockBasedTableOptions:: + IndexShorteningMode::kShortenSeparatorsAndSuccessor; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + ASSERT_OK(Put("abcdxxx0", "val1")); + ASSERT_OK(Put("abcdxxx1", "val2")); + ASSERT_OK(Put("abcdxxx2", "val3")); + ASSERT_OK(Put("abcdxxx3", "val4")); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + { + // prefix_extractor has not changed, BF will always be read + Slice upper_bound("abce"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcd0000"), 4); + } + { + Slice upper_bound("abcdzzzz"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcd0000"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:5"}})); + ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), + "rocksdb.FixedPrefix.5"); + { + // BF changed, [abcdxx00, abce) is a valid bound, will trigger BF read + Slice upper_bound("abce"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcdxx00"), 4); + // should check bloom filter since upper bound meets requirement + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + { + // [abcdxx01, abcey) is not valid bound since upper bound is too long for + // the BF in SST (capped:4) + Slice upper_bound("abcey"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcdxx01"), 4); + // should skip bloom filter since upper bound is too long + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + { + // [abcdxx02, abcdy) is a valid bound since the prefix is the same + Slice upper_bound("abcdy"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcdxx02"), 4); + // should check bloom filter since upper bound matches transformed seek + // key + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + { + // [aaaaaaaa, abce) is not a valid bound since 1) they don't share the + // same prefix, 2) the prefixes are not consecutive + Slice upper_bound("abce"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "aaaaaaaa"), 0); + // should skip bloom filter since mismatch is found + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:3"}})); + { + // [abc, abd) is not a valid bound since the upper bound is too short + // for BF (capped:4) + Slice upper_bound("abd"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abc"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + // Same with re-open + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + Reopen(options); + { + Slice upper_bound("abd"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abc"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + // Set back to capped:4 and verify BF is always read + options.prefix_extractor.reset(NewCappedPrefixTransform(4)); + Reopen(options); + { + Slice upper_bound("abd"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abc"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 5); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + } + // Same if there's a problem initally loading prefix transform + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::Open::ForceNullTablePrefixExtractor", + [&](void* arg) { *static_cast(arg) = true; }); + SyncPoint::GetInstance()->EnableProcessing(); + Reopen(options); + { + Slice upper_bound("abd"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abc"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2); + } + SyncPoint::GetInstance()->DisableProcessing(); + } +} + +// Create multiple SST files each with a different prefix_extractor config, +// verify iterators can read all SST files using the latest config. +TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { + for (const auto& bfp_impl : BloomLikeFilterPolicy::GetAllFixedImpls()) { + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.filter_policy = Create(10, bfp_impl); + table_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + Slice upper_bound("foz90000"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + + // first SST with fixed:1 BF + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foq1", "bar1")); + ASSERT_OK(Put("fpa", "0")); + dbfull()->Flush(FlushOptions()); + std::unique_ptr iter_old(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_old, "foo"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 1); + + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); + ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), + "rocksdb.CappedPrefix.3"); + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "foo"), 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2); + ASSERT_EQ(CountIter(iter, "gpk"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + + // second SST with capped:3 BF + ASSERT_OK(Put("foo3", "bar3")); + ASSERT_OK(Put("foo4", "bar4")); + ASSERT_OK(Put("foq5", "bar5")); + ASSERT_OK(Put("fpb", "1")); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + { + // BF is cappped:3 now + std::unique_ptr iter_tmp(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_tmp, "foo"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0); + // both counters are incremented because BF is "not changed" for 1 of the + // 2 SST files, so filter is checked once and found no match. + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 5); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + } + + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}})); + ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), + "rocksdb.FixedPrefix.2"); + // third SST with fixed:2 BF + ASSERT_OK(Put("foo6", "bar6")); + ASSERT_OK(Put("foo7", "bar7")); + ASSERT_OK(Put("foq8", "bar8")); + ASSERT_OK(Put("fpc", "2")); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + { + // BF is fixed:2 now + std::unique_ptr iter_tmp(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_tmp, "foo"), 9); + // the first and last BF are checked + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 7); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0); + // only last BF is checked and not found + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 8); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2); + } + + // iter_old can only see the first SST, so checked plus 1 + ASSERT_EQ(CountIter(iter_old, "foo"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 9); + // iter was created after the first setoptions call so only full filter + // will check the filter + ASSERT_EQ(CountIter(iter, "foo"), 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 10); + + { + // keys in all three SSTs are visible to iterator + // The range of [foo, foz90000] is compatible with (fixed:1) and (fixed:2) + // so +2 for checked counter + std::unique_ptr iter_all(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_all, "foo"), 9); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2); + ASSERT_EQ(CountIter(iter_all, "gpk"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 13); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); + } + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); + ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), + "rocksdb.CappedPrefix.3"); + { + std::unique_ptr iter_all(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_all, "foo"), 6); + // all three SST are checked because the current options has the same as + // the remaining SST (capped:3) + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 16); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); + ASSERT_EQ(CountIter(iter_all, "gpk"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 17); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 4); + } + // TODO(Zhongyi): Maybe also need to add Get calls to test point look up? + } +} + +// Create a new column family in a running DB, change prefix_extractor +// dynamically, verify the iterator created on the new column family behaves +// as expected +TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) { + int iteration = 0; + for (const auto& bfp_impl : BloomLikeFilterPolicy::GetAllFixedImpls()) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy = Create(10, bfp_impl); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu" + std::to_string(iteration)}, options); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + // create a new CF and set prefix_extractor dynamically + options.prefix_extractor.reset(NewCappedPrefixTransform(3)); + CreateColumnFamilies({"ramen_dojo_" + std::to_string(iteration)}, options); + ASSERT_EQ(dbfull()->GetOptions(handles_[2]).prefix_extractor->AsString(), + "rocksdb.CappedPrefix.3"); + ASSERT_OK(Put(2, "foo3", "bar3")); + ASSERT_OK(Put(2, "foo4", "bar4")); + ASSERT_OK(Put(2, "foo5", "bar5")); + ASSERT_OK(Put(2, "foq6", "bar6")); + ASSERT_OK(Put(2, "fpq7", "bar7")); + dbfull()->Flush(FlushOptions()); + { + std::unique_ptr iter( + db_->NewIterator(read_options, handles_[2])); + ASSERT_EQ(CountIter(iter, "foo"), 3); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK( + dbfull()->SetOptions(handles_[2], {{"prefix_extractor", "fixed:2"}})); + ASSERT_EQ(dbfull()->GetOptions(handles_[2]).prefix_extractor->AsString(), + "rocksdb.FixedPrefix.2"); + { + std::unique_ptr iter( + db_->NewIterator(read_options, handles_[2])); + ASSERT_EQ(CountIter(iter, "foo"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK(dbfull()->DropColumnFamily(handles_[2])); + ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[2])); + handles_[2] = nullptr; + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1])); + handles_[1] = nullptr; + iteration++; + } +} + +// Verify it's possible to change prefix_extractor at runtime and iterators +// behaves as expected +TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) { + for (const auto& bfp_impl : BloomLikeFilterPolicy::GetAllFixedImpls()) { + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy = Create(10, bfp_impl); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("fpa", "0")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("foo3", "bar3")); + ASSERT_OK(Put("foo4", "bar4")); + ASSERT_OK(Put("foo5", "bar5")); + ASSERT_OK(Put("fpb", "1")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("foo6", "bar6")); + ASSERT_OK(Put("foo7", "bar7")); + ASSERT_OK(Put("foo8", "bar8")); + ASSERT_OK(Put("fpc", "2")); + dbfull()->Flush(FlushOptions()); + + ReadOptions read_options; + read_options.prefix_same_as_start = true; + { + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "foo"), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + std::unique_ptr iter_old(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_old, "foo"), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); + ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), + "rocksdb.CappedPrefix.3"); + { + std::unique_ptr iter(db_->NewIterator(read_options)); + // "fp*" should be skipped + ASSERT_EQ(CountIter(iter, "foo"), 9); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + + // iterator created before should not be affected and see all keys + ASSERT_EQ(CountIter(iter_old, "foo"), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 9); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + ASSERT_EQ(CountIter(iter_old, "abc"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); + } +} + +TEST_F(DBBloomFilterTest, SeekForPrevWithPartitionedFilters) { + Options options = CurrentOptions(); + constexpr size_t kNumKeys = 10000; + static_assert(kNumKeys <= 10000, "kNumKeys have to be <= 10000"); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeys + 10)); + options.create_if_missing = true; + constexpr size_t kPrefixLength = 4; + options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixLength)); + options.compression = kNoCompression; + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(50)); + bbto.index_shortening = + BlockBasedTableOptions::IndexShorteningMode::kNoShortening; + bbto.block_size = 128; + bbto.metadata_block_size = 128; + bbto.partition_filters = true; + bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + const std::string value(64, '\0'); + + WriteOptions write_opts; + write_opts.disableWAL = true; + for (size_t i = 0; i < kNumKeys; ++i) { + std::ostringstream oss; + oss << std::setfill('0') << std::setw(4) << std::fixed << i; + ASSERT_OK(db_->Put(write_opts, oss.str(), value)); + } + ASSERT_OK(Flush()); + + ReadOptions read_opts; + // Use legacy, implicit prefix seek + read_opts.total_order_seek = false; + read_opts.auto_prefix_mode = false; + std::unique_ptr it(db_->NewIterator(read_opts)); + for (size_t i = 0; i < kNumKeys; ++i) { + // Seek with a key after each one added but with same prefix. One will + // surely cross a partition boundary. + std::ostringstream oss; + oss << std::setfill('0') << std::setw(4) << std::fixed << i << "a"; + it->SeekForPrev(oss.str()); + ASSERT_OK(it->status()); + ASSERT_TRUE(it->Valid()); + } + it.reset(); +} + +namespace { +class BackwardBytewiseComparator : public Comparator { + public: + const char* Name() const override { return "BackwardBytewiseComparator"; } + + int Compare(const Slice& a, const Slice& b) const override { + int min_size_neg = -static_cast(std::min(a.size(), b.size())); + const char* a_end = a.data() + a.size(); + const char* b_end = b.data() + b.size(); + for (int i = -1; i >= min_size_neg; --i) { + if (a_end[i] != b_end[i]) { + if (static_cast(a_end[i]) < + static_cast(b_end[i])) { + return -1; + } else { + return 1; + } + } + } + return static_cast(a.size()) - static_cast(b.size()); + } + + void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override {} + + void FindShortSuccessor(std::string* /*key*/) const override {} +}; + +const BackwardBytewiseComparator kBackwardBytewiseComparator{}; + +class FixedSuffix4Transform : public SliceTransform { + const char* Name() const override { return "FixedSuffixTransform"; } + + Slice Transform(const Slice& src) const override { + return Slice(src.data() + src.size() - 4, 4); + } + + bool InDomain(const Slice& src) const override { return src.size() >= 4; } +}; + +std::pair GetBloomStat(const Options& options, bool sst) { + if (sst) { + return { + options.statistics->getAndResetTickerCount(BLOOM_FILTER_PREFIX_CHECKED), + options.statistics->getAndResetTickerCount(BLOOM_FILTER_PREFIX_USEFUL)}; + } else { + auto hit = std::exchange(get_perf_context()->bloom_memtable_hit_count, 0); + auto miss = std::exchange(get_perf_context()->bloom_memtable_miss_count, 0); + return {hit + miss, miss}; + } +} + +std::pair CheckedAndUseful(uint64_t checked, + uint64_t useful) { + return {checked, useful}; +} +} // anonymous namespace + +// This uses a prefix_extractor + comparator combination that violates +// one of the old obsolete, unnecessary axioms of prefix extraction: +// * key.starts_with(prefix(key)) +// This axiom is not really needed, and we validate that here. +TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter1) { + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10)); + bbto.whole_key_filtering = false; + + Options options = CurrentOptions(); + options.comparator = &kBackwardBytewiseComparator; + options.prefix_extractor = std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.memtable_prefix_bloom_size_ratio = 0.1; + options.statistics = CreateDBStatistics(); + + DestroyAndReopen(options); + + ASSERT_OK(Put("321aaaa", "val1")); + ASSERT_OK(Put("112aaaa", "val2")); + ASSERT_OK(Put("009aaaa", "val3")); + ASSERT_OK(Put("baa", "val4")); // out of domain + ASSERT_OK(Put("321abaa", "val5")); + ASSERT_OK(Put("zzz", "val6")); // out of domain + + for (auto flushed : {false, true}) { + SCOPED_TRACE("flushed=" + std::to_string(flushed)); + if (flushed) { + ASSERT_OK(Flush()); + } + ReadOptions read_options; + if (flushed) { // TODO: support auto_prefix_mode in memtable? + read_options.auto_prefix_mode = true; + } + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + { + Slice ub("999aaaa"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "aaaa"), 3); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + } + { + Slice ub("999abaa"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "abaa"), 1); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + } + { + Slice ub("999acaa"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "acaa"), 0); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1)); + } + { + Slice ub("zzzz"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "baa"), 3); + if (flushed) { // TODO: fix memtable case + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + } + } + } +} + +// This uses a prefix_extractor + comparator combination that violates +// one of the old obsolete, unnecessary axioms of prefix extraction: +// * Compare(prefix(key), key) <= 0 +// This axiom is not really needed, and we validate that here. +TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter2) { + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10)); + bbto.whole_key_filtering = false; + + Options options = CurrentOptions(); + options.comparator = ReverseBytewiseComparator(); + options.prefix_extractor.reset(NewFixedPrefixTransform(4)); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.memtable_prefix_bloom_size_ratio = 0.1; + options.statistics = CreateDBStatistics(); + + DestroyAndReopen(options); + + ASSERT_OK(Put("aaaa123", "val1")); + ASSERT_OK(Put("aaaa211", "val2")); + ASSERT_OK(Put("aaaa900", "val3")); + ASSERT_OK(Put("aab", "val4")); // out of domain + ASSERT_OK(Put("aaba123", "val5")); + ASSERT_OK(Put("qqqq123", "val7")); + ASSERT_OK(Put("qqqq", "val8")); + ASSERT_OK(Put("zzz", "val8")); // out of domain + + for (auto flushed : {false, true}) { + SCOPED_TRACE("flushed=" + std::to_string(flushed)); + if (flushed) { + ASSERT_OK(Flush()); + } + ReadOptions read_options; + if (flushed) { // TODO: support auto_prefix_mode in memtable? + read_options.auto_prefix_mode = true; + } else { + // TODO: why needed? + get_perf_context()->bloom_memtable_hit_count = 0; + get_perf_context()->bloom_memtable_miss_count = 0; + } + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + { + Slice ub("aaaa000"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "aaaa999"), 3); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + } + { + // Note: prefix does work as upper bound + Slice ub("aaaa"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "aaaa999"), 3); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + } + { + // Note: prefix does not work here as seek key + Slice ub("aaaa500"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "aaaa"), 0); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + } + { + Slice ub("aaba000"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "aaba999"), 1); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + } + { + Slice ub("aaca000"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "aaca999"), 0); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1)); + } + { + Slice ub("aaaz"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "zzz"), 5); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + } + { + // Note: prefix does work here as seek key, but only finds key equal + // to prefix (others with same prefix are less) + read_options.auto_prefix_mode = false; + read_options.iterate_upper_bound = nullptr; + read_options.prefix_same_as_start = true; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "qqqq"), 1); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + } + } +} + +namespace { +// A weird comparator that in combination with NonIdempotentFixed4Transform +// breaks an old axiom of prefix filtering. +class WeirdComparator : public Comparator { + public: + const char* Name() const override { return "WeirdComparator"; } + + int Compare(const Slice& a, const Slice& b) const override { + bool a_in = a.size() >= 5; + bool b_in = b.size() >= 5; + if (a_in != b_in) { + // Order keys after prefixes + return a_in - b_in; + } + if (a_in) { + return BytewiseComparator()->Compare(a, b); + } else { + // Different ordering on the prefixes + return ReverseBytewiseComparator()->Compare(a, b); + } + } + + void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override {} + + void FindShortSuccessor(std::string* /*key*/) const override {} +}; +const WeirdComparator kWeirdComparator{}; + +// Non-idempotentent because prefix is always 4 bytes, but this is +// out-of-domain for keys to be assigned prefixes (>= 5 bytes) +class NonIdempotentFixed4Transform : public SliceTransform { + const char* Name() const override { return "NonIdempotentFixed4Transform"; } + + Slice Transform(const Slice& src) const override { + return Slice(src.data(), 4); + } + + bool InDomain(const Slice& src) const override { return src.size() >= 5; } +}; +} // anonymous namespace + +// This uses a prefix_extractor + comparator combination that violates +// two of the old obsolete, unnecessary axioms of prefix extraction: +// * prefix(prefix(key)) == prefix(key) +// * If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0 +// This axiom is not really needed, and we validate that here. +TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter3) { + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10)); + bbto.whole_key_filtering = false; + + Options options = CurrentOptions(); + options.prefix_extractor = std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.memtable_prefix_bloom_size_ratio = 0.1; + options.statistics = CreateDBStatistics(); + + for (auto weird_comparator : {false, true}) { + if (weird_comparator) { + options.comparator = &kWeirdComparator; + } + DestroyAndReopen(options); + + ASSERT_OK(Put("aaaa123", "val1")); + ASSERT_OK(Put("aaaa211", "val2")); + ASSERT_OK(Put("aaaa900", "val3")); + ASSERT_OK(Put("aab", "val4")); // out of domain + ASSERT_OK(Put("aaba123", "val5")); + ASSERT_OK(Put("qqqq123", "val7")); + ASSERT_OK(Put("qqqq", "val8")); // out of domain + ASSERT_OK(Put("zzzz", "val8")); // out of domain + + for (auto flushed : {false, true}) { + SCOPED_TRACE("flushed=" + std::to_string(flushed)); + if (flushed) { + ASSERT_OK(Flush()); + } + ReadOptions read_options; + if (flushed) { // TODO: support auto_prefix_mode in memtable? + read_options.auto_prefix_mode = true; + } else { + // TODO: why needed? + get_perf_context()->bloom_memtable_hit_count = 0; + get_perf_context()->bloom_memtable_miss_count = 0; + } + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + { + Slice ub("aaaa999"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "aaaa000"), 3); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + } + { + // Note: prefix as seek key is not bloom-optimized + // Note: the count works with weird_comparator because "aaaa" is + // ordered as the last of the prefixes + Slice ub("aaaa999"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "aaaa"), 3); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + } + { + Slice ub("aaba9"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "aaba0"), 1); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + } + { + Slice ub("aaca9"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "aaca0"), 0); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1)); + } + { + Slice ub("qqqq9"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "qqqq0"), 1); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + } + { + // Note: prefix as seek key is not bloom-optimized + Slice ub("qqqq9"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "qqqq"), weird_comparator ? 7 : 2); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + } + { + // Note: prefix as seek key is not bloom-optimized + Slice ub("zzzz9"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "zzzz"), weird_comparator ? 8 : 1); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + } + { + Slice ub("zzzz9"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "aab"), weird_comparator ? 6 : 5); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + } + } + } +} + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_compaction_filter_test.cc b/src/rocksdb/db/db_compaction_filter_test.cc new file mode 100644 index 000000000..be863d4f6 --- /dev/null +++ b/src/rocksdb/db/db_compaction_filter_test.cc @@ -0,0 +1,1036 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "port/stack_trace.h" + +namespace ROCKSDB_NAMESPACE { + +static int cfilter_count = 0; +static int cfilter_skips = 0; + +// This is a static filter used for filtering +// kvs during the compaction process. +static std::string NEW_VALUE = "NewValue"; + +class DBTestCompactionFilter : public DBTestBase { + public: + DBTestCompactionFilter() + : DBTestBase("db_compaction_filter_test", /*env_do_fsync=*/true) {} +}; + +// Param variant of DBTestBase::ChangeCompactOptions +class DBTestCompactionFilterWithCompactParam + : public DBTestCompactionFilter, + public ::testing::WithParamInterface { + public: + DBTestCompactionFilterWithCompactParam() : DBTestCompactionFilter() { + option_config_ = GetParam(); + Destroy(last_options_); + auto options = CurrentOptions(); + if (option_config_ == kDefault || option_config_ == kUniversalCompaction || + option_config_ == kUniversalCompactionMultiLevel) { + options.create_if_missing = true; + } + if (option_config_ == kLevelSubcompactions || + option_config_ == kUniversalSubcompactions) { + assert(options.max_subcompactions > 1); + } + Reopen(options); + } +}; + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +INSTANTIATE_TEST_CASE_P( + CompactionFilterWithOption, DBTestCompactionFilterWithCompactParam, + ::testing::Values(DBTestBase::OptionConfig::kDefault, + DBTestBase::OptionConfig::kUniversalCompaction, + DBTestBase::OptionConfig::kUniversalCompactionMultiLevel, + DBTestBase::OptionConfig::kLevelSubcompactions, + DBTestBase::OptionConfig::kUniversalSubcompactions)); +#else +// Run fewer cases in non-full valgrind to save time. +INSTANTIATE_TEST_CASE_P(CompactionFilterWithOption, + DBTestCompactionFilterWithCompactParam, + ::testing::Values(DBTestBase::OptionConfig::kDefault)); +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +class KeepFilter : public CompactionFilter { + public: + bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + cfilter_count++; + return false; + } + + const char* Name() const override { return "KeepFilter"; } +}; + +class DeleteFilter : public CompactionFilter { + public: + bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + cfilter_count++; + return true; + } + + bool FilterMergeOperand(int /*level*/, const Slice& /*key*/, + const Slice& /*operand*/) const override { + return true; + } + + const char* Name() const override { return "DeleteFilter"; } +}; + +class DeleteISFilter : public CompactionFilter { + public: + bool Filter(int /*level*/, const Slice& key, const Slice& /*value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + cfilter_count++; + int i = std::stoi(key.ToString()); + if (i > 5 && i <= 105) { + return true; + } + return false; + } + + bool IgnoreSnapshots() const override { return true; } + + const char* Name() const override { return "DeleteFilter"; } +}; + +// Skip x if floor(x/10) is even, use range skips. Requires that keys are +// zero-padded to length 10. +class SkipEvenFilter : public CompactionFilter { + public: + Decision FilterV2(int /*level*/, const Slice& key, ValueType /*value_type*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + std::string* skip_until) const override { + cfilter_count++; + int i = std::stoi(key.ToString()); + if (i / 10 % 2 == 0) { + char key_str[100]; + snprintf(key_str, sizeof(key_str), "%010d", i / 10 * 10 + 10); + *skip_until = key_str; + ++cfilter_skips; + return Decision::kRemoveAndSkipUntil; + } + return Decision::kKeep; + } + + bool IgnoreSnapshots() const override { return true; } + + const char* Name() const override { return "DeleteFilter"; } +}; + +class ConditionalFilter : public CompactionFilter { + public: + explicit ConditionalFilter(const std::string* filtered_value) + : filtered_value_(filtered_value) {} + bool Filter(int /*level*/, const Slice& /*key*/, const Slice& value, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + return value.ToString() == *filtered_value_; + } + + const char* Name() const override { return "ConditionalFilter"; } + + private: + const std::string* filtered_value_; +}; + +class ChangeFilter : public CompactionFilter { + public: + explicit ChangeFilter() {} + + bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, + std::string* new_value, bool* value_changed) const override { + assert(new_value != nullptr); + *new_value = NEW_VALUE; + *value_changed = true; + return false; + } + + const char* Name() const override { return "ChangeFilter"; } +}; + +class KeepFilterFactory : public CompactionFilterFactory { + public: + explicit KeepFilterFactory(bool check_context = false, + bool check_context_cf_id = false) + : check_context_(check_context), + check_context_cf_id_(check_context_cf_id), + compaction_filter_created_(false) {} + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + if (check_context_) { + EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction); + EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction); + } + if (check_context_cf_id_) { + EXPECT_EQ(expect_cf_id_.load(), context.column_family_id); + } + compaction_filter_created_ = true; + return std::unique_ptr(new KeepFilter()); + } + + bool compaction_filter_created() const { return compaction_filter_created_; } + + const char* Name() const override { return "KeepFilterFactory"; } + bool check_context_; + bool check_context_cf_id_; + std::atomic_bool expect_full_compaction_; + std::atomic_bool expect_manual_compaction_; + std::atomic expect_cf_id_; + bool compaction_filter_created_; +}; + +// This filter factory is configured with a `TableFileCreationReason`. Only +// table files created for that reason will undergo filtering. This +// configurability makes it useful to tests for filtering non-compaction table +// files, such as "CompactionFilterFlush" and "CompactionFilterRecovery". +class DeleteFilterFactory : public CompactionFilterFactory { + public: + explicit DeleteFilterFactory(TableFileCreationReason reason) + : reason_(reason) {} + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + EXPECT_EQ(reason_, context.reason); + if (context.reason == TableFileCreationReason::kCompaction && + !context.is_manual_compaction) { + // Table files created by automatic compaction do not undergo filtering. + // Presumably some tests rely on this. + return std::unique_ptr(nullptr); + } + return std::unique_ptr(new DeleteFilter()); + } + + bool ShouldFilterTableFileCreation( + TableFileCreationReason reason) const override { + return reason_ == reason; + } + + const char* Name() const override { return "DeleteFilterFactory"; } + + private: + const TableFileCreationReason reason_; +}; + +// Delete Filter Factory which ignores snapshots +class DeleteISFilterFactory : public CompactionFilterFactory { + public: + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + if (context.is_manual_compaction) { + return std::unique_ptr(new DeleteISFilter()); + } else { + return std::unique_ptr(nullptr); + } + } + + const char* Name() const override { return "DeleteFilterFactory"; } +}; + +class SkipEvenFilterFactory : public CompactionFilterFactory { + public: + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + if (context.is_manual_compaction) { + return std::unique_ptr(new SkipEvenFilter()); + } else { + return std::unique_ptr(nullptr); + } + } + + const char* Name() const override { return "SkipEvenFilterFactory"; } +}; + +class ConditionalFilterFactory : public CompactionFilterFactory { + public: + explicit ConditionalFilterFactory(const Slice& filtered_value) + : filtered_value_(filtered_value.ToString()) {} + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& /*context*/) override { + return std::unique_ptr( + new ConditionalFilter(&filtered_value_)); + } + + const char* Name() const override { return "ConditionalFilterFactory"; } + + private: + std::string filtered_value_; +}; + +class ChangeFilterFactory : public CompactionFilterFactory { + public: + explicit ChangeFilterFactory() {} + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& /*context*/) override { + return std::unique_ptr(new ChangeFilter()); + } + + const char* Name() const override { return "ChangeFilterFactory"; } +}; + +#ifndef ROCKSDB_LITE +TEST_F(DBTestCompactionFilter, CompactionFilter) { + Options options = CurrentOptions(); + options.max_open_files = -1; + options.num_levels = 3; + options.compaction_filter_factory = std::make_shared(); + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Write 100K keys, these are written to a few files in L0. + const std::string value(10, 'x'); + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + ASSERT_OK(Put(1, key, value)); + } + ASSERT_OK(Flush(1)); + + // Push all files to the highest level L2. Verify that + // the compaction is each level invokes the filter for + // all the keys in that level. + cfilter_count = 0; + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); + ASSERT_EQ(cfilter_count, 100000); + cfilter_count = 0; + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1])); + ASSERT_EQ(cfilter_count, 100000); + + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); + ASSERT_NE(NumTableFilesAtLevel(2, 1), 0); + cfilter_count = 0; + + // All the files are in the lowest level. + // Verify that all but the 100001st record + // has sequence number zero. The 100001st record + // is at the tip of this snapshot and cannot + // be zeroed out. + int count = 0; + int total = 0; + Arena arena; + { + InternalKeyComparator icmp(options.comparator); + ReadOptions read_options; + ScopedArenaIterator iter(dbfull()->NewInternalIterator( + read_options, &arena, kMaxSequenceNumber, handles_[1])); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */)); + total++; + if (ikey.sequence != 0) { + count++; + } + iter->Next(); + } + ASSERT_OK(iter->status()); + } + ASSERT_EQ(total, 100000); + ASSERT_EQ(count, 0); + + // overwrite all the 100K keys once again. + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + ASSERT_OK(Put(1, key, value)); + } + ASSERT_OK(Flush(1)); + + // push all files to the highest level L2. This + // means that all keys should pass at least once + // via the compaction filter + cfilter_count = 0; + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); + ASSERT_EQ(cfilter_count, 100000); + cfilter_count = 0; + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1])); + ASSERT_EQ(cfilter_count, 100000); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); + ASSERT_NE(NumTableFilesAtLevel(2, 1), 0); + + // create a new database with the compaction + // filter in such a way that it deletes all keys + options.compaction_filter_factory = std::make_shared( + TableFileCreationReason::kCompaction); + options.create_if_missing = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // write all the keys once again. + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + ASSERT_OK(Put(1, key, value)); + } + ASSERT_OK(Flush(1)); + ASSERT_NE(NumTableFilesAtLevel(0, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2, 1), 0); + + // Push all files to the highest level L2. This + // triggers the compaction filter to delete all keys, + // verify that at the end of the compaction process, + // nothing is left. + cfilter_count = 0; + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); + ASSERT_EQ(cfilter_count, 100000); + cfilter_count = 0; + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1])); + ASSERT_EQ(cfilter_count, 0); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); + + { + // Scan the entire database to ensure that nothing is left + std::unique_ptr iter( + db_->NewIterator(ReadOptions(), handles_[1])); + iter->SeekToFirst(); + count = 0; + while (iter->Valid()) { + count++; + iter->Next(); + } + ASSERT_OK(iter->status()); + ASSERT_EQ(count, 0); + } + + // The sequence number of the remaining record + // is not zeroed out even though it is at the + // level Lmax because this record is at the tip + count = 0; + { + InternalKeyComparator icmp(options.comparator); + ReadOptions read_options; + ScopedArenaIterator iter(dbfull()->NewInternalIterator( + read_options, &arena, kMaxSequenceNumber, handles_[1])); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */)); + ASSERT_NE(ikey.sequence, (unsigned)0); + count++; + iter->Next(); + } + ASSERT_EQ(count, 0); + } +} + +// Tests the edge case where compaction does not produce any output -- all +// entries are deleted. The compaction should create bunch of 'DeleteFile' +// entries in VersionEdit, but none of the 'AddFile's. +TEST_F(DBTestCompactionFilter, CompactionFilterDeletesAll) { + Options options = CurrentOptions(); + options.compaction_filter_factory = std::make_shared( + TableFileCreationReason::kCompaction); + options.disable_auto_compactions = true; + options.create_if_missing = true; + DestroyAndReopen(options); + + // put some data + for (int table = 0; table < 4; ++table) { + for (int i = 0; i < 10 + table; ++i) { + ASSERT_OK(Put(std::to_string(table * 100 + i), "val")); + } + ASSERT_OK(Flush()); + } + + // this will produce empty file (delete compaction filter) + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(0U, CountLiveFiles()); + + Reopen(options); + + Iterator* itr = db_->NewIterator(ReadOptions()); + itr->SeekToFirst(); + ASSERT_OK(itr->status()); + // empty db + ASSERT_TRUE(!itr->Valid()); + + delete itr; +} +#endif // ROCKSDB_LITE + +TEST_F(DBTestCompactionFilter, CompactionFilterFlush) { + // Tests a `CompactionFilterFactory` that filters when table file is created + // by flush. + Options options = CurrentOptions(); + options.compaction_filter_factory = + std::make_shared(TableFileCreationReason::kFlush); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + Reopen(options); + + // Puts and Merges are purged in flush. + ASSERT_OK(Put("a", "v")); + ASSERT_OK(Merge("b", "v")); + ASSERT_OK(Flush()); + ASSERT_EQ("NOT_FOUND", Get("a")); + ASSERT_EQ("NOT_FOUND", Get("b")); + + // However, Puts and Merges are preserved by recovery. + ASSERT_OK(Put("a", "v")); + ASSERT_OK(Merge("b", "v")); + Reopen(options); + ASSERT_EQ("v", Get("a")); + ASSERT_EQ("v", Get("b")); + + // Likewise, compaction does not apply filtering. + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("v", Get("a")); + ASSERT_EQ("v", Get("b")); +} + +TEST_F(DBTestCompactionFilter, CompactionFilterRecovery) { + // Tests a `CompactionFilterFactory` that filters when table file is created + // by recovery. + Options options = CurrentOptions(); + options.compaction_filter_factory = + std::make_shared(TableFileCreationReason::kRecovery); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + Reopen(options); + + // Puts and Merges are purged in recovery. + ASSERT_OK(Put("a", "v")); + ASSERT_OK(Merge("b", "v")); + Reopen(options); + ASSERT_EQ("NOT_FOUND", Get("a")); + ASSERT_EQ("NOT_FOUND", Get("b")); + + // However, Puts and Merges are preserved by flush. + ASSERT_OK(Put("a", "v")); + ASSERT_OK(Merge("b", "v")); + ASSERT_OK(Flush()); + ASSERT_EQ("v", Get("a")); + ASSERT_EQ("v", Get("b")); + + // Likewise, compaction does not apply filtering. + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("v", Get("a")); + ASSERT_EQ("v", Get("b")); +} + +TEST_P(DBTestCompactionFilterWithCompactParam, + CompactionFilterWithValueChange) { + Options options = CurrentOptions(); + options.num_levels = 3; + options.compaction_filter_factory = std::make_shared(); + CreateAndReopenWithCF({"pikachu"}, options); + + // Write 100K+1 keys, these are written to a few files + // in L0. We do this so that the current snapshot points + // to the 100001 key.The compaction filter is not invoked + // on keys that are visible via a snapshot because we + // anyways cannot delete it. + const std::string value(10, 'x'); + for (int i = 0; i < 100001; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + ASSERT_OK(Put(1, key, value)); + } + + // push all files to lower levels + ASSERT_OK(Flush(1)); + if (option_config_ != kUniversalCompactionMultiLevel && + option_config_ != kUniversalSubcompactions) { + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1])); + } else { + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], + nullptr, nullptr)); + } + + // re-write all data again + for (int i = 0; i < 100001; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + ASSERT_OK(Put(1, key, value)); + } + + // push all files to lower levels. This should + // invoke the compaction filter for all 100000 keys. + ASSERT_OK(Flush(1)); + if (option_config_ != kUniversalCompactionMultiLevel && + option_config_ != kUniversalSubcompactions) { + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1])); + } else { + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], + nullptr, nullptr)); + } + + // verify that all keys now have the new value that + // was set by the compaction process. + for (int i = 0; i < 100001; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + std::string newvalue = Get(1, key); + ASSERT_EQ(newvalue.compare(NEW_VALUE), 0); + } +} + +TEST_F(DBTestCompactionFilter, CompactionFilterWithMergeOperator) { + std::string one, two, three, four; + PutFixed64(&one, 1); + PutFixed64(&two, 2); + PutFixed64(&three, 3); + PutFixed64(&four, 4); + + Options options = CurrentOptions(); + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreateUInt64AddOperator(); + options.num_levels = 3; + // Filter out keys with value is 2. + options.compaction_filter_factory = + std::make_shared(two); + DestroyAndReopen(options); + + // In the same compaction, a value type needs to be deleted based on + // compaction filter, and there is a merge type for the key. compaction + // filter result is ignored. + ASSERT_OK(db_->Put(WriteOptions(), "foo", two)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->Merge(WriteOptions(), "foo", one)); + ASSERT_OK(Flush()); + std::string newvalue = Get("foo"); + ASSERT_EQ(newvalue, three); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + newvalue = Get("foo"); + ASSERT_EQ(newvalue, three); + + // value key can be deleted based on compaction filter, leaving only + // merge keys. + ASSERT_OK(db_->Put(WriteOptions(), "bar", two)); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + newvalue = Get("bar"); + ASSERT_EQ("NOT_FOUND", newvalue); + ASSERT_OK(db_->Merge(WriteOptions(), "bar", two)); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + newvalue = Get("bar"); + ASSERT_EQ(two, two); + + // Compaction filter never applies to merge keys. + ASSERT_OK(db_->Put(WriteOptions(), "foobar", one)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->Merge(WriteOptions(), "foobar", two)); + ASSERT_OK(Flush()); + newvalue = Get("foobar"); + ASSERT_EQ(newvalue, three); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + newvalue = Get("foobar"); + ASSERT_EQ(newvalue, three); + + // In the same compaction, both of value type and merge type keys need to be + // deleted based on compaction filter, and there is a merge type for the key. + // For both keys, compaction filter results are ignored. + ASSERT_OK(db_->Put(WriteOptions(), "barfoo", two)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->Merge(WriteOptions(), "barfoo", two)); + ASSERT_OK(Flush()); + newvalue = Get("barfoo"); + ASSERT_EQ(newvalue, four); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + newvalue = Get("barfoo"); + ASSERT_EQ(newvalue, four); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) { + KeepFilterFactory* filter = new KeepFilterFactory(true, true); + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.compaction_filter_factory.reset(filter); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 8; + Reopen(options); + int num_keys_per_file = 400; + for (int j = 0; j < 3; j++) { + // Write several keys. + const std::string value(10, 'x'); + for (int i = 0; i < num_keys_per_file; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%08d%02d", i, j); + ASSERT_OK(Put(key, value)); + } + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + // Make sure next file is much smaller so automatic compaction will not + // be triggered. + num_keys_per_file /= 2; + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Force a manual compaction + cfilter_count = 0; + filter->expect_manual_compaction_.store(true); + filter->expect_full_compaction_.store(true); + filter->expect_cf_id_.store(0); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(cfilter_count, 700); + ASSERT_EQ(NumSortedRuns(0), 1); + ASSERT_TRUE(filter->compaction_filter_created()); + + // Verify total number of keys is correct after manual compaction. + { + int count = 0; + int total = 0; + Arena arena; + InternalKeyComparator icmp(options.comparator); + ReadOptions read_options; + ScopedArenaIterator iter(dbfull()->NewInternalIterator(read_options, &arena, + kMaxSequenceNumber)); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */)); + total++; + if (ikey.sequence != 0) { + count++; + } + iter->Next(); + } + ASSERT_EQ(total, 700); + ASSERT_EQ(count, 0); + } +} +#endif // ROCKSDB_LITE + +TEST_F(DBTestCompactionFilter, CompactionFilterContextCfId) { + KeepFilterFactory* filter = new KeepFilterFactory(false, true); + filter->expect_cf_id_.store(1); + + Options options = CurrentOptions(); + options.compaction_filter_factory.reset(filter); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 2; + CreateAndReopenWithCF({"pikachu"}, options); + + int num_keys_per_file = 400; + for (int j = 0; j < 3; j++) { + // Write several keys. + const std::string value(10, 'x'); + for (int i = 0; i < num_keys_per_file; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%08d%02d", i, j); + ASSERT_OK(Put(1, key, value)); + } + ASSERT_OK(Flush(1)); + // Make sure next file is much smaller so automatic compaction will not + // be triggered. + num_keys_per_file /= 2; + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_TRUE(filter->compaction_filter_created()); +} + +#ifndef ROCKSDB_LITE +// Compaction filters aplies to all records, regardless snapshots. +TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) { + std::string five = std::to_string(5); + Options options = CurrentOptions(); + options.compaction_filter_factory = std::make_shared(); + options.disable_auto_compactions = true; + options.create_if_missing = true; + DestroyAndReopen(options); + + // Put some data. + const Snapshot* snapshot = nullptr; + for (int table = 0; table < 4; ++table) { + for (int i = 0; i < 10; ++i) { + ASSERT_OK(Put(std::to_string(table * 100 + i), "val")); + } + ASSERT_OK(Flush()); + + if (table == 0) { + snapshot = db_->GetSnapshot(); + } + } + assert(snapshot != nullptr); + + cfilter_count = 0; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + // The filter should delete 40 records. + ASSERT_EQ(40, cfilter_count); + + { + // Scan the entire database as of the snapshot to ensure + // that nothing is left + ReadOptions read_options; + read_options.snapshot = snapshot; + std::unique_ptr iter(db_->NewIterator(read_options)); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + int count = 0; + while (iter->Valid()) { + count++; + iter->Next(); + } + ASSERT_EQ(count, 6); + read_options.snapshot = nullptr; + std::unique_ptr iter1(db_->NewIterator(read_options)); + ASSERT_OK(iter1->status()); + iter1->SeekToFirst(); + count = 0; + while (iter1->Valid()) { + count++; + iter1->Next(); + } + // We have deleted 10 keys from 40 using the compaction filter + // Keys 6-9 before the snapshot and 100-105 after the snapshot + ASSERT_EQ(count, 30); + } + + // Release the snapshot and compact again -> now all records should be + // removed. + db_->ReleaseSnapshot(snapshot); +} +#endif // ROCKSDB_LITE + +TEST_F(DBTestCompactionFilter, SkipUntil) { + Options options = CurrentOptions(); + options.compaction_filter_factory = std::make_shared(); + options.disable_auto_compactions = true; + options.create_if_missing = true; + DestroyAndReopen(options); + + // Write 100K keys, these are written to a few files in L0. + for (int table = 0; table < 4; ++table) { + // Key ranges in tables are [0, 38], [106, 149], [212, 260], [318, 371]. + for (int i = table * 6; i < 39 + table * 11; ++i) { + char key[100]; + snprintf(key, sizeof(key), "%010d", table * 100 + i); + ASSERT_OK(Put(key, std::to_string(table * 1000 + i))); + } + ASSERT_OK(Flush()); + } + + cfilter_skips = 0; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + // Number of skips in tables: 2, 3, 3, 3. + ASSERT_EQ(11, cfilter_skips); + + for (int table = 0; table < 4; ++table) { + for (int i = table * 6; i < 39 + table * 11; ++i) { + int k = table * 100 + i; + char key[100]; + snprintf(key, sizeof(key), "%010d", table * 100 + i); + auto expected = std::to_string(table * 1000 + i); + std::string val; + Status s = db_->Get(ReadOptions(), key, &val); + if (k / 10 % 2 == 0) { + ASSERT_TRUE(s.IsNotFound()); + } else { + ASSERT_OK(s); + ASSERT_EQ(expected, val); + } + } + } +} + +TEST_F(DBTestCompactionFilter, SkipUntilWithBloomFilter) { + BlockBasedTableOptions table_options; + table_options.whole_key_filtering = false; + table_options.filter_policy.reset(NewBloomFilterPolicy(100, false)); + + Options options = CurrentOptions(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewCappedPrefixTransform(9)); + options.compaction_filter_factory = std::make_shared(); + options.disable_auto_compactions = true; + options.create_if_missing = true; + DestroyAndReopen(options); + + ASSERT_OK(Put("0000000010", "v10")); + ASSERT_OK(Put("0000000020", "v20")); // skipped + ASSERT_OK(Put("0000000050", "v50")); + ASSERT_OK(Flush()); + + cfilter_skips = 0; + EXPECT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + EXPECT_EQ(1, cfilter_skips); + + Status s; + std::string val; + + s = db_->Get(ReadOptions(), "0000000010", &val); + ASSERT_OK(s); + EXPECT_EQ("v10", val); + + s = db_->Get(ReadOptions(), "0000000020", &val); + EXPECT_TRUE(s.IsNotFound()); + + s = db_->Get(ReadOptions(), "0000000050", &val); + ASSERT_OK(s); + EXPECT_EQ("v50", val); +} + +class TestNotSupportedFilter : public CompactionFilter { + public: + bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + return true; + } + + const char* Name() const override { return "NotSupported"; } + bool IgnoreSnapshots() const override { return false; } +}; + +TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalse) { + Options options = CurrentOptions(); + options.compaction_filter = new TestNotSupportedFilter(); + DestroyAndReopen(options); + + ASSERT_OK(Put("a", "v10")); + ASSERT_OK(Put("z", "v20")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("a", "v10")); + ASSERT_OK(Put("z", "v20")); + ASSERT_OK(Flush()); + + // Comapction should fail because IgnoreSnapshots() = false + EXPECT_TRUE(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr) + .IsNotSupported()); + + delete options.compaction_filter; +} + +class TestNotSupportedFilterFactory : public CompactionFilterFactory { + public: + explicit TestNotSupportedFilterFactory(TableFileCreationReason reason) + : reason_(reason) {} + + bool ShouldFilterTableFileCreation( + TableFileCreationReason reason) const override { + return reason_ == reason; + } + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& /* context */) override { + return std::unique_ptr(new TestNotSupportedFilter()); + } + + const char* Name() const override { return "TestNotSupportedFilterFactory"; } + + private: + const TableFileCreationReason reason_; +}; + +TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalseDuringFlush) { + Options options = CurrentOptions(); + options.compaction_filter_factory = + std::make_shared( + TableFileCreationReason::kFlush); + Reopen(options); + + ASSERT_OK(Put("a", "v10")); + ASSERT_TRUE(Flush().IsNotSupported()); +} + +TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalseRecovery) { + Options options = CurrentOptions(); + options.compaction_filter_factory = + std::make_shared( + TableFileCreationReason::kRecovery); + Reopen(options); + + ASSERT_OK(Put("a", "v10")); + ASSERT_TRUE(TryReopen(options).IsNotSupported()); +} + +TEST_F(DBTestCompactionFilter, DropKeyWithSingleDelete) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + + Reopen(options); + + ASSERT_OK(Put("a", "v0")); + ASSERT_OK(Put("b", "v0")); + const Snapshot* snapshot = db_->GetSnapshot(); + + ASSERT_OK(SingleDelete("b")); + ASSERT_OK(Flush()); + + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = options.num_levels - 1; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + } + + db_->ReleaseSnapshot(snapshot); + Close(); + + class DeleteFilterV2 : public CompactionFilter { + public: + Decision FilterV2(int /*level*/, const Slice& key, ValueType /*value_type*/, + const Slice& /*existing_value*/, + std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + if (key.starts_with("b")) { + return Decision::kPurge; + } + return Decision::kRemove; + } + + const char* Name() const override { return "DeleteFilterV2"; } + } delete_filter_v2; + + options.compaction_filter = &delete_filter_v2; + options.level0_file_num_compaction_trigger = 2; + Reopen(options); + + ASSERT_OK(Put("b", "v1")); + ASSERT_OK(Put("x", "v1")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("r", "v1")); + ASSERT_OK(Put("z", "v1")); + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + Close(); + + options.compaction_filter = nullptr; + Reopen(options); + ASSERT_OK(SingleDelete("b")); + ASSERT_OK(Flush()); + { + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_compaction_test.cc b/src/rocksdb/db/db_compaction_test.cc new file mode 100644 index 000000000..ba9c50b9a --- /dev/null +++ b/src/rocksdb/db/db_compaction_test.cc @@ -0,0 +1,8227 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#include "db/blob/blob_index.h" +#include "db/db_test_util.h" +#include "env/mock_env.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/concurrent_task_limiter.h" +#include "rocksdb/experimental.h" +#include "rocksdb/sst_file_writer.h" +#include "rocksdb/utilities/convenience.h" +#include "test_util/sync_point.h" +#include "test_util/testutil.h" +#include "util/concurrent_task_limiter_impl.h" +#include "util/random.h" +#include "utilities/fault_injection_env.h" +#include "utilities/fault_injection_fs.h" + +namespace ROCKSDB_NAMESPACE { + +// SYNC_POINT is not supported in released Windows mode. +#if !defined(ROCKSDB_LITE) + +class CompactionStatsCollector : public EventListener { + public: + CompactionStatsCollector() + : compaction_completed_( + static_cast(CompactionReason::kNumOfReasons)) { + for (auto& v : compaction_completed_) { + v.store(0); + } + } + + ~CompactionStatsCollector() override {} + + void OnCompactionCompleted(DB* /* db */, + const CompactionJobInfo& info) override { + int k = static_cast(info.compaction_reason); + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + assert(k >= 0 && k < num_of_reasons); + compaction_completed_[k]++; + } + + void OnExternalFileIngested( + DB* /* db */, const ExternalFileIngestionInfo& /* info */) override { + int k = static_cast(CompactionReason::kExternalSstIngestion); + compaction_completed_[k]++; + } + + void OnFlushCompleted(DB* /* db */, const FlushJobInfo& /* info */) override { + int k = static_cast(CompactionReason::kFlush); + compaction_completed_[k]++; + } + + int NumberOfCompactions(CompactionReason reason) const { + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + int k = static_cast(reason); + assert(k >= 0 && k < num_of_reasons); + return compaction_completed_.at(k).load(); + } + + private: + std::vector> compaction_completed_; +}; + +class DBCompactionTest : public DBTestBase { + public: + DBCompactionTest() + : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {} + + protected: + /* + * Verifies compaction stats of cfd are valid. + * + * For each level of cfd, its compaction stats are valid if + * 1) sum(stat.counts) == stat.count, and + * 2) stat.counts[i] == collector.NumberOfCompactions(i) + */ + void VerifyCompactionStats(ColumnFamilyData& cfd, + const CompactionStatsCollector& collector) { +#ifndef NDEBUG + InternalStats* internal_stats_ptr = cfd.internal_stats(); + ASSERT_NE(internal_stats_ptr, nullptr); + const std::vector& comp_stats = + internal_stats_ptr->TEST_GetCompactionStats(); + const int num_of_reasons = + static_cast(CompactionReason::kNumOfReasons); + std::vector counts(num_of_reasons, 0); + // Count the number of compactions caused by each CompactionReason across + // all levels. + for (const auto& stat : comp_stats) { + int sum = 0; + for (int i = 0; i < num_of_reasons; i++) { + counts[i] += stat.counts[i]; + sum += stat.counts[i]; + } + ASSERT_EQ(sum, stat.count); + } + // Verify InternalStats bookkeeping matches that of + // CompactionStatsCollector, assuming that all compactions complete. + for (int i = 0; i < num_of_reasons; i++) { + ASSERT_EQ(collector.NumberOfCompactions(static_cast(i)), + counts[i]); + } +#endif /* NDEBUG */ + } +}; + +class DBCompactionTestWithParam + : public DBTestBase, + public testing::WithParamInterface> { + public: + DBCompactionTestWithParam() + : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) { + max_subcompactions_ = std::get<0>(GetParam()); + exclusive_manual_compaction_ = std::get<1>(GetParam()); + } + + // Required if inheriting from testing::WithParamInterface<> + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + uint32_t max_subcompactions_; + bool exclusive_manual_compaction_; +}; + +class DBCompactionTestWithBottommostParam + : public DBTestBase, + public testing::WithParamInterface { + public: + DBCompactionTestWithBottommostParam() + : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) { + bottommost_level_compaction_ = GetParam(); + } + + BottommostLevelCompaction bottommost_level_compaction_; +}; + +class DBCompactionDirectIOTest : public DBCompactionTest, + public ::testing::WithParamInterface { + public: + DBCompactionDirectIOTest() : DBCompactionTest() {} +}; + +// Param = true : target level is non-empty +// Param = false: level between target level and source level +// is not empty. +class ChangeLevelConflictsWithAuto + : public DBCompactionTest, + public ::testing::WithParamInterface { + public: + ChangeLevelConflictsWithAuto() : DBCompactionTest() {} +}; + +// Param = true: grab the compaction pressure token (enable +// parallel compactions) +// Param = false: Not grab the token (no parallel compactions) +class RoundRobinSubcompactionsAgainstPressureToken + : public DBCompactionTest, + public ::testing::WithParamInterface { + public: + RoundRobinSubcompactionsAgainstPressureToken() { + grab_pressure_token_ = GetParam(); + } + bool grab_pressure_token_; +}; + +class RoundRobinSubcompactionsAgainstResources + : public DBCompactionTest, + public ::testing::WithParamInterface> { + public: + RoundRobinSubcompactionsAgainstResources() { + total_low_pri_threads_ = std::get<0>(GetParam()); + max_compaction_limits_ = std::get<1>(GetParam()); + } + int total_low_pri_threads_; + int max_compaction_limits_; +}; + +namespace { +class FlushedFileCollector : public EventListener { + public: + FlushedFileCollector() {} + ~FlushedFileCollector() override {} + + void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { + std::lock_guard lock(mutex_); + flushed_files_.push_back(info.file_path); + } + + std::vector GetFlushedFiles() { + std::lock_guard lock(mutex_); + std::vector result; + for (auto fname : flushed_files_) { + result.push_back(fname); + } + return result; + } + + void ClearFlushedFiles() { flushed_files_.clear(); } + + private: + std::vector flushed_files_; + std::mutex mutex_; +}; + +class SstStatsCollector : public EventListener { + public: + SstStatsCollector() : num_ssts_creation_started_(0) {} + + void OnTableFileCreationStarted( + const TableFileCreationBriefInfo& /* info */) override { + ++num_ssts_creation_started_; + } + + int num_ssts_creation_started() { return num_ssts_creation_started_; } + + private: + std::atomic num_ssts_creation_started_; +}; + +static const int kCDTValueSize = 1000; +static const int kCDTKeysPerBuffer = 4; +static const int kCDTNumLevels = 8; +Options DeletionTriggerOptions(Options options) { + options.compression = kNoCompression; + options.write_buffer_size = kCDTKeysPerBuffer * (kCDTValueSize + 24); + options.min_write_buffer_number_to_merge = 1; + options.max_write_buffer_size_to_maintain = 0; + options.num_levels = kCDTNumLevels; + options.level0_file_num_compaction_trigger = 1; + options.target_file_size_base = options.write_buffer_size * 2; + options.target_file_size_multiplier = 2; + options.max_bytes_for_level_base = + options.target_file_size_base * options.target_file_size_multiplier; + options.max_bytes_for_level_multiplier = 2; + options.disable_auto_compactions = false; + options.compaction_options_universal.max_size_amplification_percent = 100; + return options; +} + +bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a, + const SstFileMetaData& b) { + if (c->CompareWithoutTimestamp(a.smallestkey, b.smallestkey) >= 0) { + if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) { + // b.smallestkey <= a.smallestkey <= b.largestkey + return true; + } + } else if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) { + // a.smallestkey < b.smallestkey <= a.largestkey + return true; + } + if (c->CompareWithoutTimestamp(a.largestkey, b.largestkey) <= 0) { + if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) { + // b.smallestkey <= a.largestkey <= b.largestkey + return true; + } + } else if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) { + // a.smallestkey <= b.largestkey < a.largestkey + return true; + } + return false; +} + +// Identifies all files between level "min_level" and "max_level" +// which has overlapping key range with "input_file_meta". +void GetOverlappingFileNumbersForLevelCompaction( + const ColumnFamilyMetaData& cf_meta, const Comparator* comparator, + int min_level, int max_level, const SstFileMetaData* input_file_meta, + std::set* overlapping_file_names) { + std::set overlapping_files; + overlapping_files.insert(input_file_meta); + for (int m = min_level; m <= max_level; ++m) { + for (auto& file : cf_meta.levels[m].files) { + for (auto* included_file : overlapping_files) { + if (HaveOverlappingKeyRanges(comparator, *included_file, file)) { + overlapping_files.insert(&file); + overlapping_file_names->insert(file.name); + break; + } + } + } + } +} + +void VerifyCompactionResult( + const ColumnFamilyMetaData& cf_meta, + const std::set& overlapping_file_numbers) { +#ifndef NDEBUG + for (auto& level : cf_meta.levels) { + for (auto& file : level.files) { + assert(overlapping_file_numbers.find(file.name) == + overlapping_file_numbers.end()); + } + } +#endif +} + +const SstFileMetaData* PickFileRandomly(const ColumnFamilyMetaData& cf_meta, + Random* rand, int* level = nullptr) { + auto file_id = rand->Uniform(static_cast(cf_meta.file_count)) + 1; + for (auto& level_meta : cf_meta.levels) { + if (file_id <= level_meta.files.size()) { + if (level != nullptr) { + *level = level_meta.level; + } + auto result = rand->Uniform(file_id); + return &(level_meta.files[result]); + } + file_id -= static_cast(level_meta.files.size()); + } + assert(false); + return nullptr; +} +} // anonymous namespace + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +// All the TEST_P tests run once with sub_compactions disabled (i.e. +// options.max_subcompactions = 1) and once with it enabled +TEST_P(DBCompactionTestWithParam, CompactionDeletionTrigger) { + for (int tid = 0; tid < 3; ++tid) { + uint64_t db_size[2]; + Options options = DeletionTriggerOptions(CurrentOptions()); + options.max_subcompactions = max_subcompactions_; + + if (tid == 1) { + // the following only disable stats update in DB::Open() + // and should not affect the result of this test. + options.skip_stats_update_on_db_open = true; + } else if (tid == 2) { + // third pass with universal compaction + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 1; + } + + DestroyAndReopen(options); + Random rnd(301); + + const int kTestSize = kCDTKeysPerBuffer * 1024; + std::vector values; + for (int k = 0; k < kTestSize; ++k) { + values.push_back(rnd.RandomString(kCDTValueSize)); + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0])); + + for (int k = 0; k < kTestSize; ++k) { + ASSERT_OK(Delete(Key(k))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1])); + + if (options.compaction_style == kCompactionStyleUniversal) { + // Claim: in universal compaction none of the original data will remain + // once compactions settle. + // + // Proof: The compensated size of the file containing the most tombstones + // is enough on its own to trigger size amp compaction. Size amp + // compaction is a full compaction, so all tombstones meet the obsolete + // keys they cover. + ASSERT_EQ(0, db_size[1]); + } else { + // Claim: in level compaction at most `db_size[0] / 2` of the original + // data will remain once compactions settle. + // + // Proof: Assume the original data is all in the bottom level. If it were + // not, it would meet its tombstone sooner. The original data size is + // large enough to require fanout to bottom level to be greater than + // `max_bytes_for_level_multiplier == 2`. In the level just above, + // tombstones must cover less than `db_size[0] / 4` bytes since fanout >= + // 2 and file size is compensated by doubling the size of values we expect + // are covered (`kDeletionWeightOnCompaction == 2`). The tombstones in + // levels above must cover less than `db_size[0] / 8` bytes of original + // data, `db_size[0] / 16`, and so on. + ASSERT_GT(db_size[0] / 2, db_size[1]); + } + } +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_F(DBCompactionTest, SkipStatsUpdateTest) { + // This test verify UpdateAccumulatedStats is not on + // if options.skip_stats_update_on_db_open = true + // The test will need to be updated if the internal behavior changes. + + Options options = DeletionTriggerOptions(CurrentOptions()); + options.disable_auto_compactions = true; + options.env = env_; + DestroyAndReopen(options); + Random rnd(301); + + const int kTestSize = kCDTKeysPerBuffer * 512; + std::vector values; + for (int k = 0; k < kTestSize; ++k) { + values.push_back(rnd.RandomString(kCDTValueSize)); + ASSERT_OK(Put(Key(k), values[k])); + } + + ASSERT_OK(Flush()); + + Close(); + + int update_acc_stats_called = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionStorageInfo::UpdateAccumulatedStats", + [&](void* /* arg */) { ++update_acc_stats_called; }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Reopen the DB with stats-update disabled + options.skip_stats_update_on_db_open = true; + options.max_open_files = 20; + Reopen(options); + + ASSERT_EQ(update_acc_stats_called, 0); + + // Repeat the reopen process, but this time we enable + // stats-update. + options.skip_stats_update_on_db_open = false; + Reopen(options); + + ASSERT_GT(update_acc_stats_called, 0); + + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, TestTableReaderForCompaction) { + Options options = CurrentOptions(); + options.env = env_; + options.max_open_files = 20; + options.level0_file_num_compaction_trigger = 3; + // Avoid many shards with small max_open_files, where as little as + // two table insertions could lead to an LRU eviction, depending on + // hash values. + options.table_cache_numshardbits = 2; + DestroyAndReopen(options); + Random rnd(301); + + int num_table_cache_lookup = 0; + int num_new_table_reader = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "TableCache::FindTable:0", [&](void* arg) { + assert(arg != nullptr); + bool no_io = *(reinterpret_cast(arg)); + if (!no_io) { + // filter out cases for table properties queries. + num_table_cache_lookup++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "TableCache::GetTableReader:0", + [&](void* /*arg*/) { num_new_table_reader++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + for (int k = 0; k < options.level0_file_num_compaction_trigger; ++k) { + ASSERT_OK(Put(Key(k), Key(k))); + ASSERT_OK(Put(Key(10 - k), "bar")); + if (k < options.level0_file_num_compaction_trigger - 1) { + num_table_cache_lookup = 0; + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // preloading iterator issues one table cache lookup and create + // a new table reader, if not preloaded. + int old_num_table_cache_lookup = num_table_cache_lookup; + ASSERT_GE(num_table_cache_lookup, 1); + ASSERT_EQ(num_new_table_reader, 1); + + num_table_cache_lookup = 0; + num_new_table_reader = 0; + ASSERT_EQ(Key(k), Get(Key(k))); + // lookup iterator from table cache and no need to create a new one. + ASSERT_EQ(old_num_table_cache_lookup + num_table_cache_lookup, 2); + ASSERT_EQ(num_new_table_reader, 0); + } + } + + num_table_cache_lookup = 0; + num_new_table_reader = 0; + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Preloading iterator issues one table cache lookup and creates + // a new table reader. One file is created for flush and one for compaction. + // Compaction inputs make no table cache look-up for data/range deletion + // iterators + // May preload table cache too. + ASSERT_GE(num_table_cache_lookup, 2); + int old_num_table_cache_lookup2 = num_table_cache_lookup; + + // Create new iterator for: + // (1) 1 for verifying flush results + // (2) 1 for verifying compaction results. + // (3) New TableReaders will not be created for compaction inputs + ASSERT_EQ(num_new_table_reader, 2); + + num_table_cache_lookup = 0; + num_new_table_reader = 0; + ASSERT_EQ(Key(1), Get(Key(1))); + ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 5); + ASSERT_EQ(num_new_table_reader, 0); + + num_table_cache_lookup = 0; + num_new_table_reader = 0; + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + // Only verifying compaction outputs issues one table cache lookup + // for both data block and range deletion block). + // May preload table cache too. + ASSERT_GE(num_table_cache_lookup, 1); + old_num_table_cache_lookup2 = num_table_cache_lookup; + // One for verifying compaction results. + // No new iterator created for compaction. + ASSERT_EQ(num_new_table_reader, 1); + + num_table_cache_lookup = 0; + num_new_table_reader = 0; + ASSERT_EQ(Key(1), Get(Key(1))); + ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 3); + ASSERT_EQ(num_new_table_reader, 0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(DBCompactionTestWithParam, CompactionDeletionTriggerReopen) { + for (int tid = 0; tid < 2; ++tid) { + uint64_t db_size[3]; + Options options = DeletionTriggerOptions(CurrentOptions()); + options.max_subcompactions = max_subcompactions_; + + if (tid == 1) { + // second pass with universal compaction + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 1; + } + + DestroyAndReopen(options); + Random rnd(301); + + // round 1 --- insert key/value pairs. + const int kTestSize = kCDTKeysPerBuffer * 512; + std::vector values; + for (int k = 0; k < kTestSize; ++k) { + values.push_back(rnd.RandomString(kCDTValueSize)); + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0])); + Close(); + + // round 2 --- disable auto-compactions and issue deletions. + options.create_if_missing = false; + options.disable_auto_compactions = true; + Reopen(options); + + for (int k = 0; k < kTestSize; ++k) { + ASSERT_OK(Delete(Key(k))); + } + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1])); + Close(); + // as auto_compaction is off, we shouldn't see any reduction in db size. + ASSERT_LE(db_size[0], db_size[1]); + + // round 3 --- reopen db with auto_compaction on and see if + // deletion compensation still work. + options.disable_auto_compactions = false; + Reopen(options); + // insert relatively small amount of data to trigger auto compaction. + for (int k = 0; k < kTestSize / 10; ++k) { + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[2])); + // this time we're expecting significant drop in size. + // + // See "CompactionDeletionTrigger" test for proof that at most + // `db_size[0] / 2` of the original data remains. In addition to that, this + // test inserts `db_size[0] / 10` to push the tombstones into SST files and + // then through automatic compactions. So in total `3 * db_size[0] / 5` of + // the original data may remain. + ASSERT_GT(3 * db_size[0] / 5, db_size[2]); + } +} + +TEST_F(DBCompactionTest, CompactRangeBottomPri) { + ASSERT_OK(Put(Key(50), "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(100), "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(200), "")); + ASSERT_OK(Flush()); + + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + } + ASSERT_EQ("0,0,3", FilesPerLevel(0)); + + ASSERT_OK(Put(Key(1), "")); + ASSERT_OK(Put(Key(199), "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(2), "")); + ASSERT_OK(Put(Key(199), "")); + ASSERT_OK(Flush()); + ASSERT_EQ("2,0,3", FilesPerLevel(0)); + + // Now we have 2 L0 files, and 3 L2 files, and a manual compaction will + // be triggered. + // Two compaction jobs will run. One compacts 2 L0 files in Low Pri Pool + // and one compact to L2 in bottom pri pool. + int low_pri_count = 0; + int bottom_pri_count = 0; + SyncPoint::GetInstance()->SetCallBack( + "ThreadPoolImpl::Impl::BGThread:BeforeRun", [&](void* arg) { + Env::Priority* pri = reinterpret_cast(arg); + // First time is low pri pool in the test case. + if (low_pri_count == 0 && bottom_pri_count == 0) { + ASSERT_EQ(Env::Priority::LOW, *pri); + } + if (*pri == Env::Priority::LOW) { + low_pri_count++; + } else { + bottom_pri_count++; + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + env_->SetBackgroundThreads(1, Env::Priority::BOTTOM); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(1, low_pri_count); + ASSERT_EQ(1, bottom_pri_count); + ASSERT_EQ("0,0,2", FilesPerLevel(0)); + + // Recompact bottom most level uses bottom pool + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ(1, low_pri_count); + ASSERT_EQ(2, bottom_pri_count); + + env_->SetBackgroundThreads(0, Env::Priority::BOTTOM); + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + // Low pri pool is used if bottom pool has size 0. + ASSERT_EQ(2, low_pri_count); + ASSERT_EQ(2, bottom_pri_count); + + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, DisableStatsUpdateReopen) { + uint64_t db_size[3]; + for (int test = 0; test < 2; ++test) { + Options options = DeletionTriggerOptions(CurrentOptions()); + options.skip_stats_update_on_db_open = (test == 0); + + env_->random_read_counter_.Reset(); + DestroyAndReopen(options); + Random rnd(301); + + // round 1 --- insert key/value pairs. + const int kTestSize = kCDTKeysPerBuffer * 512; + std::vector values; + for (int k = 0; k < kTestSize; ++k) { + values.push_back(rnd.RandomString(kCDTValueSize)); + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // L1 and L2 can fit deletions iff size compensation does not take effect, + // i.e., when `skip_stats_update_on_db_open == true`. Move any remaining + // files at or above L2 down to L3 to ensure obsolete data does not + // accidentally meet its tombstone above L3. This makes the final size more + // deterministic and easy to see whether size compensation for deletions + // took effect. + MoveFilesToLevel(3 /* level */); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0])); + Close(); + + // round 2 --- disable auto-compactions and issue deletions. + options.create_if_missing = false; + options.disable_auto_compactions = true; + + env_->random_read_counter_.Reset(); + Reopen(options); + + for (int k = 0; k < kTestSize; ++k) { + ASSERT_OK(Delete(Key(k))); + } + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1])); + Close(); + // as auto_compaction is off, we shouldn't see any reduction in db size. + ASSERT_LE(db_size[0], db_size[1]); + + // round 3 --- reopen db with auto_compaction on and see if + // deletion compensation still work. + options.disable_auto_compactions = false; + Reopen(options); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[2])); + + if (options.skip_stats_update_on_db_open) { + // If update stats on DB::Open is disable, we don't expect + // deletion entries taking effect. + // + // The deletions are small enough to fit in L1 and L2, and obsolete keys + // were moved to L3+, so none of the original data should have been + // dropped. + ASSERT_LE(db_size[0], db_size[2]); + } else { + // Otherwise, we should see a significant drop in db size. + // + // See "CompactionDeletionTrigger" test for proof that at most + // `db_size[0] / 2` of the original data remains. + ASSERT_GT(db_size[0] / 2, db_size[2]); + } + } +} + +TEST_P(DBCompactionTestWithParam, CompactionTrigger) { + const int kNumKeysPerFile = 100; + + Options options = CurrentOptions(); + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.num_levels = 3; + options.level0_file_num_compaction_trigger = 3; + options.max_subcompactions = max_subcompactions_; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + CreateAndReopenWithCF({"pikachu"}, options); + + Random rnd(301); + + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + std::vector values; + // Write 100KB (100 values, each 1K) + for (int i = 0; i < kNumKeysPerFile; i++) { + values.push_back(rnd.RandomString(990)); + ASSERT_OK(Put(1, Key(i), values[i])); + } + // put extra key to trigger flush + ASSERT_OK(Put(1, "", "")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1); + } + + // generate one more file in level-0, and should trigger level-0 compaction + std::vector values; + for (int i = 0; i < kNumKeysPerFile; i++) { + values.push_back(rnd.RandomString(990)); + ASSERT_OK(Put(1, Key(i), values[i])); + } + // put extra key to trigger flush + ASSERT_OK(Put(1, "", "")); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 1); +} + +TEST_F(DBCompactionTest, BGCompactionsAllowed) { + // Create several column families. Make compaction triggers in all of them + // and see number of compactions scheduled to be less than allowed. + const int kNumKeysPerFile = 100; + + Options options = CurrentOptions(); + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.num_levels = 3; + // Should speed up compaction when there are 4 files. + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 20; + options.soft_pending_compaction_bytes_limit = 1 << 30; // Infinitely large + options.max_background_compactions = 3; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + + // Block all threads in thread pool. + const size_t kTotalTasks = 4; + env_->SetBackgroundThreads(4, Env::LOW); + test::SleepingBackgroundTask sleeping_tasks[kTotalTasks]; + for (size_t i = 0; i < kTotalTasks; i++) { + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_tasks[i], Env::Priority::LOW); + sleeping_tasks[i].WaitUntilSleeping(); + } + + CreateAndReopenWithCF({"one", "two", "three"}, options); + + Random rnd(301); + for (int cf = 0; cf < 4; cf++) { + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + for (int i = 0; i < kNumKeysPerFile; i++) { + ASSERT_OK(Put(cf, Key(i), "")); + } + // put extra key to trigger flush + ASSERT_OK(Put(cf, "", "")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf])); + ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1); + } + } + + // Now all column families qualify compaction but only one should be + // scheduled, because no column family hits speed up condition. + ASSERT_EQ(1u, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + + // Create two more files for one column family, which triggers speed up + // condition, three compactions will be scheduled. + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + for (int i = 0; i < kNumKeysPerFile; i++) { + ASSERT_OK(Put(2, Key(i), "")); + } + // put extra key to trigger flush + ASSERT_OK(Put(2, "", "")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2])); + ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1, + NumTableFilesAtLevel(0, 2)); + } + ASSERT_EQ(3U, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + + // Unblock all threads to unblock all compactions. + for (size_t i = 0; i < kTotalTasks; i++) { + sleeping_tasks[i].WakeUp(); + sleeping_tasks[i].WaitUntilDone(); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Verify number of compactions allowed will come back to 1. + + for (size_t i = 0; i < kTotalTasks; i++) { + sleeping_tasks[i].Reset(); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_tasks[i], Env::Priority::LOW); + sleeping_tasks[i].WaitUntilSleeping(); + } + for (int cf = 0; cf < 4; cf++) { + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + for (int i = 0; i < kNumKeysPerFile; i++) { + ASSERT_OK(Put(cf, Key(i), "")); + } + // put extra key to trigger flush + ASSERT_OK(Put(cf, "", "")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf])); + ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1); + } + } + + // Now all column families qualify compaction but only one should be + // scheduled, because no column family hits speed up condition. + ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + + for (size_t i = 0; i < kTotalTasks; i++) { + sleeping_tasks[i].WakeUp(); + sleeping_tasks[i].WaitUntilDone(); + } +} + +TEST_P(DBCompactionTestWithParam, CompactionsGenerateMultipleFiles) { + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; // Large write buffer + options.max_subcompactions = max_subcompactions_; + CreateAndReopenWithCF({"pikachu"}, options); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + std::vector values; + for (int i = 0; i < 80; i++) { + values.push_back(rnd.RandomString(100000)); + ASSERT_OK(Put(1, Key(i), values[i])); + } + + // Reopening moves updates to level-0 + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], + true /* disallow trivial move */)); + + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_GT(NumTableFilesAtLevel(1, 1), 1); + for (int i = 0; i < 80; i++) { + ASSERT_EQ(Get(1, Key(i)), values[i]); + } +} + +TEST_F(DBCompactionTest, MinorCompactionsHappen) { + do { + Options options = CurrentOptions(); + options.write_buffer_size = 10000; + CreateAndReopenWithCF({"pikachu"}, options); + + const int N = 500; + + int starting_num_tables = TotalTableFiles(1); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(1, Key(i), Key(i) + std::string(1000, 'v'))); + } + int ending_num_tables = TotalTableFiles(1); + ASSERT_GT(ending_num_tables, starting_num_tables); + + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i))); + } + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i))); + } + } while (ChangeCompactOptions()); +} + +TEST_F(DBCompactionTest, UserKeyCrossFile1) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 3; + + DestroyAndReopen(options); + + // create first file and flush to l0 + ASSERT_OK(Put("4", "A")); + ASSERT_OK(Put("3", "A")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + ASSERT_OK(Put("2", "A")); + ASSERT_OK(Delete("3")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ("NOT_FOUND", Get("3")); + + // move both files down to l1 + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("NOT_FOUND", Get("3")); + + for (int i = 0; i < 3; i++) { + ASSERT_OK(Put("2", "B")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ("NOT_FOUND", Get("3")); +} + +TEST_F(DBCompactionTest, UserKeyCrossFile2) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 3; + + DestroyAndReopen(options); + + // create first file and flush to l0 + ASSERT_OK(Put("4", "A")); + ASSERT_OK(Put("3", "A")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + ASSERT_OK(Put("2", "A")); + ASSERT_OK(SingleDelete("3")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ("NOT_FOUND", Get("3")); + + // move both files down to l1 + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("NOT_FOUND", Get("3")); + + for (int i = 0; i < 3; i++) { + ASSERT_OK(Put("2", "B")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ("NOT_FOUND", Get("3")); +} + +TEST_F(DBCompactionTest, CompactionSstPartitioner) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 3; + std::shared_ptr factory( + NewSstPartitionerFixedPrefixFactory(4)); + options.sst_partitioner_factory = factory; + + DestroyAndReopen(options); + + // create first file and flush to l0 + ASSERT_OK(Put("aaaa1", "A")); + ASSERT_OK(Put("bbbb1", "B")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + ASSERT_OK(Put("aaaa1", "A2")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + // move both files down to l1 + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + std::vector files; + dbfull()->GetLiveFilesMetaData(&files); + ASSERT_EQ(2, files.size()); + ASSERT_EQ("A2", Get("aaaa1")); + ASSERT_EQ("B", Get("bbbb1")); +} + +TEST_F(DBCompactionTest, CompactionSstPartitionerNonTrivial) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 1; + std::shared_ptr factory( + NewSstPartitionerFixedPrefixFactory(4)); + options.sst_partitioner_factory = factory; + + DestroyAndReopen(options); + + // create first file and flush to l0 + ASSERT_OK(Put("aaaa1", "A")); + ASSERT_OK(Put("bbbb1", "B")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + std::vector files; + dbfull()->GetLiveFilesMetaData(&files); + ASSERT_EQ(2, files.size()); + ASSERT_EQ("A", Get("aaaa1")); + ASSERT_EQ("B", Get("bbbb1")); +} + +TEST_F(DBCompactionTest, ZeroSeqIdCompaction) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 3; + + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + + // compaction options + CompactionOptions compact_opt; + compact_opt.compression = kNoCompression; + compact_opt.output_file_size_limit = 4096; + const size_t key_len = + static_cast(compact_opt.output_file_size_limit) / 5; + + DestroyAndReopen(options); + + std::vector snaps; + + // create first file and flush to l0 + for (auto& key : {"1", "2", "3", "3", "3", "3"}) { + ASSERT_OK(Put(key, std::string(key_len, 'A'))); + snaps.push_back(dbfull()->GetSnapshot()); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + // create second file and flush to l0 + for (auto& key : {"3", "4", "5", "6", "7", "8"}) { + ASSERT_OK(Put(key, std::string(key_len, 'A'))); + snaps.push_back(dbfull()->GetSnapshot()); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + // move both files down to l1 + ASSERT_OK( + dbfull()->CompactFiles(compact_opt, collector->GetFlushedFiles(), 1)); + + // release snap so that first instance of key(3) can have seqId=0 + for (auto snap : snaps) { + dbfull()->ReleaseSnapshot(snap); + } + + // create 3 files in l0 so to trigger compaction + for (int i = 0; i < options.level0_file_num_compaction_trigger; i++) { + ASSERT_OK(Put("2", std::string(1, 'A'))); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(Put("", "")); +} + +TEST_F(DBCompactionTest, ManualCompactionUnknownOutputSize) { + // github issue #2249 + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 3; + DestroyAndReopen(options); + + // create two files in l1 that we can compact + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < options.level0_file_num_compaction_trigger; j++) { + ASSERT_OK(Put(std::to_string(2 * i), std::string(1, 'A'))); + ASSERT_OK(Put(std::to_string(2 * i + 1), std::string(1, 'A'))); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_OK( + dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"}})); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 0), 2); + ASSERT_OK( + dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "3"}})); + + ColumnFamilyMetaData cf_meta; + dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta); + ASSERT_EQ(2, cf_meta.levels[1].files.size()); + std::vector input_filenames; + for (const auto& sst_file : cf_meta.levels[1].files) { + input_filenames.push_back(sst_file.name); + } + + // note CompactionOptions::output_file_size_limit is unset. + CompactionOptions compact_opt; + compact_opt.compression = kNoCompression; + ASSERT_OK(dbfull()->CompactFiles(compact_opt, input_filenames, 1)); +} + +// Check that writes done during a memtable compaction are recovered +// if the database is shutdown during the memtable compaction. +TEST_F(DBCompactionTest, RecoverDuringMemtableCompaction) { + do { + Options options = CurrentOptions(); + options.env = env_; + CreateAndReopenWithCF({"pikachu"}, options); + + // Trigger a long memtable compaction and reopen the database during it + ASSERT_OK(Put(1, "foo", "v1")); // Goes to 1st log file + ASSERT_OK(Put(1, "big1", std::string(10000000, 'x'))); // Fills memtable + ASSERT_OK(Put(1, "big2", std::string(1000, 'y'))); // Triggers compaction + ASSERT_OK(Put(1, "bar", "v2")); // Goes to new log file + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v2", Get(1, "bar")); + ASSERT_EQ(std::string(10000000, 'x'), Get(1, "big1")); + ASSERT_EQ(std::string(1000, 'y'), Get(1, "big2")); + } while (ChangeOptions()); +} + +TEST_P(DBCompactionTestWithParam, TrivialMoveOneFile) { + int32_t trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; + options.max_subcompactions = max_subcompactions_; + DestroyAndReopen(options); + + int32_t num_keys = 80; + int32_t value_size = 100 * 1024; // 100 KB + + Random rnd(301); + std::vector values; + for (int i = 0; i < num_keys; i++) { + values.push_back(rnd.RandomString(value_size)); + ASSERT_OK(Put(Key(i), values[i])); + } + + // Reopening moves updates to L0 + Reopen(options); + ASSERT_EQ(NumTableFilesAtLevel(0, 0), 1); // 1 file in L0 + ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); // 0 files in L1 + + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(metadata.size(), 1U); + LiveFileMetaData level0_file = metadata[0]; // L0 file meta + + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + + // Compaction will initiate a trivial move from L0 to L1 + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + + // File moved From L0 to L1 + ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); // 0 files in L0 + ASSERT_EQ(NumTableFilesAtLevel(1, 0), 1); // 1 file in L1 + + metadata.clear(); + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(metadata.size(), 1U); + ASSERT_EQ(metadata[0].name /* level1_file.name */, level0_file.name); + ASSERT_EQ(metadata[0].size /* level1_file.size */, level0_file.size); + + for (int i = 0; i < num_keys; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + + ASSERT_EQ(trivial_move, 1); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) { + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.write_buffer_size = 10 * 1024 * 1024; + options.max_subcompactions = max_subcompactions_; + + DestroyAndReopen(options); + // non overlapping ranges + std::vector> ranges = { + {100, 199}, {300, 399}, {0, 99}, {200, 299}, + {600, 699}, {400, 499}, {500, 550}, {551, 599}, + }; + int32_t value_size = 10 * 1024; // 10 KB + + Random rnd(301); + std::map values; + for (size_t i = 0; i < ranges.size(); i++) { + for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { + values[j] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(j), values[j])); + } + ASSERT_OK(Flush()); + } + + int32_t level0_files = NumTableFilesAtLevel(0, 0); + ASSERT_EQ(level0_files, ranges.size()); // Multiple files in L0 + ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); // No files in L1 + + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + + // Since data is non-overlapping we expect compaction to initiate + // a trivial move + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + // We expect that all the files were trivially moved from L0 to L1 + ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 0) /* level1_files */, level0_files); + + for (size_t i = 0; i < ranges.size(); i++) { + for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { + ASSERT_EQ(Get(Key(j)), values[j]); + } + } + + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + trivial_move = 0; + non_trivial_move = 0; + values.clear(); + DestroyAndReopen(options); + // Same ranges as above but overlapping + ranges = { + {100, 199}, + {300, 399}, + {0, 99}, + {200, 299}, + {600, 699}, + {400, 499}, + {500, 560}, // this range overlap with the next + // one + {551, 599}, + }; + for (size_t i = 0; i < ranges.size(); i++) { + for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { + values[j] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(j), values[j])); + } + ASSERT_OK(Flush()); + } + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + for (size_t i = 0; i < ranges.size(); i++) { + for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { + ASSERT_EQ(Get(Key(j)), values[j]); + } + } + ASSERT_EQ(trivial_move, 0); + ASSERT_EQ(non_trivial_move, 1); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBCompactionTestWithParam, TrivialMoveTargetLevel) { + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.write_buffer_size = 10 * 1024 * 1024; + options.num_levels = 7; + options.max_subcompactions = max_subcompactions_; + + DestroyAndReopen(options); + int32_t value_size = 10 * 1024; // 10 KB + + // Add 2 non-overlapping files + Random rnd(301); + std::map values; + + // file 1 [0 => 300] + for (int32_t i = 0; i <= 300; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // file 2 [600 => 700] + for (int32_t i = 600; i <= 700; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // 2 files in L0 + ASSERT_EQ("2", FilesPerLevel(0)); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 6; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + // 2 files in L6 + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel(0)); + + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + for (int32_t i = 0; i <= 300; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + for (int32_t i = 600; i <= 700; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } +} + +TEST_P(DBCompactionTestWithParam, PartialOverlappingL0) { + class SubCompactionEventListener : public EventListener { + public: + void OnSubcompactionCompleted(const SubcompactionJobInfo&) override { + sub_compaction_finished_++; + } + std::atomic sub_compaction_finished_{0}; + }; + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.write_buffer_size = 10 * 1024 * 1024; + options.max_subcompactions = max_subcompactions_; + SubCompactionEventListener* listener = new SubCompactionEventListener(); + options.listeners.emplace_back(listener); + + DestroyAndReopen(options); + + // For subcompactino to trigger, output level needs to be non-empty. + ASSERT_OK(Put("key", "")); + ASSERT_OK(Put("kez", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("key", "")); + ASSERT_OK(Put("kez", "")); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Ranges that are only briefly overlapping so that they won't be trivially + // moved but subcompaction ranges would only contain a subset of files. + std::vector> ranges = { + {100, 199}, {198, 399}, {397, 600}, {598, 800}, {799, 900}, {895, 999}, + }; + int32_t value_size = 10 * 1024; // 10 KB + + Random rnd(301); + std::map values; + for (size_t i = 0; i < ranges.size(); i++) { + for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { + values[j] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(j), values[j])); + } + ASSERT_OK(Flush()); + } + + int32_t level0_files = NumTableFilesAtLevel(0, 0); + ASSERT_EQ(level0_files, ranges.size()); // Multiple files in L0 + ASSERT_EQ(NumTableFilesAtLevel(1, 0), 1); // One file in L1 + + listener->sub_compaction_finished_ = 0; + ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()})); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + if (max_subcompactions_ > 3) { + // RocksDB might not generate the exact number of sub compactions. + // Here we validate that at least subcompaction happened. + ASSERT_GT(listener->sub_compaction_finished_.load(), 2); + } + + // We expect that all the files were compacted to L1 + ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); + ASSERT_GT(NumTableFilesAtLevel(1, 0), 1); + + for (size_t i = 0; i < ranges.size(); i++) { + for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { + ASSERT_EQ(Get(Key(j)), values[j]); + } + } +} + +TEST_P(DBCompactionTestWithParam, ManualCompactionPartial) { + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial_move++; }); + bool first = true; + // Purpose of dependencies: + // 4 -> 1: ensure the order of two non-trivial compactions + // 5 -> 2 and 5 -> 3: ensure we do a check before two non-trivial compactions + // are installed + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBCompaction::ManualPartial:4", "DBCompaction::ManualPartial:1"}, + {"DBCompaction::ManualPartial:5", "DBCompaction::ManualPartial:2"}, + {"DBCompaction::ManualPartial:5", "DBCompaction::ManualPartial:3"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) { + if (first) { + first = false; + TEST_SYNC_POINT("DBCompaction::ManualPartial:4"); + TEST_SYNC_POINT("DBCompaction::ManualPartial:3"); + } else { // second non-trivial compaction + TEST_SYNC_POINT("DBCompaction::ManualPartial:2"); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.write_buffer_size = 10 * 1024 * 1024; + options.num_levels = 7; + options.max_subcompactions = max_subcompactions_; + options.level0_file_num_compaction_trigger = 3; + options.max_background_compactions = 3; + options.target_file_size_base = 1 << 23; // 8 MB + + DestroyAndReopen(options); + int32_t value_size = 10 * 1024; // 10 KB + + // Add 2 non-overlapping files + Random rnd(301); + std::map values; + + // file 1 [0 => 100] + for (int32_t i = 0; i < 100; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // file 2 [100 => 300] + for (int32_t i = 100; i < 300; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // 2 files in L0 + ASSERT_EQ("2", FilesPerLevel(0)); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 6; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + // Trivial move the two non-overlapping files to level 6 + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + // 2 files in L6 + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel(0)); + + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + // file 3 [ 0 => 200] + for (int32_t i = 0; i < 200; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // 1 files in L0 + ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel(0)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, false)); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, false)); + ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr, nullptr, false)); + ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr, nullptr, false)); + ASSERT_OK(dbfull()->TEST_CompactRange(4, nullptr, nullptr, nullptr, false)); + // 2 files in L6, 1 file in L5 + ASSERT_EQ("0,0,0,0,0,1,2", FilesPerLevel(0)); + + ASSERT_EQ(trivial_move, 6); + ASSERT_EQ(non_trivial_move, 0); + + ROCKSDB_NAMESPACE::port::Thread threads([&] { + compact_options.change_level = false; + compact_options.exclusive_manual_compaction = false; + std::string begin_string = Key(0); + std::string end_string = Key(199); + Slice begin(begin_string); + Slice end(end_string); + // First non-trivial compaction is triggered + ASSERT_OK(db_->CompactRange(compact_options, &begin, &end)); + }); + + TEST_SYNC_POINT("DBCompaction::ManualPartial:1"); + // file 4 [300 => 400) + for (int32_t i = 300; i <= 400; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // file 5 [400 => 500) + for (int32_t i = 400; i <= 500; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // file 6 [500 => 600) + for (int32_t i = 500; i <= 600; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + // Second non-trivial compaction is triggered + ASSERT_OK(Flush()); + + // Before two non-trivial compactions are installed, there are 3 files in L0 + ASSERT_EQ("3,0,0,0,0,1,2", FilesPerLevel(0)); + TEST_SYNC_POINT("DBCompaction::ManualPartial:5"); + + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // After two non-trivial compactions are installed, there is 1 file in L6, and + // 1 file in L1 + ASSERT_EQ("0,1,0,0,0,0,1", FilesPerLevel(0)); + threads.join(); + + for (int32_t i = 0; i < 600; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } +} + +// Disable as the test is flaky. +TEST_F(DBCompactionTest, DISABLED_ManualPartialFill) { + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial_move++; }); + bool first = true; + bool second = true; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBCompaction::PartialFill:4", "DBCompaction::PartialFill:1"}, + {"DBCompaction::PartialFill:2", "DBCompaction::PartialFill:3"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) { + if (first) { + TEST_SYNC_POINT("DBCompaction::PartialFill:4"); + first = false; + TEST_SYNC_POINT("DBCompaction::PartialFill:3"); + } else if (second) { + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.write_buffer_size = 10 * 1024 * 1024; + options.max_bytes_for_level_multiplier = 2; + options.num_levels = 4; + options.level0_file_num_compaction_trigger = 3; + options.max_background_compactions = 3; + + DestroyAndReopen(options); + // make sure all background compaction jobs can be scheduled + auto stop_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + int32_t value_size = 10 * 1024; // 10 KB + + // Add 2 non-overlapping files + Random rnd(301); + std::map values; + + // file 1 [0 => 100] + for (int32_t i = 0; i < 100; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // file 2 [100 => 300] + for (int32_t i = 100; i < 300; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // 2 files in L0 + ASSERT_EQ("2", FilesPerLevel(0)); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + // 2 files in L2 + ASSERT_EQ("0,0,2", FilesPerLevel(0)); + + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + // file 3 [ 0 => 200] + for (int32_t i = 0; i < 200; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // 2 files in L2, 1 in L0 + ASSERT_EQ("1,0,2", FilesPerLevel(0)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, false)); + // 2 files in L2, 1 in L1 + ASSERT_EQ("0,1,2", FilesPerLevel(0)); + + ASSERT_EQ(trivial_move, 2); + ASSERT_EQ(non_trivial_move, 0); + + ROCKSDB_NAMESPACE::port::Thread threads([&] { + compact_options.change_level = false; + compact_options.exclusive_manual_compaction = false; + std::string begin_string = Key(0); + std::string end_string = Key(199); + Slice begin(begin_string); + Slice end(end_string); + ASSERT_OK(db_->CompactRange(compact_options, &begin, &end)); + }); + + TEST_SYNC_POINT("DBCompaction::PartialFill:1"); + // Many files 4 [300 => 4300) + for (int32_t i = 0; i <= 5; i++) { + for (int32_t j = 300; j < 4300; j++) { + if (j == 2300) { + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + values[j] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(j), values[j])); + } + } + + // Verify level sizes + uint64_t target_size = 4 * options.max_bytes_for_level_base; + for (int32_t i = 1; i < options.num_levels; i++) { + ASSERT_LE(SizeAtLevel(i), target_size); + target_size = static_cast(target_size * + options.max_bytes_for_level_multiplier); + } + + TEST_SYNC_POINT("DBCompaction::PartialFill:2"); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + threads.join(); + + for (int32_t i = 0; i < 4300; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } +} + +TEST_F(DBCompactionTest, ManualCompactionWithUnorderedWrite) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL", + "DBCompactionTest::ManualCompactionWithUnorderedWrite:WaitWriteWAL"}, + {"DBImpl::WaitForPendingWrites:BeforeBlock", + "DBImpl::WriteImpl:BeforeUnorderedWriteMemtable"}}); + + Options options = CurrentOptions(); + options.unordered_write = true; + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("bar", "v1")); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + port::Thread writer([&]() { ASSERT_OK(Put("foo", "v2")); }); + + TEST_SYNC_POINT( + "DBCompactionTest::ManualCompactionWithUnorderedWrite:WaitWriteWAL"); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + writer.join(); + ASSERT_EQ(Get("foo"), "v2"); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + Reopen(options); + ASSERT_EQ(Get("foo"), "v2"); +} + +TEST_F(DBCompactionTest, DeleteFileRange) { + Options options = CurrentOptions(); + options.write_buffer_size = 10 * 1024 * 1024; + options.max_bytes_for_level_multiplier = 2; + options.num_levels = 4; + options.level0_file_num_compaction_trigger = 3; + options.max_background_compactions = 3; + + DestroyAndReopen(options); + int32_t value_size = 10 * 1024; // 10 KB + + // Add 2 non-overlapping files + Random rnd(301); + std::map values; + + // file 1 [0 => 100] + for (int32_t i = 0; i < 100; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // file 2 [100 => 300] + for (int32_t i = 100; i < 300; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // 2 files in L0 + ASSERT_EQ("2", FilesPerLevel(0)); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + // 2 files in L2 + ASSERT_EQ("0,0,2", FilesPerLevel(0)); + + // file 3 [ 0 => 200] + for (int32_t i = 0; i < 200; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // Many files 4 [300 => 4300) + for (int32_t i = 0; i <= 5; i++) { + for (int32_t j = 300; j < 4300; j++) { + if (j == 2300) { + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + values[j] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(j), values[j])); + } + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Verify level sizes + uint64_t target_size = 4 * options.max_bytes_for_level_base; + for (int32_t i = 1; i < options.num_levels; i++) { + ASSERT_LE(SizeAtLevel(i), target_size); + target_size = static_cast(target_size * + options.max_bytes_for_level_multiplier); + } + + const size_t old_num_files = CountFiles(); + std::string begin_string = Key(1000); + std::string end_string = Key(2000); + Slice begin(begin_string); + Slice end(end_string); + ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end)); + + int32_t deleted_count = 0; + for (int32_t i = 0; i < 4300; i++) { + if (i < 1000 || i > 2000) { + ASSERT_EQ(Get(Key(i)), values[i]); + } else { + ReadOptions roptions; + std::string result; + Status s = db_->Get(roptions, Key(i), &result); + ASSERT_TRUE(s.IsNotFound() || s.ok()); + if (s.IsNotFound()) { + deleted_count++; + } + } + } + ASSERT_GT(deleted_count, 0); + begin_string = Key(5000); + end_string = Key(6000); + Slice begin1(begin_string); + Slice end1(end_string); + // Try deleting files in range which contain no keys + ASSERT_OK( + DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin1, &end1)); + + // Push data from level 0 to level 1 to force all data to be deleted + // Note that we don't delete level 0 files + compact_options.change_level = true; + compact_options.target_level = 1; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_OK( + DeleteFilesInRange(db_, db_->DefaultColumnFamily(), nullptr, nullptr)); + + int32_t deleted_count2 = 0; + for (int32_t i = 0; i < 4300; i++) { + ReadOptions roptions; + std::string result; + ASSERT_TRUE(db_->Get(roptions, Key(i), &result).IsNotFound()); + deleted_count2++; + } + ASSERT_GT(deleted_count2, deleted_count); + const size_t new_num_files = CountFiles(); + ASSERT_GT(old_num_files, new_num_files); +} + +TEST_F(DBCompactionTest, DeleteFilesInRanges) { + Options options = CurrentOptions(); + options.write_buffer_size = 10 * 1024 * 1024; + options.max_bytes_for_level_multiplier = 2; + options.num_levels = 4; + options.max_background_compactions = 3; + options.disable_auto_compactions = true; + + DestroyAndReopen(options); + int32_t value_size = 10 * 1024; // 10 KB + + Random rnd(301); + std::map values; + + // file [0 => 100), [100 => 200), ... [900, 1000) + for (auto i = 0; i < 10; i++) { + for (auto j = 0; j < 100; j++) { + auto k = i * 100 + j; + values[k] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(Flush()); + } + ASSERT_EQ("10", FilesPerLevel(0)); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,0,10", FilesPerLevel(0)); + + // file [0 => 100), [200 => 300), ... [800, 900) + for (auto i = 0; i < 10; i += 2) { + for (auto j = 0; j < 100; j++) { + auto k = i * 100 + j; + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(Flush()); + } + ASSERT_EQ("5,0,10", FilesPerLevel(0)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); + ASSERT_EQ("0,5,10", FilesPerLevel(0)); + + // Delete files in range [0, 299] (inclusive) + { + auto begin_str1 = Key(0), end_str1 = Key(100); + auto begin_str2 = Key(100), end_str2 = Key(200); + auto begin_str3 = Key(200), end_str3 = Key(299); + Slice begin1(begin_str1), end1(end_str1); + Slice begin2(begin_str2), end2(end_str2); + Slice begin3(begin_str3), end3(end_str3); + std::vector ranges; + ranges.push_back(RangePtr(&begin1, &end1)); + ranges.push_back(RangePtr(&begin2, &end2)); + ranges.push_back(RangePtr(&begin3, &end3)); + ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), + ranges.data(), ranges.size())); + ASSERT_EQ("0,3,7", FilesPerLevel(0)); + + // Keys [0, 300) should not exist. + for (auto i = 0; i < 300; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + for (auto i = 300; i < 1000; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + } + + // Delete files in range [600, 999) (exclusive) + { + auto begin_str1 = Key(600), end_str1 = Key(800); + auto begin_str2 = Key(700), end_str2 = Key(900); + auto begin_str3 = Key(800), end_str3 = Key(999); + Slice begin1(begin_str1), end1(end_str1); + Slice begin2(begin_str2), end2(end_str2); + Slice begin3(begin_str3), end3(end_str3); + std::vector ranges; + ranges.push_back(RangePtr(&begin1, &end1)); + ranges.push_back(RangePtr(&begin2, &end2)); + ranges.push_back(RangePtr(&begin3, &end3)); + ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), + ranges.data(), ranges.size(), false)); + ASSERT_EQ("0,1,4", FilesPerLevel(0)); + + // Keys [600, 900) should not exist. + for (auto i = 600; i < 900; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + for (auto i = 300; i < 600; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + for (auto i = 900; i < 1000; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + } + + // Delete all files. + { + RangePtr range; + ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), &range, 1)); + ASSERT_EQ("", FilesPerLevel(0)); + + for (auto i = 0; i < 1000; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + } +} + +TEST_F(DBCompactionTest, DeleteFileRangeFileEndpointsOverlapBug) { + // regression test for #2833: groups of files whose user-keys overlap at the + // endpoints could be split by `DeleteFilesInRange`. This caused old data to + // reappear, either because a new version of the key was removed, or a range + // deletion was partially dropped. It could also cause non-overlapping + // invariant to be violated if the files dropped by DeleteFilesInRange were + // a subset of files that a range deletion spans. + const int kNumL0Files = 2; + const int kValSize = 8 << 10; // 8KB + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + options.target_file_size_base = 1 << 10; // 1KB + DestroyAndReopen(options); + + // The snapshot prevents key 1 from having its old version dropped. The low + // `target_file_size_base` ensures two keys will be in each output file. + const Snapshot* snapshot = nullptr; + Random rnd(301); + // The value indicates which flush the key belonged to, which is enough + // for us to determine the keys' relative ages. After L0 flushes finish, + // files look like: + // + // File 0: 0 -> vals[0], 1 -> vals[0] + // File 1: 1 -> vals[1], 2 -> vals[1] + // + // Then L0->L1 compaction happens, which outputs keys as follows: + // + // File 0: 0 -> vals[0], 1 -> vals[1] + // File 1: 1 -> vals[0], 2 -> vals[1] + // + // DeleteFilesInRange shouldn't be allowed to drop just file 0, as that + // would cause `1 -> vals[0]` (an older key) to reappear. + std::string vals[kNumL0Files]; + for (int i = 0; i < kNumL0Files; ++i) { + vals[i] = rnd.RandomString(kValSize); + ASSERT_OK(Put(Key(i), vals[i])); + ASSERT_OK(Put(Key(i + 1), vals[i])); + ASSERT_OK(Flush()); + if (i == 0) { + snapshot = db_->GetSnapshot(); + } + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Verify `DeleteFilesInRange` can't drop only file 0 which would cause + // "1 -> vals[0]" to reappear. + std::string begin_str = Key(0), end_str = Key(1); + Slice begin = begin_str, end = end_str; + ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end)); + ASSERT_EQ(vals[1], Get(Key(1))); + + db_->ReleaseSnapshot(snapshot); +} + +TEST_P(DBCompactionTestWithParam, TrivialMoveToLastLevelWithFiles) { + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; + options.max_subcompactions = max_subcompactions_; + DestroyAndReopen(options); + + int32_t value_size = 10 * 1024; // 10 KB + + Random rnd(301); + std::vector values; + // File with keys [ 0 => 99 ] + for (int i = 0; i < 100; i++) { + values.push_back(rnd.RandomString(value_size)); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + ASSERT_EQ("1", FilesPerLevel(0)); + // Compaction will do L0=>L1 (trivial move) then move L1 files to L3 + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 3; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,0,0,1", FilesPerLevel(0)); + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + // File with keys [ 100 => 199 ] + for (int i = 100; i < 200; i++) { + values.push_back(rnd.RandomString(value_size)); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + ASSERT_EQ("1,0,0,1", FilesPerLevel(0)); + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves) + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,2", FilesPerLevel(0)); + ASSERT_EQ(trivial_move, 4); + ASSERT_EQ(non_trivial_move, 0); + + for (int i = 0; i < 200; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBCompactionTestWithParam, LevelCompactionThirdPath) { + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_, 500 * 1024); + options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024); + options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.compaction_style = kCompactionStyleLevel; + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 4; + options.max_bytes_for_level_base = 400 * 1024; + options.max_subcompactions = max_subcompactions_; + + DestroyAndReopen(options); + + Random rnd(301); + int key_idx = 0; + + // First three 110KB files are not going to second path. + // After that, (100K, 200K) + for (int num = 0; num < 3; num++) { + GenerateNewFile(&rnd, &key_idx); + } + + // Another 110KB triggers a compaction to 400K file to fill up first path + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(3, GetSstFileCount(options.db_paths[1].path)); + + // (1, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4", FilesPerLevel(0)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 1) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,1", FilesPerLevel(0)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 2) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,2", FilesPerLevel(0)); + ASSERT_EQ(2, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 3) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,3", FilesPerLevel(0)); + ASSERT_EQ(3, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,4", FilesPerLevel(0)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 5) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,5", FilesPerLevel(0)); + ASSERT_EQ(5, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 6) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,6", FilesPerLevel(0)); + ASSERT_EQ(6, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 7) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,7", FilesPerLevel(0)); + ASSERT_EQ(7, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,8", FilesPerLevel(0)); + ASSERT_EQ(8, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Reopen(options); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Destroy(options); +} + +TEST_P(DBCompactionTestWithParam, LevelCompactionPathUse) { + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_, 500 * 1024); + options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024); + options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.compaction_style = kCompactionStyleLevel; + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 4; + options.max_bytes_for_level_base = 400 * 1024; + options.max_subcompactions = max_subcompactions_; + + DestroyAndReopen(options); + + Random rnd(301); + int key_idx = 0; + + // Always gets compacted into 1 Level1 file, + // 0/1 Level 0 file + for (int num = 0; num < 3; num++) { + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + } + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,1", FilesPerLevel(0)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("0,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("0,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("0,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("0,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Reopen(options); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Destroy(options); +} + +TEST_P(DBCompactionTestWithParam, LevelCompactionCFPathUse) { + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_, 500 * 1024); + options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024); + options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.compaction_style = kCompactionStyleLevel; + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 4; + options.max_bytes_for_level_base = 400 * 1024; + options.max_subcompactions = max_subcompactions_; + + std::vector option_vector; + option_vector.emplace_back(options); + ColumnFamilyOptions cf_opt1(options), cf_opt2(options); + // Configure CF1 specific paths. + cf_opt1.cf_paths.emplace_back(dbname_ + "cf1", 500 * 1024); + cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_2", 4 * 1024 * 1024); + cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_3", 1024 * 1024 * 1024); + option_vector.emplace_back(DBOptions(options), cf_opt1); + CreateColumnFamilies({"one"}, option_vector[1]); + + // Configure CF2 specific paths. + cf_opt2.cf_paths.emplace_back(dbname_ + "cf2", 500 * 1024); + cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_2", 4 * 1024 * 1024); + cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_3", 1024 * 1024 * 1024); + option_vector.emplace_back(DBOptions(options), cf_opt2); + CreateColumnFamilies({"two"}, option_vector[2]); + + ReopenWithColumnFamilies({"default", "one", "two"}, option_vector); + + Random rnd(301); + int key_idx = 0; + int key_idx1 = 0; + int key_idx2 = 0; + + auto generate_file = [&]() { + GenerateNewFile(0, &rnd, &key_idx); + GenerateNewFile(1, &rnd, &key_idx1); + GenerateNewFile(2, &rnd, &key_idx2); + }; + + auto check_sstfilecount = [&](int path_id, int expected) { + ASSERT_EQ(expected, GetSstFileCount(options.db_paths[path_id].path)); + ASSERT_EQ(expected, GetSstFileCount(cf_opt1.cf_paths[path_id].path)); + ASSERT_EQ(expected, GetSstFileCount(cf_opt2.cf_paths[path_id].path)); + }; + + auto check_filesperlevel = [&](const std::string& expected) { + ASSERT_EQ(expected, FilesPerLevel(0)); + ASSERT_EQ(expected, FilesPerLevel(1)); + ASSERT_EQ(expected, FilesPerLevel(2)); + }; + + auto check_getvalues = [&]() { + for (int i = 0; i < key_idx; i++) { + auto v = Get(0, Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + for (int i = 0; i < key_idx1; i++) { + auto v = Get(1, Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + for (int i = 0; i < key_idx2; i++) { + auto v = Get(2, Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + }; + + // Check that default column family uses db_paths. + // And Column family "one" uses cf_paths. + + // The compaction in level0 outputs the sst files in level1. + // The first path cannot hold level1's data(400KB+400KB > 500KB), + // so every compaction move a sst file to second path. Please + // refer to LevelCompactionBuilder::GetPathId. + for (int num = 0; num < 3; num++) { + generate_file(); + } + check_sstfilecount(0, 1); + check_sstfilecount(1, 2); + + generate_file(); + check_sstfilecount(1, 3); + + // (1, 4) + generate_file(); + check_filesperlevel("1,4"); + check_sstfilecount(1, 4); + check_sstfilecount(0, 1); + + // (1, 4, 1) + generate_file(); + check_filesperlevel("1,4,1"); + check_sstfilecount(2, 1); + check_sstfilecount(1, 4); + check_sstfilecount(0, 1); + + // (1, 4, 2) + generate_file(); + check_filesperlevel("1,4,2"); + check_sstfilecount(2, 2); + check_sstfilecount(1, 4); + check_sstfilecount(0, 1); + + check_getvalues(); + + { // Also verify GetLiveFilesStorageInfo with db_paths / cf_paths + std::vector new_infos; + LiveFilesStorageInfoOptions lfsio; + lfsio.wal_size_for_flush = UINT64_MAX; // no flush + ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsio, &new_infos)); + std::unordered_map live_sst_by_dir; + for (auto& info : new_infos) { + if (info.file_type == kTableFile) { + live_sst_by_dir[info.directory]++; + // Verify file on disk (no directory confusion) + uint64_t size; + ASSERT_OK(env_->GetFileSize( + info.directory + "/" + info.relative_filename, &size)); + ASSERT_EQ(info.size, size); + } + } + ASSERT_EQ(3U * 3U, live_sst_by_dir.size()); + for (auto& paths : {options.db_paths, cf_opt1.cf_paths, cf_opt2.cf_paths}) { + ASSERT_EQ(1, live_sst_by_dir[paths[0].path]); + ASSERT_EQ(4, live_sst_by_dir[paths[1].path]); + ASSERT_EQ(2, live_sst_by_dir[paths[2].path]); + } + } + + ReopenWithColumnFamilies({"default", "one", "two"}, option_vector); + + check_getvalues(); + + Destroy(options, true); +} + +TEST_P(DBCompactionTestWithParam, ConvertCompactionStyle) { + Random rnd(301); + int max_key_level_insert = 200; + int max_key_universal_insert = 600; + + // Stage 1: generate a db with level compaction + Options options = CurrentOptions(); + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.num_levels = 4; + options.level0_file_num_compaction_trigger = 3; + options.max_bytes_for_level_base = 500 << 10; // 500KB + options.max_bytes_for_level_multiplier = 1; + options.target_file_size_base = 200 << 10; // 200KB + options.target_file_size_multiplier = 1; + options.max_subcompactions = max_subcompactions_; + CreateAndReopenWithCF({"pikachu"}, options); + + for (int i = 0; i <= max_key_level_insert; i++) { + // each value is 10K + ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000))); + } + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_GT(TotalTableFiles(1, 4), 1); + int non_level0_num_files = 0; + for (int i = 1; i < options.num_levels; i++) { + non_level0_num_files += NumTableFilesAtLevel(i, 1); + } + ASSERT_GT(non_level0_num_files, 0); + + // Stage 2: reopen with universal compaction - should fail + options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 1; + options = CurrentOptions(options); + Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_TRUE(s.IsInvalidArgument()); + + // Stage 3: compact into a single file and move the file to level 0 + options = CurrentOptions(); + options.disable_auto_compactions = true; + options.target_file_size_base = INT_MAX; + options.target_file_size_multiplier = 1; + options.max_bytes_for_level_base = INT_MAX; + options.max_bytes_for_level_multiplier = 1; + options.num_levels = 4; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 0; + // cannot use kForceOptimized here because the compaction here is expected + // to generate one output file + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kForce; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK( + dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + + // Only 1 file in L0 + ASSERT_EQ("1", FilesPerLevel(1)); + + // Stage 4: re-open in universal compaction style and do some db operations + options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 4; + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 3; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + options.num_levels = 1; + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) { + ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000))); + } + ASSERT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + for (int i = 1; i < options.num_levels; i++) { + ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0); + } + + // verify keys inserted in both level compaction style and universal + // compaction style + std::string keys_in_db; + Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]); + ASSERT_OK(iter->status()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + keys_in_db.append(iter->key().ToString()); + keys_in_db.push_back(','); + } + delete iter; + + std::string expected_keys; + for (int i = 0; i <= max_key_universal_insert; i++) { + expected_keys.append(Key(i)); + expected_keys.push_back(','); + } + + ASSERT_EQ(keys_in_db, expected_keys); +} + +TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_a) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "b", "v")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Delete(1, "b")); + ASSERT_OK(Delete(1, "a")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Delete(1, "a")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "a", "v")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("(a->v)", Contents(1)); + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + ASSERT_EQ("(a->v)", Contents(1)); + } while (ChangeCompactOptions()); +} + +TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_b) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "", "")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Delete(1, "e")); + ASSERT_OK(Put(1, "", "")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "c", "cv")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "", "")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "", "")); + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "d", "dv")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "", "")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Delete(1, "d")); + ASSERT_OK(Delete(1, "b")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("(->)(c->cv)", Contents(1)); + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + ASSERT_EQ("(->)(c->cv)", Contents(1)); + } while (ChangeCompactOptions()); +} + +TEST_F(DBCompactionTest, ManualAutoRace) { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BGWorkCompaction", "DBCompactionTest::ManualAutoRace:1"}, + {"DBImpl::RunManualCompaction:WaitScheduled", + "BackgroundCallCompaction:0"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(1, "foo", "")); + ASSERT_OK(Put(1, "bar", "")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "foo", "")); + ASSERT_OK(Put(1, "bar", "")); + // Generate four files in CF 0, which should trigger an auto compaction + ASSERT_OK(Put("foo", "")); + ASSERT_OK(Put("bar", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "")); + ASSERT_OK(Put("bar", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "")); + ASSERT_OK(Put("bar", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "")); + ASSERT_OK(Put("bar", "")); + ASSERT_OK(Flush()); + + // The auto compaction is scheduled but waited until here + TEST_SYNC_POINT("DBCompactionTest::ManualAutoRace:1"); + // The auto compaction will wait until the manual compaction is registerd + // before processing so that it will be cancelled. + CompactRangeOptions cro; + cro.exclusive_manual_compaction = true; + ASSERT_OK(dbfull()->CompactRange(cro, handles_[1], nullptr, nullptr)); + ASSERT_EQ("0,1", FilesPerLevel(1)); + + // Eventually the cancelled compaction will be rescheduled and executed. + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("0,1", FilesPerLevel(0)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBCompactionTestWithParam, ManualCompaction) { + Options options = CurrentOptions(); + options.max_subcompactions = max_subcompactions_; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + CreateAndReopenWithCF({"pikachu"}, options); + + // iter - 0 with 7 levels + // iter - 1 with 3 levels + for (int iter = 0; iter < 2; ++iter) { + MakeTables(3, "p", "q", 1); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); + + // Compaction range falls before files + Compact(1, "", "c"); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); + + // Compaction range falls after files + Compact(1, "r", "z"); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); + + // Compaction range overlaps files + Compact(1, "p", "q"); + ASSERT_EQ("0,0,1", FilesPerLevel(1)); + + // Populate a different range + MakeTables(3, "c", "e", 1); + ASSERT_EQ("1,1,2", FilesPerLevel(1)); + + // Compact just the new range + Compact(1, "b", "f"); + ASSERT_EQ("0,0,2", FilesPerLevel(1)); + + // Compact all + MakeTables(1, "a", "z", 1); + ASSERT_EQ("1,0,2", FilesPerLevel(1)); + + uint64_t prev_block_cache_add = + options.statistics->getTickerCount(BLOCK_CACHE_ADD); + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK(db_->CompactRange(cro, handles_[1], nullptr, nullptr)); + // Verify manual compaction doesn't fill block cache + ASSERT_EQ(prev_block_cache_add, + options.statistics->getTickerCount(BLOCK_CACHE_ADD)); + + ASSERT_EQ("0,0,1", FilesPerLevel(1)); + + if (iter == 0) { + options = CurrentOptions(); + options.num_levels = 3; + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + } + } +} + +TEST_P(DBCompactionTestWithParam, ManualLevelCompactionOutputPathId) { + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760); + options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760); + options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760); + options.max_subcompactions = max_subcompactions_; + CreateAndReopenWithCF({"pikachu"}, options); + + // iter - 0 with 7 levels + // iter - 1 with 3 levels + for (int iter = 0; iter < 2; ++iter) { + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put(1, "p", "begin")); + ASSERT_OK(Put(1, "q", "end")); + ASSERT_OK(Flush(1)); + } + ASSERT_EQ("3", FilesPerLevel(1)); + ASSERT_EQ(3, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // Compaction range falls before files + Compact(1, "", "c"); + ASSERT_EQ("3", FilesPerLevel(1)); + + // Compaction range falls after files + Compact(1, "r", "z"); + ASSERT_EQ("3", FilesPerLevel(1)); + + // Compaction range overlaps files + Compact(1, "p", "q", 1); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("0,1", FilesPerLevel(1)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // Populate a different range + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put(1, "c", "begin")); + ASSERT_OK(Put(1, "e", "end")); + ASSERT_OK(Flush(1)); + } + ASSERT_EQ("3,1", FilesPerLevel(1)); + + // Compact just the new range + Compact(1, "b", "f", 1); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("0,2", FilesPerLevel(1)); + ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // Compact all + ASSERT_OK(Put(1, "a", "begin")); + ASSERT_OK(Put(1, "z", "end")); + ASSERT_OK(Flush(1)); + ASSERT_EQ("1,2", FilesPerLevel(1)); + ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path)); + CompactRangeOptions compact_options; + compact_options.target_path_id = 1; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ("0,1", FilesPerLevel(1)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + if (iter == 0) { + DestroyAndReopen(options); + options = CurrentOptions(); + options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760); + options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760); + options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760); + options.max_background_flushes = 1; + options.num_levels = 3; + options.create_if_missing = true; + CreateAndReopenWithCF({"pikachu"}, options); + } + } +} + +TEST_F(DBCompactionTest, FilesDeletedAfterCompaction) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v2")); + Compact(1, "a", "z"); + const size_t num_files = CountLiveFiles(); + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(1, "foo", "v2")); + Compact(1, "a", "z"); + } + ASSERT_EQ(CountLiveFiles(), num_files); + } while (ChangeCompactOptions()); +} + +// Check level comapction with compact files +TEST_P(DBCompactionTestWithParam, DISABLED_CompactFilesOnLevelCompaction) { + const int kTestKeySize = 16; + const int kTestValueSize = 984; + const int kEntrySize = kTestKeySize + kTestValueSize; + const int kEntriesPerBuffer = 100; + Options options; + options.create_if_missing = true; + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; + options.compaction_style = kCompactionStyleLevel; + options.target_file_size_base = options.write_buffer_size; + options.max_bytes_for_level_base = options.target_file_size_base * 2; + options.level0_stop_writes_trigger = 2; + options.max_bytes_for_level_multiplier = 2; + options.compression = kNoCompression; + options.max_subcompactions = max_subcompactions_; + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, options); + + Random rnd(301); + for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) { + ASSERT_OK(Put(1, std::to_string(key), rnd.RandomString(kTestValueSize))); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ColumnFamilyMetaData cf_meta; + dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta); + int output_level = static_cast(cf_meta.levels.size()) - 1; + for (int file_picked = 5; file_picked > 0; --file_picked) { + std::set overlapping_file_names; + std::vector compaction_input_file_names; + for (int f = 0; f < file_picked; ++f) { + int level = 0; + auto file_meta = PickFileRandomly(cf_meta, &rnd, &level); + compaction_input_file_names.push_back(file_meta->name); + GetOverlappingFileNumbersForLevelCompaction( + cf_meta, options.comparator, level, output_level, file_meta, + &overlapping_file_names); + } + + ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), handles_[1], + compaction_input_file_names, + output_level)); + + // Make sure all overlapping files do not exist after compaction + dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta); + VerifyCompactionResult(cf_meta, overlapping_file_names); + } + + // make sure all key-values are still there. + for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) { + ASSERT_NE(Get(1, std::to_string(key)), "NOT_FOUND"); + } +} + +TEST_P(DBCompactionTestWithParam, PartialCompactionFailure) { + Options options; + const int kKeySize = 16; + const int kKvSize = 1000; + const int kKeysPerBuffer = 100; + const int kNumL1Files = 5; + options.create_if_missing = true; + options.write_buffer_size = kKeysPerBuffer * kKvSize; + options.max_write_buffer_number = 2; + options.target_file_size_base = + options.write_buffer_size * (options.max_write_buffer_number - 1); + options.level0_file_num_compaction_trigger = kNumL1Files; + options.max_bytes_for_level_base = + options.level0_file_num_compaction_trigger * + options.target_file_size_base; + options.max_bytes_for_level_multiplier = 2; + options.compression = kNoCompression; + options.max_subcompactions = max_subcompactions_; + + env_->SetBackgroundThreads(1, Env::HIGH); + env_->SetBackgroundThreads(1, Env::LOW); + // stop the compaction thread until we simulate the file creation failure. + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + options.env = env_; + + DestroyAndReopen(options); + + const int kNumInsertedKeys = options.level0_file_num_compaction_trigger * + (options.max_write_buffer_number - 1) * + kKeysPerBuffer; + + Random rnd(301); + std::vector keys; + std::vector values; + for (int k = 0; k < kNumInsertedKeys; ++k) { + keys.emplace_back(rnd.RandomString(kKeySize)); + values.emplace_back(rnd.RandomString(kKvSize - kKeySize)); + ASSERT_OK(Put(Slice(keys[k]), Slice(values[k]))); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + + ASSERT_OK(dbfull()->TEST_FlushMemTable(true)); + // Make sure the number of L0 files can trigger compaction. + ASSERT_GE(NumTableFilesAtLevel(0), + options.level0_file_num_compaction_trigger); + + auto previous_num_level0_files = NumTableFilesAtLevel(0); + + // Fail the first file creation. + env_->non_writable_count_ = 1; + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + + // Expect compaction to fail here as one file will fail its + // creation. + ASSERT_TRUE(!dbfull()->TEST_WaitForCompact().ok()); + + // Verify L0 -> L1 compaction does fail. + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + + // Verify all L0 files are still there. + ASSERT_EQ(NumTableFilesAtLevel(0), previous_num_level0_files); + + // All key-values must exist after compaction fails. + for (int k = 0; k < kNumInsertedKeys; ++k) { + ASSERT_EQ(values[k], Get(keys[k])); + } + + env_->non_writable_count_ = 0; + + // Make sure RocksDB will not get into corrupted state. + Reopen(options); + + // Verify again after reopen. + for (int k = 0; k < kNumInsertedKeys; ++k) { + ASSERT_EQ(values[k], Get(keys[k])); + } +} + +TEST_P(DBCompactionTestWithParam, DeleteMovedFileAfterCompaction) { + // iter 1 -- delete_obsolete_files_period_micros == 0 + for (int iter = 0; iter < 2; ++iter) { + // This test triggers move compaction and verifies that the file is not + // deleted when it's part of move compaction + Options options = CurrentOptions(); + options.env = env_; + if (iter == 1) { + options.delete_obsolete_files_period_micros = 0; + } + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = + 2; // trigger compaction when we have 2 files + OnFileDeletionListener* listener = new OnFileDeletionListener(); + options.listeners.emplace_back(listener); + options.max_subcompactions = max_subcompactions_; + DestroyAndReopen(options); + + Random rnd(301); + // Create two 1MB sst files + for (int i = 0; i < 2; ++i) { + // Create 1MB sst file + for (int j = 0; j < 100; ++j) { + ASSERT_OK(Put(Key(i * 50 + j), rnd.RandomString(10 * 1024))); + } + ASSERT_OK(Flush()); + } + // this should execute L0->L1 + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("0,1", FilesPerLevel(0)); + + // block compactions + test::SleepingBackgroundTask sleeping_task; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::LOW); + + options.max_bytes_for_level_base = 1024 * 1024; // 1 MB + Reopen(options); + std::unique_ptr iterator(db_->NewIterator(ReadOptions())); + ASSERT_EQ("0,1", FilesPerLevel(0)); + // let compactions go + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); + + // this should execute L1->L2 (move) + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ("0,0,1", FilesPerLevel(0)); + + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(metadata.size(), 1U); + auto moved_file_name = metadata[0].name; + + // Create two more 1MB sst files + for (int i = 0; i < 2; ++i) { + // Create 1MB sst file + for (int j = 0; j < 100; ++j) { + ASSERT_OK(Put(Key(i * 50 + j + 100), rnd.RandomString(10 * 1024))); + } + ASSERT_OK(Flush()); + } + // this should execute both L0->L1 and L1->L2 (merge with previous file) + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ("0,0,2", FilesPerLevel(0)); + + // iterator is holding the file + ASSERT_OK(env_->FileExists(dbname_ + moved_file_name)); + + listener->SetExpectedFileName(dbname_ + moved_file_name); + ASSERT_OK(iterator->status()); + iterator.reset(); + + // this file should have been compacted away + ASSERT_NOK(env_->FileExists(dbname_ + moved_file_name)); + listener->VerifyMatchedCount(1); + } +} + +TEST_P(DBCompactionTestWithParam, CompressLevelCompaction) { + if (!Zlib_Supported()) { + return; + } + Options options = CurrentOptions(); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.compaction_style = kCompactionStyleLevel; + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 4; + options.max_bytes_for_level_base = 400 * 1024; + options.max_subcompactions = max_subcompactions_; + // First two levels have no compression, so that a trivial move between + // them will be allowed. Level 2 has Zlib compression so that a trivial + // move to level 3 will not be allowed + options.compression_per_level = {kNoCompression, kNoCompression, + kZlibCompression}; + int matches = 0, didnt_match = 0, trivial_move = 0, non_trivial = 0; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "Compaction::InputCompressionMatchesOutput:Matches", + [&](void* /*arg*/) { matches++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "Compaction::InputCompressionMatchesOutput:DidntMatch", + [&](void* /*arg*/) { didnt_match++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Reopen(options); + + Random rnd(301); + int key_idx = 0; + + // First three 110KB files are going to level 0 + // After that, (100K, 200K) + for (int num = 0; num < 3; num++) { + GenerateNewFile(&rnd, &key_idx); + } + + // Another 110KB triggers a compaction to 400K file to fill up level 0 + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(4, GetSstFileCount(dbname_)); + + // (1, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4", FilesPerLevel(0)); + + // (1, 4, 1) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,1", FilesPerLevel(0)); + + // (1, 4, 2) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,2", FilesPerLevel(0)); + + // (1, 4, 3) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,3", FilesPerLevel(0)); + + // (1, 4, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,4", FilesPerLevel(0)); + + // (1, 4, 5) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,5", FilesPerLevel(0)); + + // (1, 4, 6) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,6", FilesPerLevel(0)); + + // (1, 4, 7) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,7", FilesPerLevel(0)); + + // (1, 4, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,8", FilesPerLevel(0)); + + ASSERT_EQ(matches, 12); + // Currently, the test relies on the number of calls to + // InputCompressionMatchesOutput() per compaction. + const int kCallsToInputCompressionMatch = 2; + ASSERT_EQ(didnt_match, 8 * kCallsToInputCompressionMatch); + ASSERT_EQ(trivial_move, 12); + ASSERT_EQ(non_trivial, 8); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Reopen(options); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Destroy(options); +} + +TEST_F(DBCompactionTest, SanitizeCompactionOptionsTest) { + Options options = CurrentOptions(); + options.max_background_compactions = 5; + options.soft_pending_compaction_bytes_limit = 0; + options.hard_pending_compaction_bytes_limit = 100; + options.create_if_missing = true; + DestroyAndReopen(options); + ASSERT_EQ(100, db_->GetOptions().soft_pending_compaction_bytes_limit); + + options.max_background_compactions = 3; + options.soft_pending_compaction_bytes_limit = 200; + options.hard_pending_compaction_bytes_limit = 150; + DestroyAndReopen(options); + ASSERT_EQ(150, db_->GetOptions().soft_pending_compaction_bytes_limit); +} + +// This tests for a bug that could cause two level0 compactions running +// concurrently +// TODO(aekmekji): Make sure that the reason this fails when run with +// max_subcompactions > 1 is not a correctness issue but just inherent to +// running parallel L0-L1 compactions +TEST_F(DBCompactionTest, SuggestCompactRangeNoTwoLevel0Compactions) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.write_buffer_size = 110 << 10; + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 4; + options.num_levels = 4; + options.compression = kNoCompression; + options.max_bytes_for_level_base = 450 << 10; + options.target_file_size_base = 98 << 10; + options.max_write_buffer_number = 2; + options.max_background_compactions = 2; + + DestroyAndReopen(options); + + // fill up the DB + Random rnd(301); + for (int num = 0; num < 10; num++) { + GenerateNewRandomFile(&rnd); + } + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"CompactionJob::Run():Start", + "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1"}, + {"DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2", + "CompactionJob::Run():End"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // trigger L0 compaction + for (int num = 0; num < options.level0_file_num_compaction_trigger + 1; + num++) { + GenerateNewRandomFile(&rnd, /* nowait */ true); + ASSERT_OK(Flush()); + } + + TEST_SYNC_POINT( + "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1"); + + GenerateNewRandomFile(&rnd, /* nowait */ true); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr)); + for (int num = 0; num < options.level0_file_num_compaction_trigger + 1; + num++) { + GenerateNewRandomFile(&rnd, /* nowait */ true); + ASSERT_OK(Flush()); + } + + TEST_SYNC_POINT( + "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2"); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); +} + +static std::string ShortKey(int i) { + assert(i < 10000); + char buf[100]; + snprintf(buf, sizeof(buf), "key%04d", i); + return std::string(buf); +} + +TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) { + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // The key size is guaranteed to be <= 8 + class ShortKeyComparator : public Comparator { + int Compare(const ROCKSDB_NAMESPACE::Slice& a, + const ROCKSDB_NAMESPACE::Slice& b) const override { + assert(a.size() <= 8); + assert(b.size() <= 8); + return BytewiseComparator()->Compare(a, b); + } + const char* Name() const override { return "ShortKeyComparator"; } + void FindShortestSeparator( + std::string* start, + const ROCKSDB_NAMESPACE::Slice& limit) const override { + return BytewiseComparator()->FindShortestSeparator(start, limit); + } + void FindShortSuccessor(std::string* key) const override { + return BytewiseComparator()->FindShortSuccessor(key); + } + } short_key_cmp; + Options options = CurrentOptions(); + options.target_file_size_base = 100000000; + options.write_buffer_size = 100000000; + options.max_subcompactions = max_subcompactions_; + options.comparator = &short_key_cmp; + DestroyAndReopen(options); + + int32_t value_size = 10 * 1024; // 10 KB + + Random rnd(301); + std::vector values; + // File with keys [ 0 => 99 ] + for (int i = 0; i < 100; i++) { + values.push_back(rnd.RandomString(value_size)); + ASSERT_OK(Put(ShortKey(i), values[i])); + } + ASSERT_OK(Flush()); + + ASSERT_EQ("1", FilesPerLevel(0)); + // Compaction will do L0=>L1 (trivial move) then move L1 files to L3 + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 3; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,0,0,1", FilesPerLevel(0)); + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + // File with keys [ 100 => 199 ] + for (int i = 100; i < 200; i++) { + values.push_back(rnd.RandomString(value_size)); + ASSERT_OK(Put(ShortKey(i), values[i])); + } + ASSERT_OK(Flush()); + + ASSERT_EQ("1,0,0,1", FilesPerLevel(0)); + // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves) + // then compacte the bottommost level L3=>L3 (non trivial move) + compact_options = CompactRangeOptions(); + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,0,0,1", FilesPerLevel(0)); + ASSERT_EQ(trivial_move, 4); + ASSERT_EQ(non_trivial_move, 1); + + // File with keys [ 200 => 299 ] + for (int i = 200; i < 300; i++) { + values.push_back(rnd.RandomString(value_size)); + ASSERT_OK(Put(ShortKey(i), values[i])); + } + ASSERT_OK(Flush()); + + ASSERT_EQ("1,0,0,1", FilesPerLevel(0)); + trivial_move = 0; + non_trivial_move = 0; + compact_options = CompactRangeOptions(); + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kSkip; + // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves) + // and will skip bottommost level compaction + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,0,0,2", FilesPerLevel(0)); + ASSERT_EQ(trivial_move, 3); + ASSERT_EQ(non_trivial_move, 0); + + for (int i = 0; i < 300; i++) { + ASSERT_EQ(Get(ShortKey(i)), values[i]); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBCompactionTestWithParam, IntraL0Compaction) { + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 5; + options.max_background_compactions = 2; + options.max_subcompactions = max_subcompactions_; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.write_buffer_size = 2 << 20; // 2MB + + BlockBasedTableOptions table_options; + table_options.block_cache = NewLRUCache(64 << 20); // 64MB + table_options.cache_index_and_filter_blocks = true; + table_options.pin_l0_filter_and_index_blocks_in_cache = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + + const size_t kValueSize = 1 << 20; + Random rnd(301); + std::string value(rnd.RandomString(kValueSize)); + + // The L0->L1 must be picked before we begin flushing files to trigger + // intra-L0 compaction, and must not finish until after an intra-L0 + // compaction has been picked. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"LevelCompactionPicker::PickCompaction:Return", + "DBCompactionTest::IntraL0Compaction:L0ToL1Ready"}, + {"LevelCompactionPicker::PickCompactionBySize:0", + "CompactionJob::Run():Start"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // index: 0 1 2 3 4 5 6 7 8 9 + // size: 1MB 1MB 1MB 1MB 1MB 2MB 1MB 1MB 1MB 1MB + // score: 1.5 1.3 1.5 2.0 inf + // + // Files 0-4 will be included in an L0->L1 compaction. + // + // L0->L0 will be triggered since the sync points guarantee compaction to base + // level is still blocked when files 5-9 trigger another compaction. + // + // Files 6-9 are the longest span of available files for which + // work-per-deleted-file decreases (see "score" row above). + for (int i = 0; i < 10; ++i) { + ASSERT_OK(Put(Key(0), "")); // prevents trivial move + if (i == 5) { + TEST_SYNC_POINT("DBCompactionTest::IntraL0Compaction:L0ToL1Ready"); + ASSERT_OK(Put(Key(i + 1), value + value)); + } else { + ASSERT_OK(Put(Key(i + 1), value)); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_GE(level_to_files.size(), 2); // at least L0 and L1 + // L0 has the 2MB file (not compacted) and 4MB file (output of L0->L0) + ASSERT_EQ(2, level_to_files[0].size()); + ASSERT_GT(level_to_files[1].size(), 0); + for (int i = 0; i < 2; ++i) { + ASSERT_GE(level_to_files[0][i].fd.file_size, 1 << 21); + } + + // The index/filter in the file produced by intra-L0 should not be pinned. + // That means clearing unref'd entries in block cache and re-accessing the + // file produced by intra-L0 should bump the index block miss count. + uint64_t prev_index_misses = + TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS); + table_options.block_cache->EraseUnRefEntries(); + ASSERT_EQ("", Get(Key(0))); + ASSERT_EQ(prev_index_misses + 1, + TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); +} + +TEST_P(DBCompactionTestWithParam, IntraL0CompactionDoesNotObsoleteDeletions) { + // regression test for issue #2722: L0->L0 compaction can resurrect deleted + // keys from older L0 files if L1+ files' key-ranges do not include the key. + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 5; + options.max_background_compactions = 2; + options.max_subcompactions = max_subcompactions_; + DestroyAndReopen(options); + + const size_t kValueSize = 1 << 20; + Random rnd(301); + std::string value(rnd.RandomString(kValueSize)); + + // The L0->L1 must be picked before we begin flushing files to trigger + // intra-L0 compaction, and must not finish until after an intra-L0 + // compaction has been picked. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"LevelCompactionPicker::PickCompaction:Return", + "DBCompactionTest::IntraL0CompactionDoesNotObsoleteDeletions:" + "L0ToL1Ready"}, + {"LevelCompactionPicker::PickCompactionBySize:0", + "CompactionJob::Run():Start"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // index: 0 1 2 3 4 5 6 7 8 9 + // size: 1MB 1MB 1MB 1MB 1MB 1MB 1MB 1MB 1MB 1MB + // score: 1.25 1.33 1.5 2.0 inf + // + // Files 0-4 will be included in an L0->L1 compaction. + // + // L0->L0 will be triggered since the sync points guarantee compaction to base + // level is still blocked when files 5-9 trigger another compaction. All files + // 5-9 are included in the L0->L0 due to work-per-deleted file decreasing. + // + // Put a key-value in files 0-4. Delete that key in files 5-9. Verify the + // L0->L0 preserves the deletion such that the key remains deleted. + for (int i = 0; i < 10; ++i) { + // key 0 serves both to prevent trivial move and as the key we want to + // verify is not resurrected by L0->L0 compaction. + if (i < 5) { + ASSERT_OK(Put(Key(0), "")); + } else { + ASSERT_OK(Delete(Key(0))); + } + if (i == 5) { + TEST_SYNC_POINT( + "DBCompactionTest::IntraL0CompactionDoesNotObsoleteDeletions:" + "L0ToL1Ready"); + } + ASSERT_OK(Put(Key(i + 1), value)); + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_GE(level_to_files.size(), 2); // at least L0 and L1 + // L0 has a single output file from L0->L0 + ASSERT_EQ(1, level_to_files[0].size()); + ASSERT_GT(level_to_files[1].size(), 0); + ASSERT_GE(level_to_files[0][0].fd.file_size, 1 << 22); + + ReadOptions roptions; + std::string result; + ASSERT_TRUE(db_->Get(roptions, Key(0), &result).IsNotFound()); +} + +TEST_P(DBCompactionTestWithParam, FullCompactionInBottomPriThreadPool) { + const int kNumFilesTrigger = 3; + Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM); + for (bool use_universal_compaction : {false, true}) { + Options options = CurrentOptions(); + if (use_universal_compaction) { + options.compaction_style = kCompactionStyleUniversal; + } else { + options.compaction_style = kCompactionStyleLevel; + options.level_compaction_dynamic_level_bytes = true; + } + options.num_levels = 4; + options.write_buffer_size = 100 << 10; // 100KB + options.target_file_size_base = 32 << 10; // 32KB + options.level0_file_num_compaction_trigger = kNumFilesTrigger; + // Trigger compaction if size amplification exceeds 110% + options.compaction_options_universal.max_size_amplification_percent = 110; + DestroyAndReopen(options); + + int num_bottom_pri_compactions = 0; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BGWorkBottomCompaction", + [&](void* /*arg*/) { ++num_bottom_pri_compactions; }); + SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int num = 0; num < kNumFilesTrigger; num++) { + ASSERT_EQ(NumSortedRuns(), num); + int key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ(1, num_bottom_pri_compactions); + + // Verify that size amplification did occur + ASSERT_EQ(NumSortedRuns(), 1); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } + Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM); +} + +TEST_F(DBCompactionTest, OptimizedDeletionObsoleting) { + // Deletions can be dropped when compacted to non-last level if they fall + // outside the lower-level files' key-ranges. + const int kNumL0Files = 4; + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + + // put key 1 and 3 in separate L1, L2 files. + // So key 0, 2, and 4+ fall outside these levels' key-ranges. + for (int level = 2; level >= 1; --level) { + for (int i = 0; i < 2; ++i) { + ASSERT_OK(Put(Key(2 * i + 1), "val")); + ASSERT_OK(Flush()); + } + MoveFilesToLevel(level); + ASSERT_EQ(2, NumTableFilesAtLevel(level)); + } + + // Delete keys in range [1, 4]. These L0 files will be compacted with L1: + // - Tombstones for keys 2 and 4 can be dropped early. + // - Tombstones for keys 1 and 3 must be kept due to L2 files' key-ranges. + for (int i = 0; i < kNumL0Files; ++i) { + ASSERT_OK(Put(Key(0), "val")); // sentinel to prevent trivial move + ASSERT_OK(Delete(Key(i + 1))); + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + for (int i = 0; i < kNumL0Files; ++i) { + std::string value; + ASSERT_TRUE(db_->Get(ReadOptions(), Key(i + 1), &value).IsNotFound()); + } + ASSERT_EQ(2, options.statistics->getTickerCount( + COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE)); + ASSERT_EQ(2, + options.statistics->getTickerCount(COMPACTION_KEY_DROP_OBSOLETE)); +} + +TEST_F(DBCompactionTest, CompactFilesPendingL0Bug) { + // https://www.facebook.com/groups/rocksdb.dev/permalink/1389452781153232/ + // CompactFiles() had a bug where it failed to pick a compaction when an L0 + // compaction existed, but marked it as scheduled anyways. It'd never be + // unmarked as scheduled, so future compactions or DB close could hang. + const int kNumL0Files = 5; + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files - 1; + options.max_background_compactions = 2; + DestroyAndReopen(options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"LevelCompactionPicker::PickCompaction:Return", + "DBCompactionTest::CompactFilesPendingL0Bug:Picked"}, + {"DBCompactionTest::CompactFilesPendingL0Bug:ManualCompacted", + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + auto schedule_multi_compaction_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + // Files 0-3 will be included in an L0->L1 compaction. + // + // File 4 will be included in a call to CompactFiles() while the first + // compaction is running. + for (int i = 0; i < kNumL0Files - 1; ++i) { + ASSERT_OK(Put(Key(0), "val")); // sentinel to prevent trivial move + ASSERT_OK(Put(Key(i + 1), "val")); + ASSERT_OK(Flush()); + } + TEST_SYNC_POINT("DBCompactionTest::CompactFilesPendingL0Bug:Picked"); + // file 4 flushed after 0-3 picked + ASSERT_OK(Put(Key(kNumL0Files), "val")); + ASSERT_OK(Flush()); + + // previously DB close would hang forever as this situation caused scheduled + // compactions count to never decrement to zero. + ColumnFamilyMetaData cf_meta; + dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta); + ASSERT_EQ(kNumL0Files, cf_meta.levels[0].files.size()); + std::vector input_filenames; + input_filenames.push_back(cf_meta.levels[0].files.front().name); + ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), input_filenames, + 0 /* output_level */)); + TEST_SYNC_POINT("DBCompactionTest::CompactFilesPendingL0Bug:ManualCompacted"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, CompactFilesOverlapInL0Bug) { + // Regression test for bug of not pulling in L0 files that overlap the user- + // specified input files in time- and key-ranges. + ASSERT_OK(Put(Key(0), "old_val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(0), "new_val")); + ASSERT_OK(Flush()); + + ColumnFamilyMetaData cf_meta; + dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta); + ASSERT_GE(cf_meta.levels.size(), 2); + ASSERT_EQ(2, cf_meta.levels[0].files.size()); + + // Compacting {new L0 file, L1 file} should pull in the old L0 file since it + // overlaps in key-range and time-range. + std::vector input_filenames; + input_filenames.push_back(cf_meta.levels[0].files.front().name); + ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), input_filenames, + 1 /* output_level */)); + ASSERT_EQ("new_val", Get(Key(0))); +} + +TEST_F(DBCompactionTest, DeleteFilesInRangeConflictWithCompaction) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + const Snapshot* snapshot = nullptr; + const int kMaxKey = 10; + + for (int i = 0; i < kMaxKey; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + ASSERT_OK(Delete(Key(i))); + if (!snapshot) { + snapshot = db_->GetSnapshot(); + } + } + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + ASSERT_OK(Put(Key(kMaxKey), Key(kMaxKey))); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // test DeleteFilesInRange() deletes the files already picked for compaction + SyncPoint::GetInstance()->LoadDependency( + {{"VersionSet::LogAndApply:WriteManifestStart", + "BackgroundCallCompaction:0"}, + {"DBImpl::BackgroundCompaction:Finish", + "VersionSet::LogAndApply:WriteManifestDone"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + // release snapshot which mark bottommost file for compaction + db_->ReleaseSnapshot(snapshot); + std::string begin_string = Key(0); + std::string end_string = Key(kMaxKey + 1); + Slice begin(begin_string); + Slice end(end_string); + ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end)); + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) { + // bottom-level files may contain deletions due to snapshots protecting the + // deleted keys. Once the snapshot is released, we should see files with many + // such deletions undergo single-file compactions. + const int kNumKeysPerFile = 1024; + const int kNumLevelFiles = 4; + const int kValueSize = 128; + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = kNumLevelFiles; + // inflate it a bit to account for key/metadata overhead + options.target_file_size_base = 120 * kNumKeysPerFile * kValueSize / 100; + CreateAndReopenWithCF({"one"}, options); + + Random rnd(301); + const Snapshot* snapshot = nullptr; + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); + } + if (i == kNumLevelFiles - 1) { + snapshot = db_->GetSnapshot(); + // delete every other key after grabbing a snapshot, so these deletions + // and the keys they cover can't be dropped until after the snapshot is + // released. + for (int j = 0; j < kNumLevelFiles * kNumKeysPerFile; j += 2) { + ASSERT_OK(Delete(Key(j))); + } + } + ASSERT_OK(Flush()); + if (i < kNumLevelFiles - 1) { + ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); + } + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1)); + + std::vector pre_release_metadata, post_release_metadata; + db_->GetLiveFilesMetaData(&pre_release_metadata); + // just need to bump seqnum so ReleaseSnapshot knows the newest key in the SST + // files does not need to be preserved in case of a future snapshot. + ASSERT_OK(Put(Key(0), "val")); + ASSERT_NE(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_); + // release snapshot and wait for compactions to finish. Single-file + // compactions should be triggered, which reduce the size of each bottom-level + // file without changing file count. + db_->ReleaseSnapshot(snapshot); + ASSERT_EQ(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(compaction->compaction_reason() == + CompactionReason::kBottommostFiles); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + db_->GetLiveFilesMetaData(&post_release_metadata); + ASSERT_EQ(pre_release_metadata.size(), post_release_metadata.size()); + + for (size_t i = 0; i < pre_release_metadata.size(); ++i) { + const auto& pre_file = pre_release_metadata[i]; + const auto& post_file = post_release_metadata[i]; + ASSERT_EQ(1, pre_file.level); + ASSERT_EQ(1, post_file.level); + // each file is smaller than it was before as it was rewritten without + // deletion markers/deleted keys. + ASSERT_LT(post_file.size, pre_file.size); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, NoCompactBottomLevelFilesWithDeletions) { + // bottom-level files may contain deletions due to snapshots protecting the + // deleted keys. Once the snapshot is released, we should see files with many + // such deletions undergo single-file compactions. But when disabling auto + // compactions, it shouldn't be triggered which may causing too many + // background jobs. + const int kNumKeysPerFile = 1024; + const int kNumLevelFiles = 4; + const int kValueSize = 128; + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.level0_file_num_compaction_trigger = kNumLevelFiles; + // inflate it a bit to account for key/metadata overhead + options.target_file_size_base = 120 * kNumKeysPerFile * kValueSize / 100; + Reopen(options); + + Random rnd(301); + const Snapshot* snapshot = nullptr; + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); + } + if (i == kNumLevelFiles - 1) { + snapshot = db_->GetSnapshot(); + // delete every other key after grabbing a snapshot, so these deletions + // and the keys they cover can't be dropped until after the snapshot is + // released. + for (int j = 0; j < kNumLevelFiles * kNumKeysPerFile; j += 2) { + ASSERT_OK(Delete(Key(j))); + } + } + ASSERT_OK(Flush()); + if (i < kNumLevelFiles - 1) { + ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); + } + } + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr)); + ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1)); + + std::vector pre_release_metadata, post_release_metadata; + db_->GetLiveFilesMetaData(&pre_release_metadata); + // just need to bump seqnum so ReleaseSnapshot knows the newest key in the SST + // files does not need to be preserved in case of a future snapshot. + ASSERT_OK(Put(Key(0), "val")); + + // release snapshot and no compaction should be triggered. + std::atomic num_compactions{0}; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:Start", + [&](void* /*arg*/) { num_compactions.fetch_add(1); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + db_->ReleaseSnapshot(snapshot); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(0, num_compactions); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + db_->GetLiveFilesMetaData(&post_release_metadata); + ASSERT_EQ(pre_release_metadata.size(), post_release_metadata.size()); + for (size_t i = 0; i < pre_release_metadata.size(); ++i) { + const auto& pre_file = pre_release_metadata[i]; + const auto& post_file = post_release_metadata[i]; + ASSERT_EQ(1, pre_file.level); + ASSERT_EQ(1, post_file.level); + // each file is same as before with deletion markers/deleted keys. + ASSERT_EQ(post_file.size, pre_file.size); + } +} + +TEST_F(DBCompactionTest, RoundRobinTtlCompactionNormal) { + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 20; + options.ttl = 24 * 60 * 60; // 24 hours + options.compaction_pri = kRoundRobin; + env_->now_cpu_count_.store(0); + env_->SetMockSleep(); + options.env = env_; + + // add a small second for each wait time, to make sure the file is expired + int small_seconds = 1; + + std::atomic_int ttl_compactions{0}; + std::atomic_int round_robin_ttl_compactions{0}; + std::atomic_int other_compactions{0}; + + SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + auto compaction_reason = compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kTtl) { + ttl_compactions++; + } else if (compaction_reason == CompactionReason::kRoundRobinTtl) { + round_robin_ttl_compactions++; + } else { + other_compactions++; + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndReopen(options); + + // Setup the files from lower level to up level, each file is 1 hour's older + // than the next one. + // create 10 files on the last level (L6) + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(i * 100 + j), "value" + std::to_string(i * 100 + j))); + } + ASSERT_OK(Flush()); + env_->MockSleepForSeconds(60 * 60); // generate 1 file per hour + } + MoveFilesToLevel(6); + + // create 5 files on L5 + for (int i = 0; i < 5; i++) { + for (int j = 0; j < 200; j++) { + ASSERT_OK(Put(Key(i * 200 + j), "value" + std::to_string(i * 200 + j))); + } + ASSERT_OK(Flush()); + env_->MockSleepForSeconds(60 * 60); + } + MoveFilesToLevel(5); + + // create 3 files on L4 + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 300; j++) { + ASSERT_OK(Put(Key(i * 300 + j), "value" + std::to_string(i * 300 + j))); + } + ASSERT_OK(Flush()); + env_->MockSleepForSeconds(60 * 60); + } + MoveFilesToLevel(4); + + // The LSM tree should be like: + // L4: [0, 299], [300, 599], [600, 899] + // L5: [0, 199] [200, 399]...............[800, 999] + // L6: [0,99][100,199][200,299][300,399]...............[800,899][900,999] + ASSERT_EQ("0,0,0,0,3,5,10", FilesPerLevel()); + + // make sure the first L5 file is expired + env_->MockSleepForSeconds(16 * 60 * 60 + small_seconds++); + + // trigger TTL compaction + ASSERT_OK(Put(Key(4), "value" + std::to_string(1))); + ASSERT_OK(Put(Key(5), "value" + std::to_string(1))); + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // verify there's a RoundRobin TTL compaction + ASSERT_EQ(1, round_robin_ttl_compactions); + round_robin_ttl_compactions = 0; + + // expire 2 more files + env_->MockSleepForSeconds(2 * 60 * 60 + small_seconds++); + // trigger TTL compaction + ASSERT_OK(Put(Key(4), "value" + std::to_string(2))); + ASSERT_OK(Put(Key(5), "value" + std::to_string(2))); + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ(2, round_robin_ttl_compactions); + round_robin_ttl_compactions = 0; + + // expire 4 more files, 2 out of 3 files on L4 are expired + env_->MockSleepForSeconds(4 * 60 * 60 + small_seconds++); + // trigger TTL compaction + ASSERT_OK(Put(Key(6), "value" + std::to_string(3))); + ASSERT_OK(Put(Key(7), "value" + std::to_string(3))); + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ(1, NumTableFilesAtLevel(4)); + ASSERT_EQ(0, NumTableFilesAtLevel(5)); + + ASSERT_GT(round_robin_ttl_compactions, 0); + round_robin_ttl_compactions = 0; + + // make the first L0 file expired, which triggers a normal TTL compaction + // instead of roundrobin TTL compaction, it will also include an extra file + // from L0 because of overlap + ASSERT_EQ(0, ttl_compactions); + env_->MockSleepForSeconds(19 * 60 * 60 + small_seconds++); + + // trigger TTL compaction + ASSERT_OK(Put(Key(6), "value" + std::to_string(4))); + ASSERT_OK(Put(Key(7), "value" + std::to_string(4))); + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // L0 -> L1 compaction is normal TTL compaction, L1 -> next levels compactions + // are RoundRobin TTL compaction. + ASSERT_GT(ttl_compactions, 0); + ttl_compactions = 0; + ASSERT_GT(round_robin_ttl_compactions, 0); + round_robin_ttl_compactions = 0; + + // All files are expired, so only the last level has data + env_->MockSleepForSeconds(24 * 60 * 60); + // trigger TTL compaction + ASSERT_OK(Put(Key(6), "value" + std::to_string(4))); + ASSERT_OK(Put(Key(7), "value" + std::to_string(4))); + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); + + ASSERT_GT(ttl_compactions, 0); + ttl_compactions = 0; + ASSERT_GT(round_robin_ttl_compactions, 0); + round_robin_ttl_compactions = 0; + + ASSERT_EQ(0, other_compactions); +} + +TEST_F(DBCompactionTest, RoundRobinTtlCompactionUnsortedTime) { + // This is to test the case that the RoundRobin compaction cursor not pointing + // to the oldest file, RoundRobin compaction should still compact the file + // after cursor until all expired files are compacted. + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 20; + options.ttl = 24 * 60 * 60; // 24 hours + options.compaction_pri = kRoundRobin; + env_->now_cpu_count_.store(0); + env_->SetMockSleep(); + options.env = env_; + + std::atomic_int ttl_compactions{0}; + std::atomic_int round_robin_ttl_compactions{0}; + std::atomic_int other_compactions{0}; + + SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + auto compaction_reason = compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kTtl) { + ttl_compactions++; + } else if (compaction_reason == CompactionReason::kRoundRobinTtl) { + round_robin_ttl_compactions++; + } else { + other_compactions++; + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndReopen(options); + + // create 10 files on the last level (L6) + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(i * 100 + j), "value" + std::to_string(i * 100 + j))); + } + ASSERT_OK(Flush()); + env_->MockSleepForSeconds(60 * 60); // generate 1 file per hour + } + MoveFilesToLevel(6); + + // create 5 files on L5 + for (int i = 0; i < 5; i++) { + for (int j = 0; j < 200; j++) { + ASSERT_OK(Put(Key(i * 200 + j), "value" + std::to_string(i * 200 + j))); + } + ASSERT_OK(Flush()); + env_->MockSleepForSeconds(60 * 60); // 1 hour + } + MoveFilesToLevel(5); + + // The LSM tree should be like: + // L5: [0, 199] [200, 399] [400,599] [600,799] [800, 999] + // L6: [0,99][100,199][200,299][300,399]....................[800,899][900,999] + ASSERT_EQ("0,0,0,0,0,5,10", FilesPerLevel()); + + // point the compaction cursor to the 4th file on L5 + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + VersionStorageInfo* storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + const InternalKey split_cursor = InternalKey(Key(600), 100000, kTypeValue); + storage_info->AddCursorForOneLevel(5, split_cursor); + + // make the first file on L5 expired, there should be 3 TTL compactions: + // 4th one, 5th one, then 1st one. + env_->MockSleepForSeconds(19 * 60 * 60 + 1); + // trigger TTL compaction + ASSERT_OK(Put(Key(6), "value" + std::to_string(4))); + ASSERT_OK(Put(Key(7), "value" + std::to_string(4))); + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(2, NumTableFilesAtLevel(5)); + + ASSERT_EQ(3, round_robin_ttl_compactions); + ASSERT_EQ(0, ttl_compactions); + ASSERT_EQ(0, other_compactions); +} + +TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) { + const int kNumKeysPerFile = 32; + const int kNumLevelFiles = 2; + const int kValueSize = 1024; + + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.ttl = 24 * 60 * 60; // 24 hours + options.max_open_files = -1; + env_->SetMockSleep(); + options.env = env_; + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + MoveFilesToLevel(3); + ASSERT_EQ("0,0,0,2", FilesPerLevel()); + + // Delete previously written keys. + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("2,0,0,2", FilesPerLevel()); + MoveFilesToLevel(1); + ASSERT_EQ("0,2,0,2", FilesPerLevel()); + + env_->MockSleepForSeconds(36 * 60 * 60); // 36 hours + ASSERT_EQ("0,2,0,2", FilesPerLevel()); + + // Just do a simple write + flush so that the Ttl expired files get + // compacted. + ASSERT_OK(Put("a", "1")); + ASSERT_OK(Flush()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // All non-L0 files are deleted, as they contained only deleted data. + ASSERT_EQ("1", FilesPerLevel()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + // Test dynamically changing ttl. + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + DestroyAndReopen(options); + + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + MoveFilesToLevel(3); + ASSERT_EQ("0,0,0,2", FilesPerLevel()); + + // Delete previously written keys. + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("2,0,0,2", FilesPerLevel()); + MoveFilesToLevel(1); + ASSERT_EQ("0,2,0,2", FilesPerLevel()); + + // Move time forward by 12 hours, and make sure that compaction still doesn't + // trigger as ttl is set to 24 hours. + env_->MockSleepForSeconds(12 * 60 * 60); + ASSERT_OK(Put("a", "1")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("1,2,0,2", FilesPerLevel()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Dynamically change ttl to 10 hours. + // This should trigger a ttl compaction, as 12 hours have already passed. + ASSERT_OK(dbfull()->SetOptions({{"ttl", "36000"}})); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // All non-L0 files are deleted, as they contained only deleted data. + ASSERT_EQ("1", FilesPerLevel()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) { + env_->SetMockSleep(); + const int kValueSize = 100; + + for (bool if_restart : {false, true}) { + for (bool if_open_all_files : {false, true}) { + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.ttl = 24 * 60 * 60; // 24 hours + if (if_open_all_files) { + options.max_open_files = -1; + } else { + options.max_open_files = 20; + } + // RocksDB sanitize max open files to at least 20. Modify it back. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = static_cast(arg); + *max_open_files = 2; + }); + // In the case where all files are opened and doing DB restart + // forcing the oldest ancester time in manifest file to be 0 to + // simulate the case of reading from an old version. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionEdit::EncodeTo:VarintOldestAncesterTime", [&](void* arg) { + if (if_restart && if_open_all_files) { + std::string* encoded_fieled = static_cast(arg); + *encoded_fieled = ""; + PutVarint64(encoded_fieled, 0); + } + }); + + options.env = env_; + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + DestroyAndReopen(options); + + int ttl_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + auto compaction_reason = compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kTtl) { + ttl_compactions++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Add two L6 files with key ranges: [1 .. 100], [101 .. 200]. + Random rnd(301); + for (int i = 1; i <= 100; ++i) { + ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + // Get the first file's creation time. This will be the oldest file in the + // DB. Compactions inolving this file's descendents should keep getting + // this time. + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + uint64_t oldest_time = level_to_files[0][0].oldest_ancester_time; + // Add 1 hour and do another flush. + env_->MockSleepForSeconds(1 * 60 * 60); + for (int i = 101; i <= 200; ++i) { + ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(6); + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); + + env_->MockSleepForSeconds(1 * 60 * 60); + // Add two L4 files with key ranges: [1 .. 50], [51 .. 150]. + for (int i = 1; i <= 50; ++i) { + ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + env_->MockSleepForSeconds(1 * 60 * 60); + for (int i = 51; i <= 150; ++i) { + ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(4); + ASSERT_EQ("0,0,0,0,2,0,2", FilesPerLevel()); + + env_->MockSleepForSeconds(1 * 60 * 60); + // Add one L1 file with key range: [26, 75]. + for (int i = 26; i <= 75; ++i) { + ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + MoveFilesToLevel(1); + ASSERT_EQ("0,1,0,0,2,0,2", FilesPerLevel()); + + // LSM tree: + // L1: [26 .. 75] + // L4: [1 .. 50][51 ..... 150] + // L6: [1 ........ 100][101 .... 200] + // + // On TTL expiry, TTL compaction should be initiated on L1 file, and the + // compactions should keep going on until the key range hits bottom level. + // In other words: the compaction on this data range "cascasdes" until + // reaching the bottom level. + // + // Order of events on TTL expiry: + // 1. L1 file falls to L3 via 2 trivial moves which are initiated by the + // ttl + // compaction. + // 2. A TTL compaction happens between L3 and L4 files. Output file in L4. + // 3. The new output file from L4 falls to L5 via 1 trival move initiated + // by the ttl compaction. + // 4. A TTL compaction happens between L5 and L6 files. Ouptut in L6. + + // Add 25 hours and do a write + env_->MockSleepForSeconds(25 * 60 * 60); + + ASSERT_OK(Put(Key(1), "1")); + if (if_restart) { + Reopen(options); + } else { + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(5, ttl_compactions); + + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_EQ(oldest_time, level_to_files[6][0].oldest_ancester_time); + + env_->MockSleepForSeconds(25 * 60 * 60); + ASSERT_OK(Put(Key(2), "1")); + if (if_restart) { + Reopen(options); + } else { + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_GE(ttl_compactions, 6); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } + } +} + +TEST_F(DBCompactionTest, LevelPeriodicCompaction) { + env_->SetMockSleep(); + const int kNumKeysPerFile = 32; + const int kNumLevelFiles = 2; + const int kValueSize = 100; + + for (bool if_restart : {false, true}) { + for (bool if_open_all_files : {false, true}) { + Options options = CurrentOptions(); + options.periodic_compaction_seconds = 48 * 60 * 60; // 2 days + if (if_open_all_files) { + options.max_open_files = -1; // needed for ttl compaction + } else { + options.max_open_files = 20; + } + // RocksDB sanitize max open files to at least 20. Modify it back. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = static_cast(arg); + *max_open_files = 0; + }); + // In the case where all files are opened and doing DB restart + // forcing the file creation time in manifest file to be 0 to + // simulate the case of reading from an old version. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionEdit::EncodeTo:VarintFileCreationTime", [&](void* arg) { + if (if_restart && if_open_all_files) { + std::string* encoded_fieled = static_cast(arg); + *encoded_fieled = ""; + PutVarint64(encoded_fieled, 0); + } + }); + + options.env = env_; + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + DestroyAndReopen(options); + + int periodic_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + auto compaction_reason = compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kPeriodicCompaction) { + periodic_compactions++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ("2", FilesPerLevel()); + ASSERT_EQ(0, periodic_compactions); + + // Add 50 hours and do a write + env_->MockSleepForSeconds(50 * 60 * 60); + ASSERT_OK(Put("a", "1")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Assert that the files stay in the same level + ASSERT_EQ("3", FilesPerLevel()); + // The two old files go through the periodic compaction process + ASSERT_EQ(2, periodic_compactions); + + MoveFilesToLevel(1); + ASSERT_EQ("0,3", FilesPerLevel()); + + // Add another 50 hours and do another write + env_->MockSleepForSeconds(50 * 60 * 60); + ASSERT_OK(Put("b", "2")); + if (if_restart) { + Reopen(options); + } else { + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("1,3", FilesPerLevel()); + // The three old files now go through the periodic compaction process. 2 + // + 3. + ASSERT_EQ(5, periodic_compactions); + + // Add another 50 hours and do another write + env_->MockSleepForSeconds(50 * 60 * 60); + ASSERT_OK(Put("c", "3")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("2,3", FilesPerLevel()); + // The four old files now go through the periodic compaction process. 5 + // + 4. + ASSERT_EQ(9, periodic_compactions); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } + } +} + +TEST_F(DBCompactionTest, LevelPeriodicCompactionWithOldDB) { + // This test makes sure that periodic compactions are working with a DB + // where file_creation_time of some files is 0. + // After compactions the new files are created with a valid file_creation_time + + const int kNumKeysPerFile = 32; + const int kNumFiles = 4; + const int kValueSize = 100; + + Options options = CurrentOptions(); + env_->SetMockSleep(); + options.env = env_; + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + DestroyAndReopen(options); + + int periodic_compactions = 0; + bool set_file_creation_time_to_zero = true; + bool set_creation_time_to_zero = true; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + auto compaction_reason = compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kPeriodicCompaction) { + periodic_compactions++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PropertyBlockBuilder::AddTableProperty:Start", [&](void* arg) { + TableProperties* props = reinterpret_cast(arg); + if (set_file_creation_time_to_zero) { + props->file_creation_time = 0; + } + if (set_creation_time_to_zero) { + props->creation_time = 0; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int i = 0; i < kNumFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + // Move the first two files to L2. + if (i == 1) { + MoveFilesToLevel(2); + set_creation_time_to_zero = false; + } + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ("2,0,2", FilesPerLevel()); + ASSERT_EQ(0, periodic_compactions); + + Close(); + + set_file_creation_time_to_zero = false; + // Forward the clock by 2 days. + env_->MockSleepForSeconds(2 * 24 * 60 * 60); + options.periodic_compaction_seconds = 1 * 24 * 60 * 60; // 1 day + + Reopen(options); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("2,0,2", FilesPerLevel()); + // Make sure that all files go through periodic compaction. + ASSERT_EQ(kNumFiles, periodic_compactions); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, LevelPeriodicAndTtlCompaction) { + const int kNumKeysPerFile = 32; + const int kNumLevelFiles = 2; + const int kValueSize = 100; + + Options options = CurrentOptions(); + options.ttl = 10 * 60 * 60; // 10 hours + options.periodic_compaction_seconds = 48 * 60 * 60; // 2 days + options.max_open_files = -1; // needed for both periodic and ttl compactions + env_->SetMockSleep(); + options.env = env_; + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + DestroyAndReopen(options); + + int periodic_compactions = 0; + int ttl_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + auto compaction_reason = compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kPeriodicCompaction) { + periodic_compactions++; + } else if (compaction_reason == CompactionReason::kTtl) { + ttl_compactions++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + MoveFilesToLevel(3); + + ASSERT_EQ("0,0,0,2", FilesPerLevel()); + ASSERT_EQ(0, periodic_compactions); + ASSERT_EQ(0, ttl_compactions); + + // Add some time greater than periodic_compaction_time. + env_->MockSleepForSeconds(50 * 60 * 60); + ASSERT_OK(Put("a", "1")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Files in the bottom level go through periodic compactions. + ASSERT_EQ("1,0,0,2", FilesPerLevel()); + ASSERT_EQ(2, periodic_compactions); + ASSERT_EQ(0, ttl_compactions); + + // Add a little more time than ttl + env_->MockSleepForSeconds(11 * 60 * 60); + ASSERT_OK(Put("b", "1")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Notice that the previous file in level 1 falls down to the bottom level + // due to ttl compactions, one level at a time. + // And bottom level files don't get picked up for ttl compactions. + ASSERT_EQ("1,0,0,3", FilesPerLevel()); + ASSERT_EQ(2, periodic_compactions); + ASSERT_EQ(3, ttl_compactions); + + // Add some time greater than periodic_compaction_time. + env_->MockSleepForSeconds(50 * 60 * 60); + ASSERT_OK(Put("c", "1")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Previous L0 file falls one level at a time to bottom level due to ttl. + // And all 4 bottom files go through periodic compactions. + ASSERT_EQ("1,0,0,4", FilesPerLevel()); + ASSERT_EQ(6, periodic_compactions); + ASSERT_EQ(6, ttl_compactions); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, LevelTtlBooster) { + const int kNumKeysPerFile = 32; + const int kNumLevelFiles = 3; + const int kValueSize = 1000; + + Options options = CurrentOptions(); + options.ttl = 10 * 60 * 60; // 10 hours + options.periodic_compaction_seconds = 480 * 60 * 60; // very long + options.level0_file_num_compaction_trigger = 2; + options.max_bytes_for_level_base = 5 * uint64_t{kNumKeysPerFile * kValueSize}; + options.max_open_files = -1; // needed for both periodic and ttl compactions + options.compaction_pri = CompactionPri::kMinOverlappingRatio; + env_->SetMockSleep(); + options.env = env_; + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + MoveFilesToLevel(2); + + ASSERT_EQ("0,0,3", FilesPerLevel()); + + // Create some files for L1 + for (int i = 0; i < 2; i++) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK(Put(Key(2 * j + i), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + + ASSERT_EQ("0,1,3", FilesPerLevel()); + + // Make the new L0 files qualify TTL boosting and generate one more to trigger + // L1 -> L2 compaction. Old files will be picked even if their priority is + // lower without boosting. + env_->MockSleepForSeconds(8 * 60 * 60); + for (int i = 0; i < 2; i++) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK(Put(Key(kNumKeysPerFile * 2 + 2 * j + i), + rnd.RandomString(kValueSize * 2))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + // Force files to be compacted to L1 + ASSERT_OK( + dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "1"}})); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("0,1,2", FilesPerLevel()); + ASSERT_OK( + dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"}})); + + ASSERT_GT(SizeAtLevel(1), kNumKeysPerFile * 4 * kValueSize); +} + +TEST_F(DBCompactionTest, LevelPeriodicCompactionWithCompactionFilters) { + class TestCompactionFilter : public CompactionFilter { + const char* Name() const override { return "TestCompactionFilter"; } + }; + class TestCompactionFilterFactory : public CompactionFilterFactory { + const char* Name() const override { return "TestCompactionFilterFactory"; } + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& /*context*/) override { + return std::unique_ptr(new TestCompactionFilter()); + } + }; + + const int kNumKeysPerFile = 32; + const int kNumLevelFiles = 2; + const int kValueSize = 100; + + Random rnd(301); + + Options options = CurrentOptions(); + TestCompactionFilter test_compaction_filter; + env_->SetMockSleep(); + options.env = env_; + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + enum CompactionFilterType { + kUseCompactionFilter, + kUseCompactionFilterFactory + }; + + for (CompactionFilterType comp_filter_type : + {kUseCompactionFilter, kUseCompactionFilterFactory}) { + // Assert that periodic compactions are not enabled. + ASSERT_EQ(std::numeric_limits::max() - 1, + options.periodic_compaction_seconds); + + if (comp_filter_type == kUseCompactionFilter) { + options.compaction_filter = &test_compaction_filter; + options.compaction_filter_factory.reset(); + } else if (comp_filter_type == kUseCompactionFilterFactory) { + options.compaction_filter = nullptr; + options.compaction_filter_factory.reset( + new TestCompactionFilterFactory()); + } + DestroyAndReopen(options); + + // periodic_compaction_seconds should be set to the sanitized value when + // a compaction filter or a compaction filter factory is used. + ASSERT_EQ(30 * 24 * 60 * 60, + dbfull()->GetOptions().periodic_compaction_seconds); + + int periodic_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + auto compaction_reason = compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kPeriodicCompaction) { + periodic_compactions++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ("2", FilesPerLevel()); + ASSERT_EQ(0, periodic_compactions); + + // Add 31 days and do a write + env_->MockSleepForSeconds(31 * 24 * 60 * 60); + ASSERT_OK(Put("a", "1")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Assert that the files stay in the same level + ASSERT_EQ("3", FilesPerLevel()); + // The two old files go through the periodic compaction process + ASSERT_EQ(2, periodic_compactions); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_F(DBCompactionTest, CompactRangeDelayedByL0FileCount) { + // Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual + // compaction only triggers flush after it's sure stall won't be triggered for + // L0 file count going too high. + const int kNumL0FilesTrigger = 4; + const int kNumL0FilesLimit = 8; + // i == 0: verifies normal case where stall is avoided by delay + // i == 1: verifies no delay in edge case where stall trigger is same as + // compaction trigger, so stall can't be avoided + for (int i = 0; i < 2; ++i) { + Options options = CurrentOptions(); + options.level0_slowdown_writes_trigger = kNumL0FilesLimit; + if (i == 0) { + options.level0_file_num_compaction_trigger = kNumL0FilesTrigger; + } else { + options.level0_file_num_compaction_trigger = kNumL0FilesLimit; + } + Reopen(options); + + if (i == 0) { + // ensure the auto compaction doesn't finish until manual compaction has + // had a chance to be delayed. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait", + "CompactionJob::Run():End"}}); + } else { + // ensure the auto-compaction doesn't finish until manual compaction has + // continued without delay. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:StallWaitDone", + "CompactionJob::Run():End"}}); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int j = 0; j < kNumL0FilesLimit - 1; ++j) { + for (int k = 0; k < 2; ++k) { + ASSERT_OK(Put(Key(k), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + } + auto manual_compaction_thread = port::Thread([this]() { + CompactRangeOptions cro; + cro.allow_write_stall = false; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + }); + + manual_compaction_thread.join(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_F(DBCompactionTest, CompactRangeDelayedByImmMemTableCount) { + // Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual + // compaction only triggers flush after it's sure stall won't be triggered for + // immutable memtable count going too high. + const int kNumImmMemTableLimit = 8; + // i == 0: verifies normal case where stall is avoided by delay + // i == 1: verifies no delay in edge case where stall trigger is same as flush + // trigger, so stall can't be avoided + for (int i = 0; i < 2; ++i) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + // the delay limit is one less than the stop limit. This test focuses on + // avoiding delay limit, but this option sets stop limit, so add one. + options.max_write_buffer_number = kNumImmMemTableLimit + 1; + if (i == 1) { + options.min_write_buffer_number_to_merge = kNumImmMemTableLimit; + } + Reopen(options); + + if (i == 0) { + // ensure the flush doesn't finish until manual compaction has had a + // chance to be delayed. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait", + "FlushJob::WriteLevel0Table"}}); + } else { + // ensure the flush doesn't finish until manual compaction has continued + // without delay. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:StallWaitDone", + "FlushJob::WriteLevel0Table"}}); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int j = 0; j < kNumImmMemTableLimit - 1; ++j) { + ASSERT_OK(Put(Key(0), rnd.RandomString(1024))); + FlushOptions flush_opts; + flush_opts.wait = false; + flush_opts.allow_write_stall = true; + ASSERT_OK(dbfull()->Flush(flush_opts)); + } + + auto manual_compaction_thread = port::Thread([this]() { + CompactRangeOptions cro; + cro.allow_write_stall = false; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + }); + + manual_compaction_thread.join(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_F(DBCompactionTest, CompactRangeShutdownWhileDelayed) { + // Verify that, when `CompactRangeOptions::allow_write_stall == false`, delay + // does not hang if CF is dropped or DB is closed + const int kNumL0FilesTrigger = 4; + const int kNumL0FilesLimit = 8; + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0FilesTrigger; + options.level0_slowdown_writes_trigger = kNumL0FilesLimit; + // i == 0: DB::DropColumnFamily() on CompactRange's target CF unblocks it + // i == 1: DB::CancelAllBackgroundWork() unblocks CompactRange. This is to + // simulate what happens during Close as we can't call Close (it + // blocks on the auto-compaction, making a cycle). + for (int i = 0; i < 2; ++i) { + CreateAndReopenWithCF({"one"}, options); + // The calls to close CF/DB wait until the manual compaction stalls. + // The auto-compaction waits until the manual compaction finishes to ensure + // the signal comes from closing CF/DB, not from compaction making progress. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait", + "DBCompactionTest::CompactRangeShutdownWhileDelayed:PreShutdown"}, + {"DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual", + "CompactionJob::Run():End"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int j = 0; j < kNumL0FilesLimit - 1; ++j) { + for (int k = 0; k < 2; ++k) { + ASSERT_OK(Put(1, Key(k), rnd.RandomString(1024))); + } + ASSERT_OK(Flush(1)); + } + auto manual_compaction_thread = port::Thread([this, i]() { + CompactRangeOptions cro; + cro.allow_write_stall = false; + if (i == 0) { + ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr) + .IsColumnFamilyDropped()); + } else { + ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr) + .IsShutdownInProgress()); + } + }); + + TEST_SYNC_POINT( + "DBCompactionTest::CompactRangeShutdownWhileDelayed:PreShutdown"); + if (i == 0) { + ASSERT_OK(db_->DropColumnFamily(handles_[1])); + } else { + dbfull()->CancelAllBackgroundWork(false /* wait */); + } + manual_compaction_thread.join(); + TEST_SYNC_POINT( + "DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual"); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_F(DBCompactionTest, CompactRangeSkipFlushAfterDelay) { + // Verify that, when `CompactRangeOptions::allow_write_stall == false`, + // CompactRange skips its flush if the delay is long enough that the memtables + // existing at the beginning of the call have already been flushed. + const int kNumL0FilesTrigger = 4; + const int kNumL0FilesLimit = 8; + Options options = CurrentOptions(); + options.level0_slowdown_writes_trigger = kNumL0FilesLimit; + options.level0_file_num_compaction_trigger = kNumL0FilesTrigger; + Reopen(options); + + Random rnd(301); + // The manual flush includes the memtable that was active when CompactRange + // began. So it unblocks CompactRange and precludes its flush. Throughout the + // test, stall conditions are upheld via high L0 file count. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait", + "DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush"}, + {"DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush", + "DBImpl::FlushMemTable:StallWaitDone"}, + {"DBImpl::FlushMemTable:StallWaitDone", "CompactionJob::Run():End"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // used for the delayable flushes + FlushOptions flush_opts; + flush_opts.allow_write_stall = true; + for (int i = 0; i < kNumL0FilesLimit - 1; ++i) { + for (int j = 0; j < 2; ++j) { + ASSERT_OK(Put(Key(j), rnd.RandomString(1024))); + } + ASSERT_OK(dbfull()->Flush(flush_opts)); + } + auto manual_compaction_thread = port::Thread([this]() { + CompactRangeOptions cro; + cro.allow_write_stall = false; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + }); + + TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush"); + ASSERT_OK(Put(std::to_string(0), rnd.RandomString(1024))); + ASSERT_OK(dbfull()->Flush(flush_opts)); + ASSERT_OK(Put(std::to_string(0), rnd.RandomString(1024))); + TEST_SYNC_POINT( + "DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush"); + manual_compaction_thread.join(); + + // If CompactRange's flush was skipped, the final Put above will still be + // in the active memtable. + std::string num_keys_in_memtable; + ASSERT_TRUE(db_->GetProperty(DB::Properties::kNumEntriesActiveMemTable, + &num_keys_in_memtable)); + ASSERT_EQ(std::to_string(1), num_keys_in_memtable); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, CompactRangeFlushOverlappingMemtable) { + // Verify memtable only gets flushed if it contains data overlapping the range + // provided to `CompactRange`. Tests all kinds of overlap/non-overlap. + const int kNumEndpointKeys = 5; + std::string keys[kNumEndpointKeys] = {"a", "b", "c", "d", "e"}; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + Reopen(options); + + // One extra iteration for nullptr, which means left side of interval is + // unbounded. + for (int i = 0; i <= kNumEndpointKeys; ++i) { + Slice begin; + Slice* begin_ptr; + if (i == 0) { + begin_ptr = nullptr; + } else { + begin = keys[i - 1]; + begin_ptr = &begin; + } + // Start at `i` so right endpoint comes after left endpoint. One extra + // iteration for nullptr, which means right side of interval is unbounded. + for (int j = std::max(0, i - 1); j <= kNumEndpointKeys; ++j) { + Slice end; + Slice* end_ptr; + if (j == kNumEndpointKeys) { + end_ptr = nullptr; + } else { + end = keys[j]; + end_ptr = &end; + } + ASSERT_OK(Put("b", "val")); + ASSERT_OK(Put("d", "val")); + CompactRangeOptions compact_range_opts; + ASSERT_OK(db_->CompactRange(compact_range_opts, begin_ptr, end_ptr)); + + uint64_t get_prop_tmp, num_memtable_entries = 0; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesImmMemTables, + &get_prop_tmp)); + num_memtable_entries += get_prop_tmp; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &get_prop_tmp)); + num_memtable_entries += get_prop_tmp; + if (begin_ptr == nullptr || end_ptr == nullptr || + (i <= 4 && j >= 1 && (begin != "c" || end != "c"))) { + // In this case `CompactRange`'s range overlapped in some way with the + // memtable's range, so flush should've happened. Then "b" and "d" won't + // be in the memtable. + ASSERT_EQ(0, num_memtable_entries); + } else { + ASSERT_EQ(2, num_memtable_entries); + // flush anyways to prepare for next iteration + ASSERT_OK(db_->Flush(FlushOptions())); + } + } + } +} + +TEST_F(DBCompactionTest, CompactionStatsTest) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + CompactionStatsCollector* collector = new CompactionStatsCollector(); + options.listeners.emplace_back(collector); + DestroyAndReopen(options); + + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 5000; j++) { + ASSERT_OK(Put(std::to_string(j), std::string(1, 'A'))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ColumnFamilyHandleImpl* cfh = + static_cast(dbfull()->DefaultColumnFamily()); + ColumnFamilyData* cfd = cfh->cfd(); + + VerifyCompactionStats(*cfd, *collector); +} + +TEST_F(DBCompactionTest, SubcompactionEvent) { + class SubCompactionEventListener : public EventListener { + public: + void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override { + InstrumentedMutexLock l(&mutex_); + ASSERT_EQ(running_compactions_.find(ci.job_id), + running_compactions_.end()); + running_compactions_.emplace(ci.job_id, std::unordered_set()); + } + + void OnCompactionCompleted(DB* /*db*/, + const CompactionJobInfo& ci) override { + InstrumentedMutexLock l(&mutex_); + auto it = running_compactions_.find(ci.job_id); + ASSERT_NE(it, running_compactions_.end()); + ASSERT_EQ(it->second.size(), 0); + running_compactions_.erase(it); + } + + void OnSubcompactionBegin(const SubcompactionJobInfo& si) override { + InstrumentedMutexLock l(&mutex_); + auto it = running_compactions_.find(si.job_id); + ASSERT_NE(it, running_compactions_.end()); + auto r = it->second.insert(si.subcompaction_job_id); + ASSERT_TRUE(r.second); // each subcompaction_job_id should be different + total_subcompaction_cnt_++; + } + + void OnSubcompactionCompleted(const SubcompactionJobInfo& si) override { + InstrumentedMutexLock l(&mutex_); + auto it = running_compactions_.find(si.job_id); + ASSERT_NE(it, running_compactions_.end()); + auto r = it->second.erase(si.subcompaction_job_id); + ASSERT_EQ(r, 1); + } + + size_t GetRunningCompactionCount() { + InstrumentedMutexLock l(&mutex_); + return running_compactions_.size(); + } + + size_t GetTotalSubcompactionCount() { + InstrumentedMutexLock l(&mutex_); + return total_subcompaction_cnt_; + } + + private: + InstrumentedMutex mutex_; + std::unordered_map> running_compactions_; + size_t total_subcompaction_cnt_ = 0; + }; + + Options options = CurrentOptions(); + options.target_file_size_base = 1024; + options.level0_file_num_compaction_trigger = 10; + auto* listener = new SubCompactionEventListener(); + options.listeners.emplace_back(listener); + + DestroyAndReopen(options); + + // generate 4 files @ L2 + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(2); + + // generate 2 files @ L1 which overlaps with L2 files + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(1); + ASSERT_EQ(FilesPerLevel(), "0,2,4"); + + CompactRangeOptions comp_opts; + comp_opts.max_subcompactions = 4; + Status s = dbfull()->CompactRange(comp_opts, nullptr, nullptr); + ASSERT_OK(s); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // make sure there's no running compaction + ASSERT_EQ(listener->GetRunningCompactionCount(), 0); + // and sub compaction is triggered + ASSERT_GT(listener->GetTotalSubcompactionCount(), 0); +} + +TEST_F(DBCompactionTest, CompactFilesOutputRangeConflict) { + // LSM setup: + // L1: [ba bz] + // L2: [a b] [c d] + // L3: [a b] [c d] + // + // Thread 1: Thread 2: + // Begin compacting all L2->L3 + // Compact [ba bz] L1->L3 + // End compacting all L2->L3 + // + // The compaction operation in thread 2 should be disallowed because the range + // overlaps with the compaction in thread 1, which also covers that range in + // L3. + Options options = CurrentOptions(); + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + Reopen(options); + + for (int level = 3; level >= 2; --level) { + ASSERT_OK(Put("a", "val")); + ASSERT_OK(Put("b", "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("c", "val")); + ASSERT_OK(Put("d", "val")); + ASSERT_OK(Flush()); + MoveFilesToLevel(level); + } + ASSERT_OK(Put("ba", "val")); + ASSERT_OK(Put("bz", "val")); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + + SyncPoint::GetInstance()->LoadDependency({ + {"CompactFilesImpl:0", + "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2Begin"}, + {"DBCompactionTest::CompactFilesOutputRangeConflict:Thread2End", + "CompactFilesImpl:1"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + auto bg_thread = port::Thread([&]() { + // Thread 1 + std::vector filenames = collector->GetFlushedFiles(); + filenames.pop_back(); + ASSERT_OK(db_->CompactFiles(CompactionOptions(), filenames, + 3 /* output_level */)); + }); + + // Thread 2 + TEST_SYNC_POINT( + "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2Begin"); + std::string filename = collector->GetFlushedFiles().back(); + ASSERT_FALSE( + db_->CompactFiles(CompactionOptions(), {filename}, 3 /* output_level */) + .ok()); + TEST_SYNC_POINT( + "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2End"); + + bg_thread.join(); +} + +TEST_F(DBCompactionTest, CompactionHasEmptyOutput) { + Options options = CurrentOptions(); + SstStatsCollector* collector = new SstStatsCollector(); + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(collector); + Reopen(options); + + // Make sure the L0 files overlap to prevent trivial move. + ASSERT_OK(Put("a", "val")); + ASSERT_OK(Put("b", "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Delete("a")); + ASSERT_OK(Delete("b")); + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + + // Expect one file creation to start for each flush, and zero for compaction + // since no keys are written. + ASSERT_EQ(2, collector->num_ssts_creation_started()); +} + +TEST_F(DBCompactionTest, CompactionLimiter) { + const int kNumKeysPerFile = 10; + const int kMaxBackgroundThreads = 64; + + struct CompactionLimiter { + std::string name; + int limit_tasks; + int max_tasks; + int tasks; + std::shared_ptr limiter; + }; + + std::vector limiter_settings; + limiter_settings.push_back({"limiter_1", 1, 0, 0, nullptr}); + limiter_settings.push_back({"limiter_2", 2, 0, 0, nullptr}); + limiter_settings.push_back({"limiter_3", 3, 0, 0, nullptr}); + + for (auto& ls : limiter_settings) { + ls.limiter.reset(NewConcurrentTaskLimiter(ls.name, ls.limit_tasks)); + } + + std::shared_ptr unique_limiter( + NewConcurrentTaskLimiter("unique_limiter", -1)); + + const char* cf_names[] = {"default", "0", "1", "2", "3", "4", "5", "6", "7", + "8", "9", "a", "b", "c", "d", "e", "f"}; + const unsigned int cf_count = sizeof cf_names / sizeof cf_names[0]; + + std::unordered_map cf_to_limiter; + + Options options = CurrentOptions(); + options.write_buffer_size = 110 * 1024; // 110KB + options.arena_block_size = 4096; + options.num_levels = 3; + options.level0_file_num_compaction_trigger = 4; + options.level0_slowdown_writes_trigger = 64; + options.level0_stop_writes_trigger = 64; + options.max_background_jobs = kMaxBackgroundThreads; // Enough threads + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + options.max_write_buffer_number = 10; // Enough memtables + DestroyAndReopen(options); + + std::vector option_vector; + option_vector.reserve(cf_count); + + for (unsigned int cf = 0; cf < cf_count; cf++) { + ColumnFamilyOptions cf_opt(options); + if (cf == 0) { + // "Default" CF does't use compaction limiter + cf_opt.compaction_thread_limiter = nullptr; + } else if (cf == 1) { + // "1" CF uses bypass compaction limiter + unique_limiter->SetMaxOutstandingTask(-1); + cf_opt.compaction_thread_limiter = unique_limiter; + } else { + // Assign limiter by mod + auto& ls = limiter_settings[cf % 3]; + cf_opt.compaction_thread_limiter = ls.limiter; + cf_to_limiter[cf_names[cf]] = &ls; + } + option_vector.emplace_back(DBOptions(options), cf_opt); + } + + for (unsigned int cf = 1; cf < cf_count; cf++) { + CreateColumnFamilies({cf_names[cf]}, option_vector[cf]); + } + + ReopenWithColumnFamilies( + std::vector(cf_names, cf_names + cf_count), option_vector); + + port::Mutex mutex; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:BeforeCompaction", [&](void* arg) { + const auto& cf_name = static_cast(arg)->GetName(); + auto iter = cf_to_limiter.find(cf_name); + if (iter != cf_to_limiter.end()) { + MutexLock l(&mutex); + ASSERT_GE(iter->second->limit_tasks, ++iter->second->tasks); + iter->second->max_tasks = + std::max(iter->second->max_tasks, iter->second->limit_tasks); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:AfterCompaction", [&](void* arg) { + const auto& cf_name = static_cast(arg)->GetName(); + auto iter = cf_to_limiter.find(cf_name); + if (iter != cf_to_limiter.end()) { + MutexLock l(&mutex); + ASSERT_GE(--iter->second->tasks, 0); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Block all compact threads in thread pool. + const size_t kTotalFlushTasks = kMaxBackgroundThreads / 4; + const size_t kTotalCompactTasks = kMaxBackgroundThreads - kTotalFlushTasks; + env_->SetBackgroundThreads((int)kTotalFlushTasks, Env::HIGH); + env_->SetBackgroundThreads((int)kTotalCompactTasks, Env::LOW); + + test::SleepingBackgroundTask sleeping_compact_tasks[kTotalCompactTasks]; + + // Block all compaction threads in thread pool. + for (size_t i = 0; i < kTotalCompactTasks; i++) { + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_compact_tasks[i], Env::LOW); + sleeping_compact_tasks[i].WaitUntilSleeping(); + } + + int keyIndex = 0; + + for (int n = 0; n < options.level0_file_num_compaction_trigger; n++) { + for (unsigned int cf = 0; cf < cf_count; cf++) { + for (int i = 0; i < kNumKeysPerFile; i++) { + ASSERT_OK(Put(cf, Key(keyIndex++), "")); + } + // put extra key to trigger flush + ASSERT_OK(Put(cf, "", "")); + } + + for (unsigned int cf = 0; cf < cf_count; cf++) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf])); + } + } + + // Enough L0 files to trigger compaction + for (unsigned int cf = 0; cf < cf_count; cf++) { + ASSERT_EQ(NumTableFilesAtLevel(0, cf), + options.level0_file_num_compaction_trigger); + } + + // Create more files for one column family, which triggers speed up + // condition, all compactions will be scheduled. + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + for (int i = 0; i < kNumKeysPerFile; i++) { + ASSERT_OK(Put(0, Key(i), "")); + } + // put extra key to trigger flush + ASSERT_OK(Put(0, "", "")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0])); + ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1, + NumTableFilesAtLevel(0, 0)); + } + + // All CFs are pending compaction + ASSERT_EQ(cf_count, env_->GetThreadPoolQueueLen(Env::LOW)); + + // Unblock all compaction threads + for (size_t i = 0; i < kTotalCompactTasks; i++) { + sleeping_compact_tasks[i].WakeUp(); + sleeping_compact_tasks[i].WaitUntilDone(); + } + + for (unsigned int cf = 0; cf < cf_count; cf++) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf])); + } + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Max outstanding compact tasks reached limit + for (auto& ls : limiter_settings) { + ASSERT_EQ(ls.limit_tasks, ls.max_tasks); + ASSERT_EQ(0, ls.limiter->GetOutstandingTask()); + } + + // test manual compaction under a fully throttled limiter + int cf_test = 1; + unique_limiter->SetMaxOutstandingTask(0); + + // flush one more file to cf 1 + for (int i = 0; i < kNumKeysPerFile; i++) { + ASSERT_OK(Put(cf_test, Key(keyIndex++), "")); + } + // put extra key to trigger flush + ASSERT_OK(Put(cf_test, "", "")); + + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf_test])); + ASSERT_EQ(1, NumTableFilesAtLevel(0, cf_test)); + + Compact(cf_test, Key(0), Key(keyIndex)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); +} + +INSTANTIATE_TEST_CASE_P(DBCompactionTestWithParam, DBCompactionTestWithParam, + ::testing::Values(std::make_tuple(1, true), + std::make_tuple(1, false), + std::make_tuple(4, true), + std::make_tuple(4, false))); + +TEST_P(DBCompactionDirectIOTest, DirectIO) { + Options options = CurrentOptions(); + Destroy(options); + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.use_direct_io_for_flush_and_compaction = GetParam(); + options.env = MockEnv::Create(Env::Default()); + Reopen(options); + bool readahead = false; + SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::OpenCompactionOutputFile", [&](void* arg) { + bool* use_direct_writes = static_cast(arg); + ASSERT_EQ(*use_direct_writes, + options.use_direct_io_for_flush_and_compaction); + }); + if (options.use_direct_io_for_flush_and_compaction) { + SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions:direct_io", [&](void* /*arg*/) { readahead = true; }); + } + SyncPoint::GetInstance()->EnableProcessing(); + CreateAndReopenWithCF({"pikachu"}, options); + MakeTables(3, "p", "q", 1); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); + Compact(1, "p", "q"); + ASSERT_EQ(readahead, options.use_direct_reads); + ASSERT_EQ("0,0,1", FilesPerLevel(1)); + Destroy(options); + delete options.env; +} + +INSTANTIATE_TEST_CASE_P(DBCompactionDirectIOTest, DBCompactionDirectIOTest, + testing::Bool()); + +class CompactionPriTest : public DBTestBase, + public testing::WithParamInterface { + public: + CompactionPriTest() + : DBTestBase("compaction_pri_test", /*env_do_fsync=*/true) { + compaction_pri_ = GetParam(); + } + + // Required if inheriting from testing::WithParamInterface<> + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + uint32_t compaction_pri_; +}; + +TEST_P(CompactionPriTest, Test) { + Options options = CurrentOptions(); + options.write_buffer_size = 16 * 1024; + options.compaction_pri = static_cast(compaction_pri_); + options.hard_pending_compaction_bytes_limit = 256 * 1024; + options.max_bytes_for_level_base = 64 * 1024; + options.max_bytes_for_level_multiplier = 4; + options.compression = kNoCompression; + + DestroyAndReopen(options); + + Random rnd(301); + const int kNKeys = 5000; + int keys[kNKeys]; + for (int i = 0; i < kNKeys; i++) { + keys[i] = i; + } + RandomShuffle(std::begin(keys), std::end(keys), rnd.Next()); + + for (int i = 0; i < kNKeys; i++) { + ASSERT_OK(Put(Key(keys[i]), rnd.RandomString(102))); + } + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + for (int i = 0; i < kNKeys; i++) { + ASSERT_NE("NOT_FOUND", Get(Key(i))); + } +} + +INSTANTIATE_TEST_CASE_P( + CompactionPriTest, CompactionPriTest, + ::testing::Values(CompactionPri::kByCompensatedSize, + CompactionPri::kOldestLargestSeqFirst, + CompactionPri::kOldestSmallestSeqFirst, + CompactionPri::kMinOverlappingRatio, + CompactionPri::kRoundRobin)); + +TEST_F(DBCompactionTest, PersistRoundRobinCompactCursor) { + Options options = CurrentOptions(); + options.write_buffer_size = 16 * 1024; + options.max_bytes_for_level_base = 128 * 1024; + options.target_file_size_base = 64 * 1024; + options.level0_file_num_compaction_trigger = 4; + options.compaction_pri = CompactionPri::kRoundRobin; + options.max_bytes_for_level_multiplier = 4; + options.num_levels = 3; + options.compression = kNoCompression; + + DestroyAndReopen(options); + + Random rnd(301); + + // 30 Files in L0 to trigger compactions between L1 and L2 + for (int i = 0; i < 30; i++) { + for (int j = 0; j < 16; j++) { + ASSERT_OK(Put(rnd.RandomString(24), rnd.RandomString(1000))); + } + ASSERT_OK(Flush()); + } + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + + const VersionStorageInfo* const storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + + const std::vector compact_cursors = + storage_info->GetCompactCursors(); + + Reopen(options); + + VersionSet* const reopened_versions = dbfull()->GetVersionSet(); + assert(reopened_versions); + + ColumnFamilyData* const reopened_cfd = + reopened_versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(reopened_cfd, nullptr); + + Version* const reopened_current = reopened_cfd->current(); + ASSERT_NE(reopened_current, nullptr); + + const VersionStorageInfo* const reopened_storage_info = + reopened_current->storage_info(); + ASSERT_NE(reopened_storage_info, nullptr); + + const std::vector reopened_compact_cursors = + reopened_storage_info->GetCompactCursors(); + const auto icmp = reopened_storage_info->InternalComparator(); + ASSERT_EQ(compact_cursors.size(), reopened_compact_cursors.size()); + for (size_t i = 0; i < compact_cursors.size(); i++) { + if (compact_cursors[i].Valid()) { + ASSERT_EQ(0, + icmp->Compare(compact_cursors[i], reopened_compact_cursors[i])); + } else { + ASSERT_TRUE(!reopened_compact_cursors[i].Valid()); + } + } +} + +TEST_P(RoundRobinSubcompactionsAgainstPressureToken, PressureTokenTest) { + const int kKeysPerBuffer = 100; + Options options = CurrentOptions(); + options.num_levels = 4; + options.max_bytes_for_level_multiplier = 2; + options.level0_file_num_compaction_trigger = 4; + options.target_file_size_base = kKeysPerBuffer * 1024; + options.compaction_pri = CompactionPri::kRoundRobin; + options.max_bytes_for_level_base = 8 * kKeysPerBuffer * 1024; + options.disable_auto_compactions = true; + // Setup 7 threads but limited subcompactions so that + // RoundRobin requires extra compactions from reserved threads + options.max_subcompactions = 1; + options.max_background_compactions = 7; + options.max_compaction_bytes = 100000000; + DestroyAndReopen(options); + env_->SetBackgroundThreads(7, Env::LOW); + + Random rnd(301); + const std::vector files_per_level = {0, 15, 25}; + for (int lvl = 2; lvl > 0; lvl--) { + for (int i = 0; i < files_per_level[lvl]; i++) { + for (int j = 0; j < kKeysPerBuffer; j++) { + // Add (lvl-1) to ensure nearly equivallent number of files + // in L2 are overlapped with fils selected to compact from + // L1 + ASSERT_OK(Put(Key(2 * i * kKeysPerBuffer + 2 * j + (lvl - 1)), + rnd.RandomString(1010))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(lvl); + ASSERT_EQ(files_per_level[lvl], NumTableFilesAtLevel(lvl, 0)); + } + // 15 files in L1; 25 files in L2 + + // This is a variable for making sure the following callback is called + // and the assertions in it are indeed excuted. + bool num_planned_subcompactions_verified = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::GenSubcompactionBoundaries:0", [&](void* arg) { + uint64_t num_planned_subcompactions = *(static_cast(arg)); + if (grab_pressure_token_) { + // 7 files are selected for round-robin under auto + // compaction. The number of planned subcompaction is restricted by + // the limited number of max_background_compactions + ASSERT_EQ(num_planned_subcompactions, 7); + } else { + ASSERT_EQ(num_planned_subcompactions, 1); + } + num_planned_subcompactions_verified = true; + }); + + // The following 3 dependencies have to be added to ensure the auto + // compaction and the pressure token is correctly enabled. Same for + // RoundRobinSubcompactionsUsingResources and + // DBCompactionTest.RoundRobinSubcompactionsShrinkResources + SyncPoint::GetInstance()->LoadDependency( + {{"RoundRobinSubcompactionsAgainstPressureToken:0", + "BackgroundCallCompaction:0"}, + {"CompactionJob::AcquireSubcompactionResources:0", + "RoundRobinSubcompactionsAgainstPressureToken:1"}, + {"RoundRobinSubcompactionsAgainstPressureToken:2", + "CompactionJob::AcquireSubcompactionResources:1"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()})); + TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:0"); + TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:1"); + std::unique_ptr pressure_token; + if (grab_pressure_token_) { + pressure_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + } + TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:2"); + + ASSERT_OK(dbfull()->WaitForCompact()); + ASSERT_TRUE(num_planned_subcompactions_verified); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +INSTANTIATE_TEST_CASE_P(RoundRobinSubcompactionsAgainstPressureToken, + RoundRobinSubcompactionsAgainstPressureToken, + testing::Bool()); + +TEST_P(RoundRobinSubcompactionsAgainstResources, SubcompactionsUsingResources) { + const int kKeysPerBuffer = 200; + Options options = CurrentOptions(); + options.num_levels = 4; + options.level0_file_num_compaction_trigger = 3; + options.target_file_size_base = kKeysPerBuffer * 1024; + options.compaction_pri = CompactionPri::kRoundRobin; + options.max_bytes_for_level_base = 30 * kKeysPerBuffer * 1024; + options.disable_auto_compactions = true; + options.max_subcompactions = 1; + options.max_background_compactions = max_compaction_limits_; + // Set a large number for max_compaction_bytes so that one round-robin + // compaction is enough to make post-compaction L1 size less than + // the maximum size (this test assumes only one round-robin compaction + // is triggered by kLevelMaxLevelSize) + options.max_compaction_bytes = 100000000; + + DestroyAndReopen(options); + env_->SetBackgroundThreads(total_low_pri_threads_, Env::LOW); + + Random rnd(301); + const std::vector files_per_level = {0, 40, 100}; + for (int lvl = 2; lvl > 0; lvl--) { + for (int i = 0; i < files_per_level[lvl]; i++) { + for (int j = 0; j < kKeysPerBuffer; j++) { + // Add (lvl-1) to ensure nearly equivallent number of files + // in L2 are overlapped with fils selected to compact from + // L1 + ASSERT_OK(Put(Key(2 * i * kKeysPerBuffer + 2 * j + (lvl - 1)), + rnd.RandomString(1010))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(lvl); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(files_per_level[lvl], NumTableFilesAtLevel(lvl, 0)); + } + + // 40 files in L1; 100 files in L2 + // This is a variable for making sure the following callback is called + // and the assertions in it are indeed excuted. + bool num_planned_subcompactions_verified = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::GenSubcompactionBoundaries:0", [&](void* arg) { + uint64_t num_planned_subcompactions = *(static_cast(arg)); + // More than 10 files are selected for round-robin under auto + // compaction. The number of planned subcompaction is restricted by + // the minimum number between available threads and compaction limits + ASSERT_EQ(num_planned_subcompactions - options.max_subcompactions, + std::min(total_low_pri_threads_, max_compaction_limits_) - 1); + num_planned_subcompactions_verified = true; + }); + SyncPoint::GetInstance()->LoadDependency( + {{"RoundRobinSubcompactionsAgainstResources:0", + "BackgroundCallCompaction:0"}, + {"CompactionJob::AcquireSubcompactionResources:0", + "RoundRobinSubcompactionsAgainstResources:1"}, + {"RoundRobinSubcompactionsAgainstResources:2", + "CompactionJob::AcquireSubcompactionResources:1"}, + {"CompactionJob::ReleaseSubcompactionResources:0", + "RoundRobinSubcompactionsAgainstResources:3"}, + {"RoundRobinSubcompactionsAgainstResources:4", + "CompactionJob::ReleaseSubcompactionResources:1"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(dbfull()->WaitForCompact()); + ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()})); + TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:0"); + TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:1"); + auto pressure_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:2"); + TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:3"); + // We can reserve more threads now except one is being used + ASSERT_EQ(total_low_pri_threads_ - 1, + env_->ReserveThreads(total_low_pri_threads_, Env::Priority::LOW)); + ASSERT_EQ( + total_low_pri_threads_ - 1, + env_->ReleaseThreads(total_low_pri_threads_ - 1, Env::Priority::LOW)); + TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:4"); + ASSERT_OK(dbfull()->WaitForCompact()); + ASSERT_TRUE(num_planned_subcompactions_verified); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +INSTANTIATE_TEST_CASE_P(RoundRobinSubcompactionsAgainstResources, + RoundRobinSubcompactionsAgainstResources, + ::testing::Values(std::make_tuple(1, 5), + std::make_tuple(5, 1), + std::make_tuple(10, 5), + std::make_tuple(5, 10), + std::make_tuple(10, 10))); + +TEST_P(DBCompactionTestWithParam, RoundRobinWithoutAdditionalResources) { + const int kKeysPerBuffer = 200; + Options options = CurrentOptions(); + options.num_levels = 4; + options.level0_file_num_compaction_trigger = 3; + options.target_file_size_base = kKeysPerBuffer * 1024; + options.compaction_pri = CompactionPri::kRoundRobin; + options.max_bytes_for_level_base = 30 * kKeysPerBuffer * 1024; + options.disable_auto_compactions = true; + options.max_subcompactions = max_subcompactions_; + options.max_background_compactions = 1; + options.max_compaction_bytes = 100000000; + // Similar experiment setting as above except the max_subcompactions + // is given by max_subcompactions_ (1 or 4), and we fix the + // additional resources as (1, 1) and thus no more extra resources + // can be used + DestroyAndReopen(options); + env_->SetBackgroundThreads(1, Env::LOW); + + Random rnd(301); + const std::vector files_per_level = {0, 33, 100}; + for (int lvl = 2; lvl > 0; lvl--) { + for (int i = 0; i < files_per_level[lvl]; i++) { + for (int j = 0; j < kKeysPerBuffer; j++) { + // Add (lvl-1) to ensure nearly equivallent number of files + // in L2 are overlapped with fils selected to compact from + // L1 + ASSERT_OK(Put(Key(2 * i * kKeysPerBuffer + 2 * j + (lvl - 1)), + rnd.RandomString(1010))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(lvl); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(files_per_level[lvl], NumTableFilesAtLevel(lvl, 0)); + } + + // 33 files in L1; 100 files in L2 + // This is a variable for making sure the following callback is called + // and the assertions in it are indeed excuted. + bool num_planned_subcompactions_verified = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::GenSubcompactionBoundaries:0", [&](void* arg) { + uint64_t num_planned_subcompactions = *(static_cast(arg)); + // At most 4 files are selected for round-robin under auto + // compaction. The number of planned subcompaction is restricted by + // the max_subcompactions since no extra resources can be used + ASSERT_EQ(num_planned_subcompactions, options.max_subcompactions); + num_planned_subcompactions_verified = true; + }); + // No need to setup dependency for pressure token since + // AcquireSubcompactionResources may not be called and it anyway cannot + // reserve any additional resources + SyncPoint::GetInstance()->LoadDependency( + {{"DBCompactionTest::RoundRobinWithoutAdditionalResources:0", + "BackgroundCallCompaction:0"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(dbfull()->WaitForCompact()); + ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()})); + TEST_SYNC_POINT("DBCompactionTest::RoundRobinWithoutAdditionalResources:0"); + + ASSERT_OK(dbfull()->WaitForCompact()); + ASSERT_TRUE(num_planned_subcompactions_verified); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(DBCompactionTest, RoundRobinCutOutputAtCompactCursor) { + Options options = CurrentOptions(); + options.num_levels = 3; + options.compression = kNoCompression; + options.write_buffer_size = 4 * 1024; + options.max_bytes_for_level_base = 64 * 1024; + options.max_bytes_for_level_multiplier = 4; + options.level0_file_num_compaction_trigger = 4; + options.compaction_pri = CompactionPri::kRoundRobin; + + DestroyAndReopen(options); + + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + + VersionStorageInfo* storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + + const InternalKey split_cursor = InternalKey(Key(600), 100, kTypeValue); + storage_info->AddCursorForOneLevel(2, split_cursor); + + Random rnd(301); + + for (int i = 0; i < 50; i++) { + for (int j = 0; j < 50; j++) { + ASSERT_OK(Put(Key(j * 2 + i * 100), rnd.RandomString(102))); + } + } + // Add more overlapping files (avoid trivial move) to trigger compaction that + // output files in L2. Note that trivial move does not trigger compaction and + // in that case the cursor is not necessarily the boundary of file. + for (int i = 0; i < 50; i++) { + for (int j = 0; j < 50; j++) { + ASSERT_OK(Put(Key(j * 2 + 1 + i * 100), rnd.RandomString(1014))); + } + } + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + const auto icmp = cfd->current()->storage_info()->InternalComparator(); + // Files in level 2 should be split by the cursor + for (const auto& file : level_to_files[2]) { + ASSERT_TRUE( + icmp->Compare(file.smallest.Encode(), split_cursor.Encode()) >= 0 || + icmp->Compare(file.largest.Encode(), split_cursor.Encode()) < 0); + } +} + +class NoopMergeOperator : public MergeOperator { + public: + NoopMergeOperator() {} + + bool FullMergeV2(const MergeOperationInput& /*merge_in*/, + MergeOperationOutput* merge_out) const override { + std::string val("bar"); + merge_out->new_value = val; + return true; + } + + const char* Name() const override { return "Noop"; } +}; + +TEST_F(DBCompactionTest, PartialManualCompaction) { + Options opts = CurrentOptions(); + opts.num_levels = 3; + opts.level0_file_num_compaction_trigger = 10; + opts.compression = kNoCompression; + opts.merge_operator.reset(new NoopMergeOperator()); + opts.target_file_size_base = 10240; + DestroyAndReopen(opts); + + Random rnd(301); + for (auto i = 0; i < 8; ++i) { + for (auto j = 0; j < 10; ++j) { + ASSERT_OK(Merge("foo", rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + } + + MoveFilesToLevel(2); + + std::string prop; + EXPECT_TRUE(dbfull()->GetProperty(DB::Properties::kLiveSstFilesSize, &prop)); + uint64_t max_compaction_bytes = atoi(prop.c_str()) / 2; + ASSERT_OK(dbfull()->SetOptions( + {{"max_compaction_bytes", std::to_string(max_compaction_bytes)}})); + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); +} + +TEST_F(DBCompactionTest, ManualCompactionFailsInReadOnlyMode) { + // Regression test for bug where manual compaction hangs forever when the DB + // is in read-only mode. Verify it now at least returns, despite failing. + const int kNumL0Files = 4; + std::unique_ptr mock_env( + new FaultInjectionTestEnv(env_)); + Options opts = CurrentOptions(); + opts.disable_auto_compactions = true; + opts.env = mock_env.get(); + DestroyAndReopen(opts); + + Random rnd(301); + for (int i = 0; i < kNumL0Files; ++i) { + // Make sure files are overlapping in key-range to prevent trivial move. + ASSERT_OK(Put("key1", rnd.RandomString(1024))); + ASSERT_OK(Put("key2", rnd.RandomString(1024))); + ASSERT_OK(Flush()); + } + ASSERT_EQ(kNumL0Files, NumTableFilesAtLevel(0)); + + // Enter read-only mode by failing a write. + mock_env->SetFilesystemActive(false); + // Make sure this is outside `CompactRange`'s range so that it doesn't fail + // early trying to flush memtable. + ASSERT_NOK(Put("key3", rnd.RandomString(1024))); + + // In the bug scenario, the first manual compaction would fail and forget to + // unregister itself, causing the second one to hang forever due to conflict + // with a non-running compaction. + CompactRangeOptions cro; + cro.exclusive_manual_compaction = false; + Slice begin_key("key1"); + Slice end_key("key2"); + ASSERT_NOK(dbfull()->CompactRange(cro, &begin_key, &end_key)); + ASSERT_NOK(dbfull()->CompactRange(cro, &begin_key, &end_key)); + + // Close before mock_env destruct. + Close(); +} + +// ManualCompactionBottomLevelOptimization tests the bottom level manual +// compaction optimization to skip recompacting files created by Ln-1 to Ln +// compaction +TEST_F(DBCompactionTest, ManualCompactionBottomLevelOptimized) { + Options opts = CurrentOptions(); + opts.num_levels = 3; + opts.level0_file_num_compaction_trigger = 5; + opts.compression = kNoCompression; + opts.merge_operator.reset(new NoopMergeOperator()); + opts.target_file_size_base = 1024; + opts.max_bytes_for_level_multiplier = 2; + opts.disable_auto_compactions = true; + DestroyAndReopen(opts); + ColumnFamilyHandleImpl* cfh = + static_cast(dbfull()->DefaultColumnFamily()); + ColumnFamilyData* cfd = cfh->cfd(); + InternalStats* internal_stats_ptr = cfd->internal_stats(); + ASSERT_NE(internal_stats_ptr, nullptr); + + Random rnd(301); + for (auto i = 0; i < 8; ++i) { + for (auto j = 0; j < 10; ++j) { + ASSERT_OK( + Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + } + + MoveFilesToLevel(2); + + for (auto i = 0; i < 8; ++i) { + for (auto j = 0; j < 10; ++j) { + ASSERT_OK( + Put("bar" + std::to_string(i * 10 + j), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + } + const std::vector& comp_stats = + internal_stats_ptr->TEST_GetCompactionStats(); + int num = comp_stats[2].num_input_files_in_output_level; + ASSERT_EQ(num, 0); + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + + const std::vector& comp_stats2 = + internal_stats_ptr->TEST_GetCompactionStats(); + num = comp_stats2[2].num_input_files_in_output_level; + ASSERT_EQ(num, 0); +} + +TEST_F(DBCompactionTest, ManualCompactionMax) { + uint64_t l1_avg_size = 0, l2_avg_size = 0; + auto generate_sst_func = [&]() { + Random rnd(301); + for (auto i = 0; i < 100; i++) { + for (auto j = 0; j < 10; j++) { + ASSERT_OK(Put(Key(i * 10 + j), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(2); + + for (auto i = 0; i < 10; i++) { + for (auto j = 0; j < 10; j++) { + ASSERT_OK(Put(Key(i * 100 + j * 10), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(1); + + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + + uint64_t total = 0; + for (const auto& file : level_to_files[1]) { + total += file.compensated_file_size; + } + l1_avg_size = total / level_to_files[1].size(); + + total = 0; + for (const auto& file : level_to_files[2]) { + total += file.compensated_file_size; + } + l2_avg_size = total / level_to_files[2].size(); + }; + + std::atomic_int num_compactions(0); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BGWorkCompaction", [&](void* /*arg*/) { ++num_compactions; }); + SyncPoint::GetInstance()->EnableProcessing(); + + Options opts = CurrentOptions(); + opts.disable_auto_compactions = true; + + // with default setting (1.6G by default), it should cover all files in 1 + // compaction + DestroyAndReopen(opts); + generate_sst_func(); + num_compactions.store(0); + CompactRangeOptions cro; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_TRUE(num_compactions.load() == 1); + + // split the compaction to 5 + int num_split = 5; + DestroyAndReopen(opts); + generate_sst_func(); + uint64_t total_size = (l1_avg_size * 10) + (l2_avg_size * 100); + opts.max_compaction_bytes = total_size / num_split; + opts.target_file_size_base = total_size / num_split; + Reopen(opts); + num_compactions.store(0); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_TRUE(num_compactions.load() == num_split); + + // very small max_compaction_bytes, it should still move forward + opts.max_compaction_bytes = l1_avg_size / 2; + opts.target_file_size_base = l1_avg_size / 2; + DestroyAndReopen(opts); + generate_sst_func(); + num_compactions.store(0); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_TRUE(num_compactions.load() > 10); + + // dynamically set the option + num_split = 2; + opts.max_compaction_bytes = 0; + DestroyAndReopen(opts); + generate_sst_func(); + total_size = (l1_avg_size * 10) + (l2_avg_size * 100); + Status s = db_->SetOptions( + {{"max_compaction_bytes", std::to_string(total_size / num_split)}, + {"target_file_size_base", std::to_string(total_size / num_split)}}); + ASSERT_OK(s); + + num_compactions.store(0); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_TRUE(num_compactions.load() == num_split); +} + +TEST_F(DBCompactionTest, CompactionDuringShutdown) { + Options opts = CurrentOptions(); + opts.level0_file_num_compaction_trigger = 2; + opts.disable_auto_compactions = true; + DestroyAndReopen(opts); + ColumnFamilyHandleImpl* cfh = + static_cast(dbfull()->DefaultColumnFamily()); + ColumnFamilyData* cfd = cfh->cfd(); + InternalStats* internal_stats_ptr = cfd->internal_stats(); + ASSERT_NE(internal_stats_ptr, nullptr); + + Random rnd(301); + for (auto i = 0; i < 2; ++i) { + for (auto j = 0; j < 10; ++j) { + ASSERT_OK( + Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", + [&](void* /*arg*/) { dbfull()->shutting_down_.store(true); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_TRUE(s.ok() || s.IsShutdownInProgress()); + ASSERT_OK(dbfull()->error_handler_.GetBGError()); +} + +// FixFileIngestionCompactionDeadlock tests and verifies that compaction and +// file ingestion do not cause deadlock in the event of write stall triggered +// by number of L0 files reaching level0_stop_writes_trigger. +TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) { + const int kNumKeysPerFile = 100; + // Generate SST files. + Options options = CurrentOptions(); + + // Generate an external SST file containing a single key, i.e. 99 + std::string sst_files_dir = dbname_ + "/sst_files/"; + ASSERT_OK(DestroyDir(env_, sst_files_dir)); + ASSERT_OK(env_->CreateDir(sst_files_dir)); + SstFileWriter sst_writer(EnvOptions(), options); + const std::string sst_file_path = sst_files_dir + "test.sst"; + ASSERT_OK(sst_writer.Open(sst_file_path)); + ASSERT_OK(sst_writer.Put(Key(kNumKeysPerFile - 1), "value")); + ASSERT_OK(sst_writer.Finish()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::IngestExternalFile:AfterIncIngestFileCounter", + "BackgroundCallCompaction:0"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + options.write_buffer_size = 110 << 10; // 110KB + options.level0_file_num_compaction_trigger = + options.level0_stop_writes_trigger; + options.max_subcompactions = max_subcompactions_; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + Random rnd(301); + + // Generate level0_stop_writes_trigger L0 files to trigger write stop + for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) { + for (int j = 0; j != kNumKeysPerFile; ++j) { + ASSERT_OK(Put(Key(j), rnd.RandomString(990))); + } + if (i > 0) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i); + } + } + // When we reach this point, there will be level0_stop_writes_trigger L0 + // files and one extra key (99) in memory, which overlaps with the external + // SST file. Write stall triggers, and can be cleared only after compaction + // reduces the number of L0 files. + + // Compaction will also be triggered since we have reached the threshold for + // auto compaction. Note that compaction may begin after the following file + // ingestion thread and waits for ingestion to finish. + + // Thread to ingest file with overlapping key range with the current + // memtable. Consequently ingestion will trigger a flush. The flush MUST + // proceed without waiting for the write stall condition to clear, otherwise + // deadlock can happen. + port::Thread ingestion_thr([&]() { + IngestExternalFileOptions ifo; + Status s = db_->IngestExternalFile({sst_file_path}, ifo); + ASSERT_OK(s); + }); + + // More write to trigger write stop + ingestion_thr.join(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + Close(); +} + +TEST_F(DBCompactionTest, ConsistencyFailTest) { + Options options = CurrentOptions(); + options.force_consistency_checks = true; + DestroyAndReopen(options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionBuilder::CheckConsistency0", [&](void* arg) { + auto p = + reinterpret_cast*>(arg); + // just swap the two FileMetaData so that we hit error + // in CheckConsistency funcion + FileMetaData* temp = *(p->first); + *(p->first) = *(p->second); + *(p->second) = temp; + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + for (int k = 0; k < 2; ++k) { + ASSERT_OK(Put("foo", "bar")); + Status s = Flush(); + if (k < 1) { + ASSERT_OK(s); + } else { + ASSERT_TRUE(s.IsCorruption()); + } + } + + ASSERT_NOK(Put("foo", "bar")); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(DBCompactionTest, ConsistencyFailTest2) { + Options options = CurrentOptions(); + options.force_consistency_checks = true; + options.target_file_size_base = 1000; + options.level0_file_num_compaction_trigger = 2; + BlockBasedTableOptions bbto; + bbto.block_size = 400; // small block size + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionBuilder::CheckConsistency1", [&](void* arg) { + auto p = + reinterpret_cast*>(arg); + // just swap the two FileMetaData so that we hit error + // in CheckConsistency funcion + FileMetaData* temp = *(p->first); + *(p->first) = *(p->second); + *(p->second) = temp; + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + std::string value = rnd.RandomString(1000); + + ASSERT_OK(Put("foo1", value)); + ASSERT_OK(Put("z", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo2", value)); + ASSERT_OK(Put("z", "")); + Status s = Flush(); + ASSERT_TRUE(s.ok() || s.IsCorruption()); + + // This probably returns non-OK, but we rely on the next Put() + // to determine the DB is frozen. + ASSERT_NOK(dbfull()->TEST_WaitForCompact()); + ASSERT_NOK(Put("foo", "bar")); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +void IngestOneKeyValue(DBImpl* db, const std::string& key, + const std::string& value, const Options& options) { + ExternalSstFileInfo info; + std::string f = test::PerThreadDBPath("sst_file" + key); + EnvOptions env; + ROCKSDB_NAMESPACE::SstFileWriter writer(env, options); + auto s = writer.Open(f); + ASSERT_OK(s); + // ASSERT_OK(writer.Put(Key(), "")); + ASSERT_OK(writer.Put(key, value)); + + ASSERT_OK(writer.Finish(&info)); + IngestExternalFileOptions ingest_opt; + + ASSERT_OK(db->IngestExternalFile({info.file_path}, ingest_opt)); +} + +TEST_P(DBCompactionTestWithParam, + FlushAfterIntraL0CompactionCheckConsistencyFail) { + Options options = CurrentOptions(); + options.force_consistency_checks = true; + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 5; + options.max_background_compactions = 2; + options.max_subcompactions = max_subcompactions_; + DestroyAndReopen(options); + + const size_t kValueSize = 1 << 20; + Random rnd(301); + std::atomic pick_intra_l0_count(0); + std::string value(rnd.RandomString(kValueSize)); + + // The L0->L1 must be picked before we begin ingesting files to trigger + // intra-L0 compaction, and must not finish until after an intra-L0 + // compaction has been picked. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"LevelCompactionPicker::PickCompaction:Return", + "DBCompactionTestWithParam::" + "FlushAfterIntraL0CompactionCheckConsistencyFail:L0ToL1Ready"}, + {"LevelCompactionPicker::PickCompactionBySize:0", + "CompactionJob::Run():Start"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "FindIntraL0Compaction", + [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // prevents trivial move + for (int i = 0; i < 10; ++i) { + ASSERT_OK(Put(Key(i), "")); // prevents trivial move + } + ASSERT_OK(Flush()); + Compact("", Key(99)); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + + // Flush 5 L0 sst. + for (int i = 0; i < 5; ++i) { + ASSERT_OK(Put(Key(i + 1), value)); + ASSERT_OK(Flush()); + } + ASSERT_EQ(5, NumTableFilesAtLevel(0)); + + // Put one key, to make smallest log sequence number in this memtable is less + // than sst which would be ingested in next step. + ASSERT_OK(Put(Key(0), "a")); + + ASSERT_EQ(5, NumTableFilesAtLevel(0)); + TEST_SYNC_POINT( + "DBCompactionTestWithParam::" + "FlushAfterIntraL0CompactionCheckConsistencyFail:L0ToL1Ready"); + + // Ingest 5 L0 sst. And this files would trigger PickIntraL0Compaction. + for (int i = 5; i < 10; i++) { + ASSERT_EQ(i, NumTableFilesAtLevel(0)); + IngestOneKeyValue(dbfull(), Key(i), value, options); + } + + // Put one key, to make biggest log sequence number in this memtable is bigger + // than sst which would be ingested in next step. + ASSERT_OK(Put(Key(2), "b")); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_GT(level_to_files[0].size(), 0); + ASSERT_GT(pick_intra_l0_count.load(), 0); + + ASSERT_OK(Flush()); +} + +TEST_P(DBCompactionTestWithParam, + IntraL0CompactionAfterFlushCheckConsistencyFail) { + Options options = CurrentOptions(); + options.force_consistency_checks = true; + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 5; + options.max_background_compactions = 2; + options.max_subcompactions = max_subcompactions_; + options.write_buffer_size = 2 << 20; + options.max_write_buffer_number = 6; + DestroyAndReopen(options); + + const size_t kValueSize = 1 << 20; + Random rnd(301); + std::string value(rnd.RandomString(kValueSize)); + std::string value2(rnd.RandomString(kValueSize)); + std::string bigvalue = value + value; + + // prevents trivial move + for (int i = 0; i < 10; ++i) { + ASSERT_OK(Put(Key(i), "")); // prevents trivial move + } + ASSERT_OK(Flush()); + Compact("", Key(99)); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + + std::atomic pick_intra_l0_count(0); + // The L0->L1 must be picked before we begin ingesting files to trigger + // intra-L0 compaction, and must not finish until after an intra-L0 + // compaction has been picked. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"LevelCompactionPicker::PickCompaction:Return", + "DBCompactionTestWithParam::" + "IntraL0CompactionAfterFlushCheckConsistencyFail:L0ToL1Ready"}, + {"LevelCompactionPicker::PickCompactionBySize:0", + "CompactionJob::Run():Start"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "FindIntraL0Compaction", + [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + // Make 6 L0 sst. + for (int i = 0; i < 6; ++i) { + if (i % 2 == 0) { + IngestOneKeyValue(dbfull(), Key(i), value, options); + } else { + ASSERT_OK(Put(Key(i), value)); + ASSERT_OK(Flush()); + } + } + + ASSERT_EQ(6, NumTableFilesAtLevel(0)); + + // Stop run flush job + env_->SetBackgroundThreads(1, Env::HIGH); + test::SleepingBackgroundTask sleeping_tasks; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_tasks, + Env::Priority::HIGH); + sleeping_tasks.WaitUntilSleeping(); + + // Put many keys to make memtable request to flush + for (int i = 0; i < 6; ++i) { + ASSERT_OK(Put(Key(i), bigvalue)); + } + + ASSERT_EQ(6, NumTableFilesAtLevel(0)); + TEST_SYNC_POINT( + "DBCompactionTestWithParam::" + "IntraL0CompactionAfterFlushCheckConsistencyFail:L0ToL1Ready"); + // ingest file to trigger IntraL0Compaction + for (int i = 6; i < 10; ++i) { + ASSERT_EQ(i, NumTableFilesAtLevel(0)); + IngestOneKeyValue(dbfull(), Key(i), value2, options); + } + + // Wake up flush job + sleeping_tasks.WakeUp(); + sleeping_tasks.WaitUntilDone(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + uint64_t error_count = 0; + db_->GetIntProperty("rocksdb.background-errors", &error_count); + ASSERT_EQ(error_count, 0); + ASSERT_GT(pick_intra_l0_count.load(), 0); + for (int i = 0; i < 6; ++i) { + ASSERT_EQ(bigvalue, Get(Key(i))); + } + for (int i = 6; i < 10; ++i) { + ASSERT_EQ(value2, Get(Key(i))); + } +} + +TEST_P(DBCompactionTestWithBottommostParam, SequenceKeysManualCompaction) { + constexpr int kSstNum = 10; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + // Generate some sst files on level 0 with sequence keys (no overlap) + for (int i = 0; i < kSstNum; i++) { + for (int j = 1; j < UCHAR_MAX; j++) { + auto key = std::string(kSstNum, '\0'); + key[kSstNum - i] += static_cast(j); + ASSERT_OK(Put(key, std::string(i % 1000, 'A'))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + ASSERT_EQ(std::to_string(kSstNum), FilesPerLevel(0)); + + auto cro = CompactRangeOptions(); + cro.bottommost_level_compaction = bottommost_level_compaction_; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + if (bottommost_level_compaction_ == BottommostLevelCompaction::kForce || + bottommost_level_compaction_ == + BottommostLevelCompaction::kForceOptimized) { + // Real compaction to compact all sst files from level 0 to 1 file on level + // 1 + ASSERT_EQ("0,1", FilesPerLevel(0)); + } else { + // Just trivial move from level 0 -> 1 + ASSERT_EQ("0," + std::to_string(kSstNum), FilesPerLevel(0)); + } +} + +INSTANTIATE_TEST_CASE_P( + DBCompactionTestWithBottommostParam, DBCompactionTestWithBottommostParam, + ::testing::Values(BottommostLevelCompaction::kSkip, + BottommostLevelCompaction::kIfHaveCompactionFilter, + BottommostLevelCompaction::kForce, + BottommostLevelCompaction::kForceOptimized)); + +TEST_F(DBCompactionTest, UpdateLevelSubCompactionTest) { + Options options = CurrentOptions(); + options.max_subcompactions = 10; + options.target_file_size_base = 1 << 10; // 1KB + DestroyAndReopen(options); + + bool has_compaction = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(compaction->max_subcompactions() == 10); + has_compaction = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_TRUE(dbfull()->GetDBOptions().max_subcompactions == 10); + // Trigger compaction + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 5000; j++) { + ASSERT_OK(Put(std::to_string(j), std::string(1, 'A'))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(has_compaction); + + has_compaction = false; + ASSERT_OK(dbfull()->SetDBOptions({{"max_subcompactions", "2"}})); + ASSERT_TRUE(dbfull()->GetDBOptions().max_subcompactions == 2); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(compaction->max_subcompactions() == 2); + has_compaction = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Trigger compaction + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 5000; j++) { + ASSERT_OK(Put(std::to_string(j), std::string(1, 'A'))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(has_compaction); +} + +TEST_F(DBCompactionTest, UpdateUniversalSubCompactionTest) { + Options options = CurrentOptions(); + options.max_subcompactions = 10; + options.compaction_style = kCompactionStyleUniversal; + options.target_file_size_base = 1 << 10; // 1KB + DestroyAndReopen(options); + + bool has_compaction = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(compaction->max_subcompactions() == 10); + has_compaction = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Trigger compaction + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 5000; j++) { + ASSERT_OK(Put(std::to_string(j), std::string(1, 'A'))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(has_compaction); + has_compaction = false; + + ASSERT_OK(dbfull()->SetDBOptions({{"max_subcompactions", "2"}})); + ASSERT_TRUE(dbfull()->GetDBOptions().max_subcompactions == 2); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(compaction->max_subcompactions() == 2); + has_compaction = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Trigger compaction + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 5000; j++) { + ASSERT_OK(Put(std::to_string(j), std::string(1, 'A'))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(has_compaction); +} + +TEST_P(ChangeLevelConflictsWithAuto, TestConflict) { + // A `CompactRange()` may race with an automatic compaction, we'll need + // to make sure it doesn't corrupte the data. + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + Reopen(options); + + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("bar", "v1")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + } + ASSERT_EQ("0,0,1", FilesPerLevel(0)); + + // Run a qury to refitting to level 1 while another thread writing to + // the same level. + SyncPoint::GetInstance()->LoadDependency({ + // The first two dependencies ensure the foreground creates an L0 file + // between the background compaction's L0->L1 and its L1->L2. + { + "DBImpl::CompactRange:BeforeRefit:1", + "AutoCompactionFinished1", + }, + { + "AutoCompactionFinished2", + "DBImpl::CompactRange:BeforeRefit:2", + }, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::thread auto_comp([&] { + TEST_SYNC_POINT("AutoCompactionFinished1"); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Put("foo", "v2")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("bar", "v3")); + ASSERT_OK(Put("foo", "v3")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + TEST_SYNC_POINT("AutoCompactionFinished2"); + }); + + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = GetParam() ? 1 : 0; + // This should return non-OK, but it's more important for the test to + // make sure that the DB is not corrupted. + ASSERT_NOK(dbfull()->CompactRange(cro, nullptr, nullptr)); + } + auto_comp.join(); + // Refitting didn't happen. + SyncPoint::GetInstance()->DisableProcessing(); + + // Write something to DB just make sure that consistency check didn't + // fail and make the DB readable. +} + +INSTANTIATE_TEST_CASE_P(ChangeLevelConflictsWithAuto, + ChangeLevelConflictsWithAuto, testing::Bool()); + +TEST_F(DBCompactionTest, ChangeLevelCompactRangeConflictsWithManual) { + // A `CompactRange()` with `change_level == true` needs to execute its final + // step, `ReFitLevel()`, in isolation. Previously there was a bug where + // refitting could target the same level as an ongoing manual compaction, + // leading to overlapping files in that level. + // + // This test ensures that case is not possible by verifying any manual + // compaction issued during the `ReFitLevel()` phase fails with + // `Status::Incomplete`. + Options options = CurrentOptions(); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 3; + Reopen(options); + + // Setup an LSM with three levels populated. + Random rnd(301); + int key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + } + ASSERT_EQ("0,0,2", FilesPerLevel(0)); + + GenerateNewFile(&rnd, &key_idx); + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,1,2", FilesPerLevel(0)); + + // The background thread will refit L2->L1 while the + // foreground thread will try to simultaneously compact L0->L1. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + // The first two dependencies ensure the foreground creates an L0 file + // between the background compaction's L0->L1 and its L1->L2. + { + "DBImpl::RunManualCompaction()::1", + "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:" + "PutFG", + }, + { + "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:" + "FlushedFG", + "DBImpl::RunManualCompaction()::2", + }, + // The next two dependencies ensure the foreground invokes + // `CompactRange()` while the background is refitting. The + // foreground's `CompactRange()` is guaranteed to attempt an L0->L1 + // as we set it up with an empty memtable and a new L0 file. + { + "DBImpl::CompactRange:PreRefitLevel", + "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:" + "CompactFG", + }, + { + "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:" + "CompactedFG", + "DBImpl::CompactRange:PostRefitLevel", + }, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread refit_level_thread([&] { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 1; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + }); + + TEST_SYNC_POINT( + "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:PutFG"); + // Make sure we have something new to compact in the foreground. + // Note key 1 is carefully chosen as it ensures the file we create here + // overlaps with one of the files being refitted L2->L1 in the background. + // If we chose key 0, the file created here would not overlap. + ASSERT_OK(Put(Key(1), "val")); + ASSERT_OK(Flush()); + TEST_SYNC_POINT( + "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:FlushedFG"); + + TEST_SYNC_POINT( + "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:CompactFG"); + ASSERT_TRUE(dbfull() + ->CompactRange(CompactRangeOptions(), nullptr, nullptr) + .IsIncomplete()); + TEST_SYNC_POINT( + "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:" + "CompactedFG"); + refit_level_thread.join(); +} + +TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) { + // This test is added to ensure that RefitLevel() error paths are clearing + // internal flags and to test that subsequent valid RefitLevel() calls + // succeeds + Options options = CurrentOptions(); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 3; + Reopen(options); + + ASSERT_EQ("", FilesPerLevel(0)); + + // Setup an LSM with three levels populated. + Random rnd(301); + int key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1", FilesPerLevel(0)); + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + } + ASSERT_EQ("0,0,2", FilesPerLevel(0)); + + auto start_idx = key_idx; + GenerateNewFile(&rnd, &key_idx); + GenerateNewFile(&rnd, &key_idx); + auto end_idx = key_idx - 1; + ASSERT_EQ("1,1,2", FilesPerLevel(0)); + + // Next two CompactRange() calls are used to test exercise error paths within + // RefitLevel() before triggering a valid RefitLevel() call + + // Trigger a refit to L1 first + { + std::string begin_string = Key(start_idx); + std::string end_string = Key(end_idx); + Slice begin(begin_string); + Slice end(end_string); + + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 1; + ASSERT_OK(dbfull()->CompactRange(cro, &begin, &end)); + } + ASSERT_EQ("0,3,2", FilesPerLevel(0)); + + // Try a refit from L2->L1 - this should fail and exercise error paths in + // RefitLevel() + { + // Select key range that matches the bottom most level (L2) + std::string begin_string = Key(0); + std::string end_string = Key(start_idx - 1); + Slice begin(begin_string); + Slice end(end_string); + + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 1; + ASSERT_NOK(dbfull()->CompactRange(cro, &begin, &end)); + } + ASSERT_EQ("0,3,2", FilesPerLevel(0)); + + // Try a valid Refit request to ensure, the path is still working + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 1; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + } + ASSERT_EQ("0,5", FilesPerLevel(0)); +} + +TEST_F(DBCompactionTest, CompactionWithBlob) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + + Reopen(options); + + constexpr char first_key[] = "first_key"; + constexpr char second_key[] = "second_key"; + constexpr char first_value[] = "first_value"; + constexpr char second_value[] = "second_value"; + constexpr char third_value[] = "third_value"; + + ASSERT_OK(Put(first_key, first_value)); + ASSERT_OK(Put(second_key, first_value)); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(first_key, second_value)); + ASSERT_OK(Put(second_key, second_value)); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(first_key, third_value)); + ASSERT_OK(Put(second_key, third_value)); + ASSERT_OK(Flush()); + + options.enable_blob_files = true; + + Reopen(options); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + ASSERT_EQ(Get(first_key), third_value); + ASSERT_EQ(Get(second_key), third_value); + + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + + const VersionStorageInfo* const storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + + const auto& l1_files = storage_info->LevelFiles(1); + ASSERT_EQ(l1_files.size(), 1); + + const FileMetaData* const table_file = l1_files[0]; + ASSERT_NE(table_file, nullptr); + + const auto& blob_files = storage_info->GetBlobFiles(); + ASSERT_EQ(blob_files.size(), 1); + + const auto& blob_file = blob_files.front(); + ASSERT_NE(blob_file, nullptr); + + ASSERT_EQ(table_file->smallest.user_key(), first_key); + ASSERT_EQ(table_file->largest.user_key(), second_key); + ASSERT_EQ(table_file->oldest_blob_file_number, + blob_file->GetBlobFileNumber()); + + ASSERT_EQ(blob_file->GetTotalBlobCount(), 2); + + const InternalStats* const internal_stats = cfd->internal_stats(); + ASSERT_NE(internal_stats, nullptr); + + const auto& compaction_stats = internal_stats->TEST_GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_written, table_file->fd.GetFileSize()); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, + blob_file->GetTotalBlobBytes()); + ASSERT_EQ(compaction_stats[1].num_output_files, 1); + ASSERT_EQ(compaction_stats[1].num_output_files_blob, 1); +} + +class DBCompactionTestBlobError + : public DBCompactionTest, + public testing::WithParamInterface { + public: + DBCompactionTestBlobError() : sync_point_(GetParam()) {} + + std::string sync_point_; +}; + +INSTANTIATE_TEST_CASE_P(DBCompactionTestBlobError, DBCompactionTestBlobError, + ::testing::ValuesIn(std::vector{ + "BlobFileBuilder::WriteBlobToFile:AddRecord", + "BlobFileBuilder::WriteBlobToFile:AppendFooter"})); + +TEST_P(DBCompactionTestBlobError, CompactionError) { + Options options; + options.disable_auto_compactions = true; + options.env = env_; + + Reopen(options); + + constexpr char first_key[] = "first_key"; + constexpr char second_key[] = "second_key"; + constexpr char first_value[] = "first_value"; + constexpr char second_value[] = "second_value"; + constexpr char third_value[] = "third_value"; + + ASSERT_OK(Put(first_key, first_value)); + ASSERT_OK(Put(second_key, first_value)); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(first_key, second_value)); + ASSERT_OK(Put(second_key, second_value)); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(first_key, third_value)); + ASSERT_OK(Put(second_key, third_value)); + ASSERT_OK(Flush()); + + options.enable_blob_files = true; + + Reopen(options); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) { + Status* const s = static_cast(arg); + assert(s); + + (*s) = Status::IOError(sync_point_); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), begin, end).IsIOError()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + + const VersionStorageInfo* const storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + + const auto& l1_files = storage_info->LevelFiles(1); + ASSERT_TRUE(l1_files.empty()); + + const auto& blob_files = storage_info->GetBlobFiles(); + ASSERT_TRUE(blob_files.empty()); + + const InternalStats* const internal_stats = cfd->internal_stats(); + ASSERT_NE(internal_stats, nullptr); + + const auto& compaction_stats = internal_stats->TEST_GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + if (sync_point_ == "BlobFileBuilder::WriteBlobToFile:AddRecord") { + ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_written, 0); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); + ASSERT_EQ(compaction_stats[1].num_output_files, 0); + ASSERT_EQ(compaction_stats[1].num_output_files_blob, 0); + } else { + // SST file writing succeeded; blob file writing failed (during Finish) + ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); + ASSERT_GT(compaction_stats[1].bytes_written, 0); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); + ASSERT_EQ(compaction_stats[1].num_output_files, 1); + ASSERT_EQ(compaction_stats[1].num_output_files_blob, 0); + } +} + +class DBCompactionTestBlobGC + : public DBCompactionTest, + public testing::WithParamInterface> { + public: + DBCompactionTestBlobGC() + : blob_gc_age_cutoff_(std::get<0>(GetParam())), + updated_enable_blob_files_(std::get<1>(GetParam())) {} + + double blob_gc_age_cutoff_; + bool updated_enable_blob_files_; +}; + +INSTANTIATE_TEST_CASE_P(DBCompactionTestBlobGC, DBCompactionTestBlobGC, + ::testing::Combine(::testing::Values(0.0, 0.5, 1.0), + ::testing::Bool())); + +TEST_P(DBCompactionTestBlobGC, CompactionWithBlobGCOverrides) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.enable_blob_files = true; + options.blob_file_size = 32; // one blob per file + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 0; + + DestroyAndReopen(options); + + for (int i = 0; i < 128; i += 2) { + ASSERT_OK(Put("key" + std::to_string(i), "value" + std::to_string(i))); + ASSERT_OK( + Put("key" + std::to_string(i + 1), "value" + std::to_string(i + 1))); + ASSERT_OK(Flush()); + } + + std::vector original_blob_files = GetBlobFileNumbers(); + ASSERT_EQ(original_blob_files.size(), 128); + + // Note: turning off enable_blob_files before the compaction results in + // garbage collected values getting inlined. + ASSERT_OK(db_->SetOptions({{"enable_blob_files", "false"}})); + + CompactRangeOptions cro; + cro.blob_garbage_collection_policy = BlobGarbageCollectionPolicy::kForce; + cro.blob_garbage_collection_age_cutoff = blob_gc_age_cutoff_; + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + // Check that the GC stats are correct + { + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + assert(versions->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + const InternalStats* const internal_stats = cfd->internal_stats(); + assert(internal_stats); + + const auto& compaction_stats = internal_stats->TEST_GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + ASSERT_GE(compaction_stats[1].bytes_read_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); + } + + const size_t cutoff_index = static_cast( + cro.blob_garbage_collection_age_cutoff * original_blob_files.size()); + const size_t expected_num_files = original_blob_files.size() - cutoff_index; + + const std::vector new_blob_files = GetBlobFileNumbers(); + + ASSERT_EQ(new_blob_files.size(), expected_num_files); + + // Original blob files below the cutoff should be gone, original blob files + // at or above the cutoff should be still there + for (size_t i = cutoff_index; i < original_blob_files.size(); ++i) { + ASSERT_EQ(new_blob_files[i - cutoff_index], original_blob_files[i]); + } + + for (size_t i = 0; i < 128; ++i) { + ASSERT_EQ(Get("key" + std::to_string(i)), "value" + std::to_string(i)); + } +} + +TEST_P(DBCompactionTestBlobGC, CompactionWithBlobGC) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + options.enable_blob_files = true; + options.blob_file_size = 32; // one blob per file + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = blob_gc_age_cutoff_; + + Reopen(options); + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + + ASSERT_OK(Put(first_key, first_value)); + ASSERT_OK(Put(second_key, second_value)); + ASSERT_OK(Flush()); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "third_value"; + constexpr char fourth_key[] = "fourth_key"; + constexpr char fourth_value[] = "fourth_value"; + + ASSERT_OK(Put(third_key, third_value)); + ASSERT_OK(Put(fourth_key, fourth_value)); + ASSERT_OK(Flush()); + + const std::vector original_blob_files = GetBlobFileNumbers(); + + ASSERT_EQ(original_blob_files.size(), 4); + + const size_t cutoff_index = static_cast( + options.blob_garbage_collection_age_cutoff * original_blob_files.size()); + + // Note: turning off enable_blob_files before the compaction results in + // garbage collected values getting inlined. + size_t expected_number_of_files = original_blob_files.size(); + + if (!updated_enable_blob_files_) { + ASSERT_OK(db_->SetOptions({{"enable_blob_files", "false"}})); + + expected_number_of_files -= cutoff_index; + } + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + ASSERT_EQ(Get(first_key), first_value); + ASSERT_EQ(Get(second_key), second_value); + ASSERT_EQ(Get(third_key), third_value); + ASSERT_EQ(Get(fourth_key), fourth_value); + + const std::vector new_blob_files = GetBlobFileNumbers(); + + ASSERT_EQ(new_blob_files.size(), expected_number_of_files); + + // Original blob files below the cutoff should be gone, original blob files at + // or above the cutoff should be still there + for (size_t i = cutoff_index; i < original_blob_files.size(); ++i) { + ASSERT_EQ(new_blob_files[i - cutoff_index], original_blob_files[i]); + } + + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + assert(versions->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + const InternalStats* const internal_stats = cfd->internal_stats(); + assert(internal_stats); + + const auto& compaction_stats = internal_stats->TEST_GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + if (blob_gc_age_cutoff_ > 0.0) { + ASSERT_GT(compaction_stats[1].bytes_read_blob, 0); + + if (updated_enable_blob_files_) { + // GC relocated some blobs to new blob files + ASSERT_GT(compaction_stats[1].bytes_written_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_read_blob, + compaction_stats[1].bytes_written_blob); + } else { + // GC moved some blobs back to the LSM, no new blob files + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); + } + } else { + ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); + } +} + +TEST_F(DBCompactionTest, CompactionWithBlobGCError_CorruptIndex) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + options.enable_blob_files = true; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 1.0; + + Reopen(options); + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + ASSERT_OK(Put(second_key, second_value)); + + ASSERT_OK(Flush()); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "third_value"; + ASSERT_OK(Put(third_key, third_value)); + + constexpr char fourth_key[] = "fourth_key"; + constexpr char fourth_value[] = "fourth_value"; + ASSERT_OK(Put(fourth_key, fourth_value)); + + ASSERT_OK(Flush()); + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::GarbageCollectBlobIfNeeded::TamperWithBlobIndex", + [](void* arg) { + Slice* const blob_index = static_cast(arg); + assert(blob_index); + assert(!blob_index->empty()); + blob_index->remove_prefix(1); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_TRUE( + db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(DBCompactionTest, CompactionWithBlobGCError_InlinedTTLIndex) { + constexpr uint64_t min_blob_size = 10; + + Options options; + options.env = env_; + options.disable_auto_compactions = true; + options.enable_blob_files = true; + options.min_blob_size = min_blob_size; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 1.0; + + Reopen(options); + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + ASSERT_OK(Put(second_key, second_value)); + + ASSERT_OK(Flush()); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "third_value"; + ASSERT_OK(Put(third_key, third_value)); + + constexpr char fourth_key[] = "fourth_key"; + constexpr char blob[] = "short"; + static_assert(sizeof(short) - 1 < min_blob_size, + "Blob too long to be inlined"); + + // Fake an inlined TTL blob index. + std::string blob_index; + + constexpr uint64_t expiration = 1234567890; + + BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob); + + WriteBatch batch; + ASSERT_OK( + WriteBatchInternal::PutBlobIndex(&batch, 0, fourth_key, blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + ASSERT_OK(Flush()); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_TRUE( + db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption()); +} + +TEST_F(DBCompactionTest, CompactionWithBlobGCError_IndexWithInvalidFileNumber) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + options.enable_blob_files = true; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 1.0; + + Reopen(options); + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + ASSERT_OK(Put(second_key, second_value)); + + ASSERT_OK(Flush()); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "third_value"; + ASSERT_OK(Put(third_key, third_value)); + + constexpr char fourth_key[] = "fourth_key"; + + // Fake a blob index referencing a non-existent blob file. + std::string blob_index; + + constexpr uint64_t blob_file_number = 1000; + constexpr uint64_t offset = 1234; + constexpr uint64_t size = 5678; + + BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size, + kNoCompression); + + WriteBatch batch; + ASSERT_OK( + WriteBatchInternal::PutBlobIndex(&batch, 0, fourth_key, blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + ASSERT_OK(Flush()); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_TRUE( + db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption()); +} + +TEST_F(DBCompactionTest, CompactionWithChecksumHandoff1) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 3; + options.env = fault_fs_env.get(); + options.create_if_missing = true; + options.checksum_handoff_file_types.Add(FileType::kTableFile); + Status s; + Reopen(options); + + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + Destroy(options); + Reopen(options); + + // The hash does not match, compaction write fails + // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + // Since the file system returns IOStatus::Corruption, it is an + // unrecoverable error. + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), + ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError); + SyncPoint::GetInstance()->DisableProcessing(); + Destroy(options); + Reopen(options); + + // The file system does not support checksum handoff. The check + // will be ignored. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + + // Each write will be similated as corrupted. + // Since the file system returns IOStatus::Corruption, it is an + // unrecoverable error. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", + [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), + ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError); + SyncPoint::GetInstance()->DisableProcessing(); + + Destroy(options); +} + +TEST_F(DBCompactionTest, CompactionWithChecksumHandoff2) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 3; + options.env = fault_fs_env.get(); + options.create_if_missing = true; + Status s; + Reopen(options); + + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + Destroy(options); + Reopen(options); + + // options is not set, the checksum handoff will not be triggered + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + SyncPoint::GetInstance()->DisableProcessing(); + Destroy(options); + Reopen(options); + + // The file system does not support checksum handoff. The check + // will be ignored. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + + // options is not set, the checksum handoff will not be triggered + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", + [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + + Destroy(options); +} + +TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest1) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 3; + options.env = fault_fs_env.get(); + options.create_if_missing = true; + options.checksum_handoff_file_types.Add(FileType::kDescriptorFile); + Status s; + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + Reopen(options); + + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + Destroy(options); + Reopen(options); + + // The hash does not match, compaction write fails + // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + // Since the file system returns IOStatus::Corruption, it is mapped to + // kFatalError error. + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError); + SyncPoint::GetInstance()->DisableProcessing(); + Destroy(options); +} + +TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest2) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 3; + options.env = fault_fs_env.get(); + options.create_if_missing = true; + options.checksum_handoff_file_types.Add(FileType::kDescriptorFile); + Status s; + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum); + Reopen(options); + + // The file system does not support checksum handoff. The check + // will be ignored. + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + + // Each write will be similated as corrupted. + // Since the file system returns IOStatus::Corruption, it is mapped to + // kFatalError error. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", + [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError); + SyncPoint::GetInstance()->DisableProcessing(); + + Destroy(options); +} + +TEST_F(DBCompactionTest, FIFOWarm) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleFIFO; + options.num_levels = 1; + options.max_open_files = -1; + options.level0_file_num_compaction_trigger = 2; + options.create_if_missing = true; + CompactionOptionsFIFO fifo_options; + fifo_options.age_for_warm = 1000; + fifo_options.max_table_files_size = 100000000; + options.compaction_options_fifo = fifo_options; + env_->SetMockSleep(); + Reopen(options); + + int total_warm = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewWritableFile::FileOptions.temperature", [&](void* arg) { + Temperature temperature = *(static_cast(arg)); + if (temperature == Temperature::kWarm) { + total_warm++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // The file system does not support checksum handoff. The check + // will be ignored. + ASSERT_OK(Put(Key(0), "value1")); + env_->MockSleepForSeconds(800); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(0), "value1")); + env_->MockSleepForSeconds(800); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(0), "value1")); + env_->MockSleepForSeconds(800); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_OK(Put(Key(0), "value1")); + env_->MockSleepForSeconds(800); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + ColumnFamilyMetaData metadata; + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(4, metadata.file_count); + ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature); + ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[1].temperature); + ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[2].temperature); + ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[3].temperature); + ASSERT_EQ(2, total_warm); + + Destroy(options); +} + +TEST_F(DBCompactionTest, DisableMultiManualCompaction) { + const int kNumL0Files = 10; + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + Reopen(options); + + // Generate 2 levels of file to make sure the manual compaction is not skipped + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), "value")); + if (i % 2) { + ASSERT_OK(Flush()); + } + } + MoveFilesToLevel(2); + + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), "value")); + if (i % 2) { + ASSERT_OK(Flush()); + } + } + MoveFilesToLevel(1); + + // Block compaction queue + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + port::Thread compact_thread1([&]() { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = false; + std::string begin_str = Key(0); + std::string end_str = Key(3); + Slice b = begin_str; + Slice e = end_str; + auto s = db_->CompactRange(cro, &b, &e); + ASSERT_TRUE(s.IsIncomplete()); + }); + + port::Thread compact_thread2([&]() { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = false; + std::string begin_str = Key(4); + std::string end_str = Key(7); + Slice b = begin_str; + Slice e = end_str; + auto s = db_->CompactRange(cro, &b, &e); + ASSERT_TRUE(s.IsIncomplete()); + }); + + // Disable manual compaction should cancel both manual compactions and both + // compaction should return incomplete. + db_->DisableManualCompaction(); + + compact_thread1.join(); + compact_thread2.join(); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); +} + +TEST_F(DBCompactionTest, DisableJustStartedManualCompaction) { + const int kNumL0Files = 4; + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + Reopen(options); + + // generate files, but avoid trigger auto compaction + for (int i = 0; i < kNumL0Files / 2; i++) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + + // make sure the manual compaction background is started but not yet set the + // status to in_progress, then cancel the manual compaction, which should not + // result in segfault + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BGWorkCompaction", + "DBCompactionTest::DisableJustStartedManualCompaction:" + "PreDisableManualCompaction"}, + {"DBImpl::RunManualCompaction:Unscheduled", + "BackgroundCallCompaction:0"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + port::Thread compact_thread([&]() { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = true; + auto s = db_->CompactRange(cro, nullptr, nullptr); + ASSERT_TRUE(s.IsIncomplete()); + }); + TEST_SYNC_POINT( + "DBCompactionTest::DisableJustStartedManualCompaction:" + "PreDisableManualCompaction"); + db_->DisableManualCompaction(); + + compact_thread.join(); +} + +TEST_F(DBCompactionTest, DisableInProgressManualCompaction) { + const int kNumL0Files = 4; + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + Reopen(options); + + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCompaction:InProgress", + "DBCompactionTest::DisableInProgressManualCompaction:" + "PreDisableManualCompaction"}, + {"DBImpl::RunManualCompaction:Unscheduled", + "CompactionJob::Run():Start"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + // generate files, but avoid trigger auto compaction + for (int i = 0; i < kNumL0Files / 2; i++) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + + port::Thread compact_thread([&]() { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = true; + auto s = db_->CompactRange(cro, nullptr, nullptr); + ASSERT_TRUE(s.IsIncomplete()); + }); + + TEST_SYNC_POINT( + "DBCompactionTest::DisableInProgressManualCompaction:" + "PreDisableManualCompaction"); + db_->DisableManualCompaction(); + + compact_thread.join(); +} + +TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFull) { + const int kNumL0Files = 4; + + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::RunManualCompaction:Scheduled", + "DBCompactionTest::DisableManualCompactionThreadQueueFull:" + "PreDisableManualCompaction"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + Reopen(options); + + // Block compaction queue + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + // generate files, but avoid trigger auto compaction + for (int i = 0; i < kNumL0Files / 2; i++) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + + port::Thread compact_thread([&]() { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = true; + auto s = db_->CompactRange(cro, nullptr, nullptr); + ASSERT_TRUE(s.IsIncomplete()); + }); + + TEST_SYNC_POINT( + "DBCompactionTest::DisableManualCompactionThreadQueueFull:" + "PreDisableManualCompaction"); + + // Generate more files to trigger auto compaction which is scheduled after + // manual compaction. Has to generate 4 more files because existing files are + // pending compaction + for (int i = 0; i < kNumL0Files; i++) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + ASSERT_EQ(std::to_string(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0)); + + db_->DisableManualCompaction(); + + // CompactRange should return before the compaction has the chance to run + compact_thread.join(); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_EQ("0,1", FilesPerLevel(0)); +} + +TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFullDBClose) { + const int kNumL0Files = 4; + + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::RunManualCompaction:Scheduled", + "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:" + "PreDisableManualCompaction"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + Reopen(options); + + // Block compaction queue + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + // generate files, but avoid trigger auto compaction + for (int i = 0; i < kNumL0Files / 2; i++) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + + port::Thread compact_thread([&]() { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = true; + auto s = db_->CompactRange(cro, nullptr, nullptr); + ASSERT_TRUE(s.IsIncomplete()); + }); + + TEST_SYNC_POINT( + "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:" + "PreDisableManualCompaction"); + + // Generate more files to trigger auto compaction which is scheduled after + // manual compaction. Has to generate 4 more files because existing files are + // pending compaction + for (int i = 0; i < kNumL0Files; i++) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + ASSERT_EQ(std::to_string(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0)); + + db_->DisableManualCompaction(); + + // CompactRange should return before the compaction has the chance to run + compact_thread.join(); + + // Try close DB while manual compaction is canceled but still in the queue. + // And an auto-triggered compaction is also in the queue. + auto s = db_->Close(); + ASSERT_OK(s); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); +} + +TEST_F(DBCompactionTest, DBCloseWithManualCompaction) { + const int kNumL0Files = 4; + + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::RunManualCompaction:Scheduled", + "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:" + "PreDisableManualCompaction"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + Reopen(options); + + // Block compaction queue + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + // generate files, but avoid trigger auto compaction + for (int i = 0; i < kNumL0Files / 2; i++) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + + port::Thread compact_thread([&]() { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = true; + auto s = db_->CompactRange(cro, nullptr, nullptr); + ASSERT_TRUE(s.IsIncomplete()); + }); + + TEST_SYNC_POINT( + "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:" + "PreDisableManualCompaction"); + + // Generate more files to trigger auto compaction which is scheduled after + // manual compaction. Has to generate 4 more files because existing files are + // pending compaction + for (int i = 0; i < kNumL0Files; i++) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + ASSERT_EQ(std::to_string(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0)); + + // Close DB with manual compaction and auto triggered compaction in the queue. + auto s = db_->Close(); + ASSERT_OK(s); + + // manual compaction thread should return with Incomplete(). + compact_thread.join(); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); +} + +TEST_F(DBCompactionTest, + DisableManualCompactionDoesNotWaitForDrainingAutomaticCompaction) { + // When `CompactRangeOptions::exclusive_manual_compaction == true`, we wait + // for automatic compactions to drain before starting the manual compaction. + // This test verifies `DisableManualCompaction()` can cancel such a compaction + // without waiting for the drain to complete. + const int kNumL0Files = 4; + + // Enforces manual compaction enters wait loop due to pending automatic + // compaction. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BGWorkCompaction", "DBImpl::RunManualCompaction:NotScheduled"}, + {"DBImpl::RunManualCompaction:WaitScheduled", + "BackgroundCallCompaction:0"}}); + // The automatic compaction will cancel the waiting manual compaction. + // Completing this implies the cancellation did not wait on automatic + // compactions to finish. + bool callback_completed = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void* /*arg*/) { + db_->DisableManualCompaction(); + callback_completed = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + Reopen(options); + + for (int i = 0; i < kNumL0Files; ++i) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + + CompactRangeOptions cro; + cro.exclusive_manual_compaction = true; + ASSERT_TRUE(db_->CompactRange(cro, nullptr, nullptr).IsIncomplete()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(callback_completed); +} + +TEST_F(DBCompactionTest, ChangeLevelConflictsWithManual) { + Options options = CurrentOptions(); + options.num_levels = 3; + Reopen(options); + + // Setup an LSM with L2 populated. + Random rnd(301); + ASSERT_OK(Put(Key(0), rnd.RandomString(990))); + ASSERT_OK(Put(Key(1), rnd.RandomString(990))); + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + } + ASSERT_EQ("0,0,1", FilesPerLevel(0)); + + // The background thread will refit L2->L1 while the foreground thread will + // attempt to run a compaction on new data. The following dependencies + // ensure the background manual compaction's refitting phase disables manual + // compaction immediately before the foreground manual compaction can register + // itself. Manual compaction is kept disabled until the foreground manual + // checks for the failure once. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + // Only do Put()s for foreground CompactRange() once the background + // CompactRange() has reached the refitting phase. + { + "DBImpl::CompactRange:BeforeRefit:1", + "DBCompactionTest::ChangeLevelConflictsWithManual:" + "PreForegroundCompactRange", + }, + // Right before we register the manual compaction, proceed with + // the refitting phase so manual compactions are disabled. Stay in + // the refitting phase with manual compactions disabled until it is + // noticed. + { + "DBImpl::RunManualCompaction:0", + "DBImpl::CompactRange:BeforeRefit:2", + }, + { + "DBImpl::CompactRange:PreRefitLevel", + "DBImpl::RunManualCompaction:1", + }, + { + "DBImpl::RunManualCompaction:PausedAtStart", + "DBImpl::CompactRange:PostRefitLevel", + }, + // If compaction somehow were scheduled, let's let it run after reenabling + // manual compactions. This dependency is not expected to be hit but is + // here for speculatively coercing future bugs. + { + "DBImpl::CompactRange:PostRefitLevel:ManualCompactionEnabled", + "BackgroundCallCompaction:0", + }, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread refit_level_thread([&] { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 1; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + }); + + TEST_SYNC_POINT( + "DBCompactionTest::ChangeLevelConflictsWithManual:" + "PreForegroundCompactRange"); + ASSERT_OK(Put(Key(0), rnd.RandomString(990))); + ASSERT_OK(Put(Key(1), rnd.RandomString(990))); + ASSERT_TRUE(dbfull() + ->CompactRange(CompactRangeOptions(), nullptr, nullptr) + .IsIncomplete()); + + refit_level_thread.join(); +} + +TEST_F(DBCompactionTest, BottomPriCompactionCountsTowardConcurrencyLimit) { + // Flushes several files to trigger compaction while lock is released during + // a bottom-pri compaction. Verifies it does not get scheduled to thread pool + // because per-DB limit for compaction parallelism is one (default). + const int kNumL0Files = 4; + const int kNumLevels = 3; + + env_->SetBackgroundThreads(1, Env::Priority::BOTTOM); + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + options.num_levels = kNumLevels; + DestroyAndReopen(options); + + // Setup last level to be non-empty since it's a bit unclear whether + // compaction to an empty level would be considered "bottommost". + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + MoveFilesToLevel(kNumLevels - 1); + + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BGWorkBottomCompaction", + "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:" + "PreTriggerCompaction"}, + {"DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:" + "PostTriggerCompaction", + "BackgroundCallCompaction:0"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + port::Thread compact_range_thread([&] { + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + cro.exclusive_manual_compaction = false; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + }); + + // Sleep in the low-pri thread so any newly scheduled compaction will be + // queued. Otherwise it might finish before we check its existence. + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + TEST_SYNC_POINT( + "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:" + "PreTriggerCompaction"); + for (int i = 0; i < kNumL0Files; ++i) { + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + } + ASSERT_EQ(0u, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + TEST_SYNC_POINT( + "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:" + "PostTriggerCompaction"); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + compact_range_thread.join(); +} + +TEST_F(DBCompactionTest, BottommostFileCompactionAllowIngestBehind) { + // allow_ingest_behind prevents seqnum zeroing, and could cause + // compaction loop with reason kBottommostFiles. + Options options = CurrentOptions(); + options.env = env_; + options.compaction_style = kCompactionStyleLevel; + options.allow_ingest_behind = true; + options.comparator = BytewiseComparator(); + DestroyAndReopen(options); + + WriteOptions write_opts; + ASSERT_OK(db_->Put(write_opts, "infinite", "compaction loop")); + ASSERT_OK(db_->Put(write_opts, "infinite", "loop")); + + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + ASSERT_OK(db_->Put(write_opts, "bumpseqnum", "")); + ASSERT_OK(Flush()); + auto snapshot = db_->GetSnapshot(); + // Bump up oldest_snapshot_seqnum_ in VersionStorageInfo. + db_->ReleaseSnapshot(snapshot); + bool compacted = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* /* arg */) { + // There should not be a compaction. + compacted = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + // Wait for compaction to be scheduled. + env_->SleepForMicroseconds(2000000); + ASSERT_FALSE(compacted); + // The following assert can be used to check for compaction loop: + // it used to wait forever before the fix. + // ASSERT_OK(dbfull()->TEST_WaitForCompact(true /* wait_unscheduled */)); +} + +#endif // !defined(ROCKSDB_LITE) + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { +#if !defined(ROCKSDB_LITE) + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +#else + (void)argc; + (void)argv; + return 0; +#endif +} diff --git a/src/rocksdb/db/db_dynamic_level_test.cc b/src/rocksdb/db/db_dynamic_level_test.cc new file mode 100644 index 000000000..17fa67cb2 --- /dev/null +++ b/src/rocksdb/db/db_dynamic_level_test.cc @@ -0,0 +1,507 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// Introduction of SyncPoint effectively disabled building and running this test +// in Release build. +// which is a pity, it is a good test +#if !defined(ROCKSDB_LITE) + +#include "db/db_test_util.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/env.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { +class DBTestDynamicLevel : public DBTestBase { + public: + DBTestDynamicLevel() + : DBTestBase("db_dynamic_level_test", /*env_do_fsync=*/true) {} +}; + +TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase) { + if (!Snappy_Supported() || !LZ4_Supported()) { + return; + } + // Use InMemoryEnv, or it would be too slow. + std::unique_ptr env(NewMemEnv(env_)); + + const int kNKeys = 1000; + int keys[kNKeys]; + + auto verify_func = [&]() { + for (int i = 0; i < kNKeys; i++) { + ASSERT_NE("NOT_FOUND", Get(Key(i))); + ASSERT_NE("NOT_FOUND", Get(Key(kNKeys * 2 + i))); + if (i < kNKeys / 10) { + ASSERT_EQ("NOT_FOUND", Get(Key(kNKeys + keys[i]))); + } else { + ASSERT_NE("NOT_FOUND", Get(Key(kNKeys + keys[i]))); + } + } + }; + + Random rnd(301); + for (int ordered_insert = 0; ordered_insert <= 1; ordered_insert++) { + for (int i = 0; i < kNKeys; i++) { + keys[i] = i; + } + if (ordered_insert == 0) { + RandomShuffle(std::begin(keys), std::end(keys), rnd.Next()); + } + for (int max_background_compactions = 1; max_background_compactions < 4; + max_background_compactions += 2) { + Options options; + options.env = env.get(); + options.create_if_missing = true; + options.write_buffer_size = 2048; + options.max_write_buffer_number = 2; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 2; + options.target_file_size_base = 2048; + options.level_compaction_dynamic_level_bytes = true; + options.max_bytes_for_level_base = 10240; + options.max_bytes_for_level_multiplier = 4; + options.max_background_compactions = max_background_compactions; + options.num_levels = 5; + + options.compression_per_level.resize(3); + options.compression_per_level[0] = kNoCompression; + options.compression_per_level[1] = kLZ4Compression; + options.compression_per_level[2] = kSnappyCompression; + options.env = env_; + + DestroyAndReopen(options); + + for (int i = 0; i < kNKeys; i++) { + int key = keys[i]; + ASSERT_OK(Put(Key(kNKeys + key), rnd.RandomString(102))); + ASSERT_OK(Put(Key(key), rnd.RandomString(102))); + ASSERT_OK(Put(Key(kNKeys * 2 + key), rnd.RandomString(102))); + ASSERT_OK(Delete(Key(kNKeys + keys[i / 10]))); + env_->SleepForMicroseconds(5000); + } + + uint64_t int_prop; + ASSERT_TRUE(db_->GetIntProperty("rocksdb.background-errors", &int_prop)); + ASSERT_EQ(0U, int_prop); + + // Verify DB + for (int j = 0; j < 2; j++) { + verify_func(); + if (j == 0) { + Reopen(options); + } + } + + // Test compact range works + ASSERT_OK( + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + // All data should be in the last level. + ColumnFamilyMetaData cf_meta; + db_->GetColumnFamilyMetaData(&cf_meta); + ASSERT_EQ(5U, cf_meta.levels.size()); + for (int i = 0; i < 4; i++) { + ASSERT_EQ(0U, cf_meta.levels[i].files.size()); + } + ASSERT_GT(cf_meta.levels[4U].files.size(), 0U); + verify_func(); + + Close(); + } + } + + env_->SetBackgroundThreads(1, Env::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); +} + +// Test specific cases in dynamic max bytes +TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) { + Random rnd(301); + int kMaxKey = 1000000; + + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.create_if_missing = true; + options.write_buffer_size = 20480; + options.max_write_buffer_number = 2; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 9999; + options.level0_stop_writes_trigger = 9999; + options.target_file_size_base = 9102; + options.level_compaction_dynamic_level_bytes = true; + options.max_bytes_for_level_base = 40960; + options.max_bytes_for_level_multiplier = 4; + options.max_background_compactions = 2; + options.num_levels = 5; + options.max_compaction_bytes = 0; // Force not expanding in compactions + options.db_host_id = ""; // Setting this messes up the file size calculation + BlockBasedTableOptions table_options; + table_options.block_size = 1024; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "true"}, + })); + + uint64_t int_prop; + std::string str_prop; + + // Initial base level is the last level + ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); + ASSERT_EQ(4U, int_prop); + + // Put about 28K to L0 + for (int i = 0; i < 70; i++) { + ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), + rnd.RandomString(380))); + } + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "false"}, + })); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); + ASSERT_EQ(4U, int_prop); + + // Insert extra about 28K to L0. After they are compacted to L4, the base + // level should be changed to L3. + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "true"}, + })); + for (int i = 0; i < 70; i++) { + ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), + rnd.RandomString(380))); + } + + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "false"}, + })); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); + ASSERT_EQ(3U, int_prop); + ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level1", &str_prop)); + ASSERT_EQ("0", str_prop); + ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level2", &str_prop)); + ASSERT_EQ("0", str_prop); + + // Write even more data while leaving the base level at L3. + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "true"}, + })); + // Write about 40K more + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), + rnd.RandomString(380))); + } + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "false"}, + })); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); + ASSERT_EQ(3U, int_prop); + + // Fill up L0, and then run an (auto) L0->Lmax compaction to raise the base + // level to 2. + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "true"}, + })); + // Write about 650K more. + // Each file is about 11KB, with 9KB of data. + for (int i = 0; i < 1300; i++) { + ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), + rnd.RandomString(380))); + } + + // Make sure that the compaction starts before the last bit of data is + // flushed, so that the base level isn't raised to L1. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"CompactionJob::Run():Start", "DynamicLevelMaxBytesBase2:0"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "false"}, + })); + + TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:0"); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); + ASSERT_EQ(2U, int_prop); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Write more data until the base level changes to L1. There will be + // a manual compaction going on at the same time. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"CompactionJob::Run():Start", "DynamicLevelMaxBytesBase2:1"}, + {"DynamicLevelMaxBytesBase2:2", "CompactionJob::Run():End"}, + {"DynamicLevelMaxBytesBase2:compact_range_finish", + "FlushJob::WriteLevel0Table"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread thread([this] { + TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:compact_range_start"); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:compact_range_finish"); + }); + + TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:1"); + for (int i = 0; i < 2; i++) { + ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), + rnd.RandomString(380))); + } + TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:2"); + + ASSERT_OK(Flush()); + + thread.join(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); + ASSERT_EQ(1U, int_prop); +} + +// Test specific cases in dynamic max bytes +TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesCompactRange) { + Random rnd(301); + int kMaxKey = 1000000; + + Options options = CurrentOptions(); + options.create_if_missing = true; + options.write_buffer_size = 2048; + options.max_write_buffer_number = 2; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 9999; + options.level0_stop_writes_trigger = 9999; + options.target_file_size_base = 2; + options.level_compaction_dynamic_level_bytes = true; + options.max_bytes_for_level_base = 10240; + options.max_bytes_for_level_multiplier = 4; + options.max_background_compactions = 1; + const int kNumLevels = 5; + options.num_levels = kNumLevels; + options.max_compaction_bytes = 1; // Force not expanding in compactions + BlockBasedTableOptions table_options; + table_options.block_size = 1024; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + + // Compact against empty DB + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + uint64_t int_prop; + std::string str_prop; + + // Initial base level is the last level + ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); + ASSERT_EQ(4U, int_prop); + + // Put about 7K to L0 + for (int i = 0; i < 140; i++) { + ASSERT_OK( + Put(Key(static_cast(rnd.Uniform(kMaxKey))), rnd.RandomString(80))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + if (NumTableFilesAtLevel(0) == 0) { + // Make sure level 0 is not empty + ASSERT_OK( + Put(Key(static_cast(rnd.Uniform(kMaxKey))), rnd.RandomString(80))); + ASSERT_OK(Flush()); + } + + ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); + ASSERT_EQ(3U, int_prop); + ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level1", &str_prop)); + ASSERT_EQ("0", str_prop); + ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level2", &str_prop)); + ASSERT_EQ("0", str_prop); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + std::set output_levels; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionPicker::CompactRange:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + output_levels.insert(compaction->output_level()); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(output_levels.size(), 2); + ASSERT_TRUE(output_levels.find(3) != output_levels.end()); + ASSERT_TRUE(output_levels.find(4) != output_levels.end()); + ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level0", &str_prop)); + ASSERT_EQ("0", str_prop); + ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level3", &str_prop)); + ASSERT_EQ("0", str_prop); + // Base level is still level 3. + ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); + ASSERT_EQ(3U, int_prop); +} + +TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBaseInc) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.write_buffer_size = 2048; + options.max_write_buffer_number = 2; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 2; + options.target_file_size_base = 2048; + options.level_compaction_dynamic_level_bytes = true; + options.max_bytes_for_level_base = 10240; + options.max_bytes_for_level_multiplier = 4; + options.max_background_compactions = 2; + options.num_levels = 5; + options.max_compaction_bytes = 100000000; + + DestroyAndReopen(options); + + int non_trivial = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + const int total_keys = 3000; + const int random_part_size = 100; + for (int i = 0; i < total_keys; i++) { + std::string value = rnd.RandomString(random_part_size); + PutFixed32(&value, static_cast(i)); + ASSERT_OK(Put(Key(i), value)); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + ASSERT_EQ(non_trivial, 0); + + for (int i = 0; i < total_keys; i++) { + std::string value = Get(Key(i)); + ASSERT_EQ(DecodeFixed32(value.c_str() + random_part_size), + static_cast(i)); + } + + env_->SetBackgroundThreads(1, Env::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); +} + +TEST_F(DBTestDynamicLevel, DISABLED_MigrateToDynamicLevelMaxBytesBase) { + Random rnd(301); + const int kMaxKey = 2000; + + Options options; + options.create_if_missing = true; + options.write_buffer_size = 2048; + options.max_write_buffer_number = 8; + options.level0_file_num_compaction_trigger = 4; + options.level0_slowdown_writes_trigger = 4; + options.level0_stop_writes_trigger = 8; + options.target_file_size_base = 2048; + options.level_compaction_dynamic_level_bytes = false; + options.max_bytes_for_level_base = 10240; + options.max_bytes_for_level_multiplier = 4; + options.num_levels = 8; + + DestroyAndReopen(options); + + auto verify_func = [&](int num_keys, bool if_sleep) { + for (int i = 0; i < num_keys; i++) { + ASSERT_NE("NOT_FOUND", Get(Key(kMaxKey + i))); + if (i < num_keys / 10) { + ASSERT_EQ("NOT_FOUND", Get(Key(i))); + } else { + ASSERT_NE("NOT_FOUND", Get(Key(i))); + } + if (if_sleep && i % 1000 == 0) { + // Without it, valgrind may choose not to give another + // thread a chance to run before finishing the function, + // causing the test to be extremely slow. + env_->SleepForMicroseconds(1); + } + } + }; + + int total_keys = 1000; + for (int i = 0; i < total_keys; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(102))); + ASSERT_OK(Put(Key(kMaxKey + i), rnd.RandomString(102))); + ASSERT_OK(Delete(Key(i / 10))); + } + verify_func(total_keys, false); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + options.level_compaction_dynamic_level_bytes = true; + options.disable_auto_compactions = true; + Reopen(options); + verify_func(total_keys, false); + + std::atomic_bool compaction_finished; + compaction_finished = false; + // Issue manual compaction in one thread and still verify DB state + // in main thread. + ROCKSDB_NAMESPACE::port::Thread t([&]() { + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = options.num_levels - 1; + ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); + compaction_finished.store(true); + }); + do { + verify_func(total_keys, true); + } while (!compaction_finished.load()); + t.join(); + + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "false"}, + })); + + int total_keys2 = 2000; + for (int i = total_keys; i < total_keys2; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(102))); + ASSERT_OK(Put(Key(kMaxKey + i), rnd.RandomString(102))); + ASSERT_OK(Delete(Key(i / 10))); + } + + verify_func(total_keys2, false); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + verify_func(total_keys2, false); + + // Base level is not level 1 + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 0); +} +} // namespace ROCKSDB_NAMESPACE + +#endif // !defined(ROCKSDB_LITE) + +int main(int argc, char** argv) { +#if !defined(ROCKSDB_LITE) + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +#else + (void)argc; + (void)argv; + return 0; +#endif +} diff --git a/src/rocksdb/db/db_encryption_test.cc b/src/rocksdb/db/db_encryption_test.cc new file mode 100644 index 000000000..73e89d158 --- /dev/null +++ b/src/rocksdb/db/db_encryption_test.cc @@ -0,0 +1,130 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/perf_context.h" +#if !defined(ROCKSDB_LITE) +#include "test_util/sync_point.h" +#endif +#include +#include + +namespace ROCKSDB_NAMESPACE { + +class DBEncryptionTest : public DBTestBase { + public: + DBEncryptionTest() + : DBTestBase("db_encryption_test", /*env_do_fsync=*/true) {} + Env* GetTargetEnv() { + if (encrypted_env_ != nullptr) { + return (static_cast(encrypted_env_))->target(); + } else { + return env_; + } + } +}; + +#ifndef ROCKSDB_LITE + +TEST_F(DBEncryptionTest, CheckEncrypted) { + ASSERT_OK(Put("foo567", "v1.fetdq")); + ASSERT_OK(Put("bar123", "v2.dfgkjdfghsd")); + Close(); + + // Open all files and look for the values we've put in there. + // They should not be found if encrypted, otherwise + // they should be found. + std::vector fileNames; + auto status = env_->GetChildren(dbname_, &fileNames); + ASSERT_OK(status); + + Env* target = GetTargetEnv(); + int hits = 0; + for (auto it = fileNames.begin(); it != fileNames.end(); ++it) { + if (*it == "LOCK") { + continue; + } + auto filePath = dbname_ + "/" + *it; + std::unique_ptr seqFile; + auto envOptions = EnvOptions(CurrentOptions()); + status = target->NewSequentialFile(filePath, &seqFile, envOptions); + ASSERT_OK(status); + + uint64_t fileSize; + status = target->GetFileSize(filePath, &fileSize); + ASSERT_OK(status); + + std::string scratch; + scratch.reserve(fileSize); + Slice data; + status = seqFile->Read(fileSize, &data, (char*)scratch.data()); + ASSERT_OK(status); + + if (data.ToString().find("foo567") != std::string::npos) { + hits++; + // std::cout << "Hit in " << filePath << "\n"; + } + if (data.ToString().find("v1.fetdq") != std::string::npos) { + hits++; + // std::cout << "Hit in " << filePath << "\n"; + } + if (data.ToString().find("bar123") != std::string::npos) { + hits++; + // std::cout << "Hit in " << filePath << "\n"; + } + if (data.ToString().find("v2.dfgkjdfghsd") != std::string::npos) { + hits++; + // std::cout << "Hit in " << filePath << "\n"; + } + if (data.ToString().find("dfgk") != std::string::npos) { + hits++; + // std::cout << "Hit in " << filePath << "\n"; + } + } + if (encrypted_env_) { + ASSERT_EQ(hits, 0); + } else { + ASSERT_GE(hits, 4); + } +} + +TEST_F(DBEncryptionTest, ReadEmptyFile) { + auto defaultEnv = GetTargetEnv(); + + // create empty file for reading it back in later + auto envOptions = EnvOptions(CurrentOptions()); + auto filePath = dbname_ + "/empty.empty"; + + Status status; + { + std::unique_ptr writableFile; + status = defaultEnv->NewWritableFile(filePath, &writableFile, envOptions); + ASSERT_OK(status); + } + + std::unique_ptr seqFile; + status = defaultEnv->NewSequentialFile(filePath, &seqFile, envOptions); + ASSERT_OK(status); + + std::string scratch; + Slice data; + // reading back 16 bytes from the empty file shouldn't trigger an assertion. + // it should just work and return an empty string + status = seqFile->Read(16, &data, (char*)scratch.data()); + ASSERT_OK(status); + + ASSERT_TRUE(data.empty()); +} + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_filesnapshot.cc b/src/rocksdb/db/db_filesnapshot.cc new file mode 100644 index 000000000..aa9bd738a --- /dev/null +++ b/src/rocksdb/db/db_filesnapshot.cc @@ -0,0 +1,442 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/job_context.h" +#include "db/version_set.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "logging/logging.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/metadata.h" +#include "rocksdb/types.h" +#include "test_util/sync_point.h" +#include "util/file_checksum_helper.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +Status DBImpl::FlushForGetLiveFiles() { + mutex_.AssertHeld(); + + // flush all dirty data to disk. + Status status; + if (immutable_db_options_.atomic_flush) { + autovector cfds; + SelectColumnFamiliesForAtomicFlush(&cfds); + mutex_.Unlock(); + status = + AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kGetLiveFiles); + if (status.IsColumnFamilyDropped()) { + status = Status::OK(); + } + mutex_.Lock(); + } else { + for (auto cfd : versions_->GetRefedColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + mutex_.Unlock(); + status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles); + TEST_SYNC_POINT("DBImpl::GetLiveFiles:1"); + TEST_SYNC_POINT("DBImpl::GetLiveFiles:2"); + mutex_.Lock(); + if (!status.ok() && !status.IsColumnFamilyDropped()) { + break; + } else if (status.IsColumnFamilyDropped()) { + status = Status::OK(); + } + } + } + return status; +} + +Status DBImpl::GetLiveFiles(std::vector& ret, + uint64_t* manifest_file_size, bool flush_memtable) { + *manifest_file_size = 0; + + mutex_.Lock(); + + if (flush_memtable) { + Status status = FlushForGetLiveFiles(); + if (!status.ok()) { + mutex_.Unlock(); + ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Cannot Flush data %s\n", + status.ToString().c_str()); + return status; + } + } + + // Make a set of all of the live table and blob files + std::vector live_table_files; + std::vector live_blob_files; + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + cfd->current()->AddLiveFiles(&live_table_files, &live_blob_files); + } + + ret.clear(); + ret.reserve(live_table_files.size() + live_blob_files.size() + + 3); // for CURRENT + MANIFEST + OPTIONS + + // create names of the live files. The names are not absolute + // paths, instead they are relative to dbname_. + for (const auto& table_file_number : live_table_files) { + ret.emplace_back(MakeTableFileName("", table_file_number)); + } + + for (const auto& blob_file_number : live_blob_files) { + ret.emplace_back(BlobFileName("", blob_file_number)); + } + + ret.emplace_back(CurrentFileName("")); + ret.emplace_back(DescriptorFileName("", versions_->manifest_file_number())); + // The OPTIONS file number is zero in read-write mode when OPTIONS file + // writing failed and the DB was configured with + // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file + // number is zero when no OPTIONS file exist at all. In those cases we do not + // record any OPTIONS file in the live file list. + if (versions_->options_file_number() != 0) { + ret.emplace_back(OptionsFileName("", versions_->options_file_number())); + } + + // find length of manifest file while holding the mutex lock + *manifest_file_size = versions_->manifest_file_size(); + + mutex_.Unlock(); + return Status::OK(); +} + +Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { + // Record tracked WALs as a (minimum) cross-check for directory scan + std::vector required_by_manifest; + + // If caller disabled deletions, this function should return files that are + // guaranteed not to be deleted until deletions are re-enabled. We need to + // wait for pending purges to finish since WalManager doesn't know which + // files are going to be purged. Additional purges won't be scheduled as + // long as deletions are disabled (so the below loop must terminate). + // Also note that we disable deletions anyway to avoid the case where a + // file is deleted in the middle of the scan, causing IO error. + Status deletions_disabled = DisableFileDeletions(); + { + InstrumentedMutexLock l(&mutex_); + while (pending_purge_obsolete_files_ > 0 || bg_purge_scheduled_ > 0) { + bg_cv_.Wait(); + } + + // Record tracked WALs as a (minimum) cross-check for directory scan + const auto& manifest_wals = versions_->GetWalSet().GetWals(); + required_by_manifest.reserve(manifest_wals.size()); + for (const auto& wal : manifest_wals) { + required_by_manifest.push_back(wal.first); + } + } + + Status s = wal_manager_.GetSortedWalFiles(files); + + // DisableFileDeletions / EnableFileDeletions not supported in read-only DB + if (deletions_disabled.ok()) { + Status s2 = EnableFileDeletions(/*force*/ false); + assert(s2.ok()); + s2.PermitUncheckedError(); + } else { + assert(deletions_disabled.IsNotSupported()); + } + + if (s.ok()) { + // Verify includes those required by manifest (one sorted list is superset + // of the other) + auto required = required_by_manifest.begin(); + auto included = files.begin(); + + while (required != required_by_manifest.end()) { + if (included == files.end() || *required < (*included)->LogNumber()) { + // FAIL - did not find + return Status::Corruption( + "WAL file " + std::to_string(*required) + + " required by manifest but not in directory list"); + } + if (*required == (*included)->LogNumber()) { + ++required; + ++included; + } else { + assert(*required > (*included)->LogNumber()); + ++included; + } + } + } + + return s; +} + +Status DBImpl::GetCurrentWalFile(std::unique_ptr* current_log_file) { + uint64_t current_logfile_number; + { + InstrumentedMutexLock l(&mutex_); + current_logfile_number = logfile_number_; + } + + return wal_manager_.GetLiveWalFile(current_logfile_number, current_log_file); +} + +Status DBImpl::GetLiveFilesStorageInfo( + const LiveFilesStorageInfoOptions& opts, + std::vector* files) { + // To avoid returning partial results, only move results to files on success. + assert(files); + files->clear(); + std::vector results; + + // NOTE: This implementation was largely migrated from Checkpoint. + + Status s; + VectorLogPtr live_wal_files; + bool flush_memtable = true; + if (!immutable_db_options_.allow_2pc) { + if (opts.wal_size_for_flush == std::numeric_limits::max()) { + flush_memtable = false; + } else if (opts.wal_size_for_flush > 0) { + // If the outstanding log files are small, we skip the flush. + s = GetSortedWalFiles(live_wal_files); + + if (!s.ok()) { + return s; + } + + // Don't flush column families if total log size is smaller than + // log_size_for_flush. We copy the log files instead. + // We may be able to cover 2PC case too. + uint64_t total_wal_size = 0; + for (auto& wal : live_wal_files) { + total_wal_size += wal->SizeFileBytes(); + } + if (total_wal_size < opts.wal_size_for_flush) { + flush_memtable = false; + } + live_wal_files.clear(); + } + } + + // This is a modified version of GetLiveFiles, to get access to more + // metadata. + mutex_.Lock(); + if (flush_memtable) { + Status status = FlushForGetLiveFiles(); + if (!status.ok()) { + mutex_.Unlock(); + ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Cannot Flush data %s\n", + status.ToString().c_str()); + return status; + } + } + + // Make a set of all of the live table and blob files + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + VersionStorageInfo& vsi = *cfd->current()->storage_info(); + auto& cf_paths = cfd->ioptions()->cf_paths; + + auto GetDir = [&](size_t path_id) { + // Matching TableFileName() behavior + if (path_id >= cf_paths.size()) { + assert(false); + return cf_paths.back().path; + } else { + return cf_paths[path_id].path; + } + }; + + for (int level = 0; level < vsi.num_levels(); ++level) { + const auto& level_files = vsi.LevelFiles(level); + for (const auto& meta : level_files) { + assert(meta); + + results.emplace_back(); + LiveFileStorageInfo& info = results.back(); + + info.relative_filename = MakeTableFileName(meta->fd.GetNumber()); + info.directory = GetDir(meta->fd.GetPathId()); + info.file_number = meta->fd.GetNumber(); + info.file_type = kTableFile; + info.size = meta->fd.GetFileSize(); + if (opts.include_checksum_info) { + info.file_checksum_func_name = meta->file_checksum_func_name; + info.file_checksum = meta->file_checksum; + if (info.file_checksum_func_name.empty()) { + info.file_checksum_func_name = kUnknownFileChecksumFuncName; + info.file_checksum = kUnknownFileChecksum; + } + } + info.temperature = meta->temperature; + } + } + const auto& blob_files = vsi.GetBlobFiles(); + for (const auto& meta : blob_files) { + assert(meta); + + results.emplace_back(); + LiveFileStorageInfo& info = results.back(); + + info.relative_filename = BlobFileName(meta->GetBlobFileNumber()); + info.directory = GetDir(/* path_id */ 0); + info.file_number = meta->GetBlobFileNumber(); + info.file_type = kBlobFile; + info.size = meta->GetBlobFileSize(); + if (opts.include_checksum_info) { + info.file_checksum_func_name = meta->GetChecksumMethod(); + info.file_checksum = meta->GetChecksumValue(); + if (info.file_checksum_func_name.empty()) { + info.file_checksum_func_name = kUnknownFileChecksumFuncName; + info.file_checksum = kUnknownFileChecksum; + } + } + // TODO?: info.temperature + } + } + + // Capture some final info before releasing mutex + const uint64_t manifest_number = versions_->manifest_file_number(); + const uint64_t manifest_size = versions_->manifest_file_size(); + const uint64_t options_number = versions_->options_file_number(); + const uint64_t options_size = versions_->options_file_size_; + const uint64_t min_log_num = MinLogNumberToKeep(); + + mutex_.Unlock(); + + std::string manifest_fname = DescriptorFileName(manifest_number); + { // MANIFEST + results.emplace_back(); + LiveFileStorageInfo& info = results.back(); + + info.relative_filename = manifest_fname; + info.directory = GetName(); + info.file_number = manifest_number; + info.file_type = kDescriptorFile; + info.size = manifest_size; + info.trim_to_size = true; + if (opts.include_checksum_info) { + info.file_checksum_func_name = kUnknownFileChecksumFuncName; + info.file_checksum = kUnknownFileChecksum; + } + } + + { // CURRENT + results.emplace_back(); + LiveFileStorageInfo& info = results.back(); + + info.relative_filename = kCurrentFileName; + info.directory = GetName(); + info.file_type = kCurrentFile; + // CURRENT could be replaced so we have to record the contents as needed. + info.replacement_contents = manifest_fname + "\n"; + info.size = manifest_fname.size() + 1; + if (opts.include_checksum_info) { + info.file_checksum_func_name = kUnknownFileChecksumFuncName; + info.file_checksum = kUnknownFileChecksum; + } + } + + // The OPTIONS file number is zero in read-write mode when OPTIONS file + // writing failed and the DB was configured with + // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file + // number is zero when no OPTIONS file exist at all. In those cases we do not + // record any OPTIONS file in the live file list. + if (options_number != 0) { + results.emplace_back(); + LiveFileStorageInfo& info = results.back(); + + info.relative_filename = OptionsFileName(options_number); + info.directory = GetName(); + info.file_number = options_number; + info.file_type = kOptionsFile; + info.size = options_size; + if (opts.include_checksum_info) { + info.file_checksum_func_name = kUnknownFileChecksumFuncName; + info.file_checksum = kUnknownFileChecksum; + } + } + + // Some legacy testing stuff TODO: carefully clean up obsolete parts + TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:FlushDone"); + + TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles1"); + TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"); + + if (s.ok()) { + // To maximize the effectiveness of track_and_verify_wals_in_manifest, + // sync WAL when it is enabled. + s = FlushWAL( + immutable_db_options_.track_and_verify_wals_in_manifest /* sync */); + if (s.IsNotSupported()) { // read-only DB or similar + s = Status::OK(); + } + } + + TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive1"); + TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive2"); + + // If we have more than one column family, we also need to get WAL files. + if (s.ok()) { + s = GetSortedWalFiles(live_wal_files); + } + if (!s.ok()) { + return s; + } + + size_t wal_size = live_wal_files.size(); + + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Number of log files %" ROCKSDB_PRIszt, live_wal_files.size()); + + // Link WAL files. Copy exact size of last one because it is the only one + // that has changes after the last flush. + auto wal_dir = immutable_db_options_.GetWalDir(); + for (size_t i = 0; s.ok() && i < wal_size; ++i) { + if ((live_wal_files[i]->Type() == kAliveLogFile) && + (!flush_memtable || live_wal_files[i]->LogNumber() >= min_log_num)) { + results.emplace_back(); + LiveFileStorageInfo& info = results.back(); + auto f = live_wal_files[i]->PathName(); + assert(!f.empty() && f[0] == '/'); + info.relative_filename = f.substr(1); + info.directory = wal_dir; + info.file_number = live_wal_files[i]->LogNumber(); + info.file_type = kWalFile; + info.size = live_wal_files[i]->SizeFileBytes(); + // Only last should need to be trimmed + info.trim_to_size = (i + 1 == wal_size); + if (opts.include_checksum_info) { + info.file_checksum_func_name = kUnknownFileChecksumFuncName; + info.file_checksum = kUnknownFileChecksum; + } + } + } + + if (s.ok()) { + // Only move results to output on success. + *files = std::move(results); + } + return s; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/db_flush_test.cc b/src/rocksdb/db/db_flush_test.cc new file mode 100644 index 000000000..3b3f7e183 --- /dev/null +++ b/src/rocksdb/db/db_flush_test.cc @@ -0,0 +1,3084 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "env/mock_env.h" +#include "file/filename.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/utilities/transaction_db.h" +#include "test_util/sync_point.h" +#include "test_util/testutil.h" +#include "util/cast_util.h" +#include "util/mutexlock.h" +#include "utilities/fault_injection_env.h" +#include "utilities/fault_injection_fs.h" + +namespace ROCKSDB_NAMESPACE { + +// This is a static filter used for filtering +// kvs during the compaction process. +static std::string NEW_VALUE = "NewValue"; + +class DBFlushTest : public DBTestBase { + public: + DBFlushTest() : DBTestBase("db_flush_test", /*env_do_fsync=*/true) {} +}; + +class DBFlushDirectIOTest : public DBFlushTest, + public ::testing::WithParamInterface { + public: + DBFlushDirectIOTest() : DBFlushTest() {} +}; + +class DBAtomicFlushTest : public DBFlushTest, + public ::testing::WithParamInterface { + public: + DBAtomicFlushTest() : DBFlushTest() {} +}; + +// We had issue when two background threads trying to flush at the same time, +// only one of them get committed. The test verifies the issue is fixed. +TEST_F(DBFlushTest, FlushWhileWritingManifest) { + Options options; + options.disable_auto_compactions = true; + options.max_background_flushes = 2; + options.env = env_; + Reopen(options); + FlushOptions no_wait; + no_wait.wait = false; + no_wait.allow_write_stall = true; + + SyncPoint::GetInstance()->LoadDependency( + {{"VersionSet::LogAndApply:WriteManifest", + "DBFlushTest::FlushWhileWritingManifest:1"}, + {"MemTableList::TryInstallMemtableFlushResults:InProgress", + "VersionSet::LogAndApply:WriteManifestDone"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("foo", "v")); + ASSERT_OK(dbfull()->Flush(no_wait)); + TEST_SYNC_POINT("DBFlushTest::FlushWhileWritingManifest:1"); + ASSERT_OK(Put("bar", "v")); + ASSERT_OK(dbfull()->Flush(no_wait)); + // If the issue is hit we will wait here forever. + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); +#ifndef ROCKSDB_LITE + ASSERT_EQ(2, TotalTableFiles()); +#endif // ROCKSDB_LITE +} + +// Disable this test temporarily on Travis as it fails intermittently. +// Github issue: #4151 +TEST_F(DBFlushTest, SyncFail) { + std::unique_ptr fault_injection_env( + new FaultInjectionTestEnv(env_)); + Options options; + options.disable_auto_compactions = true; + options.env = fault_injection_env.get(); + + SyncPoint::GetInstance()->LoadDependency( + {{"DBFlushTest::SyncFail:1", "DBImpl::SyncClosedLogs:Start"}, + {"DBImpl::SyncClosedLogs:Failed", "DBFlushTest::SyncFail:2"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put("key", "value")); + FlushOptions flush_options; + flush_options.wait = false; + ASSERT_OK(dbfull()->Flush(flush_options)); + // Flush installs a new super-version. Get the ref count after that. + fault_injection_env->SetFilesystemActive(false); + TEST_SYNC_POINT("DBFlushTest::SyncFail:1"); + TEST_SYNC_POINT("DBFlushTest::SyncFail:2"); + fault_injection_env->SetFilesystemActive(true); + // Now the background job will do the flush; wait for it. + // Returns the IO error happend during flush. + ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable()); +#ifndef ROCKSDB_LITE + ASSERT_EQ("", FilesPerLevel()); // flush failed. +#endif // ROCKSDB_LITE + Destroy(options); +} + +TEST_F(DBFlushTest, SyncSkip) { + Options options = CurrentOptions(); + + SyncPoint::GetInstance()->LoadDependency( + {{"DBFlushTest::SyncSkip:1", "DBImpl::SyncClosedLogs:Skip"}, + {"DBImpl::SyncClosedLogs:Skip", "DBFlushTest::SyncSkip:2"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + Reopen(options); + ASSERT_OK(Put("key", "value")); + + FlushOptions flush_options; + flush_options.wait = false; + ASSERT_OK(dbfull()->Flush(flush_options)); + + TEST_SYNC_POINT("DBFlushTest::SyncSkip:1"); + TEST_SYNC_POINT("DBFlushTest::SyncSkip:2"); + + // Now the background job will do the flush; wait for it. + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + Destroy(options); +} + +TEST_F(DBFlushTest, FlushInLowPriThreadPool) { + // Verify setting an empty high-pri (flush) thread pool causes flushes to be + // scheduled in the low-pri (compaction) thread pool. + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 4; + options.memtable_factory.reset(test::NewSpecialSkipListFactory(1)); + Reopen(options); + env_->SetBackgroundThreads(0, Env::HIGH); + + std::thread::id tid; + int num_flushes = 0, num_compactions = 0; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BGWorkFlush", [&](void* /*arg*/) { + if (tid == std::thread::id()) { + tid = std::this_thread::get_id(); + } else { + ASSERT_EQ(tid, std::this_thread::get_id()); + } + ++num_flushes; + }); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BGWorkCompaction", [&](void* /*arg*/) { + ASSERT_EQ(tid, std::this_thread::get_id()); + ++num_compactions; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("key", "val")); + for (int i = 0; i < 4; ++i) { + ASSERT_OK(Put("key", "val")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(4, num_flushes); + ASSERT_EQ(1, num_compactions); +} + +// Test when flush job is submitted to low priority thread pool and when DB is +// closed in the meanwhile, CloseHelper doesn't hang. +TEST_F(DBFlushTest, CloseDBWhenFlushInLowPri) { + Options options = CurrentOptions(); + options.max_background_flushes = 1; + options.max_total_wal_size = 8192; + + DestroyAndReopen(options); + CreateColumnFamilies({"cf1", "cf2"}, options); + + env_->SetBackgroundThreads(0, Env::HIGH); + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + int num_flushes = 0; + + SyncPoint::GetInstance()->SetCallBack("DBImpl::BGWorkFlush", + [&](void* /*arg*/) { ++num_flushes; }); + + int num_low_flush_unscheduled = 0; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::UnscheduleLowFlushCallback", [&](void* /*arg*/) { + num_low_flush_unscheduled++; + // There should be one flush job in low pool that needs to be + // unscheduled + ASSERT_EQ(num_low_flush_unscheduled, 1); + }); + + int num_high_flush_unscheduled = 0; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::UnscheduleHighFlushCallback", [&](void* /*arg*/) { + num_high_flush_unscheduled++; + // There should be no flush job in high pool + ASSERT_EQ(num_high_flush_unscheduled, 0); + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(0, "key1", DummyString(8192))); + // Block thread so that flush cannot be run and can be removed from the queue + // when called Unschedule. + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + sleeping_task_low.WaitUntilSleeping(); + + // Trigger flush and flush job will be scheduled to LOW priority thread. + ASSERT_OK(Put(0, "key2", DummyString(8192))); + + // Close DB and flush job in low priority queue will be removed without + // running. + Close(); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + ASSERT_EQ(0, num_flushes); + + TryReopenWithColumnFamilies({"default", "cf1", "cf2"}, options); + ASSERT_OK(Put(0, "key3", DummyString(8192))); + ASSERT_OK(Flush(0)); + ASSERT_EQ(1, num_flushes); +} + +TEST_F(DBFlushTest, ManualFlushWithMinWriteBufferNumberToMerge) { + Options options = CurrentOptions(); + options.write_buffer_size = 100; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + Reopen(options); + + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BGWorkFlush", + "DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:1"}, + {"DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:2", + "FlushJob::WriteLevel0Table"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("key1", "value1")); + + port::Thread t([&]() { + // The call wait for flush to finish, i.e. with flush_options.wait = true. + ASSERT_OK(Flush()); + }); + + // Wait for flush start. + TEST_SYNC_POINT("DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:1"); + // Insert a second memtable before the manual flush finish. + // At the end of the manual flush job, it will check if further flush + // is needed, but it will not trigger flush of the second memtable because + // min_write_buffer_number_to_merge is not reached. + ASSERT_OK(Put("key2", "value2")); + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + TEST_SYNC_POINT("DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:2"); + + // Manual flush should return, without waiting for flush indefinitely. + t.join(); +} + +TEST_F(DBFlushTest, ScheduleOnlyOneBgThread) { + Options options = CurrentOptions(); + Reopen(options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + int called = 0; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::MaybeScheduleFlushOrCompaction:AfterSchedule:0", [&](void* arg) { + ASSERT_NE(nullptr, arg); + auto unscheduled_flushes = *reinterpret_cast(arg); + ASSERT_EQ(0, unscheduled_flushes); + ++called; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("a", "foo")); + FlushOptions flush_opts; + ASSERT_OK(dbfull()->Flush(flush_opts)); + ASSERT_EQ(1, called); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +// The following 3 tests are designed for testing garbage statistics at flush +// time. +// +// ======= General Information ======= (from GitHub Wiki). +// There are three scenarios where memtable flush can be triggered: +// +// 1 - Memtable size exceeds ColumnFamilyOptions::write_buffer_size +// after a write. +// 2 - Total memtable size across all column families exceeds +// DBOptions::db_write_buffer_size, +// or DBOptions::write_buffer_manager signals a flush. In this scenario +// the largest memtable will be flushed. +// 3 - Total WAL file size exceeds DBOptions::max_total_wal_size. +// In this scenario the memtable with the oldest data will be flushed, +// in order to allow the WAL file with data from this memtable to be +// purged. +// +// As a result, a memtable can be flushed before it is full. This is one +// reason the generated SST file can be smaller than the corresponding +// memtable. Compression is another factor to make SST file smaller than +// corresponding memtable, since data in memtable is uncompressed. + +TEST_F(DBFlushTest, StatisticsGarbageBasic) { + Options options = CurrentOptions(); + + // The following options are used to enforce several values that + // may already exist as default values to make this test resilient + // to default value updates in the future. + options.statistics = CreateDBStatistics(); + + // Record all statistics. + options.statistics->set_stats_level(StatsLevel::kAll); + + // create the DB if it's not already present + options.create_if_missing = true; + + // Useful for now as we are trying to compare uncompressed data savings on + // flush(). + options.compression = kNoCompression; + + // Prevent memtable in place updates. Should already be disabled + // (from Wiki: + // In place updates can be enabled by toggling on the bool + // inplace_update_support flag. However, this flag is by default set to + // false + // because this thread-safe in-place update support is not compatible + // with concurrent memtable writes. Note that the bool + // allow_concurrent_memtable_write is set to true by default ) + options.inplace_update_support = false; + options.allow_concurrent_memtable_write = true; + + // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes). + options.write_buffer_size = 64 << 20; + + ASSERT_OK(TryReopen(options)); + + // Put multiple times the same key-values. + // The encoded length of a db entry in the memtable is + // defined in db/memtable.cc (MemTable::Add) as the variable: + // encoded_len= VarintLength(internal_key_size) --> = + // log_256(internal_key). + // Min # of bytes + // necessary to + // store + // internal_key_size. + // + internal_key_size --> = actual key string, + // (size key_size: w/o term null char) + // + 8 bytes for + // fixed uint64 "seq + // number + // + + // insertion type" + // + VarintLength(val_size) --> = min # of bytes to + // store val_size + // + val_size --> = actual value + // string + // For example, in our situation, "key1" : size 4, "value1" : size 6 + // (the terminating null characters are not copied over to the memtable). + // And therefore encoded_len = 1 + (4+8) + 1 + 6 = 20 bytes per entry. + // However in terms of raw data contained in the memtable, and written + // over to the SSTable, we only count internal_key_size and val_size, + // because this is the only raw chunk of bytes that contains everything + // necessary to reconstruct a user entry: sequence number, insertion type, + // key, and value. + + // To test the relevance of our Memtable garbage statistics, + // namely MEMTABLE_PAYLOAD_BYTES_AT_FLUSH and MEMTABLE_GARBAGE_BYTES_AT_FLUSH, + // we insert K-V pairs with 3 distinct keys (of length 4), + // and random values of arbitrary length RAND_VALUES_LENGTH, + // and we repeat this step NUM_REPEAT times total. + // At the end, we insert 3 final K-V pairs with the same 3 keys + // and known values (these will be the final values, of length 6). + // I chose NUM_REPEAT=2,000 such that no automatic flush is + // triggered (the number of bytes in the memtable is therefore + // well below any meaningful heuristic for a memtable of size 64MB). + // As a result, since each K-V pair is inserted as a payload + // of N meaningful bytes (sequence number, insertion type, + // key, and value = 8 + 4 + RAND_VALUE_LENGTH), + // MEMTABLE_GARBAGE_BYTES_AT_FLUSH should be equal to 2,000 * N bytes + // and MEMTABLE_PAYLAOD_BYTES_AT_FLUSH = MEMTABLE_GARBAGE_BYTES_AT_FLUSH + + // (3*(8 + 4 + 6)) bytes. For RAND_VALUE_LENGTH = 172 (arbitrary value), we + // expect: + // N = 8 + 4 + 172 = 184 bytes + // MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 2,000 * 184 = 368,000 bytes. + // MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 368,000 + 3*18 = 368,054 bytes. + + const size_t NUM_REPEAT = 2000; + const size_t RAND_VALUES_LENGTH = 172; + const std::string KEY1 = "key1"; + const std::string KEY2 = "key2"; + const std::string KEY3 = "key3"; + const std::string VALUE1 = "value1"; + const std::string VALUE2 = "value2"; + const std::string VALUE3 = "value3"; + uint64_t EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 0; + uint64_t EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 0; + + Random rnd(301); + // Insertion of of K-V pairs, multiple times. + for (size_t i = 0; i < NUM_REPEAT; i++) { + // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes. + std::string p_v1 = rnd.RandomString(RAND_VALUES_LENGTH); + std::string p_v2 = rnd.RandomString(RAND_VALUES_LENGTH); + std::string p_v3 = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(KEY1, p_v1)); + ASSERT_OK(Put(KEY2, p_v2)); + ASSERT_OK(Put(KEY3, p_v3)); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY1.size() + p_v1.size() + sizeof(uint64_t); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY2.size() + p_v2.size() + sizeof(uint64_t); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY3.size() + p_v3.size() + sizeof(uint64_t); + } + + // The memtable data bytes includes the "garbage" + // bytes along with the useful payload. + EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH; + + ASSERT_OK(Put(KEY1, VALUE1)); + ASSERT_OK(Put(KEY2, VALUE2)); + ASSERT_OK(Put(KEY3, VALUE3)); + + // Add useful payload to the memtable data bytes: + EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH += + KEY1.size() + VALUE1.size() + KEY2.size() + VALUE2.size() + KEY3.size() + + VALUE3.size() + 3 * sizeof(uint64_t); + + // We assert that the last K-V pairs have been successfully inserted, + // and that the valid values are VALUE1, VALUE2, VALUE3. + PinnableSlice value; + ASSERT_OK(Get(KEY1, &value)); + ASSERT_EQ(value.ToString(), VALUE1); + ASSERT_OK(Get(KEY2, &value)); + ASSERT_EQ(value.ToString(), VALUE2); + ASSERT_OK(Get(KEY3, &value)); + ASSERT_EQ(value.ToString(), VALUE3); + + // Force flush to SST. Increments the statistics counter. + ASSERT_OK(Flush()); + + // Collect statistics. + uint64_t mem_data_bytes = + TestGetTickerCount(options, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH); + uint64_t mem_garbage_bytes = + TestGetTickerCount(options, MEMTABLE_GARBAGE_BYTES_AT_FLUSH); + + EXPECT_EQ(mem_data_bytes, EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH); + EXPECT_EQ(mem_garbage_bytes, EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH); + + Close(); +} + +TEST_F(DBFlushTest, StatisticsGarbageInsertAndDeletes) { + Options options = CurrentOptions(); + options.statistics = CreateDBStatistics(); + options.statistics->set_stats_level(StatsLevel::kAll); + options.create_if_missing = true; + options.compression = kNoCompression; + options.inplace_update_support = false; + options.allow_concurrent_memtable_write = true; + options.write_buffer_size = 67108864; + + ASSERT_OK(TryReopen(options)); + + const size_t NUM_REPEAT = 2000; + const size_t RAND_VALUES_LENGTH = 37; + const std::string KEY1 = "key1"; + const std::string KEY2 = "key2"; + const std::string KEY3 = "key3"; + const std::string KEY4 = "key4"; + const std::string KEY5 = "key5"; + const std::string KEY6 = "key6"; + + uint64_t EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 0; + uint64_t EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 0; + + WriteBatch batch; + + Random rnd(301); + // Insertion of of K-V pairs, multiple times. + for (size_t i = 0; i < NUM_REPEAT; i++) { + // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes. + std::string p_v1 = rnd.RandomString(RAND_VALUES_LENGTH); + std::string p_v2 = rnd.RandomString(RAND_VALUES_LENGTH); + std::string p_v3 = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(KEY1, p_v1)); + ASSERT_OK(Put(KEY2, p_v2)); + ASSERT_OK(Put(KEY3, p_v3)); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY1.size() + p_v1.size() + sizeof(uint64_t); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY2.size() + p_v2.size() + sizeof(uint64_t); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY3.size() + p_v3.size() + sizeof(uint64_t); + ASSERT_OK(Delete(KEY1)); + ASSERT_OK(Delete(KEY2)); + ASSERT_OK(Delete(KEY3)); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY1.size() + KEY2.size() + KEY3.size() + 3 * sizeof(uint64_t); + } + + // The memtable data bytes includes the "garbage" + // bytes along with the useful payload. + EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH; + + // Note : one set of delete for KEY1, KEY2, KEY3 is written to + // SSTable to propagate the delete operations to K-V pairs + // that could have been inserted into the database during past Flush + // opeartions. + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH -= + KEY1.size() + KEY2.size() + KEY3.size() + 3 * sizeof(uint64_t); + + // Additional useful paylaod. + ASSERT_OK(Delete(KEY4)); + ASSERT_OK(Delete(KEY5)); + ASSERT_OK(Delete(KEY6)); + + // // Add useful payload to the memtable data bytes: + EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH += + KEY4.size() + KEY5.size() + KEY6.size() + 3 * sizeof(uint64_t); + + // We assert that the K-V pairs have been successfully deleted. + PinnableSlice value; + ASSERT_NOK(Get(KEY1, &value)); + ASSERT_NOK(Get(KEY2, &value)); + ASSERT_NOK(Get(KEY3, &value)); + + // Force flush to SST. Increments the statistics counter. + ASSERT_OK(Flush()); + + // Collect statistics. + uint64_t mem_data_bytes = + TestGetTickerCount(options, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH); + uint64_t mem_garbage_bytes = + TestGetTickerCount(options, MEMTABLE_GARBAGE_BYTES_AT_FLUSH); + + EXPECT_EQ(mem_data_bytes, EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH); + EXPECT_EQ(mem_garbage_bytes, EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH); + + Close(); +} + +TEST_F(DBFlushTest, StatisticsGarbageRangeDeletes) { + Options options = CurrentOptions(); + options.statistics = CreateDBStatistics(); + options.statistics->set_stats_level(StatsLevel::kAll); + options.create_if_missing = true; + options.compression = kNoCompression; + options.inplace_update_support = false; + options.allow_concurrent_memtable_write = true; + options.write_buffer_size = 67108864; + + ASSERT_OK(TryReopen(options)); + + const size_t NUM_REPEAT = 1000; + const size_t RAND_VALUES_LENGTH = 42; + const std::string KEY1 = "key1"; + const std::string KEY2 = "key2"; + const std::string KEY3 = "key3"; + const std::string KEY4 = "key4"; + const std::string KEY5 = "key5"; + const std::string KEY6 = "key6"; + const std::string VALUE3 = "value3"; + + uint64_t EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 0; + uint64_t EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 0; + + Random rnd(301); + // Insertion of of K-V pairs, multiple times. + // Also insert DeleteRange + for (size_t i = 0; i < NUM_REPEAT; i++) { + // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes. + std::string p_v1 = rnd.RandomString(RAND_VALUES_LENGTH); + std::string p_v2 = rnd.RandomString(RAND_VALUES_LENGTH); + std::string p_v3 = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(KEY1, p_v1)); + ASSERT_OK(Put(KEY2, p_v2)); + ASSERT_OK(Put(KEY3, p_v3)); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY1.size() + p_v1.size() + sizeof(uint64_t); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY2.size() + p_v2.size() + sizeof(uint64_t); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY3.size() + p_v3.size() + sizeof(uint64_t); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY1, + KEY2)); + // Note: DeleteRange have an exclusive upper bound, e.g. here: [KEY2,KEY3) + // is deleted. + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY2, + KEY3)); + // Delete ranges are stored as a regular K-V pair, with key=STARTKEY, + // value=ENDKEY. + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + (KEY1.size() + KEY2.size() + sizeof(uint64_t)) + + (KEY2.size() + KEY3.size() + sizeof(uint64_t)); + } + + // The memtable data bytes includes the "garbage" + // bytes along with the useful payload. + EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH; + + // Note : one set of deleteRange for (KEY1, KEY2) and (KEY2, KEY3) is written + // to SSTable to propagate the deleteRange operations to K-V pairs that could + // have been inserted into the database during past Flush opeartions. + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH -= + (KEY1.size() + KEY2.size() + sizeof(uint64_t)) + + (KEY2.size() + KEY3.size() + sizeof(uint64_t)); + + // Overwrite KEY3 with known value (VALUE3) + // Note that during the whole time KEY3 has never been deleted + // by the RangeDeletes. + ASSERT_OK(Put(KEY3, VALUE3)); + EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH += + KEY3.size() + VALUE3.size() + sizeof(uint64_t); + + // Additional useful paylaod. + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY4, KEY5)); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY5, KEY6)); + + // Add useful payload to the memtable data bytes: + EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH += + (KEY4.size() + KEY5.size() + sizeof(uint64_t)) + + (KEY5.size() + KEY6.size() + sizeof(uint64_t)); + + // We assert that the K-V pairs have been successfully deleted. + PinnableSlice value; + ASSERT_NOK(Get(KEY1, &value)); + ASSERT_NOK(Get(KEY2, &value)); + // And that KEY3's value is correct. + ASSERT_OK(Get(KEY3, &value)); + ASSERT_EQ(value, VALUE3); + + // Force flush to SST. Increments the statistics counter. + ASSERT_OK(Flush()); + + // Collect statistics. + uint64_t mem_data_bytes = + TestGetTickerCount(options, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH); + uint64_t mem_garbage_bytes = + TestGetTickerCount(options, MEMTABLE_GARBAGE_BYTES_AT_FLUSH); + + EXPECT_EQ(mem_data_bytes, EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH); + EXPECT_EQ(mem_garbage_bytes, EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH); + + Close(); +} + +#ifndef ROCKSDB_LITE +// This simple Listener can only handle one flush at a time. +class TestFlushListener : public EventListener { + public: + TestFlushListener(Env* env, DBFlushTest* test) + : slowdown_count(0), stop_count(0), db_closed(), env_(env), test_(test) { + db_closed = false; + } + + ~TestFlushListener() override { + prev_fc_info_.status.PermitUncheckedError(); // Ignore the status + } + + void OnTableFileCreated(const TableFileCreationInfo& info) override { + // remember the info for later checking the FlushJobInfo. + prev_fc_info_ = info; + ASSERT_GT(info.db_name.size(), 0U); + ASSERT_GT(info.cf_name.size(), 0U); + ASSERT_GT(info.file_path.size(), 0U); + ASSERT_GT(info.job_id, 0); + ASSERT_GT(info.table_properties.data_size, 0U); + ASSERT_GT(info.table_properties.raw_key_size, 0U); + ASSERT_GT(info.table_properties.raw_value_size, 0U); + ASSERT_GT(info.table_properties.num_data_blocks, 0U); + ASSERT_GT(info.table_properties.num_entries, 0U); + ASSERT_EQ(info.file_checksum, kUnknownFileChecksum); + ASSERT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName); + } + + void OnFlushCompleted(DB* db, const FlushJobInfo& info) override { + flushed_dbs_.push_back(db); + flushed_column_family_names_.push_back(info.cf_name); + if (info.triggered_writes_slowdown) { + slowdown_count++; + } + if (info.triggered_writes_stop) { + stop_count++; + } + // verify whether the previously created file matches the flushed file. + ASSERT_EQ(prev_fc_info_.db_name, db->GetName()); + ASSERT_EQ(prev_fc_info_.cf_name, info.cf_name); + ASSERT_EQ(prev_fc_info_.job_id, info.job_id); + ASSERT_EQ(prev_fc_info_.file_path, info.file_path); + ASSERT_EQ(TableFileNameToNumber(info.file_path), info.file_number); + + // Note: the following chunk relies on the notification pertaining to the + // database pointed to by DBTestBase::db_, and is thus bypassed when + // that assumption does not hold (see the test case MultiDBMultiListeners + // below). + ASSERT_TRUE(test_); + if (db == test_->db_) { + std::vector> files_by_level; + test_->dbfull()->TEST_GetFilesMetaData(db->DefaultColumnFamily(), + &files_by_level); + + ASSERT_FALSE(files_by_level.empty()); + auto it = std::find_if(files_by_level[0].begin(), files_by_level[0].end(), + [&](const FileMetaData& meta) { + return meta.fd.GetNumber() == info.file_number; + }); + ASSERT_NE(it, files_by_level[0].end()); + ASSERT_EQ(info.oldest_blob_file_number, it->oldest_blob_file_number); + } + + ASSERT_EQ(db->GetEnv()->GetThreadID(), info.thread_id); + ASSERT_GT(info.thread_id, 0U); + } + + std::vector flushed_column_family_names_; + std::vector flushed_dbs_; + int slowdown_count; + int stop_count; + bool db_closing; + std::atomic_bool db_closed; + TableFileCreationInfo prev_fc_info_; + + protected: + Env* env_; + DBFlushTest* test_; +}; +#endif // !ROCKSDB_LITE + +TEST_F(DBFlushTest, MemPurgeBasic) { + Options options = CurrentOptions(); + + // The following options are used to enforce several values that + // may already exist as default values to make this test resilient + // to default value updates in the future. + options.statistics = CreateDBStatistics(); + + // Record all statistics. + options.statistics->set_stats_level(StatsLevel::kAll); + + // create the DB if it's not already present + options.create_if_missing = true; + + // Useful for now as we are trying to compare uncompressed data savings on + // flush(). + options.compression = kNoCompression; + + // Prevent memtable in place updates. Should already be disabled + // (from Wiki: + // In place updates can be enabled by toggling on the bool + // inplace_update_support flag. However, this flag is by default set to + // false + // because this thread-safe in-place update support is not compatible + // with concurrent memtable writes. Note that the bool + // allow_concurrent_memtable_write is set to true by default ) + options.inplace_update_support = false; + options.allow_concurrent_memtable_write = true; + + // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes). + options.write_buffer_size = 1 << 20; +#ifndef ROCKSDB_LITE + // Initially deactivate the MemPurge prototype. + options.experimental_mempurge_threshold = 0.0; + TestFlushListener* listener = new TestFlushListener(options.env, this); + options.listeners.emplace_back(listener); +#else + // Activate directly the MemPurge prototype. + // (RocksDB lite does not support dynamic options) + options.experimental_mempurge_threshold = 1.0; +#endif // !ROCKSDB_LITE + ASSERT_OK(TryReopen(options)); + + // RocksDB lite does not support dynamic options +#ifndef ROCKSDB_LITE + // Dynamically activate the MemPurge prototype without restarting the DB. + ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); + ASSERT_OK(db_->SetOptions(cfh, {{"experimental_mempurge_threshold", "1.0"}})); +#endif + + std::atomic mempurge_count{0}; + std::atomic sst_count{0}; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushJob:MemPurgeSuccessful", + [&](void* /*arg*/) { mempurge_count++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + std::string KEY1 = "IamKey1"; + std::string KEY2 = "IamKey2"; + std::string KEY3 = "IamKey3"; + std::string KEY4 = "IamKey4"; + std::string KEY5 = "IamKey5"; + std::string KEY6 = "IamKey6"; + std::string KEY7 = "IamKey7"; + std::string KEY8 = "IamKey8"; + std::string KEY9 = "IamKey9"; + std::string RNDKEY1, RNDKEY2, RNDKEY3; + const std::string NOT_FOUND = "NOT_FOUND"; + + // Heavy overwrite workload, + // more than would fit in maximum allowed memtables. + Random rnd(719); + const size_t NUM_REPEAT = 100; + const size_t RAND_KEYS_LENGTH = 57; + const size_t RAND_VALUES_LENGTH = 10240; + std::string p_v1, p_v2, p_v3, p_v4, p_v5, p_v6, p_v7, p_v8, p_v9, p_rv1, + p_rv2, p_rv3; + + // Insert a very first set of keys that will be + // mempurged at least once. + p_v1 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v2 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v3 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v4 = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(KEY1, p_v1)); + ASSERT_OK(Put(KEY2, p_v2)); + ASSERT_OK(Put(KEY3, p_v3)); + ASSERT_OK(Put(KEY4, p_v4)); + ASSERT_EQ(Get(KEY1), p_v1); + ASSERT_EQ(Get(KEY2), p_v2); + ASSERT_EQ(Get(KEY3), p_v3); + ASSERT_EQ(Get(KEY4), p_v4); + + // Insertion of of K-V pairs, multiple times (overwrites). + for (size_t i = 0; i < NUM_REPEAT; i++) { + // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes. + p_v5 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v6 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v7 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v8 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v9 = rnd.RandomString(RAND_VALUES_LENGTH); + + ASSERT_OK(Put(KEY5, p_v5)); + ASSERT_OK(Put(KEY6, p_v6)); + ASSERT_OK(Put(KEY7, p_v7)); + ASSERT_OK(Put(KEY8, p_v8)); + ASSERT_OK(Put(KEY9, p_v9)); + + ASSERT_EQ(Get(KEY1), p_v1); + ASSERT_EQ(Get(KEY2), p_v2); + ASSERT_EQ(Get(KEY3), p_v3); + ASSERT_EQ(Get(KEY4), p_v4); + ASSERT_EQ(Get(KEY5), p_v5); + ASSERT_EQ(Get(KEY6), p_v6); + ASSERT_EQ(Get(KEY7), p_v7); + ASSERT_EQ(Get(KEY8), p_v8); + ASSERT_EQ(Get(KEY9), p_v9); + } + + // Check that there was at least one mempurge + const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1; + // Check that there was no SST files created during flush. + const uint32_t EXPECTED_SST_COUNT = 0; + + EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT); + EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT); + + // Insertion of of K-V pairs, no overwrites. + for (size_t i = 0; i < NUM_REPEAT; i++) { + // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes. + RNDKEY1 = rnd.RandomString(RAND_KEYS_LENGTH); + RNDKEY2 = rnd.RandomString(RAND_KEYS_LENGTH); + RNDKEY3 = rnd.RandomString(RAND_KEYS_LENGTH); + p_rv1 = rnd.RandomString(RAND_VALUES_LENGTH); + p_rv2 = rnd.RandomString(RAND_VALUES_LENGTH); + p_rv3 = rnd.RandomString(RAND_VALUES_LENGTH); + + ASSERT_OK(Put(RNDKEY1, p_rv1)); + ASSERT_OK(Put(RNDKEY2, p_rv2)); + ASSERT_OK(Put(RNDKEY3, p_rv3)); + + ASSERT_EQ(Get(KEY1), p_v1); + ASSERT_EQ(Get(KEY2), p_v2); + ASSERT_EQ(Get(KEY3), p_v3); + ASSERT_EQ(Get(KEY4), p_v4); + ASSERT_EQ(Get(KEY5), p_v5); + ASSERT_EQ(Get(KEY6), p_v6); + ASSERT_EQ(Get(KEY7), p_v7); + ASSERT_EQ(Get(KEY8), p_v8); + ASSERT_EQ(Get(KEY9), p_v9); + ASSERT_EQ(Get(RNDKEY1), p_rv1); + ASSERT_EQ(Get(RNDKEY2), p_rv2); + ASSERT_EQ(Get(RNDKEY3), p_rv3); + } + + // Assert that at least one flush to storage has been performed + EXPECT_GT(sst_count.exchange(0), EXPECTED_SST_COUNT); + // (which will consequently increase the number of mempurges recorded too). + EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT); + + // Assert that there is no data corruption, even with + // a flush to storage. + ASSERT_EQ(Get(KEY1), p_v1); + ASSERT_EQ(Get(KEY2), p_v2); + ASSERT_EQ(Get(KEY3), p_v3); + ASSERT_EQ(Get(KEY4), p_v4); + ASSERT_EQ(Get(KEY5), p_v5); + ASSERT_EQ(Get(KEY6), p_v6); + ASSERT_EQ(Get(KEY7), p_v7); + ASSERT_EQ(Get(KEY8), p_v8); + ASSERT_EQ(Get(KEY9), p_v9); + ASSERT_EQ(Get(RNDKEY1), p_rv1); + ASSERT_EQ(Get(RNDKEY2), p_rv2); + ASSERT_EQ(Get(RNDKEY3), p_rv3); + + Close(); +} + +// RocksDB lite does not support dynamic options +#ifndef ROCKSDB_LITE +TEST_F(DBFlushTest, MemPurgeBasicToggle) { + Options options = CurrentOptions(); + + // The following options are used to enforce several values that + // may already exist as default values to make this test resilient + // to default value updates in the future. + options.statistics = CreateDBStatistics(); + + // Record all statistics. + options.statistics->set_stats_level(StatsLevel::kAll); + + // create the DB if it's not already present + options.create_if_missing = true; + + // Useful for now as we are trying to compare uncompressed data savings on + // flush(). + options.compression = kNoCompression; + + // Prevent memtable in place updates. Should already be disabled + // (from Wiki: + // In place updates can be enabled by toggling on the bool + // inplace_update_support flag. However, this flag is by default set to + // false + // because this thread-safe in-place update support is not compatible + // with concurrent memtable writes. Note that the bool + // allow_concurrent_memtable_write is set to true by default ) + options.inplace_update_support = false; + options.allow_concurrent_memtable_write = true; + + // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes). + options.write_buffer_size = 1 << 20; + // Initially deactivate the MemPurge prototype. + // (negative values are equivalent to 0.0). + options.experimental_mempurge_threshold = -25.3; + TestFlushListener* listener = new TestFlushListener(options.env, this); + options.listeners.emplace_back(listener); + + ASSERT_OK(TryReopen(options)); + // Dynamically activate the MemPurge prototype without restarting the DB. + ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); + // Values greater than 1.0 are equivalent to 1.0 + ASSERT_OK( + db_->SetOptions(cfh, {{"experimental_mempurge_threshold", "3.7898"}})); + std::atomic mempurge_count{0}; + std::atomic sst_count{0}; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushJob:MemPurgeSuccessful", + [&](void* /*arg*/) { mempurge_count++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + const size_t KVSIZE = 3; + std::vector KEYS(KVSIZE); + for (size_t k = 0; k < KVSIZE; k++) { + KEYS[k] = "IamKey" + std::to_string(k); + } + + std::vector RNDVALS(KVSIZE); + const std::string NOT_FOUND = "NOT_FOUND"; + + // Heavy overwrite workload, + // more than would fit in maximum allowed memtables. + Random rnd(719); + const size_t NUM_REPEAT = 100; + const size_t RAND_VALUES_LENGTH = 10240; + + // Insertion of of K-V pairs, multiple times (overwrites). + for (size_t i = 0; i < NUM_REPEAT; i++) { + for (size_t j = 0; j < KEYS.size(); j++) { + RNDVALS[j] = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(KEYS[j], RNDVALS[j])); + ASSERT_EQ(Get(KEYS[j]), RNDVALS[j]); + } + for (size_t j = 0; j < KEYS.size(); j++) { + ASSERT_EQ(Get(KEYS[j]), RNDVALS[j]); + } + } + + // Check that there was at least one mempurge + const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1; + // Check that there was no SST files created during flush. + const uint32_t EXPECTED_SST_COUNT = 0; + + EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT); + EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT); + + // Dynamically deactivate MemPurge. + ASSERT_OK( + db_->SetOptions(cfh, {{"experimental_mempurge_threshold", "-1023.0"}})); + + // Insertion of of K-V pairs, multiple times (overwrites). + for (size_t i = 0; i < NUM_REPEAT; i++) { + for (size_t j = 0; j < KEYS.size(); j++) { + RNDVALS[j] = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(KEYS[j], RNDVALS[j])); + ASSERT_EQ(Get(KEYS[j]), RNDVALS[j]); + } + for (size_t j = 0; j < KEYS.size(); j++) { + ASSERT_EQ(Get(KEYS[j]), RNDVALS[j]); + } + } + + // Check that there was at least one mempurge + const uint32_t ZERO = 0; + // Assert that at least one flush to storage has been performed + EXPECT_GT(sst_count.exchange(0), EXPECTED_SST_COUNT); + // The mempurge count is expected to be set to 0 when the options are updated. + // We expect no mempurge at all. + EXPECT_EQ(mempurge_count.exchange(0), ZERO); + + Close(); +} +// Closes the "#ifndef ROCKSDB_LITE" +// End of MemPurgeBasicToggle, which is not +// supported with RocksDB LITE because it +// relies on dynamically changing the option +// flag experimental_mempurge_threshold. +#endif + +// At the moment, MemPurge feature is deactivated +// when atomic_flush is enabled. This is because the level +// of garbage between Column Families is not guaranteed to +// be consistent, therefore a CF could hypothetically +// trigger a MemPurge while another CF would trigger +// a regular Flush. +TEST_F(DBFlushTest, MemPurgeWithAtomicFlush) { + Options options = CurrentOptions(); + + // The following options are used to enforce several values that + // may already exist as default values to make this test resilient + // to default value updates in the future. + options.statistics = CreateDBStatistics(); + + // Record all statistics. + options.statistics->set_stats_level(StatsLevel::kAll); + + // create the DB if it's not already present + options.create_if_missing = true; + + // Useful for now as we are trying to compare uncompressed data savings on + // flush(). + options.compression = kNoCompression; + + // Prevent memtable in place updates. Should already be disabled + // (from Wiki: + // In place updates can be enabled by toggling on the bool + // inplace_update_support flag. However, this flag is by default set to + // false + // because this thread-safe in-place update support is not compatible + // with concurrent memtable writes. Note that the bool + // allow_concurrent_memtable_write is set to true by default ) + options.inplace_update_support = false; + options.allow_concurrent_memtable_write = true; + + // Enforce size of a single MemTable to 64KB (64KB = 65,536 bytes). + options.write_buffer_size = 1 << 20; + // Activate the MemPurge prototype. + options.experimental_mempurge_threshold = 153.245; + // Activate atomic_flush. + options.atomic_flush = true; + + const std::vector new_cf_names = {"pikachu", "eevie"}; + CreateColumnFamilies(new_cf_names, options); + + Close(); + + // 3 CFs: default will be filled with overwrites (would normally trigger + // mempurge) + // new_cf_names[1] will be filled with random values (would trigger + // flush) new_cf_names[2] not filled with anything. + ReopenWithColumnFamilies( + {kDefaultColumnFamilyName, new_cf_names[0], new_cf_names[1]}, options); + size_t num_cfs = handles_.size(); + ASSERT_EQ(3, num_cfs); + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(2, "bar", "baz")); + + std::atomic mempurge_count{0}; + std::atomic sst_count{0}; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushJob:MemPurgeSuccessful", + [&](void* /*arg*/) { mempurge_count++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + const size_t KVSIZE = 3; + std::vector KEYS(KVSIZE); + for (size_t k = 0; k < KVSIZE; k++) { + KEYS[k] = "IamKey" + std::to_string(k); + } + + std::string RNDKEY; + std::vector RNDVALS(KVSIZE); + const std::string NOT_FOUND = "NOT_FOUND"; + + // Heavy overwrite workload, + // more than would fit in maximum allowed memtables. + Random rnd(106); + const size_t NUM_REPEAT = 100; + const size_t RAND_KEY_LENGTH = 128; + const size_t RAND_VALUES_LENGTH = 10240; + + // Insertion of of K-V pairs, multiple times (overwrites). + for (size_t i = 0; i < NUM_REPEAT; i++) { + for (size_t j = 0; j < KEYS.size(); j++) { + RNDKEY = rnd.RandomString(RAND_KEY_LENGTH); + RNDVALS[j] = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(KEYS[j], RNDVALS[j])); + ASSERT_OK(Put(1, RNDKEY, RNDVALS[j])); + ASSERT_EQ(Get(KEYS[j]), RNDVALS[j]); + ASSERT_EQ(Get(1, RNDKEY), RNDVALS[j]); + } + } + + // Check that there was no mempurge because atomic_flush option is true. + const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 0; + // Check that there was at least one SST files created during flush. + const uint32_t EXPECTED_SST_COUNT = 1; + + EXPECT_EQ(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT); + EXPECT_GE(sst_count.exchange(0), EXPECTED_SST_COUNT); + + Close(); +} + +TEST_F(DBFlushTest, MemPurgeDeleteAndDeleteRange) { + Options options = CurrentOptions(); + + options.statistics = CreateDBStatistics(); + options.statistics->set_stats_level(StatsLevel::kAll); + options.create_if_missing = true; + options.compression = kNoCompression; + options.inplace_update_support = false; + options.allow_concurrent_memtable_write = true; +#ifndef ROCKSDB_LITE + TestFlushListener* listener = new TestFlushListener(options.env, this); + options.listeners.emplace_back(listener); +#endif // !ROCKSDB_LITE + // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes). + options.write_buffer_size = 1 << 20; + // Activate the MemPurge prototype. + options.experimental_mempurge_threshold = 15.0; + + ASSERT_OK(TryReopen(options)); + + std::atomic mempurge_count{0}; + std::atomic sst_count{0}; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushJob:MemPurgeSuccessful", + [&](void* /*arg*/) { mempurge_count++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + std::string KEY1 = "ThisIsKey1"; + std::string KEY2 = "ThisIsKey2"; + std::string KEY3 = "ThisIsKey3"; + std::string KEY4 = "ThisIsKey4"; + std::string KEY5 = "ThisIsKey5"; + const std::string NOT_FOUND = "NOT_FOUND"; + + Random rnd(117); + const size_t NUM_REPEAT = 100; + const size_t RAND_VALUES_LENGTH = 10240; + + std::string key, value, p_v1, p_v2, p_v3, p_v3b, p_v4, p_v5; + int count = 0; + const int EXPECTED_COUNT_FORLOOP = 3; + const int EXPECTED_COUNT_END = 4; + + ReadOptions ropt; + ropt.pin_data = true; + ropt.total_order_seek = true; + Iterator* iter = nullptr; + + // Insertion of of K-V pairs, multiple times. + // Also insert DeleteRange + for (size_t i = 0; i < NUM_REPEAT; i++) { + // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes. + p_v1 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v2 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v3 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v3b = rnd.RandomString(RAND_VALUES_LENGTH); + p_v4 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v5 = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(KEY1, p_v1)); + ASSERT_OK(Put(KEY2, p_v2)); + ASSERT_OK(Put(KEY3, p_v3)); + ASSERT_OK(Put(KEY4, p_v4)); + ASSERT_OK(Put(KEY5, p_v5)); + ASSERT_OK(Delete(KEY2)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY2, + KEY4)); + ASSERT_OK(Put(KEY3, p_v3b)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY1, + KEY3)); + ASSERT_OK(Delete(KEY1)); + + ASSERT_EQ(Get(KEY1), NOT_FOUND); + ASSERT_EQ(Get(KEY2), NOT_FOUND); + ASSERT_EQ(Get(KEY3), p_v3b); + ASSERT_EQ(Get(KEY4), p_v4); + ASSERT_EQ(Get(KEY5), p_v5); + + iter = db_->NewIterator(ropt); + iter->SeekToFirst(); + count = 0; + for (; iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + key = (iter->key()).ToString(false); + value = (iter->value()).ToString(false); + if (key.compare(KEY3) == 0) + ASSERT_EQ(value, p_v3b); + else if (key.compare(KEY4) == 0) + ASSERT_EQ(value, p_v4); + else if (key.compare(KEY5) == 0) + ASSERT_EQ(value, p_v5); + else + ASSERT_EQ(value, NOT_FOUND); + count++; + } + + // Expected count here is 3: KEY3, KEY4, KEY5. + ASSERT_EQ(count, EXPECTED_COUNT_FORLOOP); + if (iter) { + delete iter; + } + } + + // Check that there was at least one mempurge + const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1; + // Check that there was no SST files created during flush. + const uint32_t EXPECTED_SST_COUNT = 0; + + EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT); + EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT); + + // Additional test for the iterator+memPurge. + ASSERT_OK(Put(KEY2, p_v2)); + iter = db_->NewIterator(ropt); + iter->SeekToFirst(); + ASSERT_OK(Put(KEY4, p_v4)); + count = 0; + for (; iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + key = (iter->key()).ToString(false); + value = (iter->value()).ToString(false); + if (key.compare(KEY2) == 0) + ASSERT_EQ(value, p_v2); + else if (key.compare(KEY3) == 0) + ASSERT_EQ(value, p_v3b); + else if (key.compare(KEY4) == 0) + ASSERT_EQ(value, p_v4); + else if (key.compare(KEY5) == 0) + ASSERT_EQ(value, p_v5); + else + ASSERT_EQ(value, NOT_FOUND); + count++; + } + + // Expected count here is 4: KEY2, KEY3, KEY4, KEY5. + ASSERT_EQ(count, EXPECTED_COUNT_END); + if (iter) delete iter; + + Close(); +} + +// Create a Compaction Fitler that will be invoked +// at flush time and will update the value of a KV pair +// if the key string is "lower" than the filter_key_ string. +class ConditionalUpdateFilter : public CompactionFilter { + public: + explicit ConditionalUpdateFilter(const std::string* filtered_key) + : filtered_key_(filtered_key) {} + bool Filter(int /*level*/, const Slice& key, const Slice& /*value*/, + std::string* new_value, bool* value_changed) const override { + // If key CreateCompactionFilter( + const CompactionFilter::Context& /*context*/) override { + return std::unique_ptr( + new ConditionalUpdateFilter(&filtered_key_)); + } + + const char* Name() const override { return "ConditionalUpdateFilterFactory"; } + + bool ShouldFilterTableFileCreation( + TableFileCreationReason reason) const override { + // This compaction filter will be invoked + // at flush time (and therefore at MemPurge time). + return (reason == TableFileCreationReason::kFlush); + } + + private: + std::string filtered_key_; +}; + +TEST_F(DBFlushTest, MemPurgeAndCompactionFilter) { + Options options = CurrentOptions(); + + std::string KEY1 = "ThisIsKey1"; + std::string KEY2 = "ThisIsKey2"; + std::string KEY3 = "ThisIsKey3"; + std::string KEY4 = "ThisIsKey4"; + std::string KEY5 = "ThisIsKey5"; + std::string KEY6 = "ThisIsKey6"; + std::string KEY7 = "ThisIsKey7"; + std::string KEY8 = "ThisIsKey8"; + std::string KEY9 = "ThisIsKey9"; + const std::string NOT_FOUND = "NOT_FOUND"; + + options.statistics = CreateDBStatistics(); + options.statistics->set_stats_level(StatsLevel::kAll); + options.create_if_missing = true; + options.compression = kNoCompression; + options.inplace_update_support = false; + options.allow_concurrent_memtable_write = true; +#ifndef ROCKSDB_LITE + TestFlushListener* listener = new TestFlushListener(options.env, this); + options.listeners.emplace_back(listener); +#endif // !ROCKSDB_LITE + // Create a ConditionalUpdate compaction filter + // that will update all the values of the KV pairs + // where the keys are "lower" than KEY4. + options.compaction_filter_factory = + std::make_shared(KEY4); + + // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes). + options.write_buffer_size = 1 << 20; + // Activate the MemPurge prototype. + options.experimental_mempurge_threshold = 26.55; + + ASSERT_OK(TryReopen(options)); + + std::atomic mempurge_count{0}; + std::atomic sst_count{0}; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushJob:MemPurgeSuccessful", + [&](void* /*arg*/) { mempurge_count++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(53); + const size_t NUM_REPEAT = 1000; + const size_t RAND_VALUES_LENGTH = 10240; + std::string p_v1, p_v2, p_v3, p_v4, p_v5, p_v6, p_v7, p_v8, p_v9; + + p_v1 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v2 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v3 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v4 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v5 = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(KEY1, p_v1)); + ASSERT_OK(Put(KEY2, p_v2)); + ASSERT_OK(Put(KEY3, p_v3)); + ASSERT_OK(Put(KEY4, p_v4)); + ASSERT_OK(Put(KEY5, p_v5)); + ASSERT_OK(Delete(KEY1)); + + // Insertion of of K-V pairs, multiple times. + for (size_t i = 0; i < NUM_REPEAT; i++) { + // Create value strings of arbitrary + // length RAND_VALUES_LENGTH bytes. + p_v6 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v7 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v8 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v9 = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(KEY6, p_v6)); + ASSERT_OK(Put(KEY7, p_v7)); + ASSERT_OK(Put(KEY8, p_v8)); + ASSERT_OK(Put(KEY9, p_v9)); + + ASSERT_OK(Delete(KEY7)); + } + + // Check that there was at least one mempurge + const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1; + // Check that there was no SST files created during flush. + const uint32_t EXPECTED_SST_COUNT = 0; + + EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT); + EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT); + + // Verify that the ConditionalUpdateCompactionFilter + // updated the values of KEY2 and KEY3, and not KEY4 and KEY5. + ASSERT_EQ(Get(KEY1), NOT_FOUND); + ASSERT_EQ(Get(KEY2), NEW_VALUE); + ASSERT_EQ(Get(KEY3), NEW_VALUE); + ASSERT_EQ(Get(KEY4), p_v4); + ASSERT_EQ(Get(KEY5), p_v5); +} + +TEST_F(DBFlushTest, DISABLED_MemPurgeWALSupport) { + Options options = CurrentOptions(); + + options.statistics = CreateDBStatistics(); + options.statistics->set_stats_level(StatsLevel::kAll); + options.create_if_missing = true; + options.compression = kNoCompression; + options.inplace_update_support = false; + options.allow_concurrent_memtable_write = true; + + // Enforce size of a single MemTable to 128KB. + options.write_buffer_size = 128 << 10; + // Activate the MemPurge prototype + // (values >1.0 are equivalent to 1.0). + options.experimental_mempurge_threshold = 2.5; + + ASSERT_OK(TryReopen(options)); + + const size_t KVSIZE = 10; + + do { + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "baz", "v5")); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("v1", Get(1, "foo")); + + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v5", Get(1, "baz")); + ASSERT_OK(Put(0, "bar", "v2")); + ASSERT_OK(Put(1, "bar", "v2")); + ASSERT_OK(Put(1, "foo", "v3")); + std::atomic mempurge_count{0}; + std::atomic sst_count{0}; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushJob:MemPurgeSuccessful", + [&](void* /*arg*/) { mempurge_count++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + std::vector keys; + for (size_t k = 0; k < KVSIZE; k++) { + keys.push_back("IamKey" + std::to_string(k)); + } + + std::string RNDKEY, RNDVALUE; + const std::string NOT_FOUND = "NOT_FOUND"; + + // Heavy overwrite workload, + // more than would fit in maximum allowed memtables. + Random rnd(719); + const size_t NUM_REPEAT = 100; + const size_t RAND_KEY_LENGTH = 4096; + const size_t RAND_VALUES_LENGTH = 1024; + std::vector values_default(KVSIZE), values_pikachu(KVSIZE); + + // Insert a very first set of keys that will be + // mempurged at least once. + for (size_t k = 0; k < KVSIZE / 2; k++) { + values_default[k] = rnd.RandomString(RAND_VALUES_LENGTH); + values_pikachu[k] = rnd.RandomString(RAND_VALUES_LENGTH); + } + + // Insert keys[0:KVSIZE/2] to + // both 'default' and 'pikachu' CFs. + for (size_t k = 0; k < KVSIZE / 2; k++) { + ASSERT_OK(Put(0, keys[k], values_default[k])); + ASSERT_OK(Put(1, keys[k], values_pikachu[k])); + } + + // Check that the insertion was seamless. + for (size_t k = 0; k < KVSIZE / 2; k++) { + ASSERT_EQ(Get(0, keys[k]), values_default[k]); + ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]); + } + + // Insertion of of K-V pairs, multiple times (overwrites) + // into 'default' CF. Will trigger mempurge. + for (size_t j = 0; j < NUM_REPEAT; j++) { + // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes. + for (size_t k = KVSIZE / 2; k < KVSIZE; k++) { + values_default[k] = rnd.RandomString(RAND_VALUES_LENGTH); + } + + // Insert K-V into default CF. + for (size_t k = KVSIZE / 2; k < KVSIZE; k++) { + ASSERT_OK(Put(0, keys[k], values_default[k])); + } + + // Check key validity, for all keys, both in + // default and pikachu CFs. + for (size_t k = 0; k < KVSIZE; k++) { + ASSERT_EQ(Get(0, keys[k]), values_default[k]); + } + // Note that at this point, only keys[0:KVSIZE/2] + // have been inserted into Pikachu. + for (size_t k = 0; k < KVSIZE / 2; k++) { + ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]); + } + } + + // Insertion of of K-V pairs, multiple times (overwrites) + // into 'pikachu' CF. Will trigger mempurge. + // Check that we keep the older logs for 'default' imm(). + for (size_t j = 0; j < NUM_REPEAT; j++) { + // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes. + for (size_t k = KVSIZE / 2; k < KVSIZE; k++) { + values_pikachu[k] = rnd.RandomString(RAND_VALUES_LENGTH); + } + + // Insert K-V into pikachu CF. + for (size_t k = KVSIZE / 2; k < KVSIZE; k++) { + ASSERT_OK(Put(1, keys[k], values_pikachu[k])); + } + + // Check key validity, for all keys, + // both in default and pikachu. + for (size_t k = 0; k < KVSIZE; k++) { + ASSERT_EQ(Get(0, keys[k]), values_default[k]); + ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]); + } + } + + // Check that there was at least one mempurge + const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1; + // Check that there was no SST files created during flush. + const uint32_t EXPECTED_SST_COUNT = 0; + + EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT); + if (options.experimental_mempurge_threshold == + std::numeric_limits::max()) { + EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT); + } + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + // Check that there was no data corruption anywhere, + // not in 'default' nor in 'Pikachu' CFs. + ASSERT_EQ("v3", Get(1, "foo")); + ASSERT_OK(Put(1, "foo", "v4")); + ASSERT_EQ("v4", Get(1, "foo")); + ASSERT_EQ("v2", Get(1, "bar")); + ASSERT_EQ("v5", Get(1, "baz")); + // Check keys in 'Default' and 'Pikachu'. + // keys[0:KVSIZE/2] were for sure contained + // in the imm() at Reopen/recovery time. + for (size_t k = 0; k < KVSIZE; k++) { + ASSERT_EQ(Get(0, keys[k]), values_default[k]); + ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]); + } + // Insertion of random K-V pairs to trigger + // a flush in the Pikachu CF. + for (size_t j = 0; j < NUM_REPEAT; j++) { + RNDKEY = rnd.RandomString(RAND_KEY_LENGTH); + RNDVALUE = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(1, RNDKEY, RNDVALUE)); + } + // ASsert than there was at least one flush to storage. + EXPECT_GT(sst_count.exchange(0), EXPECTED_SST_COUNT); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("v4", Get(1, "foo")); + ASSERT_EQ("v2", Get(1, "bar")); + ASSERT_EQ("v5", Get(1, "baz")); + // Since values in default are held in mutable mem() + // and imm(), check if the flush in pikachu didn't + // affect these values. + for (size_t k = 0; k < KVSIZE; k++) { + ASSERT_EQ(Get(0, keys[k]), values_default[k]); + ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]); + } + ASSERT_EQ(Get(1, RNDKEY), RNDVALUE); + } while (ChangeWalOptions()); +} + +TEST_F(DBFlushTest, MemPurgeCorrectLogNumberAndSSTFileCreation) { + // Before our bug fix, we noticed that when 2 memtables were + // being flushed (with one memtable being the output of a + // previous MemPurge and one memtable being a newly-sealed memtable), + // the SST file created was not properly added to the DB version + // (via the VersionEdit obj), leading to data loss (the SST file + // was later being purged as an obsolete file). + // Therefore, we reproduce this scenario to test our fix. + Options options = CurrentOptions(); + + options.create_if_missing = true; + options.compression = kNoCompression; + options.inplace_update_support = false; + options.allow_concurrent_memtable_write = true; + + // Enforce size of a single MemTable to 1MB (64MB = 1048576 bytes). + options.write_buffer_size = 1 << 20; + // Activate the MemPurge prototype. + options.experimental_mempurge_threshold = 1.0; + + // Force to have more than one memtable to trigger a flush. + // For some reason this option does not seem to be enforced, + // so the following test is designed to make sure that we + // are testing the correct test case. + options.min_write_buffer_number_to_merge = 3; + options.max_write_buffer_number = 5; + options.max_write_buffer_size_to_maintain = 2 * (options.write_buffer_size); + options.disable_auto_compactions = true; + ASSERT_OK(TryReopen(options)); + + std::atomic mempurge_count{0}; + std::atomic sst_count{0}; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushJob:MemPurgeSuccessful", + [&](void* /*arg*/) { mempurge_count++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Dummy variable used for the following callback function. + uint64_t ZERO = 0; + // We will first execute mempurge operations exclusively. + // Therefore, when the first flush is triggered, we want to make + // sure there is at least 2 memtables being flushed: one output + // from a previous mempurge, and one newly sealed memtable. + // This is when we observed in the past that some SST files created + // were not properly added to the DB version (via the VersionEdit obj). + std::atomic num_memtable_at_first_flush(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "FlushJob::WriteLevel0Table:num_memtables", [&](void* arg) { + uint64_t* mems_size = reinterpret_cast(arg); + // atomic_compare_exchange_strong sometimes updates the value + // of ZERO (the "expected" object), so we make sure ZERO is indeed... + // zero. + ZERO = 0; + std::atomic_compare_exchange_strong(&num_memtable_at_first_flush, &ZERO, + *mems_size); + }); + + const std::vector KEYS = { + "ThisIsKey1", "ThisIsKey2", "ThisIsKey3", "ThisIsKey4", "ThisIsKey5", + "ThisIsKey6", "ThisIsKey7", "ThisIsKey8", "ThisIsKey9"}; + const std::string NOT_FOUND = "NOT_FOUND"; + + Random rnd(117); + const uint64_t NUM_REPEAT_OVERWRITES = 100; + const uint64_t NUM_RAND_INSERTS = 500; + const uint64_t RAND_VALUES_LENGTH = 10240; + + std::string key, value; + std::vector values(9, ""); + + // Keys used to check that no SST file disappeared. + for (uint64_t k = 0; k < 5; k++) { + values[k] = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(KEYS[k], values[k])); + } + + // Insertion of of K-V pairs, multiple times. + // Trigger at least one mempurge and no SST file creation. + for (size_t i = 0; i < NUM_REPEAT_OVERWRITES; i++) { + // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes. + for (uint64_t k = 5; k < values.size(); k++) { + values[k] = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(KEYS[k], values[k])); + } + // Check database consistency. + for (uint64_t k = 0; k < values.size(); k++) { + ASSERT_EQ(Get(KEYS[k]), values[k]); + } + } + + // Check that there was at least one mempurge + uint32_t expected_min_mempurge_count = 1; + // Check that there was no SST files created during flush. + uint32_t expected_sst_count = 0; + EXPECT_GE(mempurge_count.load(), expected_min_mempurge_count); + EXPECT_EQ(sst_count.load(), expected_sst_count); + + // Trigger an SST file creation and no mempurge. + for (size_t i = 0; i < NUM_RAND_INSERTS; i++) { + key = rnd.RandomString(RAND_VALUES_LENGTH); + // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes. + value = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(key, value)); + // Check database consistency. + for (uint64_t k = 0; k < values.size(); k++) { + ASSERT_EQ(Get(KEYS[k]), values[k]); + } + ASSERT_EQ(Get(key), value); + } + + // Check that there was at least one SST files created during flush. + expected_sst_count = 1; + EXPECT_GE(sst_count.load(), expected_sst_count); + + // Oddly enough, num_memtable_at_first_flush is not enforced to be + // equal to min_write_buffer_number_to_merge. So by asserting that + // the first SST file creation comes from one output memtable + // from a previous mempurge, and one newly sealed memtable. This + // is the scenario where we observed that some SST files created + // were not properly added to the DB version before our bug fix. + ASSERT_GE(num_memtable_at_first_flush.load(), 2); + + // Check that no data was lost after SST file creation. + for (uint64_t k = 0; k < values.size(); k++) { + ASSERT_EQ(Get(KEYS[k]), values[k]); + } + // Extra check of database consistency. + ASSERT_EQ(Get(key), value); + + Close(); +} + +TEST_P(DBFlushDirectIOTest, DirectIO) { + Options options; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.max_background_flushes = 2; + options.use_direct_io_for_flush_and_compaction = GetParam(); + options.env = MockEnv::Create(Env::Default()); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:create_file", [&](void* arg) { + bool* use_direct_writes = static_cast(arg); + ASSERT_EQ(*use_direct_writes, + options.use_direct_io_for_flush_and_compaction); + }); + + SyncPoint::GetInstance()->EnableProcessing(); + Reopen(options); + ASSERT_OK(Put("foo", "v")); + FlushOptions flush_options; + flush_options.wait = true; + ASSERT_OK(dbfull()->Flush(flush_options)); + Destroy(options); + delete options.env; +} + +TEST_F(DBFlushTest, FlushError) { + Options options; + std::unique_ptr fault_injection_env( + new FaultInjectionTestEnv(env_)); + options.write_buffer_size = 100; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.disable_auto_compactions = true; + options.env = fault_injection_env.get(); + Reopen(options); + + ASSERT_OK(Put("key1", "value1")); + ASSERT_OK(Put("key2", "value2")); + fault_injection_env->SetFilesystemActive(false); + Status s = dbfull()->TEST_SwitchMemtable(); + fault_injection_env->SetFilesystemActive(true); + Destroy(options); + ASSERT_NE(s, Status::OK()); +} + +TEST_F(DBFlushTest, ManualFlushFailsInReadOnlyMode) { + // Regression test for bug where manual flush hangs forever when the DB + // is in read-only mode. Verify it now at least returns, despite failing. + Options options; + std::unique_ptr fault_injection_env( + new FaultInjectionTestEnv(env_)); + options.env = fault_injection_env.get(); + options.max_write_buffer_number = 2; + Reopen(options); + + // Trigger a first flush but don't let it run + ASSERT_OK(db_->PauseBackgroundWork()); + ASSERT_OK(Put("key1", "value1")); + FlushOptions flush_opts; + flush_opts.wait = false; + ASSERT_OK(db_->Flush(flush_opts)); + + // Write a key to the second memtable so we have something to flush later + // after the DB is in read-only mode. + ASSERT_OK(Put("key2", "value2")); + + // Let the first flush continue, hit an error, and put the DB in read-only + // mode. + fault_injection_env->SetFilesystemActive(false); + ASSERT_OK(db_->ContinueBackgroundWork()); + // We ingested the error to env, so the returned status is not OK. + ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable()); +#ifndef ROCKSDB_LITE + uint64_t num_bg_errors; + ASSERT_TRUE( + db_->GetIntProperty(DB::Properties::kBackgroundErrors, &num_bg_errors)); + ASSERT_GT(num_bg_errors, 0); +#endif // ROCKSDB_LITE + + // In the bug scenario, triggering another flush would cause the second flush + // to hang forever. After the fix we expect it to return an error. + ASSERT_NOK(db_->Flush(FlushOptions())); + + Close(); +} + +TEST_F(DBFlushTest, CFDropRaceWithWaitForFlushMemTables) { + Options options = CurrentOptions(); + options.create_if_missing = true; + CreateAndReopenWithCF({"pikachu"}, options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:AfterScheduleFlush", + "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"}, + {"DBFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree", + "DBImpl::BackgroundCallFlush:start"}, + {"DBImpl::BackgroundCallFlush:start", + "DBImpl::FlushMemTable:BeforeWaitForBgFlush"}}); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_EQ(2, handles_.size()); + ASSERT_OK(Put(1, "key", "value")); + auto* cfd = static_cast(handles_[1])->cfd(); + port::Thread drop_cf_thr([&]() { + TEST_SYNC_POINT( + "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"); + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1])); + handles_.resize(1); + TEST_SYNC_POINT( + "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree"); + }); + FlushOptions flush_opts; + flush_opts.allow_write_stall = true; + ASSERT_NOK(dbfull()->TEST_FlushMemTable(cfd, flush_opts)); + drop_cf_thr.join(); + Close(); + SyncPoint::GetInstance()->DisableProcessing(); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBFlushTest, FireOnFlushCompletedAfterCommittedResult) { + class TestListener : public EventListener { + public: + void OnFlushCompleted(DB* db, const FlushJobInfo& info) override { + // There's only one key in each flush. + ASSERT_EQ(info.smallest_seqno, info.largest_seqno); + ASSERT_NE(0, info.smallest_seqno); + if (info.smallest_seqno == seq1) { + // First flush completed + ASSERT_FALSE(completed1); + completed1 = true; + CheckFlushResultCommitted(db, seq1); + } else { + // Second flush completed + ASSERT_FALSE(completed2); + completed2 = true; + ASSERT_EQ(info.smallest_seqno, seq2); + CheckFlushResultCommitted(db, seq2); + } + } + + void CheckFlushResultCommitted(DB* db, SequenceNumber seq) { + DBImpl* db_impl = static_cast_with_check(db); + InstrumentedMutex* mutex = db_impl->mutex(); + mutex->Lock(); + auto* cfd = static_cast_with_check( + db->DefaultColumnFamily()) + ->cfd(); + ASSERT_LT(seq, cfd->imm()->current()->GetEarliestSequenceNumber()); + mutex->Unlock(); + } + + std::atomic seq1{0}; + std::atomic seq2{0}; + std::atomic completed1{false}; + std::atomic completed2{false}; + }; + std::shared_ptr listener = std::make_shared(); + + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTableToOutputFile:AfterPickMemtables", + "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitFirst"}, + {"DBImpl::FlushMemTableToOutputFile:Finish", + "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitSecond"}}); + SyncPoint::GetInstance()->SetCallBack( + "FlushJob::WriteLevel0Table", [&listener](void* arg) { + // Wait for the second flush finished, out of mutex. + auto* mems = reinterpret_cast*>(arg); + if (mems->front()->GetEarliestSequenceNumber() == listener->seq1 - 1) { + TEST_SYNC_POINT( + "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:" + "WaitSecond"); + } + }); + + Options options = CurrentOptions(); + options.create_if_missing = true; + options.listeners.push_back(listener); + // Setting max_flush_jobs = max_background_jobs / 4 = 2. + options.max_background_jobs = 8; + // Allow 2 immutable memtables. + options.max_write_buffer_number = 3; + Reopen(options); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put("foo", "v")); + listener->seq1 = db_->GetLatestSequenceNumber(); + // t1 will wait for the second flush complete before committing flush result. + auto t1 = port::Thread([&]() { + // flush_opts.wait = true + ASSERT_OK(db_->Flush(FlushOptions())); + }); + // Wait for first flush started. + TEST_SYNC_POINT( + "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitFirst"); + // The second flush will exit early without commit its result. The work + // is delegated to the first flush. + ASSERT_OK(Put("bar", "v")); + listener->seq2 = db_->GetLatestSequenceNumber(); + FlushOptions flush_opts; + flush_opts.wait = false; + ASSERT_OK(db_->Flush(flush_opts)); + t1.join(); + // Ensure background work is fully finished including listener callbacks + // before accessing listener state. + ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork()); + ASSERT_TRUE(listener->completed1); + ASSERT_TRUE(listener->completed2); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} +#endif // !ROCKSDB_LITE + +TEST_F(DBFlushTest, FlushWithBlob) { + constexpr uint64_t min_blob_size = 10; + + Options options; + options.enable_blob_files = true; + options.min_blob_size = min_blob_size; + options.disable_auto_compactions = true; + options.env = env_; + + Reopen(options); + + constexpr char short_value[] = "short"; + static_assert(sizeof(short_value) - 1 < min_blob_size, + "short_value too long"); + + constexpr char long_value[] = "long_value"; + static_assert(sizeof(long_value) - 1 >= min_blob_size, + "long_value too short"); + + ASSERT_OK(Put("key1", short_value)); + ASSERT_OK(Put("key2", long_value)); + + ASSERT_OK(Flush()); + + ASSERT_EQ(Get("key1"), short_value); + ASSERT_EQ(Get("key2"), long_value); + + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + Version* const current = cfd->current(); + assert(current); + + const VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); + + const auto& l0_files = storage_info->LevelFiles(0); + ASSERT_EQ(l0_files.size(), 1); + + const FileMetaData* const table_file = l0_files[0]; + assert(table_file); + + const auto& blob_files = storage_info->GetBlobFiles(); + ASSERT_EQ(blob_files.size(), 1); + + const auto& blob_file = blob_files.front(); + assert(blob_file); + + ASSERT_EQ(table_file->smallest.user_key(), "key1"); + ASSERT_EQ(table_file->largest.user_key(), "key2"); + ASSERT_EQ(table_file->fd.smallest_seqno, 1); + ASSERT_EQ(table_file->fd.largest_seqno, 2); + ASSERT_EQ(table_file->oldest_blob_file_number, + blob_file->GetBlobFileNumber()); + + ASSERT_EQ(blob_file->GetTotalBlobCount(), 1); + +#ifndef ROCKSDB_LITE + const InternalStats* const internal_stats = cfd->internal_stats(); + assert(internal_stats); + + const auto& compaction_stats = internal_stats->TEST_GetCompactionStats(); + ASSERT_FALSE(compaction_stats.empty()); + ASSERT_EQ(compaction_stats[0].bytes_written, table_file->fd.GetFileSize()); + ASSERT_EQ(compaction_stats[0].bytes_written_blob, + blob_file->GetTotalBlobBytes()); + ASSERT_EQ(compaction_stats[0].num_output_files, 1); + ASSERT_EQ(compaction_stats[0].num_output_files_blob, 1); + + const uint64_t* const cf_stats_value = internal_stats->TEST_GetCFStatsValue(); + ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED], + compaction_stats[0].bytes_written + + compaction_stats[0].bytes_written_blob); +#endif // ROCKSDB_LITE +} + +TEST_F(DBFlushTest, FlushWithChecksumHandoff1) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.write_buffer_size = 100; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.disable_auto_compactions = true; + options.env = fault_fs_env.get(); + options.checksum_handoff_file_types.Add(FileType::kTableFile); + Reopen(options); + + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put("key1", "value1")); + ASSERT_OK(Put("key2", "value2")); + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + + // The hash does not match, write fails + // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + // Since the file system returns IOStatus::Corruption, it is an + // unrecoverable error. + SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + }); + ASSERT_OK(Put("key3", "value3")); + ASSERT_OK(Put("key4", "value4")); + SyncPoint::GetInstance()->EnableProcessing(); + Status s = Flush(); + ASSERT_EQ(s.severity(), + ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError); + SyncPoint::GetInstance()->DisableProcessing(); + Destroy(options); + Reopen(options); + + // The file system does not support checksum handoff. The check + // will be ignored. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum); + ASSERT_OK(Put("key5", "value5")); + ASSERT_OK(Put("key6", "value6")); + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + + // Each write will be similated as corrupted. + // Since the file system returns IOStatus::Corruption, it is an + // unrecoverable error. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { + fault_fs->IngestDataCorruptionBeforeWrite(); + }); + ASSERT_OK(Put("key7", "value7")); + ASSERT_OK(Put("key8", "value8")); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), + ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError); + SyncPoint::GetInstance()->DisableProcessing(); + + Destroy(options); +} + +TEST_F(DBFlushTest, FlushWithChecksumHandoff2) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.write_buffer_size = 100; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.disable_auto_compactions = true; + options.env = fault_fs_env.get(); + Reopen(options); + + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put("key1", "value1")); + ASSERT_OK(Put("key2", "value2")); + ASSERT_OK(Flush()); + + // options is not set, the checksum handoff will not be triggered + SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + }); + ASSERT_OK(Put("key3", "value3")); + ASSERT_OK(Put("key4", "value4")); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Flush()); + SyncPoint::GetInstance()->DisableProcessing(); + Destroy(options); + Reopen(options); + + // The file system does not support checksum handoff. The check + // will be ignored. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum); + ASSERT_OK(Put("key5", "value5")); + ASSERT_OK(Put("key6", "value6")); + ASSERT_OK(Flush()); + + // options is not set, the checksum handoff will not be triggered + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { + fault_fs->IngestDataCorruptionBeforeWrite(); + }); + ASSERT_OK(Put("key7", "value7")); + ASSERT_OK(Put("key8", "value8")); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Flush()); + SyncPoint::GetInstance()->DisableProcessing(); + + Destroy(options); +} + +TEST_F(DBFlushTest, FlushWithChecksumHandoffManifest1) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.write_buffer_size = 100; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.disable_auto_compactions = true; + options.env = fault_fs_env.get(); + options.checksum_handoff_file_types.Add(FileType::kDescriptorFile); + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + Reopen(options); + + ASSERT_OK(Put("key1", "value1")); + ASSERT_OK(Put("key2", "value2")); + ASSERT_OK(Flush()); + + // The hash does not match, write fails + // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + // Since the file system returns IOStatus::Corruption, it is mapped to + // kFatalError error. + ASSERT_OK(Put("key3", "value3")); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", [&](void*) { + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + }); + ASSERT_OK(Put("key3", "value3")); + ASSERT_OK(Put("key4", "value4")); + SyncPoint::GetInstance()->EnableProcessing(); + Status s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError); + SyncPoint::GetInstance()->DisableProcessing(); + Destroy(options); +} + +TEST_F(DBFlushTest, FlushWithChecksumHandoffManifest2) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.write_buffer_size = 100; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.disable_auto_compactions = true; + options.env = fault_fs_env.get(); + options.checksum_handoff_file_types.Add(FileType::kDescriptorFile); + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum); + Reopen(options); + // The file system does not support checksum handoff. The check + // will be ignored. + ASSERT_OK(Put("key5", "value5")); + ASSERT_OK(Put("key6", "value6")); + ASSERT_OK(Flush()); + + // Each write will be similated as corrupted. + // Since the file system returns IOStatus::Corruption, it is mapped to + // kFatalError error. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", + [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); }); + ASSERT_OK(Put("key7", "value7")); + ASSERT_OK(Put("key8", "value8")); + SyncPoint::GetInstance()->EnableProcessing(); + Status s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError); + SyncPoint::GetInstance()->DisableProcessing(); + + Destroy(options); +} + +TEST_F(DBFlushTest, PickRightMemtables) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + options.create_if_missing = true; + + const std::string test_cf_name = "test_cf"; + options.max_write_buffer_number = 128; + CreateColumnFamilies({test_cf_name}, options); + + Close(); + + ReopenWithColumnFamilies({kDefaultColumnFamilyName, test_cf_name}, options); + + ASSERT_OK(db_->Put(WriteOptions(), "key", "value")); + + ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "key", "value")); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::SyncClosedLogs:BeforeReLock", [&](void* /*arg*/) { + ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "what", "v")); + auto* cfhi = + static_cast_with_check(handles_[1]); + assert(cfhi); + ASSERT_OK(dbfull()->TEST_SwitchMemtable(cfhi->cfd())); + }); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushMemTableToOutputFile:AfterPickMemtables", [&](void* arg) { + auto* job = reinterpret_cast(arg); + assert(job); + const auto& mems = job->GetMemTables(); + assert(mems.size() == 1); + assert(mems[0]); + ASSERT_EQ(1, mems[0]->GetID()); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db_->Flush(FlushOptions(), handles_[1])); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +class DBFlushTestBlobError : public DBFlushTest, + public testing::WithParamInterface { + public: + DBFlushTestBlobError() : sync_point_(GetParam()) {} + + std::string sync_point_; +}; + +INSTANTIATE_TEST_CASE_P(DBFlushTestBlobError, DBFlushTestBlobError, + ::testing::ValuesIn(std::vector{ + "BlobFileBuilder::WriteBlobToFile:AddRecord", + "BlobFileBuilder::WriteBlobToFile:AppendFooter"})); + +TEST_P(DBFlushTestBlobError, FlushError) { + Options options; + options.enable_blob_files = true; + options.disable_auto_compactions = true; + options.env = env_; + + Reopen(options); + + ASSERT_OK(Put("key", "blob")); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) { + Status* const s = static_cast(arg); + assert(s); + + (*s) = Status::IOError(sync_point_); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_NOK(Flush()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + Version* const current = cfd->current(); + assert(current); + + const VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); + + const auto& l0_files = storage_info->LevelFiles(0); + ASSERT_TRUE(l0_files.empty()); + + const auto& blob_files = storage_info->GetBlobFiles(); + ASSERT_TRUE(blob_files.empty()); + + // Make sure the files generated by the failed job have been deleted + std::vector files; + ASSERT_OK(env_->GetChildren(dbname_, &files)); + for (const auto& file : files) { + uint64_t number = 0; + FileType type = kTableFile; + + if (!ParseFileName(file, &number, &type)) { + continue; + } + + ASSERT_NE(type, kTableFile); + ASSERT_NE(type, kBlobFile); + } + +#ifndef ROCKSDB_LITE + const InternalStats* const internal_stats = cfd->internal_stats(); + assert(internal_stats); + + const auto& compaction_stats = internal_stats->TEST_GetCompactionStats(); + ASSERT_FALSE(compaction_stats.empty()); + + if (sync_point_ == "BlobFileBuilder::WriteBlobToFile:AddRecord") { + ASSERT_EQ(compaction_stats[0].bytes_written, 0); + ASSERT_EQ(compaction_stats[0].bytes_written_blob, 0); + ASSERT_EQ(compaction_stats[0].num_output_files, 0); + ASSERT_EQ(compaction_stats[0].num_output_files_blob, 0); + } else { + // SST file writing succeeded; blob file writing failed (during Finish) + ASSERT_GT(compaction_stats[0].bytes_written, 0); + ASSERT_EQ(compaction_stats[0].bytes_written_blob, 0); + ASSERT_EQ(compaction_stats[0].num_output_files, 1); + ASSERT_EQ(compaction_stats[0].num_output_files_blob, 0); + } + + const uint64_t* const cf_stats_value = internal_stats->TEST_GetCFStatsValue(); + ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED], + compaction_stats[0].bytes_written + + compaction_stats[0].bytes_written_blob); +#endif // ROCKSDB_LITE +} + +#ifndef ROCKSDB_LITE +TEST_F(DBFlushTest, TombstoneVisibleInSnapshot) { + class SimpleTestFlushListener : public EventListener { + public: + explicit SimpleTestFlushListener(DBFlushTest* _test) : test_(_test) {} + ~SimpleTestFlushListener() override {} + + void OnFlushBegin(DB* db, const FlushJobInfo& info) override { + ASSERT_EQ(static_cast(0), info.cf_id); + + ASSERT_OK(db->Delete(WriteOptions(), "foo")); + snapshot_ = db->GetSnapshot(); + ASSERT_OK(db->Put(WriteOptions(), "foo", "value")); + + auto* dbimpl = static_cast_with_check(db); + assert(dbimpl); + + ColumnFamilyHandle* cfh = db->DefaultColumnFamily(); + auto* cfhi = static_cast_with_check(cfh); + assert(cfhi); + ASSERT_OK(dbimpl->TEST_SwitchMemtable(cfhi->cfd())); + } + + DBFlushTest* test_ = nullptr; + const Snapshot* snapshot_ = nullptr; + }; + + Options options = CurrentOptions(); + options.create_if_missing = true; + auto* listener = new SimpleTestFlushListener(this); + options.listeners.emplace_back(listener); + DestroyAndReopen(options); + + ASSERT_OK(db_->Put(WriteOptions(), "foo", "value0")); + + ManagedSnapshot snapshot_guard(db_); + + ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily(); + ASSERT_OK(db_->Flush(FlushOptions(), default_cf)); + + const Snapshot* snapshot = listener->snapshot_; + assert(snapshot); + + ReadOptions read_opts; + read_opts.snapshot = snapshot; + + // Using snapshot should not see "foo". + { + std::string value; + Status s = db_->Get(read_opts, "foo", &value); + ASSERT_TRUE(s.IsNotFound()); + } + + db_->ReleaseSnapshot(snapshot); +} + +TEST_P(DBAtomicFlushTest, ManualFlushUnder2PC) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.allow_2pc = true; + options.atomic_flush = GetParam(); + // 64MB so that memtable flush won't be trigger by the small writes. + options.write_buffer_size = (static_cast(64) << 20); + + // Destroy the DB to recreate as a TransactionDB. + Close(); + Destroy(options, true); + + // Create a TransactionDB. + TransactionDB* txn_db = nullptr; + TransactionDBOptions txn_db_opts; + txn_db_opts.write_policy = TxnDBWritePolicy::WRITE_COMMITTED; + ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db)); + ASSERT_NE(txn_db, nullptr); + db_ = txn_db; + + // Create two more columns other than default CF. + std::vector cfs = {"puppy", "kitty"}; + CreateColumnFamilies(cfs, options); + ASSERT_EQ(handles_.size(), 2); + ASSERT_EQ(handles_[0]->GetName(), cfs[0]); + ASSERT_EQ(handles_[1]->GetName(), cfs[1]); + const size_t kNumCfToFlush = options.atomic_flush ? 2 : 1; + + WriteOptions wopts; + TransactionOptions txn_opts; + // txn1 only prepare, but does not commit. + // The WAL containing the prepared but uncommitted data must be kept. + Transaction* txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr); + // txn2 not only prepare, but also commit. + Transaction* txn2 = txn_db->BeginTransaction(wopts, txn_opts, nullptr); + ASSERT_NE(txn1, nullptr); + ASSERT_NE(txn2, nullptr); + for (size_t i = 0; i < kNumCfToFlush; i++) { + ASSERT_OK(txn1->Put(handles_[i], "k1", "v1")); + ASSERT_OK(txn2->Put(handles_[i], "k2", "v2")); + } + // A txn must be named before prepare. + ASSERT_OK(txn1->SetName("txn1")); + ASSERT_OK(txn2->SetName("txn2")); + // Prepare writes to WAL, but not to memtable. (WriteCommitted) + ASSERT_OK(txn1->Prepare()); + ASSERT_OK(txn2->Prepare()); + // Commit writes to memtable. + ASSERT_OK(txn2->Commit()); + delete txn1; + delete txn2; + + // There are still data in memtable not flushed. + // But since data is small enough to reside in the active memtable, + // there are no immutable memtable. + for (size_t i = 0; i < kNumCfToFlush; i++) { + auto cfh = static_cast(handles_[i]); + ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed()); + ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty()); + } + + // Atomic flush memtables, + // the min log with prepared data should be written to MANIFEST. + std::vector cfs_to_flush(kNumCfToFlush); + for (size_t i = 0; i < kNumCfToFlush; i++) { + cfs_to_flush[i] = handles_[i]; + } + ASSERT_OK(txn_db->Flush(FlushOptions(), cfs_to_flush)); + + // There are no remaining data in memtable after flush. + for (size_t i = 0; i < kNumCfToFlush; i++) { + auto cfh = static_cast(handles_[i]); + ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed()); + ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty()); + ASSERT_EQ(cfh->cfd()->GetFlushReason(), FlushReason::kManualFlush); + } + + // The recovered min log number with prepared data should be non-zero. + // In 2pc mode, MinLogNumberToKeep returns the + // VersionSet::min_log_number_to_keep recovered from MANIFEST, if it's 0, + // it means atomic flush didn't write the min_log_number_to_keep to MANIFEST. + cfs.push_back(kDefaultColumnFamilyName); + ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); + DBImpl* db_impl = reinterpret_cast(db_); + ASSERT_TRUE(db_impl->allow_2pc()); + ASSERT_NE(db_impl->MinLogNumberToKeep(), 0); +} +#endif // ROCKSDB_LITE + +TEST_P(DBAtomicFlushTest, ManualAtomicFlush) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = GetParam(); + options.write_buffer_size = (static_cast(64) << 20); + + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + size_t num_cfs = handles_.size(); + ASSERT_EQ(3, num_cfs); + WriteOptions wopts; + wopts.disableWAL = true; + for (size_t i = 0; i != num_cfs; ++i) { + ASSERT_OK(Put(static_cast(i) /*cf*/, "key", "value", wopts)); + } + + for (size_t i = 0; i != num_cfs; ++i) { + auto cfh = static_cast(handles_[i]); + ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed()); + ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty()); + } + + std::vector cf_ids; + for (size_t i = 0; i != num_cfs; ++i) { + cf_ids.emplace_back(static_cast(i)); + } + ASSERT_OK(Flush(cf_ids)); + + for (size_t i = 0; i != num_cfs; ++i) { + auto cfh = static_cast(handles_[i]); + ASSERT_EQ(cfh->cfd()->GetFlushReason(), FlushReason::kManualFlush); + ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed()); + ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty()); + } +} + +TEST_P(DBAtomicFlushTest, PrecomputeMinLogNumberToKeepNon2PC) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = GetParam(); + options.write_buffer_size = (static_cast(64) << 20); + CreateAndReopenWithCF({"pikachu"}, options); + + const size_t num_cfs = handles_.size(); + ASSERT_EQ(num_cfs, 2); + WriteOptions wopts; + for (size_t i = 0; i != num_cfs; ++i) { + ASSERT_OK(Put(static_cast(i) /*cf*/, "key", "value", wopts)); + } + + { + // Flush the default CF only. + std::vector cf_ids{0}; + ASSERT_OK(Flush(cf_ids)); + + autovector flushed_cfds; + autovector> flush_edits; + auto flushed_cfh = static_cast(handles_[0]); + flushed_cfds.push_back(flushed_cfh->cfd()); + flush_edits.push_back({}); + auto unflushed_cfh = static_cast(handles_[1]); + + ASSERT_EQ(PrecomputeMinLogNumberToKeepNon2PC(dbfull()->GetVersionSet(), + flushed_cfds, flush_edits), + unflushed_cfh->cfd()->GetLogNumber()); + } + + { + // Flush all CFs. + std::vector cf_ids; + for (size_t i = 0; i != num_cfs; ++i) { + cf_ids.emplace_back(static_cast(i)); + } + ASSERT_OK(Flush(cf_ids)); + uint64_t log_num_after_flush = dbfull()->TEST_GetCurrentLogNumber(); + + uint64_t min_log_number_to_keep = std::numeric_limits::max(); + autovector flushed_cfds; + autovector> flush_edits; + for (size_t i = 0; i != num_cfs; ++i) { + auto cfh = static_cast(handles_[i]); + flushed_cfds.push_back(cfh->cfd()); + flush_edits.push_back({}); + min_log_number_to_keep = + std::min(min_log_number_to_keep, cfh->cfd()->GetLogNumber()); + } + ASSERT_EQ(min_log_number_to_keep, log_num_after_flush); + ASSERT_EQ(PrecomputeMinLogNumberToKeepNon2PC(dbfull()->GetVersionSet(), + flushed_cfds, flush_edits), + min_log_number_to_keep); + } +} + +TEST_P(DBAtomicFlushTest, AtomicFlushTriggeredByMemTableFull) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = GetParam(); + // 4KB so that we can easily trigger auto flush. + options.write_buffer_size = 4096; + + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCallFlush:FlushFinish:0", + "DBAtomicFlushTest::AtomicFlushTriggeredByMemTableFull:BeforeCheck"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + size_t num_cfs = handles_.size(); + ASSERT_EQ(3, num_cfs); + WriteOptions wopts; + wopts.disableWAL = true; + for (size_t i = 0; i != num_cfs; ++i) { + ASSERT_OK(Put(static_cast(i) /*cf*/, "key", "value", wopts)); + } + // Keep writing to one of them column families to trigger auto flush. + for (int i = 0; i != 4000; ++i) { + ASSERT_OK(Put(static_cast(num_cfs) - 1 /*cf*/, + "key" + std::to_string(i), "value" + std::to_string(i), + wopts)); + } + + TEST_SYNC_POINT( + "DBAtomicFlushTest::AtomicFlushTriggeredByMemTableFull:BeforeCheck"); + if (options.atomic_flush) { + for (size_t i = 0; i + 1 != num_cfs; ++i) { + auto cfh = static_cast(handles_[i]); + ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed()); + ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty()); + } + } else { + for (size_t i = 0; i + 1 != num_cfs; ++i) { + auto cfh = static_cast(handles_[i]); + ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed()); + ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty()); + } + } + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBAtomicFlushTest, AtomicFlushRollbackSomeJobs) { + bool atomic_flush = GetParam(); + if (!atomic_flush) { + return; + } + std::unique_ptr fault_injection_env( + new FaultInjectionTestEnv(env_)); + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = atomic_flush; + options.env = fault_injection_env.get(); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:1", + "DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:1"}, + {"DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:2", + "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + size_t num_cfs = handles_.size(); + ASSERT_EQ(3, num_cfs); + WriteOptions wopts; + wopts.disableWAL = true; + for (size_t i = 0; i != num_cfs; ++i) { + int cf_id = static_cast(i); + ASSERT_OK(Put(cf_id, "key", "value", wopts)); + } + FlushOptions flush_opts; + flush_opts.wait = false; + ASSERT_OK(dbfull()->Flush(flush_opts, handles_)); + TEST_SYNC_POINT("DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:1"); + fault_injection_env->SetFilesystemActive(false); + TEST_SYNC_POINT("DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:2"); + for (auto* cfh : handles_) { + // Returns the IO error happend during flush. + ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable(cfh)); + } + for (size_t i = 0; i != num_cfs; ++i) { + auto cfh = static_cast(handles_[i]); + ASSERT_EQ(1, cfh->cfd()->imm()->NumNotFlushed()); + ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty()); + } + fault_injection_env->SetFilesystemActive(true); + Destroy(options); +} + +TEST_P(DBAtomicFlushTest, FlushMultipleCFs_DropSomeBeforeRequestFlush) { + bool atomic_flush = GetParam(); + if (!atomic_flush) { + return; + } + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = atomic_flush; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->EnableProcessing(); + + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + size_t num_cfs = handles_.size(); + ASSERT_EQ(3, num_cfs); + WriteOptions wopts; + wopts.disableWAL = true; + std::vector cf_ids; + for (size_t i = 0; i != num_cfs; ++i) { + int cf_id = static_cast(i); + ASSERT_OK(Put(cf_id, "key", "value", wopts)); + cf_ids.push_back(cf_id); + } + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + ASSERT_TRUE(Flush(cf_ids).IsColumnFamilyDropped()); + Destroy(options); +} + +TEST_P(DBAtomicFlushTest, + FlushMultipleCFs_DropSomeAfterScheduleFlushBeforeFlushJobRun) { + bool atomic_flush = GetParam(); + if (!atomic_flush) { + return; + } + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = atomic_flush; + + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::AtomicFlushMemTables:AfterScheduleFlush", + "DBAtomicFlushTest::BeforeDropCF"}, + {"DBAtomicFlushTest::AfterDropCF", + "DBImpl::BackgroundCallFlush:start"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + size_t num_cfs = handles_.size(); + ASSERT_EQ(3, num_cfs); + WriteOptions wopts; + wopts.disableWAL = true; + for (size_t i = 0; i != num_cfs; ++i) { + int cf_id = static_cast(i); + ASSERT_OK(Put(cf_id, "key", "value", wopts)); + } + port::Thread user_thread([&]() { + TEST_SYNC_POINT("DBAtomicFlushTest::BeforeDropCF"); + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + TEST_SYNC_POINT("DBAtomicFlushTest::AfterDropCF"); + }); + FlushOptions flush_opts; + flush_opts.wait = true; + ASSERT_OK(dbfull()->Flush(flush_opts, handles_)); + user_thread.join(); + for (size_t i = 0; i != num_cfs; ++i) { + int cf_id = static_cast(i); + ASSERT_EQ("value", Get(cf_id, "key")); + } + + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "eevee"}, options); + num_cfs = handles_.size(); + ASSERT_EQ(2, num_cfs); + for (size_t i = 0; i != num_cfs; ++i) { + int cf_id = static_cast(i); + ASSERT_EQ("value", Get(cf_id, "key")); + } + Destroy(options); +} + +TEST_P(DBAtomicFlushTest, TriggerFlushAndClose) { + bool atomic_flush = GetParam(); + if (!atomic_flush) { + return; + } + const int kNumKeysTriggerFlush = 4; + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = atomic_flush; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysTriggerFlush)); + CreateAndReopenWithCF({"pikachu"}, options); + + for (int i = 0; i != kNumKeysTriggerFlush; ++i) { + ASSERT_OK(Put(0, "key" + std::to_string(i), "value" + std::to_string(i))); + } + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(0, "key", "value")); + Close(); + + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); + ASSERT_EQ("value", Get(0, "key")); +} + +TEST_P(DBAtomicFlushTest, PickMemtablesRaceWithBackgroundFlush) { + bool atomic_flush = GetParam(); + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = atomic_flush; + options.max_write_buffer_number = 4; + // Set min_write_buffer_number_to_merge to be greater than 1, so that + // a column family with one memtable in the imm will not cause IsFlushPending + // to return true when flush_requested_ is false. + options.min_write_buffer_number_to_merge = 2; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_EQ(2, handles_.size()); + ASSERT_OK(dbfull()->PauseBackgroundWork()); + ASSERT_OK(Put(0, "key00", "value00")); + ASSERT_OK(Put(1, "key10", "value10")); + FlushOptions flush_opts; + flush_opts.wait = false; + ASSERT_OK(dbfull()->Flush(flush_opts, handles_)); + ASSERT_OK(Put(0, "key01", "value01")); + // Since max_write_buffer_number is 4, the following flush won't cause write + // stall. + ASSERT_OK(dbfull()->Flush(flush_opts)); + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1])); + handles_[1] = nullptr; + ASSERT_OK(dbfull()->ContinueBackgroundWork()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0])); + delete handles_[0]; + handles_.clear(); +} + +TEST_P(DBAtomicFlushTest, CFDropRaceWithWaitForFlushMemTables) { + bool atomic_flush = GetParam(); + if (!atomic_flush) { + return; + } + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = atomic_flush; + CreateAndReopenWithCF({"pikachu"}, options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::AtomicFlushMemTables:AfterScheduleFlush", + "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"}, + {"DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree", + "DBImpl::BackgroundCallFlush:start"}, + {"DBImpl::BackgroundCallFlush:start", + "DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush"}}); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_EQ(2, handles_.size()); + ASSERT_OK(Put(0, "key", "value")); + ASSERT_OK(Put(1, "key", "value")); + auto* cfd_default = + static_cast(dbfull()->DefaultColumnFamily()) + ->cfd(); + auto* cfd_pikachu = static_cast(handles_[1])->cfd(); + port::Thread drop_cf_thr([&]() { + TEST_SYNC_POINT( + "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"); + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + delete handles_[1]; + handles_.resize(1); + TEST_SYNC_POINT( + "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree"); + }); + FlushOptions flush_opts; + flush_opts.allow_write_stall = true; + ASSERT_OK(dbfull()->TEST_AtomicFlushMemTables({cfd_default, cfd_pikachu}, + flush_opts)); + drop_cf_thr.join(); + Close(); + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBAtomicFlushTest, RollbackAfterFailToInstallResults) { + bool atomic_flush = GetParam(); + if (!atomic_flush) { + return; + } + auto fault_injection_env = std::make_shared(env_); + Options options = CurrentOptions(); + options.env = fault_injection_env.get(); + options.create_if_missing = true; + options.atomic_flush = atomic_flush; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_EQ(2, handles_.size()); + for (size_t cf = 0; cf < handles_.size(); ++cf) { + ASSERT_OK(Put(static_cast(cf), "a", "value")); + } + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0", + [&](void* /*arg*/) { fault_injection_env->SetFilesystemActive(false); }); + SyncPoint::GetInstance()->EnableProcessing(); + FlushOptions flush_opts; + Status s = db_->Flush(flush_opts, handles_); + ASSERT_NOK(s); + fault_injection_env->SetFilesystemActive(true); + Close(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +// In atomic flush, concurrent bg flush threads commit to the MANIFEST in +// serial, in the order of their picked memtables for each column family. +// Only when a bg flush thread finds out that its memtables are the earliest +// unflushed ones for all the included column families will this bg flush +// thread continue to commit to MANIFEST. +// This unit test uses sync point to coordinate the execution of two bg threads +// executing the same sequence of functions. The interleaving are as follows. +// time bg1 bg2 +// | pick memtables to flush +// | flush memtables cf1_m1, cf2_m1 +// | join MANIFEST write queue +// | pick memtabls to flush +// | flush memtables cf1_(m1+1) +// | join MANIFEST write queue +// | wait to write MANIFEST +// | write MANIFEST +// | IO error +// | detect IO error and stop waiting +// V +TEST_P(DBAtomicFlushTest, BgThreadNoWaitAfterManifestError) { + bool atomic_flush = GetParam(); + if (!atomic_flush) { + return; + } + auto fault_injection_env = std::make_shared(env_); + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.atomic_flush = true; + options.env = fault_injection_env.get(); + // Set a larger value than default so that RocksDB can schedule concurrent + // background flush threads. + options.max_background_jobs = 8; + options.max_write_buffer_number = 8; + CreateAndReopenWithCF({"pikachu"}, options); + + assert(2 == handles_.size()); + + WriteOptions write_opts; + write_opts.disableWAL = true; + + ASSERT_OK(Put(0, "a", "v_0_a", write_opts)); + ASSERT_OK(Put(1, "a", "v_1_a", write_opts)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + SyncPoint::GetInstance()->LoadDependency({ + {"BgFlushThr2:WaitToCommit", "BgFlushThr1:BeforeWriteManifest"}, + }); + + std::thread::id bg_flush_thr1, bg_flush_thr2; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCallFlush:start", [&](void*) { + if (bg_flush_thr1 == std::thread::id()) { + bg_flush_thr1 = std::this_thread::get_id(); + } else if (bg_flush_thr2 == std::thread::id()) { + bg_flush_thr2 = std::this_thread::get_id(); + } + }); + + int called = 0; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::AtomicFlushMemTablesToOutputFiles:WaitToCommit", [&](void* arg) { + if (std::this_thread::get_id() == bg_flush_thr2) { + const auto* ptr = reinterpret_cast*>(arg); + assert(ptr); + if (0 == called) { + // When bg flush thread 2 reaches here for the first time. + ASSERT_OK(ptr->first); + ASSERT_TRUE(ptr->second); + } else if (1 == called) { + // When bg flush thread 2 reaches here for the second time. + ASSERT_TRUE(ptr->first.IsIOError()); + ASSERT_FALSE(ptr->second); + } + ++called; + TEST_SYNC_POINT("BgFlushThr2:WaitToCommit"); + } + }); + + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0", + [&](void*) { + if (std::this_thread::get_id() == bg_flush_thr1) { + TEST_SYNC_POINT("BgFlushThr1:BeforeWriteManifest"); + } + }); + + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", [&](void*) { + if (std::this_thread::get_id() != bg_flush_thr1) { + return; + } + ASSERT_OK(db_->Put(write_opts, "b", "v_1_b")); + + FlushOptions flush_opts; + flush_opts.wait = false; + std::vector cfhs(1, db_->DefaultColumnFamily()); + ASSERT_OK(dbfull()->Flush(flush_opts, cfhs)); + }); + + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ProcessManifestWrites:AfterSyncManifest", [&](void* arg) { + auto* ptr = reinterpret_cast(arg); + assert(ptr); + *ptr = IOStatus::IOError("Injected failure"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_TRUE(dbfull()->Flush(FlushOptions(), handles_).IsIOError()); + + Close(); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(DBAtomicFlushTest, NoWaitWhenWritesStopped) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.atomic_flush = GetParam(); + options.max_write_buffer_number = 2; + options.memtable_factory.reset(test::NewSpecialSkipListFactory(1)); + + Reopen(options); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::DelayWrite:Start", + "DBAtomicFlushTest::NoWaitWhenWritesStopped:0"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(dbfull()->PauseBackgroundWork()); + for (int i = 0; i < options.max_write_buffer_number; ++i) { + ASSERT_OK(Put("k" + std::to_string(i), "v" + std::to_string(i))); + } + std::thread stalled_writer([&]() { ASSERT_OK(Put("k", "v")); }); + + TEST_SYNC_POINT("DBAtomicFlushTest::NoWaitWhenWritesStopped:0"); + + { + FlushOptions flush_opts; + flush_opts.wait = false; + flush_opts.allow_write_stall = true; + ASSERT_TRUE(db_->Flush(flush_opts).IsTryAgain()); + } + + ASSERT_OK(dbfull()->ContinueBackgroundWork()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + stalled_writer.join(); + + SyncPoint::GetInstance()->DisableProcessing(); +} + +INSTANTIATE_TEST_CASE_P(DBFlushDirectIOTest, DBFlushDirectIOTest, + testing::Bool()); + +INSTANTIATE_TEST_CASE_P(DBAtomicFlushTest, DBAtomicFlushTest, testing::Bool()); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_impl/compacted_db_impl.cc b/src/rocksdb/db/db_impl/compacted_db_impl.cc new file mode 100644 index 000000000..f18ee0d72 --- /dev/null +++ b/src/rocksdb/db/db_impl/compacted_db_impl.cc @@ -0,0 +1,257 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#include "db/db_impl/compacted_db_impl.h" + +#include "db/db_impl/db_impl.h" +#include "db/version_set.h" +#include "logging/logging.h" +#include "table/get_context.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { + +extern void MarkKeyMayExist(void* arg); +extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key, + const Slice& v, bool hit_and_return); + +CompactedDBImpl::CompactedDBImpl(const DBOptions& options, + const std::string& dbname) + : DBImpl(options, dbname, /*seq_per_batch*/ false, +/*batch_per_txn*/ true, + /*read_only*/ true), + cfd_(nullptr), + version_(nullptr), + user_comparator_(nullptr) {} + +CompactedDBImpl::~CompactedDBImpl() {} + +size_t CompactedDBImpl::FindFile(const Slice& key) { + size_t right = files_.num_files - 1; + auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool { + return user_comparator_->Compare(ExtractUserKey(f.largest_key), k) < 0; + }; + return static_cast( + std::lower_bound(files_.files, files_.files + right, key, cmp) - + files_.files); +} + +Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*, + const Slice& key, PinnableSlice* value) { + return Get(options, /*column_family*/ nullptr, key, value, + /*timestamp*/ nullptr); +} + +Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*, + const Slice& key, PinnableSlice* value, + std::string* timestamp) { + assert(user_comparator_); + if (options.timestamp) { + const Status s = FailIfTsMismatchCf( + DefaultColumnFamily(), *(options.timestamp), /*ts_for_read=*/true); + if (!s.ok()) { + return s; + } + } else { + const Status s = FailIfCfHasTs(DefaultColumnFamily()); + if (!s.ok()) { + return s; + } + } + + // Clear the timestamps for returning results so that we can distinguish + // between tombstone or key that has never been written + if (timestamp) { + timestamp->clear(); + } + + GetWithTimestampReadCallback read_cb(kMaxSequenceNumber); + std::string* ts = + user_comparator_->timestamp_size() > 0 ? timestamp : nullptr; + LookupKey lkey(key, kMaxSequenceNumber, options.timestamp); + GetContext get_context(user_comparator_, nullptr, nullptr, nullptr, + GetContext::kNotFound, lkey.user_key(), value, + /*columns=*/nullptr, ts, nullptr, nullptr, true, + nullptr, nullptr, nullptr, nullptr, &read_cb); + + const FdWithKeyRange& f = files_.files[FindFile(lkey.user_key())]; + if (user_comparator_->CompareWithoutTimestamp( + key, /*a_has_ts=*/false, + ExtractUserKeyAndStripTimestamp(f.smallest_key, + user_comparator_->timestamp_size()), + /*b_has_ts=*/false) < 0) { + return Status::NotFound(); + } + Status s = f.fd.table_reader->Get(options, lkey.internal_key(), &get_context, + nullptr); + if (!s.ok() && !s.IsNotFound()) { + return s; + } + if (get_context.State() == GetContext::kFound) { + return Status::OK(); + } + return Status::NotFound(); +} + +std::vector CompactedDBImpl::MultiGet( + const ReadOptions& options, const std::vector&, + const std::vector& keys, std::vector* values) { + return MultiGet(options, keys, values, /*timestamps*/ nullptr); +} + +std::vector CompactedDBImpl::MultiGet( + const ReadOptions& options, const std::vector&, + const std::vector& keys, std::vector* values, + std::vector* timestamps) { + assert(user_comparator_); + size_t num_keys = keys.size(); + + if (options.timestamp) { + Status s = FailIfTsMismatchCf(DefaultColumnFamily(), *(options.timestamp), + /*ts_for_read=*/true); + if (!s.ok()) { + return std::vector(num_keys, s); + } + } else { + Status s = FailIfCfHasTs(DefaultColumnFamily()); + if (!s.ok()) { + return std::vector(num_keys, s); + } + } + + // Clear the timestamps for returning results so that we can distinguish + // between tombstone or key that has never been written + if (timestamps) { + for (auto& ts : *timestamps) { + ts.clear(); + } + } + + GetWithTimestampReadCallback read_cb(kMaxSequenceNumber); + autovector reader_list; + for (const auto& key : keys) { + LookupKey lkey(key, kMaxSequenceNumber, options.timestamp); + const FdWithKeyRange& f = files_.files[FindFile(lkey.user_key())]; + if (user_comparator_->CompareWithoutTimestamp( + key, /*a_has_ts=*/false, + ExtractUserKeyAndStripTimestamp(f.smallest_key, + user_comparator_->timestamp_size()), + /*b_has_ts=*/false) < 0) { + reader_list.push_back(nullptr); + } else { + f.fd.table_reader->Prepare(lkey.internal_key()); + reader_list.push_back(f.fd.table_reader); + } + } + std::vector statuses(num_keys, Status::NotFound()); + values->resize(num_keys); + if (timestamps) { + timestamps->resize(num_keys); + } + int idx = 0; + for (auto* r : reader_list) { + if (r != nullptr) { + PinnableSlice pinnable_val; + std::string& value = (*values)[idx]; + LookupKey lkey(keys[idx], kMaxSequenceNumber, options.timestamp); + std::string* timestamp = timestamps ? &(*timestamps)[idx] : nullptr; + GetContext get_context( + user_comparator_, nullptr, nullptr, nullptr, GetContext::kNotFound, + lkey.user_key(), &pinnable_val, /*columns=*/nullptr, + user_comparator_->timestamp_size() > 0 ? timestamp : nullptr, nullptr, + nullptr, true, nullptr, nullptr, nullptr, nullptr, &read_cb); + Status s = r->Get(options, lkey.internal_key(), &get_context, nullptr); + assert(static_cast(idx) < statuses.size()); + if (!s.ok() && !s.IsNotFound()) { + statuses[idx] = s; + } else { + value.assign(pinnable_val.data(), pinnable_val.size()); + if (get_context.State() == GetContext::kFound) { + statuses[idx] = Status::OK(); + } + } + } + ++idx; + } + return statuses; +} + +Status CompactedDBImpl::Init(const Options& options) { + SuperVersionContext sv_context(/* create_superversion */ true); + mutex_.Lock(); + ColumnFamilyDescriptor cf(kDefaultColumnFamilyName, + ColumnFamilyOptions(options)); + Status s = Recover({cf}, true /* read only */, false, true); + if (s.ok()) { + cfd_ = static_cast_with_check(DefaultColumnFamily()) + ->cfd(); + cfd_->InstallSuperVersion(&sv_context, &mutex_); + } + mutex_.Unlock(); + sv_context.Clean(); + if (!s.ok()) { + return s; + } + NewThreadStatusCfInfo(cfd_); + version_ = cfd_->GetSuperVersion()->current; + user_comparator_ = cfd_->user_comparator(); + auto* vstorage = version_->storage_info(); + if (vstorage->num_non_empty_levels() == 0) { + return Status::NotSupported("no file exists"); + } + const LevelFilesBrief& l0 = vstorage->LevelFilesBrief(0); + // L0 should not have files + if (l0.num_files > 1) { + return Status::NotSupported("L0 contain more than 1 file"); + } + if (l0.num_files == 1) { + if (vstorage->num_non_empty_levels() > 1) { + return Status::NotSupported("Both L0 and other level contain files"); + } + files_ = l0; + return Status::OK(); + } + + for (int i = 1; i < vstorage->num_non_empty_levels() - 1; ++i) { + if (vstorage->LevelFilesBrief(i).num_files > 0) { + return Status::NotSupported("Other levels also contain files"); + } + } + + int level = vstorage->num_non_empty_levels() - 1; + if (vstorage->LevelFilesBrief(level).num_files > 0) { + files_ = vstorage->LevelFilesBrief(level); + return Status::OK(); + } + return Status::NotSupported("no file exists"); +} + +Status CompactedDBImpl::Open(const Options& options, const std::string& dbname, + DB** dbptr) { + *dbptr = nullptr; + + if (options.max_open_files != -1) { + return Status::InvalidArgument("require max_open_files = -1"); + } + if (options.merge_operator.get() != nullptr) { + return Status::InvalidArgument("merge operator is not supported"); + } + DBOptions db_options(options); + std::unique_ptr db(new CompactedDBImpl(db_options, dbname)); + Status s = db->Init(options); + if (s.ok()) { + s = db->StartPeriodicTaskScheduler(); + } + if (s.ok()) { + ROCKS_LOG_INFO(db->immutable_db_options_.info_log, + "Opened the db as fully compacted mode"); + LogFlush(db->immutable_db_options_.info_log); + *dbptr = db.release(); + } + return s; +} + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/db_impl/compacted_db_impl.h b/src/rocksdb/db/db_impl/compacted_db_impl.h new file mode 100644 index 000000000..eb458b85d --- /dev/null +++ b/src/rocksdb/db/db_impl/compacted_db_impl.h @@ -0,0 +1,154 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE +#include +#include + +#include "db/db_impl/db_impl.h" + +namespace ROCKSDB_NAMESPACE { + +// TODO: Share common structure with DBImplSecondary and DBImplReadOnly +class CompactedDBImpl : public DBImpl { + public: + CompactedDBImpl(const DBOptions& options, const std::string& dbname); + // No copying allowed + CompactedDBImpl(const CompactedDBImpl&) = delete; + void operator=(const CompactedDBImpl&) = delete; + + ~CompactedDBImpl() override; + + static Status Open(const Options& options, const std::string& dbname, + DB** dbptr); + + // Implementations of the DB interface + using DB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value, + std::string* timestamp) override; + + using DB::MultiGet; + // Note that CompactedDBImpl::MultiGet is not the optimized version of + // MultiGet to use. + // TODO: optimize CompactedDBImpl::MultiGet, see DBImpl::MultiGet for details. + virtual std::vector MultiGet( + const ReadOptions& options, const std::vector&, + const std::vector& keys, + std::vector* values) override; + + std::vector MultiGet(const ReadOptions& options, + const std::vector&, + const std::vector& keys, + std::vector* values, + std::vector* timestamps) override; + + using DBImpl::Put; + virtual Status Put(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*value*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + + using DBImpl::PutEntity; + Status PutEntity(const WriteOptions& /* options */, + ColumnFamilyHandle* /* column_family */, + const Slice& /* key */, + const WideColumns& /* columns */) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + + using DBImpl::Merge; + virtual Status Merge(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*value*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + + using DBImpl::Delete; + virtual Status Delete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + virtual Status Write(const WriteOptions& /*options*/, + WriteBatch* /*updates*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + using DBImpl::CompactRange; + virtual Status CompactRange(const CompactRangeOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice* /*begin*/, + const Slice* /*end*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + + virtual Status DisableFileDeletions() override { + return Status::NotSupported("Not supported in compacted db mode."); + } + virtual Status EnableFileDeletions(bool /*force*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + virtual Status GetLiveFiles(std::vector& ret, + uint64_t* manifest_file_size, + bool /*flush_memtable*/) override { + return DBImpl::GetLiveFiles(ret, manifest_file_size, + false /* flush_memtable */); + } + using DBImpl::Flush; + virtual Status Flush(const FlushOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + + virtual Status SyncWAL() override { + return Status::NotSupported("Not supported in compacted db mode."); + } + + using DB::IngestExternalFile; + virtual Status IngestExternalFile( + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*external_files*/, + const IngestExternalFileOptions& /*ingestion_options*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& /*options*/, + const std::string& /*column_family_name*/, + const ImportColumnFamilyOptions& /*import_options*/, + const ExportImportFilesMetaData& /*metadata*/, + ColumnFamilyHandle** /*handle*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + + // FIXME: some missing overrides for more "write" functions + // Share with DBImplReadOnly? + + protected: +#ifndef ROCKSDB_LITE + Status FlushForGetLiveFiles() override { + // No-op for read-only DB + return Status::OK(); + } +#endif // !ROCKSDB_LITE + + private: + friend class DB; + inline size_t FindFile(const Slice& key); + Status Init(const Options& options); + + ColumnFamilyData* cfd_; + Version* version_; + const Comparator* user_comparator_; + LevelFilesBrief files_; +}; +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/db_impl/db_impl.cc b/src/rocksdb/db/db_impl/db_impl.cc new file mode 100644 index 000000000..a431111d4 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl.cc @@ -0,0 +1,5918 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/db_impl/db_impl.h" + +#include +#ifdef OS_SOLARIS +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/arena_wrapped_db_iter.h" +#include "db/builder.h" +#include "db/compaction/compaction_job.h" +#include "db/db_info_dumper.h" +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "db/external_sst_file_ingestion_job.h" +#include "db/flush_job.h" +#include "db/forward_iterator.h" +#include "db/import_column_family_job.h" +#include "db/job_context.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/malloc_stats.h" +#include "db/memtable.h" +#include "db/memtable_list.h" +#include "db/merge_context.h" +#include "db/merge_helper.h" +#include "db/periodic_task_scheduler.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/table_cache.h" +#include "db/table_properties_collector.h" +#include "db/transaction_log_impl.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "db/write_callback.h" +#include "env/unique_id_gen.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "file/random_access_file_reader.h" +#include "file/sst_file_manager_impl.h" +#include "logging/auto_roll_logger.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "monitoring/in_memory_stats_history.h" +#include "monitoring/instrumented_mutex.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/persistent_stats_history.h" +#include "monitoring/thread_status_updater.h" +#include "monitoring/thread_status_util.h" +#include "options/cf_options.h" +#include "options/options_helper.h" +#include "options/options_parser.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/statistics.h" +#include "rocksdb/stats_history.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "rocksdb/version.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/get_context.h" +#include "table/merging_iterator.h" +#include "table/multiget_context.h" +#include "table/sst_file_dumper.h" +#include "table/table_builder.h" +#include "table/two_level_iterator.h" +#include "table/unique_id_impl.h" +#include "test_util/sync_point.h" +#include "trace_replay/trace_replay.h" +#include "util/autovector.h" +#include "util/cast_util.h" +#include "util/coding.h" +#include "util/compression.h" +#include "util/crc32c.h" +#include "util/defer.h" +#include "util/distributed_mutex.h" +#include "util/hash_containers.h" +#include "util/mutexlock.h" +#include "util/stop_watch.h" +#include "util/string_util.h" +#include "utilities/trace/replayer_impl.h" + +namespace ROCKSDB_NAMESPACE { + +const std::string kDefaultColumnFamilyName("default"); +const std::string kPersistentStatsColumnFamilyName( + "___rocksdb_stats_history___"); +void DumpRocksDBBuildVersion(Logger* log); + +CompressionType GetCompressionFlush( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options) { + // Compressing memtable flushes might not help unless the sequential load + // optimization is used for leveled compaction. Otherwise the CPU and + // latency overhead is not offset by saving much space. + if (ioptions.compaction_style == kCompactionStyleUniversal && + mutable_cf_options.compaction_options_universal + .compression_size_percent >= 0) { + return kNoCompression; + } + if (mutable_cf_options.compression_per_level.empty()) { + return mutable_cf_options.compression; + } else { + // For leveled compress when min_level_to_compress != 0. + return mutable_cf_options.compression_per_level[0]; + } +} + +namespace { +void DumpSupportInfo(Logger* logger) { + ROCKS_LOG_HEADER(logger, "Compression algorithms supported:"); + for (auto& compression : OptionsHelper::compression_type_string_map) { + if (compression.second != kNoCompression && + compression.second != kDisableCompressionOption) { + ROCKS_LOG_HEADER(logger, "\t%s supported: %d", compression.first.c_str(), + CompressionTypeSupported(compression.second)); + } + } + ROCKS_LOG_HEADER(logger, "Fast CRC32 supported: %s", + crc32c::IsFastCrc32Supported().c_str()); + + ROCKS_LOG_HEADER(logger, "DMutex implementation: %s", DMutex::kName()); +} +} // namespace + +DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, + const bool seq_per_batch, const bool batch_per_txn, + bool read_only) + : dbname_(dbname), + own_info_log_(options.info_log == nullptr), + init_logger_creation_s_(), + initial_db_options_(SanitizeOptions(dbname, options, read_only, + &init_logger_creation_s_)), + env_(initial_db_options_.env), + io_tracer_(std::make_shared()), + immutable_db_options_(initial_db_options_), + fs_(immutable_db_options_.fs, io_tracer_), + mutable_db_options_(initial_db_options_), + stats_(immutable_db_options_.stats), +#ifdef COERCE_CONTEXT_SWITCH + mutex_(stats_, immutable_db_options_.clock, DB_MUTEX_WAIT_MICROS, &bg_cv_, + immutable_db_options_.use_adaptive_mutex), +#else // COERCE_CONTEXT_SWITCH + mutex_(stats_, immutable_db_options_.clock, DB_MUTEX_WAIT_MICROS, + immutable_db_options_.use_adaptive_mutex), +#endif // COERCE_CONTEXT_SWITCH + default_cf_handle_(nullptr), + error_handler_(this, immutable_db_options_, &mutex_), + event_logger_(immutable_db_options_.info_log.get()), + max_total_in_memory_state_(0), + file_options_(BuildDBOptions(immutable_db_options_, mutable_db_options_)), + file_options_for_compaction_(fs_->OptimizeForCompactionTableWrite( + file_options_, immutable_db_options_)), + seq_per_batch_(seq_per_batch), + batch_per_txn_(batch_per_txn), + next_job_id_(1), + shutting_down_(false), + db_lock_(nullptr), + manual_compaction_paused_(false), + bg_cv_(&mutex_), + logfile_number_(0), + log_dir_synced_(false), + log_empty_(true), + persist_stats_cf_handle_(nullptr), + log_sync_cv_(&log_write_mutex_), + total_log_size_(0), + is_snapshot_supported_(true), + write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()), + write_thread_(immutable_db_options_), + nonmem_write_thread_(immutable_db_options_), + write_controller_(mutable_db_options_.delayed_write_rate), + last_batch_group_size_(0), + unscheduled_flushes_(0), + unscheduled_compactions_(0), + bg_bottom_compaction_scheduled_(0), + bg_compaction_scheduled_(0), + num_running_compactions_(0), + bg_flush_scheduled_(0), + num_running_flushes_(0), + bg_purge_scheduled_(0), + disable_delete_obsolete_files_(0), + pending_purge_obsolete_files_(0), + delete_obsolete_files_last_run_(immutable_db_options_.clock->NowMicros()), + last_stats_dump_time_microsec_(0), + has_unpersisted_data_(false), + unable_to_release_oldest_log_(false), + num_running_ingest_file_(0), +#ifndef ROCKSDB_LITE + wal_manager_(immutable_db_options_, file_options_, io_tracer_, + seq_per_batch), +#endif // ROCKSDB_LITE + bg_work_paused_(0), + bg_compaction_paused_(0), + refitting_level_(false), + opened_successfully_(false), +#ifndef ROCKSDB_LITE + periodic_task_scheduler_(), +#endif // ROCKSDB_LITE + two_write_queues_(options.two_write_queues), + manual_wal_flush_(options.manual_wal_flush), + // last_sequencee_ is always maintained by the main queue that also writes + // to the memtable. When two_write_queues_ is disabled last seq in + // memtable is the same as last seq published to the readers. When it is + // enabled but seq_per_batch_ is disabled, last seq in memtable still + // indicates last published seq since wal-only writes that go to the 2nd + // queue do not consume a sequence number. Otherwise writes performed by + // the 2nd queue could change what is visible to the readers. In this + // cases, last_seq_same_as_publish_seq_==false, the 2nd queue maintains a + // separate variable to indicate the last published sequence. + last_seq_same_as_publish_seq_( + !(seq_per_batch && options.two_write_queues)), + // Since seq_per_batch_ is currently set only by WritePreparedTxn which + // requires a custom gc for compaction, we use that to set use_custom_gc_ + // as well. + use_custom_gc_(seq_per_batch), + shutdown_initiated_(false), + own_sfm_(options.sst_file_manager == nullptr), + closed_(false), + atomic_flush_install_cv_(&mutex_), + blob_callback_(immutable_db_options_.sst_file_manager.get(), &mutex_, + &error_handler_, &event_logger_, + immutable_db_options_.listeners, dbname_) { + // !batch_per_trx_ implies seq_per_batch_ because it is only unset for + // WriteUnprepared, which should use seq_per_batch_. + assert(batch_per_txn_ || seq_per_batch_); + + // Reserve ten files or so for other uses and give the rest to TableCache. + // Give a large number for setting of "infinite" open files. + const int table_cache_size = (mutable_db_options_.max_open_files == -1) + ? TableCache::kInfiniteCapacity + : mutable_db_options_.max_open_files - 10; + LRUCacheOptions co; + co.capacity = table_cache_size; + co.num_shard_bits = immutable_db_options_.table_cache_numshardbits; + co.metadata_charge_policy = kDontChargeCacheMetadata; + table_cache_ = NewLRUCache(co); + SetDbSessionId(); + assert(!db_session_id_.empty()); + +#ifndef ROCKSDB_LITE + periodic_task_functions_.emplace(PeriodicTaskType::kDumpStats, + [this]() { this->DumpStats(); }); + periodic_task_functions_.emplace(PeriodicTaskType::kPersistStats, + [this]() { this->PersistStats(); }); + periodic_task_functions_.emplace(PeriodicTaskType::kFlushInfoLog, + [this]() { this->FlushInfoLog(); }); + periodic_task_functions_.emplace( + PeriodicTaskType::kRecordSeqnoTime, + [this]() { this->RecordSeqnoToTimeMapping(); }); +#endif // ROCKSDB_LITE + + versions_.reset(new VersionSet(dbname_, &immutable_db_options_, file_options_, + table_cache_.get(), write_buffer_manager_, + &write_controller_, &block_cache_tracer_, + io_tracer_, db_id_, db_session_id_)); + column_family_memtables_.reset( + new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); + + DumpRocksDBBuildVersion(immutable_db_options_.info_log.get()); + DumpDBFileSummary(immutable_db_options_, dbname_, db_session_id_); + immutable_db_options_.Dump(immutable_db_options_.info_log.get()); + mutable_db_options_.Dump(immutable_db_options_.info_log.get()); + DumpSupportInfo(immutable_db_options_.info_log.get()); + + max_total_wal_size_.store(mutable_db_options_.max_total_wal_size, + std::memory_order_relaxed); + if (write_buffer_manager_) { + wbm_stall_.reset(new WBMStallInterface()); + } +} + +Status DBImpl::Resume() { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Resuming DB"); + + InstrumentedMutexLock db_mutex(&mutex_); + + if (!error_handler_.IsDBStopped() && !error_handler_.IsBGWorkStopped()) { + // Nothing to do + return Status::OK(); + } + + if (error_handler_.IsRecoveryInProgress()) { + // Don't allow a mix of manual and automatic recovery + return Status::Busy(); + } + + mutex_.Unlock(); + Status s = error_handler_.RecoverFromBGError(true); + mutex_.Lock(); + return s; +} + +// This function implements the guts of recovery from a background error. It +// is eventually called for both manual as well as automatic recovery. It does +// the following - +// 1. Wait for currently scheduled background flush/compaction to exit, in +// order to inadvertently causing an error and thinking recovery failed +// 2. Flush memtables if there's any data for all the CFs. This may result +// another error, which will be saved by error_handler_ and reported later +// as the recovery status +// 3. Find and delete any obsolete files +// 4. Schedule compactions if needed for all the CFs. This is needed as the +// flush in the prior step might have been a no-op for some CFs, which +// means a new super version wouldn't have been installed +Status DBImpl::ResumeImpl(DBRecoverContext context) { + mutex_.AssertHeld(); + WaitForBackgroundWork(); + + Status s; + if (shutdown_initiated_) { + // Returning shutdown status to SFM during auto recovery will cause it + // to abort the recovery and allow the shutdown to progress + s = Status::ShutdownInProgress(); + } + + if (s.ok()) { + Status bg_error = error_handler_.GetBGError(); + if (bg_error.severity() > Status::Severity::kHardError) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "DB resume requested but failed due to Fatal/Unrecoverable error"); + s = bg_error; + } + } + + // Make sure the IO Status stored in version set is set to OK. + bool file_deletion_disabled = !IsFileDeletionsEnabled(); + if (s.ok()) { + IOStatus io_s = versions_->io_status(); + if (io_s.IsIOError()) { + // If resuming from IOError resulted from MANIFEST write, then assert + // that we must have already set the MANIFEST writer to nullptr during + // clean-up phase MANIFEST writing. We must have also disabled file + // deletions. + assert(!versions_->descriptor_log_); + assert(file_deletion_disabled); + // Since we are trying to recover from MANIFEST write error, we need to + // switch to a new MANIFEST anyway. The old MANIFEST can be corrupted. + // Therefore, force writing a dummy version edit because we do not know + // whether there are flush jobs with non-empty data to flush, triggering + // appends to MANIFEST. + VersionEdit edit; + auto cfh = + static_cast_with_check(default_cf_handle_); + assert(cfh); + ColumnFamilyData* cfd = cfh->cfd(); + const MutableCFOptions& cf_opts = *cfd->GetLatestMutableCFOptions(); + s = versions_->LogAndApply(cfd, cf_opts, &edit, &mutex_, + directories_.GetDbDir()); + if (!s.ok()) { + io_s = versions_->io_status(); + if (!io_s.ok()) { + s = error_handler_.SetBGError(io_s, + BackgroundErrorReason::kManifestWrite); + } + } + } + } + + // We cannot guarantee consistency of the WAL. So force flush Memtables of + // all the column families + if (s.ok()) { + FlushOptions flush_opts; + // We allow flush to stall write since we are trying to resume from error. + flush_opts.allow_write_stall = true; + if (immutable_db_options_.atomic_flush) { + autovector cfds; + SelectColumnFamiliesForAtomicFlush(&cfds); + mutex_.Unlock(); + s = AtomicFlushMemTables(cfds, flush_opts, context.flush_reason); + mutex_.Lock(); + } else { + for (auto cfd : versions_->GetRefedColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + InstrumentedMutexUnlock u(&mutex_); + s = FlushMemTable(cfd, flush_opts, context.flush_reason); + if (!s.ok()) { + break; + } + } + } + if (!s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "DB resume requested but failed due to Flush failure [%s]", + s.ToString().c_str()); + } + } + + JobContext job_context(0); + FindObsoleteFiles(&job_context, true); + mutex_.Unlock(); + + job_context.manifest_file_number = 1; + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + + if (s.ok()) { + assert(versions_->io_status().ok()); + // If we reach here, we should re-enable file deletions if it was disabled + // during previous error handling. + if (file_deletion_disabled) { + // Always return ok + s = EnableFileDeletions(/*force=*/true); + if (!s.ok()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "DB resume requested but could not enable file deletions [%s]", + s.ToString().c_str()); + assert(false); + } + } + } + + mutex_.Lock(); + if (s.ok()) { + // This will notify and unblock threads waiting for error recovery to + // finish. Those previouly waiting threads can now proceed, which may + // include closing the db. + s = error_handler_.ClearBGError(); + } else { + // NOTE: this is needed to pass ASSERT_STATUS_CHECKED + // in the DBSSTTest.DBWithMaxSpaceAllowedRandomized test. + // See https://github.com/facebook/rocksdb/pull/7715#issuecomment-754947952 + error_handler_.GetRecoveryError().PermitUncheckedError(); + } + + if (s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB"); + } else { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Failed to resume DB [%s]", + s.ToString().c_str()); + } + + // Check for shutdown again before scheduling further compactions, + // since we released and re-acquired the lock above + if (shutdown_initiated_) { + s = Status::ShutdownInProgress(); + } + if (s.ok()) { + for (auto cfd : *versions_->GetColumnFamilySet()) { + SchedulePendingCompaction(cfd); + } + MaybeScheduleFlushOrCompaction(); + } + + // Wake up any waiters - in this case, it could be the shutdown thread + bg_cv_.SignalAll(); + + // No need to check BGError again. If something happened, event listener would + // be notified and the operation causing it would have failed + return s; +} + +void DBImpl::WaitForBackgroundWork() { + // Wait for background work to finish + while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ || + bg_flush_scheduled_) { + bg_cv_.Wait(); + } +} + +// Will lock the mutex_, will wait for completion if wait is true +void DBImpl::CancelAllBackgroundWork(bool wait) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Shutdown: canceling all background work"); + +#ifndef ROCKSDB_LITE + for (uint8_t task_type = 0; + task_type < static_cast(PeriodicTaskType::kMax); task_type++) { + Status s = periodic_task_scheduler_.Unregister( + static_cast(task_type)); + if (!s.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Failed to unregister periodic task %d, status: %s", + task_type, s.ToString().c_str()); + } + } +#endif // !ROCKSDB_LITE + + InstrumentedMutexLock l(&mutex_); + if (!shutting_down_.load(std::memory_order_acquire) && + has_unpersisted_data_.load(std::memory_order_relaxed) && + !mutable_db_options_.avoid_flush_during_shutdown) { + if (immutable_db_options_.atomic_flush) { + autovector cfds; + SelectColumnFamiliesForAtomicFlush(&cfds); + mutex_.Unlock(); + Status s = + AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kShutDown); + s.PermitUncheckedError(); //**TODO: What to do on error? + mutex_.Lock(); + } else { + for (auto cfd : versions_->GetRefedColumnFamilySet()) { + if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) { + InstrumentedMutexUnlock u(&mutex_); + Status s = FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown); + s.PermitUncheckedError(); //**TODO: What to do on error? + } + } + } + } + + shutting_down_.store(true, std::memory_order_release); + bg_cv_.SignalAll(); + if (!wait) { + return; + } + WaitForBackgroundWork(); +} + +Status DBImpl::MaybeReleaseTimestampedSnapshotsAndCheck() { + size_t num_snapshots = 0; + ReleaseTimestampedSnapshotsOlderThan(std::numeric_limits::max(), + &num_snapshots); + + // If there is unreleased snapshot, fail the close call + if (num_snapshots > 0) { + return Status::Aborted("Cannot close DB with unreleased snapshot."); + } + + return Status::OK(); +} + +Status DBImpl::CloseHelper() { + // Guarantee that there is no background error recovery in progress before + // continuing with the shutdown + mutex_.Lock(); + shutdown_initiated_ = true; + error_handler_.CancelErrorRecovery(); + while (error_handler_.IsRecoveryInProgress()) { + bg_cv_.Wait(); + } + mutex_.Unlock(); + + // Below check is added as recovery_error_ is not checked and it causes crash + // in DBSSTTest.DBWithMaxSpaceAllowedWithBlobFiles when space limit is + // reached. + error_handler_.GetRecoveryError().PermitUncheckedError(); + + // CancelAllBackgroundWork called with false means we just set the shutdown + // marker. After this we do a variant of the waiting and unschedule work + // (to consider: moving all the waiting into CancelAllBackgroundWork(true)) + CancelAllBackgroundWork(false); + + // Cancel manual compaction if there's any + if (HasPendingManualCompaction()) { + DisableManualCompaction(); + } + mutex_.Lock(); + // Unschedule all tasks for this DB + for (uint8_t i = 0; i < static_cast(TaskType::kCount); i++) { + env_->UnSchedule(GetTaskTag(i), Env::Priority::BOTTOM); + env_->UnSchedule(GetTaskTag(i), Env::Priority::LOW); + env_->UnSchedule(GetTaskTag(i), Env::Priority::HIGH); + } + + Status ret = Status::OK(); + + // Wait for background work to finish + while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ || + bg_flush_scheduled_ || bg_purge_scheduled_ || + pending_purge_obsolete_files_ || + error_handler_.IsRecoveryInProgress()) { + TEST_SYNC_POINT("DBImpl::~DBImpl:WaitJob"); + bg_cv_.Wait(); + } + TEST_SYNC_POINT_CALLBACK("DBImpl::CloseHelper:PendingPurgeFinished", + &files_grabbed_for_purge_); + EraseThreadStatusDbInfo(); + flush_scheduler_.Clear(); + trim_history_scheduler_.Clear(); + + while (!flush_queue_.empty()) { + const FlushRequest& flush_req = PopFirstFromFlushQueue(); + for (const auto& iter : flush_req) { + iter.first->UnrefAndTryDelete(); + } + } + + while (!compaction_queue_.empty()) { + auto cfd = PopFirstFromCompactionQueue(); + cfd->UnrefAndTryDelete(); + } + + if (default_cf_handle_ != nullptr || persist_stats_cf_handle_ != nullptr) { + // we need to delete handle outside of lock because it does its own locking + mutex_.Unlock(); + if (default_cf_handle_) { + delete default_cf_handle_; + default_cf_handle_ = nullptr; + } + if (persist_stats_cf_handle_) { + delete persist_stats_cf_handle_; + persist_stats_cf_handle_ = nullptr; + } + mutex_.Lock(); + } + + // Clean up obsolete files due to SuperVersion release. + // (1) Need to delete to obsolete files before closing because RepairDB() + // scans all existing files in the file system and builds manifest file. + // Keeping obsolete files confuses the repair process. + // (2) Need to check if we Open()/Recover() the DB successfully before + // deleting because if VersionSet recover fails (may be due to corrupted + // manifest file), it is not able to identify live files correctly. As a + // result, all "live" files can get deleted by accident. However, corrupted + // manifest is recoverable by RepairDB(). + if (opened_successfully_) { + JobContext job_context(next_job_id_.fetch_add(1)); + FindObsoleteFiles(&job_context, true); + + mutex_.Unlock(); + // manifest number starting from 2 + job_context.manifest_file_number = 1; + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + mutex_.Lock(); + } + { + InstrumentedMutexLock lock(&log_write_mutex_); + for (auto l : logs_to_free_) { + delete l; + } + for (auto& log : logs_) { + uint64_t log_number = log.writer->get_log_number(); + Status s = log.ClearWriter(); + if (!s.ok()) { + ROCKS_LOG_WARN( + immutable_db_options_.info_log, + "Unable to Sync WAL file %s with error -- %s", + LogFileName(immutable_db_options_.GetWalDir(), log_number).c_str(), + s.ToString().c_str()); + // Retain the first error + if (ret.ok()) { + ret = s; + } + } + } + logs_.clear(); + } + + // Table cache may have table handles holding blocks from the block cache. + // We need to release them before the block cache is destroyed. The block + // cache may be destroyed inside versions_.reset(), when column family data + // list is destroyed, so leaving handles in table cache after + // versions_.reset() may cause issues. + // Here we clean all unreferenced handles in table cache. + // Now we assume all user queries have finished, so only version set itself + // can possibly hold the blocks from block cache. After releasing unreferenced + // handles here, only handles held by version set left and inside + // versions_.reset(), we will release them. There, we need to make sure every + // time a handle is released, we erase it from the cache too. By doing that, + // we can guarantee that after versions_.reset(), table cache is empty + // so the cache can be safely destroyed. + table_cache_->EraseUnRefEntries(); + + for (auto& txn_entry : recovered_transactions_) { + delete txn_entry.second; + } + + // versions need to be destroyed before table_cache since it can hold + // references to table_cache. + versions_.reset(); + mutex_.Unlock(); + if (db_lock_ != nullptr) { + // TODO: Check for unlock error + env_->UnlockFile(db_lock_).PermitUncheckedError(); + } + + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Shutdown complete"); + LogFlush(immutable_db_options_.info_log); + +#ifndef ROCKSDB_LITE + // If the sst_file_manager was allocated by us during DB::Open(), ccall + // Close() on it before closing the info_log. Otherwise, background thread + // in SstFileManagerImpl might try to log something + if (immutable_db_options_.sst_file_manager && own_sfm_) { + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + sfm->Close(); + } +#endif // ROCKSDB_LITE + + if (immutable_db_options_.info_log && own_info_log_) { + Status s = immutable_db_options_.info_log->Close(); + if (!s.ok() && !s.IsNotSupported() && ret.ok()) { + ret = s; + } + } + + if (write_buffer_manager_ && wbm_stall_) { + write_buffer_manager_->RemoveDBFromQueue(wbm_stall_.get()); + } + + IOStatus io_s = directories_.Close(IOOptions(), nullptr /* dbg */); + if (!io_s.ok()) { + ret = io_s; + } + if (ret.IsAborted()) { + // Reserve IsAborted() error for those where users didn't release + // certain resource and they can release them and come back and + // retry. In this case, we wrap this exception to something else. + return Status::Incomplete(ret.ToString()); + } + + return ret; +} + +Status DBImpl::CloseImpl() { return CloseHelper(); } + +DBImpl::~DBImpl() { + // TODO: remove this. + init_logger_creation_s_.PermitUncheckedError(); + + InstrumentedMutexLock closing_lock_guard(&closing_mutex_); + if (closed_) { + return; + } + + closed_ = true; + + { + const Status s = MaybeReleaseTimestampedSnapshotsAndCheck(); + s.PermitUncheckedError(); + } + + closing_status_ = CloseImpl(); + closing_status_.PermitUncheckedError(); +} + +void DBImpl::MaybeIgnoreError(Status* s) const { + if (s->ok() || immutable_db_options_.paranoid_checks) { + // No change needed + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, "Ignoring error %s", + s->ToString().c_str()); + *s = Status::OK(); + } +} + +const Status DBImpl::CreateArchivalDirectory() { + if (immutable_db_options_.WAL_ttl_seconds > 0 || + immutable_db_options_.WAL_size_limit_MB > 0) { + std::string archivalPath = + ArchivalDirectory(immutable_db_options_.GetWalDir()); + return env_->CreateDirIfMissing(archivalPath); + } + return Status::OK(); +} + +void DBImpl::PrintStatistics() { + auto dbstats = immutable_db_options_.stats; + if (dbstats) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "STATISTICS:\n %s", + dbstats->ToString().c_str()); + } +} + +Status DBImpl::StartPeriodicTaskScheduler() { +#ifndef ROCKSDB_LITE + +#ifndef NDEBUG + // It only used by test to disable scheduler + bool disable_scheduler = false; + TEST_SYNC_POINT_CALLBACK( + "DBImpl::StartPeriodicTaskScheduler:DisableScheduler", + &disable_scheduler); + if (disable_scheduler) { + return Status::OK(); + } + + { + InstrumentedMutexLock l(&mutex_); + TEST_SYNC_POINT_CALLBACK("DBImpl::StartPeriodicTaskScheduler:Init", + &periodic_task_scheduler_); + } + +#endif // !NDEBUG + if (mutable_db_options_.stats_dump_period_sec > 0) { + Status s = periodic_task_scheduler_.Register( + PeriodicTaskType::kDumpStats, + periodic_task_functions_.at(PeriodicTaskType::kDumpStats), + mutable_db_options_.stats_dump_period_sec); + if (!s.ok()) { + return s; + } + } + if (mutable_db_options_.stats_persist_period_sec > 0) { + Status s = periodic_task_scheduler_.Register( + PeriodicTaskType::kPersistStats, + periodic_task_functions_.at(PeriodicTaskType::kPersistStats), + mutable_db_options_.stats_persist_period_sec); + if (!s.ok()) { + return s; + } + } + + Status s = periodic_task_scheduler_.Register( + PeriodicTaskType::kFlushInfoLog, + periodic_task_functions_.at(PeriodicTaskType::kFlushInfoLog)); + + return s; +#else + return Status::OK(); +#endif // !ROCKSDB_LITE +} + +Status DBImpl::RegisterRecordSeqnoTimeWorker() { +#ifndef ROCKSDB_LITE + uint64_t min_time_duration = std::numeric_limits::max(); + uint64_t max_time_duration = std::numeric_limits::min(); + { + InstrumentedMutexLock l(&mutex_); + + for (auto cfd : *versions_->GetColumnFamilySet()) { + // preserve time is the max of 2 options. + uint64_t preserve_time_duration = + std::max(cfd->ioptions()->preserve_internal_time_seconds, + cfd->ioptions()->preclude_last_level_data_seconds); + if (!cfd->IsDropped() && preserve_time_duration > 0) { + min_time_duration = std::min(preserve_time_duration, min_time_duration); + max_time_duration = std::max(preserve_time_duration, max_time_duration); + } + } + if (min_time_duration == std::numeric_limits::max()) { + seqno_time_mapping_.Resize(0, 0); + } else { + seqno_time_mapping_.Resize(min_time_duration, max_time_duration); + } + } + + uint64_t seqno_time_cadence = 0; + if (min_time_duration != std::numeric_limits::max()) { + // round up to 1 when the time_duration is smaller than + // kMaxSeqnoTimePairsPerCF + seqno_time_cadence = + (min_time_duration + SeqnoToTimeMapping::kMaxSeqnoTimePairsPerCF - 1) / + SeqnoToTimeMapping::kMaxSeqnoTimePairsPerCF; + } + + Status s; + if (seqno_time_cadence == 0) { + s = periodic_task_scheduler_.Unregister(PeriodicTaskType::kRecordSeqnoTime); + } else { + s = periodic_task_scheduler_.Register( + PeriodicTaskType::kRecordSeqnoTime, + periodic_task_functions_.at(PeriodicTaskType::kRecordSeqnoTime), + seqno_time_cadence); + } + + return s; +#else + return Status::OK(); +#endif // !ROCKSDB_LITE +} + +// esitmate the total size of stats_history_ +size_t DBImpl::EstimateInMemoryStatsHistorySize() const { + size_t size_total = + sizeof(std::map>); + if (stats_history_.size() == 0) return size_total; + size_t size_per_slice = + sizeof(uint64_t) + sizeof(std::map); + // non-empty map, stats_history_.begin() guaranteed to exist + for (const auto& pairs : stats_history_.begin()->second) { + size_per_slice += + pairs.first.capacity() + sizeof(pairs.first) + sizeof(pairs.second); + } + size_total = size_per_slice * stats_history_.size(); + return size_total; +} + +void DBImpl::PersistStats() { + TEST_SYNC_POINT("DBImpl::PersistStats:Entry"); +#ifndef ROCKSDB_LITE + if (shutdown_initiated_) { + return; + } + TEST_SYNC_POINT("DBImpl::PersistStats:StartRunning"); + uint64_t now_seconds = + immutable_db_options_.clock->NowMicros() / kMicrosInSecond; + + Statistics* statistics = immutable_db_options_.stats; + if (!statistics) { + return; + } + size_t stats_history_size_limit = 0; + { + InstrumentedMutexLock l(&mutex_); + stats_history_size_limit = mutable_db_options_.stats_history_buffer_size; + } + + std::map stats_map; + if (!statistics->getTickerMap(&stats_map)) { + return; + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "------- PERSISTING STATS -------"); + + if (immutable_db_options_.persist_stats_to_disk) { + WriteBatch batch; + Status s = Status::OK(); + if (stats_slice_initialized_) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Reading %" ROCKSDB_PRIszt " stats from statistics\n", + stats_slice_.size()); + for (const auto& stat : stats_map) { + if (s.ok()) { + char key[100]; + int length = + EncodePersistentStatsKey(now_seconds, stat.first, 100, key); + // calculate the delta from last time + if (stats_slice_.find(stat.first) != stats_slice_.end()) { + uint64_t delta = stat.second - stats_slice_[stat.first]; + s = batch.Put(persist_stats_cf_handle_, + Slice(key, std::min(100, length)), + std::to_string(delta)); + } + } + } + } + stats_slice_initialized_ = true; + std::swap(stats_slice_, stats_map); + if (s.ok()) { + WriteOptions wo; + wo.low_pri = true; + wo.no_slowdown = true; + wo.sync = false; + s = Write(wo, &batch); + } + if (!s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Writing to persistent stats CF failed -- %s", + s.ToString().c_str()); + } else { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Writing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64 + " to persistent stats CF succeeded", + stats_slice_.size(), now_seconds); + } + // TODO(Zhongyi): add purging for persisted data + } else { + InstrumentedMutexLock l(&stats_history_mutex_); + // calculate the delta from last time + if (stats_slice_initialized_) { + std::map stats_delta; + for (const auto& stat : stats_map) { + if (stats_slice_.find(stat.first) != stats_slice_.end()) { + stats_delta[stat.first] = stat.second - stats_slice_[stat.first]; + } + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Storing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64 + " to in-memory stats history", + stats_slice_.size(), now_seconds); + stats_history_[now_seconds] = stats_delta; + } + stats_slice_initialized_ = true; + std::swap(stats_slice_, stats_map); + TEST_SYNC_POINT("DBImpl::PersistStats:StatsCopied"); + + // delete older stats snapshots to control memory consumption + size_t stats_history_size = EstimateInMemoryStatsHistorySize(); + bool purge_needed = stats_history_size > stats_history_size_limit; + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[Pre-GC] In-memory stats history size: %" ROCKSDB_PRIszt + " bytes, slice count: %" ROCKSDB_PRIszt, + stats_history_size, stats_history_.size()); + while (purge_needed && !stats_history_.empty()) { + stats_history_.erase(stats_history_.begin()); + purge_needed = + EstimateInMemoryStatsHistorySize() > stats_history_size_limit; + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[Post-GC] In-memory stats history size: %" ROCKSDB_PRIszt + " bytes, slice count: %" ROCKSDB_PRIszt, + stats_history_size, stats_history_.size()); + } + TEST_SYNC_POINT("DBImpl::PersistStats:End"); +#endif // !ROCKSDB_LITE +} + +bool DBImpl::FindStatsByTime(uint64_t start_time, uint64_t end_time, + uint64_t* new_time, + std::map* stats_map) { + assert(new_time); + assert(stats_map); + if (!new_time || !stats_map) return false; + // lock when search for start_time + { + InstrumentedMutexLock l(&stats_history_mutex_); + auto it = stats_history_.lower_bound(start_time); + if (it != stats_history_.end() && it->first < end_time) { + // make a copy for timestamp and stats_map + *new_time = it->first; + *stats_map = it->second; + return true; + } else { + return false; + } + } +} + +Status DBImpl::GetStatsHistory( + uint64_t start_time, uint64_t end_time, + std::unique_ptr* stats_iterator) { + if (!stats_iterator) { + return Status::InvalidArgument("stats_iterator not preallocated."); + } + if (immutable_db_options_.persist_stats_to_disk) { + stats_iterator->reset( + new PersistentStatsHistoryIterator(start_time, end_time, this)); + } else { + stats_iterator->reset( + new InMemoryStatsHistoryIterator(start_time, end_time, this)); + } + return (*stats_iterator)->status(); +} + +void DBImpl::DumpStats() { + TEST_SYNC_POINT("DBImpl::DumpStats:1"); +#ifndef ROCKSDB_LITE + std::string stats; + if (shutdown_initiated_) { + return; + } + + // Also probe block cache(s) for problems, dump to info log + UnorderedSet probed_caches; + TEST_SYNC_POINT("DBImpl::DumpStats:StartRunning"); + { + InstrumentedMutexLock l(&mutex_); + for (auto cfd : versions_->GetRefedColumnFamilySet()) { + if (!cfd->initialized()) { + continue; + } + + // Release DB mutex for gathering cache entry stats. Pass over all + // column families for this first so that other stats are dumped + // near-atomically. + InstrumentedMutexUnlock u(&mutex_); + cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false); + + // Probe block cache for problems (if not already via another CF) + if (immutable_db_options_.info_log) { + auto* table_factory = cfd->ioptions()->table_factory.get(); + assert(table_factory != nullptr); + Cache* cache = + table_factory->GetOptions(TableFactory::kBlockCacheOpts()); + if (cache && probed_caches.insert(cache).second) { + cache->ReportProblems(immutable_db_options_.info_log); + } + } + } + + const std::string* property = &DB::Properties::kDBStats; + const DBPropertyInfo* property_info = GetPropertyInfo(*property); + assert(property_info != nullptr); + assert(!property_info->need_out_of_mutex); + default_cf_internal_stats_->GetStringProperty(*property_info, *property, + &stats); + + property = &InternalStats::kPeriodicCFStats; + property_info = GetPropertyInfo(*property); + assert(property_info != nullptr); + assert(!property_info->need_out_of_mutex); + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->initialized()) { + cfd->internal_stats()->GetStringProperty(*property_info, *property, + &stats); + } + } + } + TEST_SYNC_POINT("DBImpl::DumpStats:2"); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "------- DUMPING STATS -------"); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str()); + if (immutable_db_options_.dump_malloc_stats) { + stats.clear(); + DumpMallocStats(&stats); + if (!stats.empty()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "------- Malloc STATS -------"); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str()); + } + } +#endif // !ROCKSDB_LITE + + PrintStatistics(); +} + +// Periodically flush info log out of application buffer at a low frequency. +// This improves debuggability in case of RocksDB hanging since it ensures the +// log messages leading up to the hang will eventually become visible in the +// log. +void DBImpl::FlushInfoLog() { + if (shutdown_initiated_) { + return; + } + TEST_SYNC_POINT("DBImpl::FlushInfoLog:StartRunning"); + LogFlush(immutable_db_options_.info_log); +} + +Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family, + int max_entries_to_print, + std::string* out_str) { + auto* cfh = static_cast_with_check(column_family); + ColumnFamilyData* cfd = cfh->cfd(); + + SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); + Version* version = super_version->current; + + Status s = + version->TablesRangeTombstoneSummary(max_entries_to_print, out_str); + + CleanupSuperVersion(super_version); + return s; +} + +void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) { + mutex_.AssertHeld(); + if (!job_context->logs_to_free.empty()) { + for (auto l : job_context->logs_to_free) { + AddToLogsToFreeQueue(l); + } + job_context->logs_to_free.clear(); + } +} + +FSDirectory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const { + assert(cfd); + FSDirectory* ret_dir = cfd->GetDataDir(path_id); + if (ret_dir == nullptr) { + return directories_.GetDataDir(path_id); + } + return ret_dir; +} + +Status DBImpl::SetOptions( + ColumnFamilyHandle* column_family, + const std::unordered_map& options_map) { +#ifdef ROCKSDB_LITE + (void)column_family; + (void)options_map; + return Status::NotSupported("Not supported in ROCKSDB LITE"); +#else + auto* cfd = + static_cast_with_check(column_family)->cfd(); + if (options_map.empty()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "SetOptions() on column family [%s], empty input", + cfd->GetName().c_str()); + return Status::InvalidArgument("empty input"); + } + + MutableCFOptions new_options; + Status s; + Status persist_options_status; + SuperVersionContext sv_context(/* create_superversion */ true); + { + auto db_options = GetDBOptions(); + InstrumentedMutexLock l(&mutex_); + s = cfd->SetOptions(db_options, options_map); + if (s.ok()) { + new_options = *cfd->GetLatestMutableCFOptions(); + // Append new version to recompute compaction score. + VersionEdit dummy_edit; + s = versions_->LogAndApply(cfd, new_options, &dummy_edit, &mutex_, + directories_.GetDbDir()); + // Trigger possible flush/compactions. This has to be before we persist + // options to file, otherwise there will be a deadlock with writer + // thread. + InstallSuperVersionAndScheduleWork(cfd, &sv_context, new_options); + + persist_options_status = WriteOptionsFile( + false /*need_mutex_lock*/, true /*need_enter_write_thread*/); + bg_cv_.SignalAll(); + } + } + sv_context.Clean(); + + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "SetOptions() on column family [%s], inputs:", cfd->GetName().c_str()); + for (const auto& o : options_map) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(), + o.second.c_str()); + } + if (s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] SetOptions() succeeded", cfd->GetName().c_str()); + new_options.Dump(immutable_db_options_.info_log.get()); + if (!persist_options_status.ok()) { + // NOTE: WriteOptionsFile already logs on failure + s = persist_options_status; + } + } else { + persist_options_status.PermitUncheckedError(); // less important + ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] SetOptions() failed", + cfd->GetName().c_str()); + } + LogFlush(immutable_db_options_.info_log); + return s; +#endif // ROCKSDB_LITE +} + +Status DBImpl::SetDBOptions( + const std::unordered_map& options_map) { +#ifdef ROCKSDB_LITE + (void)options_map; + return Status::NotSupported("Not supported in ROCKSDB LITE"); +#else + if (options_map.empty()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "SetDBOptions(), empty input."); + return Status::InvalidArgument("empty input"); + } + + MutableDBOptions new_options; + Status s; + Status persist_options_status = Status::OK(); + bool wal_changed = false; + WriteContext write_context; + { + InstrumentedMutexLock l(&mutex_); + s = GetMutableDBOptionsFromStrings(mutable_db_options_, options_map, + &new_options); + + if (new_options.bytes_per_sync == 0) { + new_options.bytes_per_sync = 1024 * 1024; + } + + if (MutableDBOptionsAreEqual(mutable_db_options_, new_options)) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "SetDBOptions(), input option value is not changed, " + "skipping updating."); + persist_options_status.PermitUncheckedError(); + return s; + } + + DBOptions new_db_options = + BuildDBOptions(immutable_db_options_, new_options); + if (s.ok()) { + s = ValidateOptions(new_db_options); + } + if (s.ok()) { + for (auto c : *versions_->GetColumnFamilySet()) { + if (!c->IsDropped()) { + auto cf_options = c->GetLatestCFOptions(); + s = ColumnFamilyData::ValidateOptions(new_db_options, cf_options); + if (!s.ok()) { + break; + } + } + } + } + if (s.ok()) { + const BGJobLimits current_bg_job_limits = + GetBGJobLimits(mutable_db_options_.max_background_flushes, + mutable_db_options_.max_background_compactions, + mutable_db_options_.max_background_jobs, + /* parallelize_compactions */ true); + const BGJobLimits new_bg_job_limits = GetBGJobLimits( + new_options.max_background_flushes, + new_options.max_background_compactions, + new_options.max_background_jobs, /* parallelize_compactions */ true); + + const bool max_flushes_increased = + new_bg_job_limits.max_flushes > current_bg_job_limits.max_flushes; + const bool max_compactions_increased = + new_bg_job_limits.max_compactions > + current_bg_job_limits.max_compactions; + + if (max_flushes_increased || max_compactions_increased) { + if (max_flushes_increased) { + env_->IncBackgroundThreadsIfNeeded(new_bg_job_limits.max_flushes, + Env::Priority::HIGH); + } + + if (max_compactions_increased) { + env_->IncBackgroundThreadsIfNeeded(new_bg_job_limits.max_compactions, + Env::Priority::LOW); + } + + MaybeScheduleFlushOrCompaction(); + } + + mutex_.Unlock(); + if (new_options.stats_dump_period_sec == 0) { + s = periodic_task_scheduler_.Unregister(PeriodicTaskType::kDumpStats); + } else { + s = periodic_task_scheduler_.Register( + PeriodicTaskType::kDumpStats, + periodic_task_functions_.at(PeriodicTaskType::kDumpStats), + new_options.stats_dump_period_sec); + } + if (new_options.max_total_wal_size != + mutable_db_options_.max_total_wal_size) { + max_total_wal_size_.store(new_options.max_total_wal_size, + std::memory_order_release); + } + if (s.ok()) { + if (new_options.stats_persist_period_sec == 0) { + s = periodic_task_scheduler_.Unregister( + PeriodicTaskType::kPersistStats); + } else { + s = periodic_task_scheduler_.Register( + PeriodicTaskType::kPersistStats, + periodic_task_functions_.at(PeriodicTaskType::kPersistStats), + new_options.stats_persist_period_sec); + } + } + mutex_.Lock(); + if (!s.ok()) { + return s; + } + + write_controller_.set_max_delayed_write_rate( + new_options.delayed_write_rate); + table_cache_.get()->SetCapacity(new_options.max_open_files == -1 + ? TableCache::kInfiniteCapacity + : new_options.max_open_files - 10); + wal_changed = mutable_db_options_.wal_bytes_per_sync != + new_options.wal_bytes_per_sync; + mutable_db_options_ = new_options; + file_options_for_compaction_ = FileOptions(new_db_options); + file_options_for_compaction_ = fs_->OptimizeForCompactionTableWrite( + file_options_for_compaction_, immutable_db_options_); + versions_->ChangeFileOptions(mutable_db_options_); + // TODO(xiez): clarify why apply optimize for read to write options + file_options_for_compaction_ = fs_->OptimizeForCompactionTableRead( + file_options_for_compaction_, immutable_db_options_); + file_options_for_compaction_.compaction_readahead_size = + mutable_db_options_.compaction_readahead_size; + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + if (total_log_size_ > GetMaxTotalWalSize() || wal_changed) { + Status purge_wal_status = SwitchWAL(&write_context); + if (!purge_wal_status.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Unable to purge WAL files in SetDBOptions() -- %s", + purge_wal_status.ToString().c_str()); + } + } + persist_options_status = WriteOptionsFile( + false /*need_mutex_lock*/, false /*need_enter_write_thread*/); + write_thread_.ExitUnbatched(&w); + } else { + // To get here, we must have had invalid options and will not attempt to + // persist the options, which means the status is "OK/Uninitialized. + persist_options_status.PermitUncheckedError(); + } + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions(), inputs:"); + for (const auto& o : options_map) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(), + o.second.c_str()); + } + if (s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions() succeeded"); + new_options.Dump(immutable_db_options_.info_log.get()); + if (!persist_options_status.ok()) { + if (immutable_db_options_.fail_if_options_file_error) { + s = Status::IOError( + "SetDBOptions() succeeded, but unable to persist options", + persist_options_status.ToString()); + } + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Unable to persist options in SetDBOptions() -- %s", + persist_options_status.ToString().c_str()); + } + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, "SetDBOptions failed"); + } + LogFlush(immutable_db_options_.info_log); + return s; +#endif // ROCKSDB_LITE +} + +// return the same level if it cannot be moved +int DBImpl::FindMinimumEmptyLevelFitting( + ColumnFamilyData* cfd, const MutableCFOptions& /*mutable_cf_options*/, + int level) { + mutex_.AssertHeld(); + const auto* vstorage = cfd->current()->storage_info(); + int minimum_level = level; + for (int i = level - 1; i > 0; --i) { + // stop if level i is not empty + if (vstorage->NumLevelFiles(i) > 0) break; + // stop if level i is too small (cannot fit the level files) + if (vstorage->MaxBytesForLevel(i) < vstorage->NumLevelBytes(level)) { + break; + } + + minimum_level = i; + } + return minimum_level; +} + +Status DBImpl::FlushWAL(bool sync) { + if (manual_wal_flush_) { + IOStatus io_s; + { + // We need to lock log_write_mutex_ since logs_ might change concurrently + InstrumentedMutexLock wl(&log_write_mutex_); + log::Writer* cur_log_writer = logs_.back().writer; + io_s = cur_log_writer->WriteBuffer(); + } + if (!io_s.ok()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s", + io_s.ToString().c_str()); + // In case there is a fs error we should set it globally to prevent the + // future writes + IOStatusCheck(io_s); + // whether sync or not, we should abort the rest of function upon error + return static_cast(io_s); + } + if (!sync) { + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=false"); + return static_cast(io_s); + } + } + if (!sync) { + return Status::OK(); + } + // sync = true + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=true"); + return SyncWAL(); +} + +bool DBImpl::WALBufferIsEmpty(bool lock) { + if (lock) { + log_write_mutex_.Lock(); + } + log::Writer* cur_log_writer = logs_.back().writer; + auto res = cur_log_writer->BufferIsEmpty(); + if (lock) { + log_write_mutex_.Unlock(); + } + return res; +} + +Status DBImpl::SyncWAL() { + TEST_SYNC_POINT("DBImpl::SyncWAL:Begin"); + autovector logs_to_sync; + bool need_log_dir_sync; + uint64_t current_log_number; + + { + InstrumentedMutexLock l(&log_write_mutex_); + assert(!logs_.empty()); + + // This SyncWAL() call only cares about logs up to this number. + current_log_number = logfile_number_; + + while (logs_.front().number <= current_log_number && + logs_.front().IsSyncing()) { + log_sync_cv_.Wait(); + } + // First check that logs are safe to sync in background. + for (auto it = logs_.begin(); + it != logs_.end() && it->number <= current_log_number; ++it) { + if (!it->writer->file()->writable_file()->IsSyncThreadSafe()) { + return Status::NotSupported( + "SyncWAL() is not supported for this implementation of WAL file", + immutable_db_options_.allow_mmap_writes + ? "try setting Options::allow_mmap_writes to false" + : Slice()); + } + } + for (auto it = logs_.begin(); + it != logs_.end() && it->number <= current_log_number; ++it) { + auto& log = *it; + log.PrepareForSync(); + logs_to_sync.push_back(log.writer); + } + + need_log_dir_sync = !log_dir_synced_; + } + + TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:1"); + RecordTick(stats_, WAL_FILE_SYNCED); + Status status; + IOStatus io_s; + for (log::Writer* log : logs_to_sync) { + io_s = log->file()->SyncWithoutFlush(immutable_db_options_.use_fsync); + if (!io_s.ok()) { + status = io_s; + break; + } + } + if (!io_s.ok()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL Sync error %s", + io_s.ToString().c_str()); + // In case there is a fs error we should set it globally to prevent the + // future writes + IOStatusCheck(io_s); + } + if (status.ok() && need_log_dir_sync) { + status = directories_.GetWalDir()->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + } + TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2"); + + TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1"); + VersionEdit synced_wals; + { + InstrumentedMutexLock l(&log_write_mutex_); + if (status.ok()) { + MarkLogsSynced(current_log_number, need_log_dir_sync, &synced_wals); + } else { + MarkLogsNotSynced(current_log_number); + } + } + if (status.ok() && synced_wals.IsWalAddition()) { + InstrumentedMutexLock l(&mutex_); + status = ApplyWALToManifest(&synced_wals); + } + + TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2"); + + return status; +} + +Status DBImpl::ApplyWALToManifest(VersionEdit* synced_wals) { + // not empty, write to MANIFEST. + mutex_.AssertHeld(); + Status status = versions_->LogAndApplyToDefaultColumnFamily( + synced_wals, &mutex_, directories_.GetDbDir()); + if (!status.ok() && versions_->io_status().IsIOError()) { + status = error_handler_.SetBGError(versions_->io_status(), + BackgroundErrorReason::kManifestWrite); + } + return status; +} + +Status DBImpl::LockWAL() { + log_write_mutex_.Lock(); + auto cur_log_writer = logs_.back().writer; + IOStatus status = cur_log_writer->WriteBuffer(); + if (!status.ok()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s", + status.ToString().c_str()); + // In case there is a fs error we should set it globally to prevent the + // future writes + WriteStatusCheck(status); + } + return static_cast(status); +} + +Status DBImpl::UnlockWAL() { + log_write_mutex_.Unlock(); + return Status::OK(); +} + +void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir, + VersionEdit* synced_wals) { + log_write_mutex_.AssertHeld(); + if (synced_dir && logfile_number_ == up_to) { + log_dir_synced_ = true; + } + for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) { + auto& wal = *it; + assert(wal.IsSyncing()); + + if (wal.number < logs_.back().number) { + // Inactive WAL + if (immutable_db_options_.track_and_verify_wals_in_manifest && + wal.GetPreSyncSize() > 0) { + synced_wals->AddWal(wal.number, WalMetadata(wal.GetPreSyncSize())); + } + if (wal.GetPreSyncSize() == wal.writer->file()->GetFlushedSize()) { + // Fully synced + logs_to_free_.push_back(wal.ReleaseWriter()); + it = logs_.erase(it); + } else { + assert(wal.GetPreSyncSize() < wal.writer->file()->GetFlushedSize()); + wal.FinishSync(); + ++it; + } + } else { + assert(wal.number == logs_.back().number); + // Active WAL + wal.FinishSync(); + ++it; + } + } + log_sync_cv_.SignalAll(); +} + +void DBImpl::MarkLogsNotSynced(uint64_t up_to) { + log_write_mutex_.AssertHeld(); + for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to; + ++it) { + auto& wal = *it; + wal.FinishSync(); + } + log_sync_cv_.SignalAll(); +} + +SequenceNumber DBImpl::GetLatestSequenceNumber() const { + return versions_->LastSequence(); +} + +void DBImpl::SetLastPublishedSequence(SequenceNumber seq) { + versions_->SetLastPublishedSequence(seq); +} + +Status DBImpl::GetFullHistoryTsLow(ColumnFamilyHandle* column_family, + std::string* ts_low) { + if (ts_low == nullptr) { + return Status::InvalidArgument("ts_low is nullptr"); + } + ColumnFamilyData* cfd = nullptr; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = static_cast_with_check(column_family); + assert(cfh != nullptr); + cfd = cfh->cfd(); + } + assert(cfd != nullptr && cfd->user_comparator() != nullptr); + if (cfd->user_comparator()->timestamp_size() == 0) { + return Status::InvalidArgument( + "Timestamp is not enabled in this column family"); + } + InstrumentedMutexLock l(&mutex_); + *ts_low = cfd->GetFullHistoryTsLow(); + assert(cfd->user_comparator()->timestamp_size() == ts_low->size()); + return Status::OK(); +} + +InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options, + Arena* arena, + SequenceNumber sequence, + ColumnFamilyHandle* column_family, + bool allow_unprepared_value) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = static_cast_with_check(column_family); + cfd = cfh->cfd(); + } + + mutex_.Lock(); + SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); + mutex_.Unlock(); + return NewInternalIterator(read_options, cfd, super_version, arena, sequence, + allow_unprepared_value); +} + +void DBImpl::SchedulePurge() { + mutex_.AssertHeld(); + assert(opened_successfully_); + + // Purge operations are put into High priority queue + bg_purge_scheduled_++; + env_->Schedule(&DBImpl::BGWorkPurge, this, Env::Priority::HIGH, nullptr); +} + +void DBImpl::BackgroundCallPurge() { + mutex_.Lock(); + + while (!logs_to_free_queue_.empty()) { + assert(!logs_to_free_queue_.empty()); + log::Writer* log_writer = *(logs_to_free_queue_.begin()); + logs_to_free_queue_.pop_front(); + mutex_.Unlock(); + delete log_writer; + mutex_.Lock(); + } + while (!superversions_to_free_queue_.empty()) { + assert(!superversions_to_free_queue_.empty()); + SuperVersion* sv = superversions_to_free_queue_.front(); + superversions_to_free_queue_.pop_front(); + mutex_.Unlock(); + delete sv; + mutex_.Lock(); + } + + assert(bg_purge_scheduled_ > 0); + + // Can't use iterator to go over purge_files_ because inside the loop we're + // unlocking the mutex that protects purge_files_. + while (!purge_files_.empty()) { + auto it = purge_files_.begin(); + // Need to make a copy of the PurgeFilesInfo before unlocking the mutex. + PurgeFileInfo purge_file = it->second; + + const std::string& fname = purge_file.fname; + const std::string& dir_to_sync = purge_file.dir_to_sync; + FileType type = purge_file.type; + uint64_t number = purge_file.number; + int job_id = purge_file.job_id; + + purge_files_.erase(it); + + mutex_.Unlock(); + DeleteObsoleteFileImpl(job_id, fname, dir_to_sync, type, number); + mutex_.Lock(); + } + + bg_purge_scheduled_--; + + bg_cv_.SignalAll(); + // IMPORTANT:there should be no code after calling SignalAll. This call may + // signal the DB destructor that it's OK to proceed with destruction. In + // that case, all DB variables will be dealloacated and referencing them + // will cause trouble. + mutex_.Unlock(); +} + +namespace { + +// A `SuperVersionHandle` holds a non-null `SuperVersion*` pointing at a +// `SuperVersion` referenced once for this object. It also contains the state +// needed to clean up the `SuperVersion` reference from outside of `DBImpl` +// using `CleanupSuperVersionHandle()`. +struct SuperVersionHandle { + // `_super_version` must be non-nullptr and `Ref()`'d once as long as the + // `SuperVersionHandle` may use it. + SuperVersionHandle(DBImpl* _db, InstrumentedMutex* _mu, + SuperVersion* _super_version, bool _background_purge) + : db(_db), + mu(_mu), + super_version(_super_version), + background_purge(_background_purge) {} + + DBImpl* db; + InstrumentedMutex* mu; + SuperVersion* super_version; + bool background_purge; +}; + +static void CleanupSuperVersionHandle(void* arg1, void* /*arg2*/) { + SuperVersionHandle* sv_handle = reinterpret_cast(arg1); + + if (sv_handle->super_version->Unref()) { + // Job id == 0 means that this is not our background process, but rather + // user thread + JobContext job_context(0); + + sv_handle->mu->Lock(); + sv_handle->super_version->Cleanup(); + sv_handle->db->FindObsoleteFiles(&job_context, false, true); + if (sv_handle->background_purge) { + sv_handle->db->ScheduleBgLogWriterClose(&job_context); + sv_handle->db->AddSuperVersionsToFreeQueue(sv_handle->super_version); + sv_handle->db->SchedulePurge(); + } + sv_handle->mu->Unlock(); + + if (!sv_handle->background_purge) { + delete sv_handle->super_version; + } + if (job_context.HaveSomethingToDelete()) { + sv_handle->db->PurgeObsoleteFiles(job_context, + sv_handle->background_purge); + } + job_context.Clean(); + } + + delete sv_handle; +} + +struct GetMergeOperandsState { + MergeContext merge_context; + PinnedIteratorsManager pinned_iters_mgr; + SuperVersionHandle* sv_handle; +}; + +static void CleanupGetMergeOperandsState(void* arg1, void* /*arg2*/) { + GetMergeOperandsState* state = static_cast(arg1); + CleanupSuperVersionHandle(state->sv_handle /* arg1 */, nullptr /* arg2 */); + delete state; +} + +} // namespace + +InternalIterator* DBImpl::NewInternalIterator( + const ReadOptions& read_options, ColumnFamilyData* cfd, + SuperVersion* super_version, Arena* arena, SequenceNumber sequence, + bool allow_unprepared_value, ArenaWrappedDBIter* db_iter) { + InternalIterator* internal_iter; + assert(arena != nullptr); + // Need to create internal iterator from the arena. + MergeIteratorBuilder merge_iter_builder( + &cfd->internal_comparator(), arena, + !read_options.total_order_seek && + super_version->mutable_cf_options.prefix_extractor != nullptr, + read_options.iterate_upper_bound); + // Collect iterator for mutable memtable + auto mem_iter = super_version->mem->NewIterator(read_options, arena); + Status s; + if (!read_options.ignore_range_deletions) { + TruncatedRangeDelIterator* mem_tombstone_iter = nullptr; + auto range_del_iter = super_version->mem->NewRangeTombstoneIterator( + read_options, sequence, false /* immutable_memtable */); + if (range_del_iter == nullptr || range_del_iter->empty()) { + delete range_del_iter; + } else { + mem_tombstone_iter = new TruncatedRangeDelIterator( + std::unique_ptr(range_del_iter), + &cfd->ioptions()->internal_comparator, nullptr /* smallest */, + nullptr /* largest */); + } + merge_iter_builder.AddPointAndTombstoneIterator(mem_iter, + mem_tombstone_iter); + } else { + merge_iter_builder.AddIterator(mem_iter); + } + + // Collect all needed child iterators for immutable memtables + if (s.ok()) { + super_version->imm->AddIterators(read_options, &merge_iter_builder, + !read_options.ignore_range_deletions); + } + TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s); + if (s.ok()) { + // Collect iterators for files in L0 - Ln + if (read_options.read_tier != kMemtableTier) { + super_version->current->AddIterators(read_options, file_options_, + &merge_iter_builder, + allow_unprepared_value); + } + internal_iter = merge_iter_builder.Finish( + read_options.ignore_range_deletions ? nullptr : db_iter); + SuperVersionHandle* cleanup = new SuperVersionHandle( + this, &mutex_, super_version, + read_options.background_purge_on_iterator_cleanup || + immutable_db_options_.avoid_unnecessary_blocking_io); + internal_iter->RegisterCleanup(CleanupSuperVersionHandle, cleanup, nullptr); + + return internal_iter; + } else { + CleanupSuperVersion(super_version); + } + return NewErrorInternalIterator(s, arena); +} + +ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const { + return default_cf_handle_; +} + +ColumnFamilyHandle* DBImpl::PersistentStatsColumnFamily() const { + return persist_stats_cf_handle_; +} + +Status DBImpl::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) { + return Get(read_options, column_family, key, value, /*timestamp=*/nullptr); +} + +Status DBImpl::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, std::string* timestamp) { + assert(value != nullptr); + value->Reset(); + GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.value = value; + get_impl_options.timestamp = timestamp; + Status s = GetImpl(read_options, key, get_impl_options); + return s; +} + +Status DBImpl::GetEntity(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableWideColumns* columns) { + if (!column_family) { + return Status::InvalidArgument( + "Cannot call GetEntity without a column family handle"); + } + + if (!columns) { + return Status::InvalidArgument( + "Cannot call GetEntity without a PinnableWideColumns object"); + } + + columns->Reset(); + + GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.columns = columns; + + return GetImpl(read_options, key, get_impl_options); +} + +bool DBImpl::ShouldReferenceSuperVersion(const MergeContext& merge_context) { + // If both thresholds are reached, a function returning merge operands as + // `PinnableSlice`s should reference the `SuperVersion` to avoid large and/or + // numerous `memcpy()`s. + // + // The below constants enable the optimization conservatively. They are + // verified to not regress `GetMergeOperands()` latency in the following + // scenarios. + // + // - CPU: two socket Intel(R) Xeon(R) Gold 6138 CPU @ 2.00GHz + // - `GetMergeOperands()` threads: 1 - 32 + // - Entry size: 32 bytes - 4KB + // - Merges per key: 1 - 16K + // - LSM component: memtable + // + // TODO(ajkr): expand measurement to SST files. + static const size_t kNumBytesForSvRef = 32768; + static const size_t kLog2AvgBytesForSvRef = 8; // 256 bytes + + size_t num_bytes = 0; + for (const Slice& sl : merge_context.GetOperands()) { + num_bytes += sl.size(); + } + return num_bytes >= kNumBytesForSvRef && + (num_bytes >> kLog2AvgBytesForSvRef) >= + merge_context.GetOperands().size(); +} + +Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, + GetImplOptions& get_impl_options) { + assert(get_impl_options.value != nullptr || + get_impl_options.merge_operands != nullptr || + get_impl_options.columns != nullptr); + + assert(get_impl_options.column_family); + + if (read_options.timestamp) { + const Status s = FailIfTsMismatchCf(get_impl_options.column_family, + *(read_options.timestamp), + /*ts_for_read=*/true); + if (!s.ok()) { + return s; + } + } else { + const Status s = FailIfCfHasTs(get_impl_options.column_family); + if (!s.ok()) { + return s; + } + } + + // Clear the timestamps for returning results so that we can distinguish + // between tombstone or key that has never been written + if (get_impl_options.timestamp) { + get_impl_options.timestamp->clear(); + } + + GetWithTimestampReadCallback read_cb(0); // Will call Refresh + + PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock); + StopWatch sw(immutable_db_options_.clock, stats_, DB_GET); + PERF_TIMER_GUARD(get_snapshot_time); + + auto cfh = static_cast_with_check( + get_impl_options.column_family); + auto cfd = cfh->cfd(); + + if (tracer_) { + // TODO: This mutex should be removed later, to improve performance when + // tracing is enabled. + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + // TODO: maybe handle the tracing status? + tracer_->Get(get_impl_options.column_family, key).PermitUncheckedError(); + } + } + + if (get_impl_options.get_merge_operands_options != nullptr) { + for (int i = 0; i < get_impl_options.get_merge_operands_options + ->expected_max_number_of_operands; + ++i) { + get_impl_options.merge_operands[i].Reset(); + } + } + + // Acquire SuperVersion + SuperVersion* sv = GetAndRefSuperVersion(cfd); + + TEST_SYNC_POINT("DBImpl::GetImpl:1"); + TEST_SYNC_POINT("DBImpl::GetImpl:2"); + + SequenceNumber snapshot; + if (read_options.snapshot != nullptr) { + if (get_impl_options.callback) { + // Already calculated based on read_options.snapshot + snapshot = get_impl_options.callback->max_visible_seq(); + } else { + snapshot = + reinterpret_cast(read_options.snapshot)->number_; + } + } else { + // Note that the snapshot is assigned AFTER referencing the super + // version because otherwise a flush happening in between may compact away + // data for the snapshot, so the reader would see neither data that was be + // visible to the snapshot before compaction nor the newer data inserted + // afterwards. + snapshot = GetLastPublishedSequence(); + if (get_impl_options.callback) { + // The unprep_seqs are not published for write unprepared, so it could be + // that max_visible_seq is larger. Seek to the std::max of the two. + // However, we still want our callback to contain the actual snapshot so + // that it can do the correct visibility filtering. + get_impl_options.callback->Refresh(snapshot); + + // Internally, WriteUnpreparedTxnReadCallback::Refresh would set + // max_visible_seq = max(max_visible_seq, snapshot) + // + // Currently, the commented out assert is broken by + // InvalidSnapshotReadCallback, but if write unprepared recovery followed + // the regular transaction flow, then this special read callback would not + // be needed. + // + // assert(callback->max_visible_seq() >= snapshot); + snapshot = get_impl_options.callback->max_visible_seq(); + } + } + // If timestamp is used, we use read callback to ensure is returned + // only if t <= read_opts.timestamp and s <= snapshot. + // HACK: temporarily overwrite input struct field but restore + SaveAndRestore restore_callback(&get_impl_options.callback); + const Comparator* ucmp = get_impl_options.column_family->GetComparator(); + assert(ucmp); + if (ucmp->timestamp_size() > 0) { + assert(!get_impl_options + .callback); // timestamp with callback is not supported + read_cb.Refresh(snapshot); + get_impl_options.callback = &read_cb; + } + TEST_SYNC_POINT("DBImpl::GetImpl:3"); + TEST_SYNC_POINT("DBImpl::GetImpl:4"); + + // Prepare to store a list of merge operations if merge occurs. + MergeContext merge_context; + SequenceNumber max_covering_tombstone_seq = 0; + + Status s; + // First look in the memtable, then in the immutable memtable (if any). + // s is both in/out. When in, s could either be OK or MergeInProgress. + // merge_operands will contain the sequence of merges in the latter case. + LookupKey lkey(key, snapshot, read_options.timestamp); + PERF_TIMER_STOP(get_snapshot_time); + + bool skip_memtable = (read_options.read_tier == kPersistedTier && + has_unpersisted_data_.load(std::memory_order_relaxed)); + bool done = false; + std::string* timestamp = + ucmp->timestamp_size() > 0 ? get_impl_options.timestamp : nullptr; + if (!skip_memtable) { + // Get value associated with key + if (get_impl_options.get_value) { + if (sv->mem->Get( + lkey, + get_impl_options.value ? get_impl_options.value->GetSelf() + : nullptr, + get_impl_options.columns, timestamp, &s, &merge_context, + &max_covering_tombstone_seq, read_options, + false /* immutable_memtable */, get_impl_options.callback, + get_impl_options.is_blob_index)) { + done = true; + + if (get_impl_options.value) { + get_impl_options.value->PinSelf(); + } + + RecordTick(stats_, MEMTABLE_HIT); + } else if ((s.ok() || s.IsMergeInProgress()) && + sv->imm->Get(lkey, + get_impl_options.value + ? get_impl_options.value->GetSelf() + : nullptr, + get_impl_options.columns, timestamp, &s, + &merge_context, &max_covering_tombstone_seq, + read_options, get_impl_options.callback, + get_impl_options.is_blob_index)) { + done = true; + + if (get_impl_options.value) { + get_impl_options.value->PinSelf(); + } + + RecordTick(stats_, MEMTABLE_HIT); + } + } else { + // Get Merge Operands associated with key, Merge Operands should not be + // merged and raw values should be returned to the user. + if (sv->mem->Get(lkey, /*value=*/nullptr, /*columns=*/nullptr, + /*timestamp=*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, read_options, + false /* immutable_memtable */, nullptr, nullptr, + false)) { + done = true; + RecordTick(stats_, MEMTABLE_HIT); + } else if ((s.ok() || s.IsMergeInProgress()) && + sv->imm->GetMergeOperands(lkey, &s, &merge_context, + &max_covering_tombstone_seq, + read_options)) { + done = true; + RecordTick(stats_, MEMTABLE_HIT); + } + } + if (!done && !s.ok() && !s.IsMergeInProgress()) { + ReturnAndCleanupSuperVersion(cfd, sv); + return s; + } + } + TEST_SYNC_POINT("DBImpl::GetImpl:PostMemTableGet:0"); + TEST_SYNC_POINT("DBImpl::GetImpl:PostMemTableGet:1"); + PinnedIteratorsManager pinned_iters_mgr; + if (!done) { + PERF_TIMER_GUARD(get_from_output_files_time); + sv->current->Get( + read_options, lkey, get_impl_options.value, get_impl_options.columns, + timestamp, &s, &merge_context, &max_covering_tombstone_seq, + &pinned_iters_mgr, + get_impl_options.get_value ? get_impl_options.value_found : nullptr, + nullptr, nullptr, + get_impl_options.get_value ? get_impl_options.callback : nullptr, + get_impl_options.get_value ? get_impl_options.is_blob_index : nullptr, + get_impl_options.get_value); + RecordTick(stats_, MEMTABLE_MISS); + } + + { + PERF_TIMER_GUARD(get_post_process_time); + + RecordTick(stats_, NUMBER_KEYS_READ); + size_t size = 0; + if (s.ok()) { + if (get_impl_options.get_value) { + if (get_impl_options.value) { + size = get_impl_options.value->size(); + } else if (get_impl_options.columns) { + size = get_impl_options.columns->serialized_size(); + } + } else { + // Return all merge operands for get_impl_options.key + *get_impl_options.number_of_operands = + static_cast(merge_context.GetNumOperands()); + if (*get_impl_options.number_of_operands > + get_impl_options.get_merge_operands_options + ->expected_max_number_of_operands) { + s = Status::Incomplete( + Status::SubCode::KMergeOperandsInsufficientCapacity); + } else { + // Each operand depends on one of the following resources: `sv`, + // `pinned_iters_mgr`, or `merge_context`. It would be crazy expensive + // to reference `sv` for each operand relying on it because `sv` is + // (un)ref'd in all threads using the DB. Furthermore, we do not track + // on which resource each operand depends. + // + // To solve this, we bundle the resources in a `GetMergeOperandsState` + // and manage them with a `SharedCleanablePtr` shared among the + // `PinnableSlice`s we return. This bundle includes one `sv` reference + // and ownership of the `merge_context` and `pinned_iters_mgr` + // objects. + bool ref_sv = ShouldReferenceSuperVersion(merge_context); + if (ref_sv) { + assert(!merge_context.GetOperands().empty()); + SharedCleanablePtr shared_cleanable; + GetMergeOperandsState* state = nullptr; + state = new GetMergeOperandsState(); + state->merge_context = std::move(merge_context); + state->pinned_iters_mgr = std::move(pinned_iters_mgr); + + sv->Ref(); + + state->sv_handle = new SuperVersionHandle( + this, &mutex_, sv, + immutable_db_options_.avoid_unnecessary_blocking_io); + + shared_cleanable.Allocate(); + shared_cleanable->RegisterCleanup(CleanupGetMergeOperandsState, + state /* arg1 */, + nullptr /* arg2 */); + for (size_t i = 0; i < state->merge_context.GetOperands().size(); + ++i) { + const Slice& sl = state->merge_context.GetOperands()[i]; + size += sl.size(); + + get_impl_options.merge_operands->PinSlice( + sl, nullptr /* cleanable */); + if (i == state->merge_context.GetOperands().size() - 1) { + shared_cleanable.MoveAsCleanupTo( + get_impl_options.merge_operands); + } else { + shared_cleanable.RegisterCopyWith( + get_impl_options.merge_operands); + } + get_impl_options.merge_operands++; + } + } else { + for (const Slice& sl : merge_context.GetOperands()) { + size += sl.size(); + get_impl_options.merge_operands->PinSelf(sl); + get_impl_options.merge_operands++; + } + } + } + } + RecordTick(stats_, BYTES_READ, size); + PERF_COUNTER_ADD(get_read_bytes, size); + } + + ReturnAndCleanupSuperVersion(cfd, sv); + + RecordInHistogram(stats_, BYTES_PER_READ, size); + } + return s; +} + +std::vector DBImpl::MultiGet( + const ReadOptions& read_options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) { + return MultiGet(read_options, column_family, keys, values, + /*timestamps=*/nullptr); +} + +std::vector DBImpl::MultiGet( + const ReadOptions& read_options, + const std::vector& column_family, + const std::vector& keys, std::vector* values, + std::vector* timestamps) { + PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock); + StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET); + PERF_TIMER_GUARD(get_snapshot_time); + + size_t num_keys = keys.size(); + assert(column_family.size() == num_keys); + std::vector stat_list(num_keys); + + bool should_fail = false; + for (size_t i = 0; i < num_keys; ++i) { + assert(column_family[i]); + if (read_options.timestamp) { + stat_list[i] = FailIfTsMismatchCf( + column_family[i], *(read_options.timestamp), /*ts_for_read=*/true); + if (!stat_list[i].ok()) { + should_fail = true; + } + } else { + stat_list[i] = FailIfCfHasTs(column_family[i]); + if (!stat_list[i].ok()) { + should_fail = true; + } + } + } + + if (should_fail) { + for (auto& s : stat_list) { + if (s.ok()) { + s = Status::Incomplete( + "DB not queried due to invalid argument(s) in the same MultiGet"); + } + } + return stat_list; + } + + if (tracer_) { + // TODO: This mutex should be removed later, to improve performance when + // tracing is enabled. + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + // TODO: maybe handle the tracing status? + tracer_->MultiGet(column_family, keys).PermitUncheckedError(); + } + } + + SequenceNumber consistent_seqnum; + + UnorderedMap multiget_cf_data( + column_family.size()); + for (auto cf : column_family) { + auto cfh = static_cast_with_check(cf); + auto cfd = cfh->cfd(); + if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) { + multiget_cf_data.emplace(cfd->GetID(), + MultiGetColumnFamilyData(cfh, nullptr)); + } + } + + std::function::iterator&)> + iter_deref_lambda = + [](UnorderedMap::iterator& + cf_iter) { return &cf_iter->second; }; + + bool unref_only = + MultiCFSnapshot>( + read_options, nullptr, iter_deref_lambda, &multiget_cf_data, + &consistent_seqnum); + + TEST_SYNC_POINT("DBImpl::MultiGet:AfterGetSeqNum1"); + TEST_SYNC_POINT("DBImpl::MultiGet:AfterGetSeqNum2"); + + // Contain a list of merge operations if merge occurs. + MergeContext merge_context; + + // Note: this always resizes the values array + values->resize(num_keys); + if (timestamps) { + timestamps->resize(num_keys); + } + + // Keep track of bytes that we read for statistics-recording later + uint64_t bytes_read = 0; + PERF_TIMER_STOP(get_snapshot_time); + + // For each of the given keys, apply the entire "get" process as follows: + // First look in the memtable, then in the immutable memtable (if any). + // s is both in/out. When in, s could either be OK or MergeInProgress. + // merge_operands will contain the sequence of merges in the latter case. + size_t num_found = 0; + size_t keys_read; + uint64_t curr_value_size = 0; + + GetWithTimestampReadCallback timestamp_read_callback(0); + ReadCallback* read_callback = nullptr; + if (read_options.timestamp && read_options.timestamp->size() > 0) { + timestamp_read_callback.Refresh(consistent_seqnum); + read_callback = ×tamp_read_callback; + } + + for (keys_read = 0; keys_read < num_keys; ++keys_read) { + merge_context.Clear(); + Status& s = stat_list[keys_read]; + std::string* value = &(*values)[keys_read]; + std::string* timestamp = timestamps ? &(*timestamps)[keys_read] : nullptr; + + LookupKey lkey(keys[keys_read], consistent_seqnum, read_options.timestamp); + auto cfh = static_cast_with_check( + column_family[keys_read]); + SequenceNumber max_covering_tombstone_seq = 0; + auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID()); + assert(mgd_iter != multiget_cf_data.end()); + auto mgd = mgd_iter->second; + auto super_version = mgd.super_version; + bool skip_memtable = + (read_options.read_tier == kPersistedTier && + has_unpersisted_data_.load(std::memory_order_relaxed)); + bool done = false; + if (!skip_memtable) { + if (super_version->mem->Get( + lkey, value, /*columns=*/nullptr, timestamp, &s, &merge_context, + &max_covering_tombstone_seq, read_options, + false /* immutable_memtable */, read_callback)) { + done = true; + RecordTick(stats_, MEMTABLE_HIT); + } else if (super_version->imm->Get(lkey, value, /*columns=*/nullptr, + timestamp, &s, &merge_context, + &max_covering_tombstone_seq, + read_options, read_callback)) { + done = true; + RecordTick(stats_, MEMTABLE_HIT); + } + } + if (!done) { + PinnableSlice pinnable_val; + PERF_TIMER_GUARD(get_from_output_files_time); + PinnedIteratorsManager pinned_iters_mgr; + super_version->current->Get(read_options, lkey, &pinnable_val, + /*columns=*/nullptr, timestamp, &s, + &merge_context, &max_covering_tombstone_seq, + &pinned_iters_mgr, /*value_found=*/nullptr, + /*key_exists=*/nullptr, + /*seq=*/nullptr, read_callback); + value->assign(pinnable_val.data(), pinnable_val.size()); + RecordTick(stats_, MEMTABLE_MISS); + } + + if (s.ok()) { + bytes_read += value->size(); + num_found++; + curr_value_size += value->size(); + if (curr_value_size > read_options.value_size_soft_limit) { + while (++keys_read < num_keys) { + stat_list[keys_read] = Status::Aborted(); + } + break; + } + } + if (read_options.deadline.count() && + immutable_db_options_.clock->NowMicros() > + static_cast(read_options.deadline.count())) { + break; + } + } + + if (keys_read < num_keys) { + // The only reason to break out of the loop is when the deadline is + // exceeded + assert(immutable_db_options_.clock->NowMicros() > + static_cast(read_options.deadline.count())); + for (++keys_read; keys_read < num_keys; ++keys_read) { + stat_list[keys_read] = Status::TimedOut(); + } + } + + // Post processing (decrement reference counts and record statistics) + PERF_TIMER_GUARD(get_post_process_time); + autovector superversions_to_delete; + + for (auto mgd_iter : multiget_cf_data) { + auto mgd = mgd_iter.second; + if (!unref_only) { + ReturnAndCleanupSuperVersion(mgd.cfd, mgd.super_version); + } else { + mgd.cfd->GetSuperVersion()->Unref(); + } + } + RecordTick(stats_, NUMBER_MULTIGET_CALLS); + RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys); + RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found); + RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read); + RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read); + PERF_COUNTER_ADD(multiget_read_bytes, bytes_read); + PERF_TIMER_STOP(get_post_process_time); + + return stat_list; +} + +template +bool DBImpl::MultiCFSnapshot( + const ReadOptions& read_options, ReadCallback* callback, + std::function& + iter_deref_func, + T* cf_list, SequenceNumber* snapshot) { + PERF_TIMER_GUARD(get_snapshot_time); + + bool last_try = false; + if (cf_list->size() == 1) { + // Fast path for a single column family. We can simply get the thread loca + // super version + auto cf_iter = cf_list->begin(); + auto node = iter_deref_func(cf_iter); + node->super_version = GetAndRefSuperVersion(node->cfd); + if (read_options.snapshot != nullptr) { + // Note: In WritePrepared txns this is not necessary but not harmful + // either. Because prep_seq > snapshot => commit_seq > snapshot so if + // a snapshot is specified we should be fine with skipping seq numbers + // that are greater than that. + // + // In WriteUnprepared, we cannot set snapshot in the lookup key because we + // may skip uncommitted data that should be visible to the transaction for + // reading own writes. + *snapshot = + static_cast(read_options.snapshot)->number_; + if (callback) { + *snapshot = std::max(*snapshot, callback->max_visible_seq()); + } + } else { + // Since we get and reference the super version before getting + // the snapshot number, without a mutex protection, it is possible + // that a memtable switch happened in the middle and not all the + // data for this snapshot is available. But it will contain all + // the data available in the super version we have, which is also + // a valid snapshot to read from. + // We shouldn't get snapshot before finding and referencing the super + // version because a flush happening in between may compact away data for + // the snapshot, but the snapshot is earlier than the data overwriting it, + // so users may see wrong results. + *snapshot = GetLastPublishedSequence(); + } + } else { + // If we end up with the same issue of memtable geting sealed during 2 + // consecutive retries, it means the write rate is very high. In that case + // its probably ok to take the mutex on the 3rd try so we can succeed for + // sure + constexpr int num_retries = 3; + for (int i = 0; i < num_retries; ++i) { + last_try = (i == num_retries - 1); + bool retry = false; + + if (i > 0) { + for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end(); + ++cf_iter) { + auto node = iter_deref_func(cf_iter); + SuperVersion* super_version = node->super_version; + ColumnFamilyData* cfd = node->cfd; + if (super_version != nullptr) { + ReturnAndCleanupSuperVersion(cfd, super_version); + } + node->super_version = nullptr; + } + } + if (read_options.snapshot == nullptr) { + if (last_try) { + TEST_SYNC_POINT("DBImpl::MultiGet::LastTry"); + // We're close to max number of retries. For the last retry, + // acquire the lock so we're sure to succeed + mutex_.Lock(); + } + *snapshot = GetLastPublishedSequence(); + } else { + *snapshot = + static_cast_with_check(read_options.snapshot) + ->number_; + } + for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end(); + ++cf_iter) { + auto node = iter_deref_func(cf_iter); + if (!last_try) { + node->super_version = GetAndRefSuperVersion(node->cfd); + } else { + node->super_version = node->cfd->GetSuperVersion()->Ref(); + } + TEST_SYNC_POINT("DBImpl::MultiGet::AfterRefSV"); + if (read_options.snapshot != nullptr || last_try) { + // If user passed a snapshot, then we don't care if a memtable is + // sealed or compaction happens because the snapshot would ensure + // that older key versions are kept around. If this is the last + // retry, then we have the lock so nothing bad can happen + continue; + } + // We could get the earliest sequence number for the whole list of + // memtables, which will include immutable memtables as well, but that + // might be tricky to maintain in case we decide, in future, to do + // memtable compaction. + if (!last_try) { + SequenceNumber seq = + node->super_version->mem->GetEarliestSequenceNumber(); + if (seq > *snapshot) { + retry = true; + break; + } + } + } + if (!retry) { + if (last_try) { + mutex_.Unlock(); + } + break; + } + } + } + + // Keep track of bytes that we read for statistics-recording later + PERF_TIMER_STOP(get_snapshot_time); + + return last_try; +} + +void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input) { + return MultiGet(read_options, num_keys, column_families, keys, values, + /*timestamps=*/nullptr, statuses, sorted_input); +} + +void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, std::string* timestamps, + Status* statuses, const bool sorted_input) { + if (num_keys == 0) { + return; + } + + bool should_fail = false; + for (size_t i = 0; i < num_keys; ++i) { + ColumnFamilyHandle* cfh = column_families[i]; + assert(cfh); + if (read_options.timestamp) { + statuses[i] = FailIfTsMismatchCf(cfh, *(read_options.timestamp), + /*ts_for_read=*/true); + if (!statuses[i].ok()) { + should_fail = true; + } + } else { + statuses[i] = FailIfCfHasTs(cfh); + if (!statuses[i].ok()) { + should_fail = true; + } + } + } + if (should_fail) { + for (size_t i = 0; i < num_keys; ++i) { + if (statuses[i].ok()) { + statuses[i] = Status::Incomplete( + "DB not queried due to invalid argument(s) in the same MultiGet"); + } + } + return; + } + + if (tracer_) { + // TODO: This mutex should be removed later, to improve performance when + // tracing is enabled. + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + // TODO: maybe handle the tracing status? + tracer_->MultiGet(num_keys, column_families, keys).PermitUncheckedError(); + } + } + + autovector key_context; + autovector sorted_keys; + sorted_keys.resize(num_keys); + for (size_t i = 0; i < num_keys; ++i) { + values[i].Reset(); + key_context.emplace_back(column_families[i], keys[i], &values[i], + timestamps ? ×tamps[i] : nullptr, + &statuses[i]); + } + for (size_t i = 0; i < num_keys; ++i) { + sorted_keys[i] = &key_context[i]; + } + PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys); + + autovector + multiget_cf_data; + size_t cf_start = 0; + ColumnFamilyHandle* cf = sorted_keys[0]->column_family; + + for (size_t i = 0; i < num_keys; ++i) { + KeyContext* key_ctx = sorted_keys[i]; + if (key_ctx->column_family != cf) { + multiget_cf_data.emplace_back(cf, cf_start, i - cf_start, nullptr); + cf_start = i; + cf = key_ctx->column_family; + } + } + + multiget_cf_data.emplace_back(cf, cf_start, num_keys - cf_start, nullptr); + + std::function::iterator&)> + iter_deref_lambda = + [](autovector::iterator& cf_iter) { + return &(*cf_iter); + }; + + SequenceNumber consistent_seqnum; + bool unref_only = MultiCFSnapshot< + autovector>( + read_options, nullptr, iter_deref_lambda, &multiget_cf_data, + &consistent_seqnum); + + GetWithTimestampReadCallback timestamp_read_callback(0); + ReadCallback* read_callback = nullptr; + if (read_options.timestamp && read_options.timestamp->size() > 0) { + timestamp_read_callback.Refresh(consistent_seqnum); + read_callback = ×tamp_read_callback; + } + + Status s; + auto cf_iter = multiget_cf_data.begin(); + for (; cf_iter != multiget_cf_data.end(); ++cf_iter) { + s = MultiGetImpl(read_options, cf_iter->start, cf_iter->num_keys, + &sorted_keys, cf_iter->super_version, consistent_seqnum, + read_callback); + if (!s.ok()) { + break; + } + } + if (!s.ok()) { + assert(s.IsTimedOut() || s.IsAborted()); + for (++cf_iter; cf_iter != multiget_cf_data.end(); ++cf_iter) { + for (size_t i = cf_iter->start; i < cf_iter->start + cf_iter->num_keys; + ++i) { + *sorted_keys[i]->s = s; + } + } + } + + for (const auto& iter : multiget_cf_data) { + if (!unref_only) { + ReturnAndCleanupSuperVersion(iter.cfd, iter.super_version); + } else { + iter.cfd->GetSuperVersion()->Unref(); + } + } +} + +namespace { +// Order keys by CF ID, followed by key contents +struct CompareKeyContext { + inline bool operator()(const KeyContext* lhs, const KeyContext* rhs) { + ColumnFamilyHandleImpl* cfh = + static_cast(lhs->column_family); + uint32_t cfd_id1 = cfh->cfd()->GetID(); + const Comparator* comparator = cfh->cfd()->user_comparator(); + cfh = static_cast(rhs->column_family); + uint32_t cfd_id2 = cfh->cfd()->GetID(); + + if (cfd_id1 < cfd_id2) { + return true; + } else if (cfd_id1 > cfd_id2) { + return false; + } + + // Both keys are from the same column family + int cmp = comparator->CompareWithoutTimestamp( + *(lhs->key), /*a_has_ts=*/false, *(rhs->key), /*b_has_ts=*/false); + if (cmp < 0) { + return true; + } + return false; + } +}; + +} // anonymous namespace + +void DBImpl::PrepareMultiGetKeys( + size_t num_keys, bool sorted_input, + autovector* sorted_keys) { + if (sorted_input) { +#ifndef NDEBUG + assert(std::is_sorted(sorted_keys->begin(), sorted_keys->end(), + CompareKeyContext())); +#endif + return; + } + + std::sort(sorted_keys->begin(), sorted_keys->begin() + num_keys, + CompareKeyContext()); +} + +void DBImpl::MultiGet(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const size_t num_keys, + const Slice* keys, PinnableSlice* values, + Status* statuses, const bool sorted_input) { + return MultiGet(read_options, column_family, num_keys, keys, values, + /*timestamp=*/nullptr, statuses, sorted_input); +} + +void DBImpl::MultiGet(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const size_t num_keys, + const Slice* keys, PinnableSlice* values, + std::string* timestamps, Status* statuses, + const bool sorted_input) { + if (tracer_) { + // TODO: This mutex should be removed later, to improve performance when + // tracing is enabled. + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + // TODO: maybe handle the tracing status? + tracer_->MultiGet(num_keys, column_family, keys).PermitUncheckedError(); + } + } + autovector key_context; + autovector sorted_keys; + sorted_keys.resize(num_keys); + for (size_t i = 0; i < num_keys; ++i) { + values[i].Reset(); + key_context.emplace_back(column_family, keys[i], &values[i], + timestamps ? ×tamps[i] : nullptr, + &statuses[i]); + } + for (size_t i = 0; i < num_keys; ++i) { + sorted_keys[i] = &key_context[i]; + } + PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys); + MultiGetWithCallback(read_options, column_family, nullptr, &sorted_keys); +} + +void DBImpl::MultiGetWithCallback( + const ReadOptions& read_options, ColumnFamilyHandle* column_family, + ReadCallback* callback, + autovector* sorted_keys) { + std::array multiget_cf_data; + multiget_cf_data[0] = MultiGetColumnFamilyData(column_family, nullptr); + std::function::iterator&)> + iter_deref_lambda = + [](std::array::iterator& cf_iter) { + return &(*cf_iter); + }; + + size_t num_keys = sorted_keys->size(); + SequenceNumber consistent_seqnum; + bool unref_only = MultiCFSnapshot>( + read_options, callback, iter_deref_lambda, &multiget_cf_data, + &consistent_seqnum); +#ifndef NDEBUG + assert(!unref_only); +#else + // Silence unused variable warning + (void)unref_only; +#endif // NDEBUG + + if (callback && read_options.snapshot == nullptr) { + // The unprep_seqs are not published for write unprepared, so it could be + // that max_visible_seq is larger. Seek to the std::max of the two. + // However, we still want our callback to contain the actual snapshot so + // that it can do the correct visibility filtering. + callback->Refresh(consistent_seqnum); + + // Internally, WriteUnpreparedTxnReadCallback::Refresh would set + // max_visible_seq = max(max_visible_seq, snapshot) + // + // Currently, the commented out assert is broken by + // InvalidSnapshotReadCallback, but if write unprepared recovery followed + // the regular transaction flow, then this special read callback would not + // be needed. + // + // assert(callback->max_visible_seq() >= snapshot); + consistent_seqnum = callback->max_visible_seq(); + } + + GetWithTimestampReadCallback timestamp_read_callback(0); + ReadCallback* read_callback = callback; + if (read_options.timestamp && read_options.timestamp->size() > 0) { + assert(!read_callback); // timestamp with callback is not supported + timestamp_read_callback.Refresh(consistent_seqnum); + read_callback = ×tamp_read_callback; + } + + Status s = MultiGetImpl(read_options, 0, num_keys, sorted_keys, + multiget_cf_data[0].super_version, consistent_seqnum, + read_callback); + assert(s.ok() || s.IsTimedOut() || s.IsAborted()); + ReturnAndCleanupSuperVersion(multiget_cf_data[0].cfd, + multiget_cf_data[0].super_version); +} + +// The actual implementation of batched MultiGet. Parameters - +// start_key - Index in the sorted_keys vector to start processing from +// num_keys - Number of keys to lookup, starting with sorted_keys[start_key] +// sorted_keys - The entire batch of sorted keys for this CF +// +// The per key status is returned in the KeyContext structures pointed to by +// sorted_keys. An overall Status is also returned, with the only possible +// values being Status::OK() and Status::TimedOut(). The latter indicates +// that the call exceeded read_options.deadline +Status DBImpl::MultiGetImpl( + const ReadOptions& read_options, size_t start_key, size_t num_keys, + autovector* sorted_keys, + SuperVersion* super_version, SequenceNumber snapshot, + ReadCallback* callback) { + PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock); + StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET); + + assert(sorted_keys); + // Clear the timestamps for returning results so that we can distinguish + // between tombstone or key that has never been written + for (auto* kctx : *sorted_keys) { + assert(kctx); + if (kctx->timestamp) { + kctx->timestamp->clear(); + } + } + + // For each of the given keys, apply the entire "get" process as follows: + // First look in the memtable, then in the immutable memtable (if any). + // s is both in/out. When in, s could either be OK or MergeInProgress. + // merge_operands will contain the sequence of merges in the latter case. + size_t keys_left = num_keys; + Status s; + uint64_t curr_value_size = 0; + while (keys_left) { + if (read_options.deadline.count() && + immutable_db_options_.clock->NowMicros() > + static_cast(read_options.deadline.count())) { + s = Status::TimedOut(); + break; + } + + size_t batch_size = (keys_left > MultiGetContext::MAX_BATCH_SIZE) + ? MultiGetContext::MAX_BATCH_SIZE + : keys_left; + MultiGetContext ctx(sorted_keys, start_key + num_keys - keys_left, + batch_size, snapshot, read_options, GetFileSystem(), + stats_); + MultiGetRange range = ctx.GetMultiGetRange(); + range.AddValueSize(curr_value_size); + bool lookup_current = false; + + keys_left -= batch_size; + for (auto mget_iter = range.begin(); mget_iter != range.end(); + ++mget_iter) { + mget_iter->merge_context.Clear(); + *mget_iter->s = Status::OK(); + } + + bool skip_memtable = + (read_options.read_tier == kPersistedTier && + has_unpersisted_data_.load(std::memory_order_relaxed)); + if (!skip_memtable) { + super_version->mem->MultiGet(read_options, &range, callback, + false /* immutable_memtable */); + if (!range.empty()) { + super_version->imm->MultiGet(read_options, &range, callback); + } + if (!range.empty()) { + lookup_current = true; + uint64_t left = range.KeysLeft(); + RecordTick(stats_, MEMTABLE_MISS, left); + } + } + if (lookup_current) { + PERF_TIMER_GUARD(get_from_output_files_time); + super_version->current->MultiGet(read_options, &range, callback); + } + curr_value_size = range.GetValueSize(); + if (curr_value_size > read_options.value_size_soft_limit) { + s = Status::Aborted(); + break; + } + } + + // Post processing (decrement reference counts and record statistics) + PERF_TIMER_GUARD(get_post_process_time); + size_t num_found = 0; + uint64_t bytes_read = 0; + for (size_t i = start_key; i < start_key + num_keys - keys_left; ++i) { + KeyContext* key = (*sorted_keys)[i]; + if (key->s->ok()) { + bytes_read += key->value->size(); + num_found++; + } + } + if (keys_left) { + assert(s.IsTimedOut() || s.IsAborted()); + for (size_t i = start_key + num_keys - keys_left; i < start_key + num_keys; + ++i) { + KeyContext* key = (*sorted_keys)[i]; + *key->s = s; + } + } + + RecordTick(stats_, NUMBER_MULTIGET_CALLS); + RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys); + RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found); + RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read); + RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read); + PERF_COUNTER_ADD(multiget_read_bytes, bytes_read); + PERF_TIMER_STOP(get_post_process_time); + + return s; +} + +Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options, + const std::string& column_family, + ColumnFamilyHandle** handle) { + assert(handle != nullptr); + Status s = CreateColumnFamilyImpl(cf_options, column_family, handle); + if (s.ok()) { + s = WriteOptionsFile(true /*need_mutex_lock*/, + true /*need_enter_write_thread*/); + } + return s; +} + +Status DBImpl::CreateColumnFamilies( + const ColumnFamilyOptions& cf_options, + const std::vector& column_family_names, + std::vector* handles) { + assert(handles != nullptr); + handles->clear(); + size_t num_cf = column_family_names.size(); + Status s; + bool success_once = false; + for (size_t i = 0; i < num_cf; i++) { + ColumnFamilyHandle* handle; + s = CreateColumnFamilyImpl(cf_options, column_family_names[i], &handle); + if (!s.ok()) { + break; + } + handles->push_back(handle); + success_once = true; + } + if (success_once) { + Status persist_options_status = WriteOptionsFile( + true /*need_mutex_lock*/, true /*need_enter_write_thread*/); + if (s.ok() && !persist_options_status.ok()) { + s = persist_options_status; + } + } + return s; +} + +Status DBImpl::CreateColumnFamilies( + const std::vector& column_families, + std::vector* handles) { + assert(handles != nullptr); + handles->clear(); + size_t num_cf = column_families.size(); + Status s; + bool success_once = false; + for (size_t i = 0; i < num_cf; i++) { + ColumnFamilyHandle* handle; + s = CreateColumnFamilyImpl(column_families[i].options, + column_families[i].name, &handle); + if (!s.ok()) { + break; + } + handles->push_back(handle); + success_once = true; + } + if (success_once) { + Status persist_options_status = WriteOptionsFile( + true /*need_mutex_lock*/, true /*need_enter_write_thread*/); + if (s.ok() && !persist_options_status.ok()) { + s = persist_options_status; + } + } + return s; +} + +Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) { + Status s; + *handle = nullptr; + + DBOptions db_options = + BuildDBOptions(immutable_db_options_, mutable_db_options_); + s = ColumnFamilyData::ValidateOptions(db_options, cf_options); + if (s.ok()) { + for (auto& cf_path : cf_options.cf_paths) { + s = env_->CreateDirIfMissing(cf_path.path); + if (!s.ok()) { + break; + } + } + } + if (!s.ok()) { + return s; + } + + SuperVersionContext sv_context(/* create_superversion */ true); + { + InstrumentedMutexLock l(&mutex_); + + if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) != + nullptr) { + return Status::InvalidArgument("Column family already exists"); + } + VersionEdit edit; + edit.AddColumnFamily(column_family_name); + uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID(); + edit.SetColumnFamily(new_id); + edit.SetLogNumber(logfile_number_); + edit.SetComparatorName(cf_options.comparator->Name()); + + // LogAndApply will both write the creation in MANIFEST and create + // ColumnFamilyData object + { // write thread + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + // LogAndApply will both write the creation in MANIFEST and create + // ColumnFamilyData object + s = versions_->LogAndApply(nullptr, MutableCFOptions(cf_options), &edit, + &mutex_, directories_.GetDbDir(), false, + &cf_options); + write_thread_.ExitUnbatched(&w); + } + if (s.ok()) { + auto* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name); + assert(cfd != nullptr); + std::map> dummy_created_dirs; + s = cfd->AddDirectories(&dummy_created_dirs); + } + if (s.ok()) { + auto* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name); + assert(cfd != nullptr); + InstallSuperVersionAndScheduleWork(cfd, &sv_context, + *cfd->GetLatestMutableCFOptions()); + + if (!cfd->mem()->IsSnapshotSupported()) { + is_snapshot_supported_ = false; + } + + cfd->set_initialized(); + + *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Created column family [%s] (ID %u)", + column_family_name.c_str(), (unsigned)cfd->GetID()); + } else { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Creating column family [%s] FAILED -- %s", + column_family_name.c_str(), s.ToString().c_str()); + } + } // InstrumentedMutexLock l(&mutex_) + + if (cf_options.preserve_internal_time_seconds > 0 || + cf_options.preclude_last_level_data_seconds > 0) { + s = RegisterRecordSeqnoTimeWorker(); + } + sv_context.Clean(); + // this is outside the mutex + if (s.ok()) { + NewThreadStatusCfInfo( + static_cast_with_check(*handle)->cfd()); + } + return s; +} + +Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { + assert(column_family != nullptr); + Status s = DropColumnFamilyImpl(column_family); + if (s.ok()) { + s = WriteOptionsFile(true /*need_mutex_lock*/, + true /*need_enter_write_thread*/); + } + return s; +} + +Status DBImpl::DropColumnFamilies( + const std::vector& column_families) { + Status s; + bool success_once = false; + for (auto* handle : column_families) { + s = DropColumnFamilyImpl(handle); + if (!s.ok()) { + break; + } + success_once = true; + } + if (success_once) { + Status persist_options_status = WriteOptionsFile( + true /*need_mutex_lock*/, true /*need_enter_write_thread*/); + if (s.ok() && !persist_options_status.ok()) { + s = persist_options_status; + } + } + return s; +} + +Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) { + auto cfh = static_cast_with_check(column_family); + auto cfd = cfh->cfd(); + if (cfd->GetID() == 0) { + return Status::InvalidArgument("Can't drop default column family"); + } + + bool cf_support_snapshot = cfd->mem()->IsSnapshotSupported(); + + VersionEdit edit; + edit.DropColumnFamily(); + edit.SetColumnFamily(cfd->GetID()); + + Status s; + { + InstrumentedMutexLock l(&mutex_); + if (cfd->IsDropped()) { + s = Status::InvalidArgument("Column family already dropped!\n"); + } + if (s.ok()) { + // we drop column family from a single write thread + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit, + &mutex_, directories_.GetDbDir()); + write_thread_.ExitUnbatched(&w); + } + if (s.ok()) { + auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size * + mutable_cf_options->max_write_buffer_number; + } + + if (!cf_support_snapshot) { + // Dropped Column Family doesn't support snapshot. Need to recalculate + // is_snapshot_supported_. + bool new_is_snapshot_supported = true; + for (auto c : *versions_->GetColumnFamilySet()) { + if (!c->IsDropped() && !c->mem()->IsSnapshotSupported()) { + new_is_snapshot_supported = false; + break; + } + } + is_snapshot_supported_ = new_is_snapshot_supported; + } + bg_cv_.SignalAll(); + } + + if (cfd->ioptions()->preserve_internal_time_seconds > 0 || + cfd->ioptions()->preclude_last_level_data_seconds > 0) { + s = RegisterRecordSeqnoTimeWorker(); + } + + if (s.ok()) { + // Note that here we erase the associated cf_info of the to-be-dropped + // cfd before its ref-count goes to zero to avoid having to erase cf_info + // later inside db_mutex. + EraseThreadStatusCfInfo(cfd); + assert(cfd->IsDropped()); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Dropped column family with id %u\n", cfd->GetID()); + } else { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Dropping column family with id %u FAILED -- %s\n", + cfd->GetID(), s.ToString().c_str()); + } + + return s; +} + +bool DBImpl::KeyMayExist(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, std::string* timestamp, + bool* value_found) { + assert(value != nullptr); + if (value_found != nullptr) { + // falsify later if key-may-exist but can't fetch value + *value_found = true; + } + ReadOptions roptions = read_options; + roptions.read_tier = kBlockCacheTier; // read from block cache only + PinnableSlice pinnable_val; + GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.value = &pinnable_val; + get_impl_options.value_found = value_found; + get_impl_options.timestamp = timestamp; + auto s = GetImpl(roptions, key, get_impl_options); + value->assign(pinnable_val.data(), pinnable_val.size()); + + // If block_cache is enabled and the index block of the table didn't + // not present in block_cache, the return value will be Status::Incomplete. + // In this case, key may still exist in the table. + return s.ok() || s.IsIncomplete(); +} + +Iterator* DBImpl::NewIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) { + if (read_options.managed) { + return NewErrorIterator( + Status::NotSupported("Managed iterator is not supported anymore.")); + } + Iterator* result = nullptr; + if (read_options.read_tier == kPersistedTier) { + return NewErrorIterator(Status::NotSupported( + "ReadTier::kPersistedData is not yet supported in iterators.")); + } + + assert(column_family); + + if (read_options.timestamp) { + const Status s = FailIfTsMismatchCf( + column_family, *(read_options.timestamp), /*ts_for_read=*/true); + if (!s.ok()) { + return NewErrorIterator(s); + } + } else { + const Status s = FailIfCfHasTs(column_family); + if (!s.ok()) { + return NewErrorIterator(s); + } + } + + auto cfh = static_cast_with_check(column_family); + ColumnFamilyData* cfd = cfh->cfd(); + assert(cfd != nullptr); + ReadCallback* read_callback = nullptr; // No read callback provided. + if (read_options.tailing) { +#ifdef ROCKSDB_LITE + // not supported in lite version + result = nullptr; + +#else + SuperVersion* sv = cfd->GetReferencedSuperVersion(this); + auto iter = new ForwardIterator(this, read_options, cfd, sv, + /* allow_unprepared_value */ true); + result = NewDBIterator( + env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, + cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber, + sv->mutable_cf_options.max_sequential_skip_in_iterations, read_callback, + this, cfd); +#endif + } else { + // Note: no need to consider the special case of + // last_seq_same_as_publish_seq_==false since NewIterator is overridden in + // WritePreparedTxnDB + result = NewIteratorImpl(read_options, cfd, + (read_options.snapshot != nullptr) + ? read_options.snapshot->GetSequenceNumber() + : kMaxSequenceNumber, + read_callback); + } + return result; +} + +ArenaWrappedDBIter* DBImpl::NewIteratorImpl(const ReadOptions& read_options, + ColumnFamilyData* cfd, + SequenceNumber snapshot, + ReadCallback* read_callback, + bool expose_blob_index, + bool allow_refresh) { + SuperVersion* sv = cfd->GetReferencedSuperVersion(this); + + TEST_SYNC_POINT("DBImpl::NewIterator:1"); + TEST_SYNC_POINT("DBImpl::NewIterator:2"); + + if (snapshot == kMaxSequenceNumber) { + // Note that the snapshot is assigned AFTER referencing the super + // version because otherwise a flush happening in between may compact away + // data for the snapshot, so the reader would see neither data that was be + // visible to the snapshot before compaction nor the newer data inserted + // afterwards. + // Note that the super version might not contain all the data available + // to this snapshot, but in that case it can see all the data in the + // super version, which is a valid consistent state after the user + // calls NewIterator(). + snapshot = versions_->LastSequence(); + TEST_SYNC_POINT("DBImpl::NewIterator:3"); + TEST_SYNC_POINT("DBImpl::NewIterator:4"); + } + + // Try to generate a DB iterator tree in continuous memory area to be + // cache friendly. Here is an example of result: + // +-------------------------------+ + // | | + // | ArenaWrappedDBIter | + // | + | + // | +---> Inner Iterator ------------+ + // | | | | + // | | +-- -- -- -- -- -- -- --+ | + // | +--- | Arena | | + // | | | | + // | Allocated Memory: | | + // | | +-------------------+ | + // | | | DBIter | <---+ + // | | + | + // | | | +-> iter_ ------------+ + // | | | | | + // | | +-------------------+ | + // | | | MergingIterator | <---+ + // | | + | + // | | | +->child iter1 ------------+ + // | | | | | | + // | | +->child iter2 ----------+ | + // | | | | | | | + // | | | +->child iter3 --------+ | | + // | | | | | | + // | | +-------------------+ | | | + // | | | Iterator1 | <--------+ + // | | +-------------------+ | | + // | | | Iterator2 | <------+ + // | | +-------------------+ | + // | | | Iterator3 | <----+ + // | | +-------------------+ + // | | | + // +-------+-----------------------+ + // + // ArenaWrappedDBIter inlines an arena area where all the iterators in + // the iterator tree are allocated in the order of being accessed when + // querying. + // Laying out the iterators in the order of being accessed makes it more + // likely that any iterator pointer is close to the iterator it points to so + // that they are likely to be in the same cache line and/or page. + ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( + env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, sv->current, + snapshot, sv->mutable_cf_options.max_sequential_skip_in_iterations, + sv->version_number, read_callback, this, cfd, expose_blob_index, + read_options.snapshot != nullptr ? false : allow_refresh); + + InternalIterator* internal_iter = NewInternalIterator( + db_iter->GetReadOptions(), cfd, sv, db_iter->GetArena(), snapshot, + /* allow_unprepared_value */ true, db_iter); + db_iter->SetIterUnderDBIter(internal_iter); + + return db_iter; +} + +Status DBImpl::NewIterators( + const ReadOptions& read_options, + const std::vector& column_families, + std::vector* iterators) { + if (read_options.managed) { + return Status::NotSupported("Managed iterator is not supported anymore."); + } + if (read_options.read_tier == kPersistedTier) { + return Status::NotSupported( + "ReadTier::kPersistedData is not yet supported in iterators."); + } + + if (read_options.timestamp) { + for (auto* cf : column_families) { + assert(cf); + const Status s = FailIfTsMismatchCf(cf, *(read_options.timestamp), + /*ts_for_read=*/true); + if (!s.ok()) { + return s; + } + } + } else { + for (auto* cf : column_families) { + assert(cf); + const Status s = FailIfCfHasTs(cf); + if (!s.ok()) { + return s; + } + } + } + + ReadCallback* read_callback = nullptr; // No read callback provided. + iterators->clear(); + iterators->reserve(column_families.size()); + if (read_options.tailing) { +#ifdef ROCKSDB_LITE + return Status::InvalidArgument( + "Tailing iterator not supported in RocksDB lite"); +#else + for (auto cfh : column_families) { + auto cfd = static_cast_with_check(cfh)->cfd(); + SuperVersion* sv = cfd->GetReferencedSuperVersion(this); + auto iter = new ForwardIterator(this, read_options, cfd, sv, + /* allow_unprepared_value */ true); + iterators->push_back(NewDBIterator( + env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, + cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber, + sv->mutable_cf_options.max_sequential_skip_in_iterations, + read_callback, this, cfd)); + } +#endif + } else { + // Note: no need to consider the special case of + // last_seq_same_as_publish_seq_==false since NewIterators is overridden in + // WritePreparedTxnDB + auto snapshot = read_options.snapshot != nullptr + ? read_options.snapshot->GetSequenceNumber() + : versions_->LastSequence(); + for (size_t i = 0; i < column_families.size(); ++i) { + auto* cfd = + static_cast_with_check(column_families[i]) + ->cfd(); + iterators->push_back( + NewIteratorImpl(read_options, cfd, snapshot, read_callback)); + } + } + + return Status::OK(); +} + +const Snapshot* DBImpl::GetSnapshot() { return GetSnapshotImpl(false); } + +#ifndef ROCKSDB_LITE +const Snapshot* DBImpl::GetSnapshotForWriteConflictBoundary() { + return GetSnapshotImpl(true); +} +#endif // ROCKSDB_LITE + +std::pair> +DBImpl::CreateTimestampedSnapshot(SequenceNumber snapshot_seq, uint64_t ts) { + assert(ts != std::numeric_limits::max()); + + auto ret = CreateTimestampedSnapshotImpl(snapshot_seq, ts, /*lock=*/true); + return ret; +} + +std::shared_ptr DBImpl::GetTimestampedSnapshot( + uint64_t ts) const { + InstrumentedMutexLock lock_guard(&mutex_); + return timestamped_snapshots_.GetSnapshot(ts); +} + +void DBImpl::ReleaseTimestampedSnapshotsOlderThan(uint64_t ts, + size_t* remaining_total_ss) { + autovector> snapshots_to_release; + { + InstrumentedMutexLock lock_guard(&mutex_); + timestamped_snapshots_.ReleaseSnapshotsOlderThan(ts, snapshots_to_release); + } + snapshots_to_release.clear(); + + if (remaining_total_ss) { + InstrumentedMutexLock lock_guard(&mutex_); + *remaining_total_ss = static_cast(snapshots_.count()); + } +} + +Status DBImpl::GetTimestampedSnapshots( + uint64_t ts_lb, uint64_t ts_ub, + std::vector>& timestamped_snapshots) const { + if (ts_lb >= ts_ub) { + return Status::InvalidArgument( + "timestamp lower bound must be smaller than upper bound"); + } + timestamped_snapshots.clear(); + InstrumentedMutexLock lock_guard(&mutex_); + timestamped_snapshots_.GetSnapshots(ts_lb, ts_ub, timestamped_snapshots); + return Status::OK(); +} + +SnapshotImpl* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary, + bool lock) { + int64_t unix_time = 0; + immutable_db_options_.clock->GetCurrentTime(&unix_time) + .PermitUncheckedError(); // Ignore error + SnapshotImpl* s = new SnapshotImpl; + + if (lock) { + mutex_.Lock(); + } else { + mutex_.AssertHeld(); + } + // returns null if the underlying memtable does not support snapshot. + if (!is_snapshot_supported_) { + if (lock) { + mutex_.Unlock(); + } + delete s; + return nullptr; + } + auto snapshot_seq = GetLastPublishedSequence(); + SnapshotImpl* snapshot = + snapshots_.New(s, snapshot_seq, unix_time, is_write_conflict_boundary); + if (lock) { + mutex_.Unlock(); + } + return snapshot; +} + +std::pair> +DBImpl::CreateTimestampedSnapshotImpl(SequenceNumber snapshot_seq, uint64_t ts, + bool lock) { + int64_t unix_time = 0; + immutable_db_options_.clock->GetCurrentTime(&unix_time) + .PermitUncheckedError(); // Ignore error + SnapshotImpl* s = new SnapshotImpl; + + const bool need_update_seq = (snapshot_seq != kMaxSequenceNumber); + + if (lock) { + mutex_.Lock(); + } else { + mutex_.AssertHeld(); + } + // returns null if the underlying memtable does not support snapshot. + if (!is_snapshot_supported_) { + if (lock) { + mutex_.Unlock(); + } + delete s; + return std::make_pair( + Status::NotSupported("Memtable does not support snapshot"), nullptr); + } + + // Caller is not write thread, thus didn't provide a valid snapshot_seq. + // Obtain seq from db. + if (!need_update_seq) { + snapshot_seq = GetLastPublishedSequence(); + } + + std::shared_ptr latest = + timestamped_snapshots_.GetSnapshot(std::numeric_limits::max()); + + // If there is already a latest timestamped snapshot, then we need to do some + // checks. + if (latest) { + uint64_t latest_snap_ts = latest->GetTimestamp(); + SequenceNumber latest_snap_seq = latest->GetSequenceNumber(); + assert(latest_snap_seq <= snapshot_seq); + bool needs_create_snap = true; + Status status; + std::shared_ptr ret; + if (latest_snap_ts > ts) { + // A snapshot created later cannot have smaller timestamp than a previous + // timestamped snapshot. + needs_create_snap = false; + std::ostringstream oss; + oss << "snapshot exists with larger timestamp " << latest_snap_ts << " > " + << ts; + status = Status::InvalidArgument(oss.str()); + } else if (latest_snap_ts == ts) { + if (latest_snap_seq == snapshot_seq) { + // We are requesting the same sequence number and timestamp, thus can + // safely reuse (share) the current latest timestamped snapshot. + needs_create_snap = false; + ret = latest; + } else if (latest_snap_seq < snapshot_seq) { + // There may have been writes to the database since the latest + // timestamped snapshot, yet we are still requesting the same + // timestamp. In this case, we cannot create the new timestamped + // snapshot. + needs_create_snap = false; + std::ostringstream oss; + oss << "Allocated seq is " << snapshot_seq + << ", while snapshot exists with smaller seq " << latest_snap_seq + << " but same timestamp " << ts; + status = Status::InvalidArgument(oss.str()); + } + } + if (!needs_create_snap) { + if (lock) { + mutex_.Unlock(); + } + delete s; + return std::make_pair(status, ret); + } else { + status.PermitUncheckedError(); + } + } + + SnapshotImpl* snapshot = + snapshots_.New(s, snapshot_seq, unix_time, + /*is_write_conflict_boundary=*/true, ts); + + std::shared_ptr ret( + snapshot, + std::bind(&DBImpl::ReleaseSnapshot, this, std::placeholders::_1)); + timestamped_snapshots_.AddSnapshot(ret); + + // Caller is from write thread, and we need to update database's sequence + // number. + if (need_update_seq) { + assert(versions_); + if (last_seq_same_as_publish_seq_) { + versions_->SetLastSequence(snapshot_seq); + } else { + // TODO: support write-prepared/write-unprepared transactions with two + // write queues. + assert(false); + } + } + + if (lock) { + mutex_.Unlock(); + } + return std::make_pair(Status::OK(), ret); +} + +namespace { +using CfdList = autovector; +bool CfdListContains(const CfdList& list, ColumnFamilyData* cfd) { + for (const ColumnFamilyData* t : list) { + if (t == cfd) { + return true; + } + } + return false; +} +} // namespace + +void DBImpl::ReleaseSnapshot(const Snapshot* s) { + if (s == nullptr) { + // DBImpl::GetSnapshot() can return nullptr when snapshot + // not supported by specifying the condition: + // inplace_update_support enabled. + return; + } + const SnapshotImpl* casted_s = reinterpret_cast(s); + { + InstrumentedMutexLock l(&mutex_); + snapshots_.Delete(casted_s); + uint64_t oldest_snapshot; + if (snapshots_.empty()) { + oldest_snapshot = GetLastPublishedSequence(); + } else { + oldest_snapshot = snapshots_.oldest()->number_; + } + // Avoid to go through every column family by checking a global threshold + // first. + if (oldest_snapshot > bottommost_files_mark_threshold_) { + CfdList cf_scheduled; + for (auto* cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->ioptions()->allow_ingest_behind) { + cfd->current()->storage_info()->UpdateOldestSnapshot(oldest_snapshot); + if (!cfd->current() + ->storage_info() + ->BottommostFilesMarkedForCompaction() + .empty()) { + SchedulePendingCompaction(cfd); + MaybeScheduleFlushOrCompaction(); + cf_scheduled.push_back(cfd); + } + } + } + + // Calculate a new threshold, skipping those CFs where compactions are + // scheduled. We do not do the same pass as the previous loop because + // mutex might be unlocked during the loop, making the result inaccurate. + SequenceNumber new_bottommost_files_mark_threshold = kMaxSequenceNumber; + for (auto* cfd : *versions_->GetColumnFamilySet()) { + if (CfdListContains(cf_scheduled, cfd) || + cfd->ioptions()->allow_ingest_behind) { + continue; + } + new_bottommost_files_mark_threshold = std::min( + new_bottommost_files_mark_threshold, + cfd->current()->storage_info()->bottommost_files_mark_threshold()); + } + bottommost_files_mark_threshold_ = new_bottommost_files_mark_threshold; + } + } + delete casted_s; +} + +#ifndef ROCKSDB_LITE +Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) { + auto cfh = static_cast_with_check(column_family); + auto cfd = cfh->cfd(); + + // Increment the ref count + mutex_.Lock(); + auto version = cfd->current(); + version->Ref(); + mutex_.Unlock(); + + auto s = version->GetPropertiesOfAllTables(props); + + // Decrement the ref count + mutex_.Lock(); + version->Unref(); + mutex_.Unlock(); + + return s; +} + +Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family, + const Range* range, std::size_t n, + TablePropertiesCollection* props) { + auto cfh = static_cast_with_check(column_family); + auto cfd = cfh->cfd(); + + // Increment the ref count + mutex_.Lock(); + auto version = cfd->current(); + version->Ref(); + mutex_.Unlock(); + + auto s = version->GetPropertiesOfTablesInRange(range, n, props); + + // Decrement the ref count + mutex_.Lock(); + version->Unref(); + mutex_.Unlock(); + + return s; +} + +#endif // ROCKSDB_LITE + +const std::string& DBImpl::GetName() const { return dbname_; } + +Env* DBImpl::GetEnv() const { return env_; } + +FileSystem* DB::GetFileSystem() const { + const auto& fs = GetEnv()->GetFileSystem(); + return fs.get(); +} + +FileSystem* DBImpl::GetFileSystem() const { + return immutable_db_options_.fs.get(); +} + +SystemClock* DBImpl::GetSystemClock() const { + return immutable_db_options_.clock; +} + +#ifndef ROCKSDB_LITE + +Status DBImpl::StartIOTrace(const TraceOptions& trace_options, + std::unique_ptr&& trace_writer) { + assert(trace_writer != nullptr); + return io_tracer_->StartIOTrace(GetSystemClock(), trace_options, + std::move(trace_writer)); +} + +Status DBImpl::EndIOTrace() { + io_tracer_->EndIOTrace(); + return Status::OK(); +} + +#endif // ROCKSDB_LITE + +Options DBImpl::GetOptions(ColumnFamilyHandle* column_family) const { + InstrumentedMutexLock l(&mutex_); + auto cfh = static_cast_with_check(column_family); + return Options(BuildDBOptions(immutable_db_options_, mutable_db_options_), + cfh->cfd()->GetLatestCFOptions()); +} + +DBOptions DBImpl::GetDBOptions() const { + InstrumentedMutexLock l(&mutex_); + return BuildDBOptions(immutable_db_options_, mutable_db_options_); +} + +bool DBImpl::GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) { + const DBPropertyInfo* property_info = GetPropertyInfo(property); + value->clear(); + auto cfd = + static_cast_with_check(column_family)->cfd(); + if (property_info == nullptr) { + return false; + } else if (property_info->handle_int) { + uint64_t int_value; + bool ret_value = + GetIntPropertyInternal(cfd, *property_info, false, &int_value); + if (ret_value) { + *value = std::to_string(int_value); + } + return ret_value; + } else if (property_info->handle_string) { + if (property_info->need_out_of_mutex) { + return cfd->internal_stats()->GetStringProperty(*property_info, property, + value); + } else { + InstrumentedMutexLock l(&mutex_); + return cfd->internal_stats()->GetStringProperty(*property_info, property, + value); + } + } else if (property_info->handle_string_dbimpl) { + if (property_info->need_out_of_mutex) { + return (this->*(property_info->handle_string_dbimpl))(value); + } else { + InstrumentedMutexLock l(&mutex_); + return (this->*(property_info->handle_string_dbimpl))(value); + } + } + // Shouldn't reach here since exactly one of handle_string and handle_int + // should be non-nullptr. + assert(false); + return false; +} + +bool DBImpl::GetMapProperty(ColumnFamilyHandle* column_family, + const Slice& property, + std::map* value) { + const DBPropertyInfo* property_info = GetPropertyInfo(property); + value->clear(); + auto cfd = + static_cast_with_check(column_family)->cfd(); + if (property_info == nullptr) { + return false; + } else if (property_info->handle_map) { + if (property_info->need_out_of_mutex) { + return cfd->internal_stats()->GetMapProperty(*property_info, property, + value); + } else { + InstrumentedMutexLock l(&mutex_); + return cfd->internal_stats()->GetMapProperty(*property_info, property, + value); + } + } + // If we reach this point it means that handle_map is not provided for the + // requested property + return false; +} + +bool DBImpl::GetIntProperty(ColumnFamilyHandle* column_family, + const Slice& property, uint64_t* value) { + const DBPropertyInfo* property_info = GetPropertyInfo(property); + if (property_info == nullptr || property_info->handle_int == nullptr) { + return false; + } + auto cfd = + static_cast_with_check(column_family)->cfd(); + return GetIntPropertyInternal(cfd, *property_info, false, value); +} + +bool DBImpl::GetIntPropertyInternal(ColumnFamilyData* cfd, + const DBPropertyInfo& property_info, + bool is_locked, uint64_t* value) { + assert(property_info.handle_int != nullptr); + if (!property_info.need_out_of_mutex) { + if (is_locked) { + mutex_.AssertHeld(); + return cfd->internal_stats()->GetIntProperty(property_info, value, this); + } else { + InstrumentedMutexLock l(&mutex_); + return cfd->internal_stats()->GetIntProperty(property_info, value, this); + } + } else { + SuperVersion* sv = nullptr; + if (is_locked) { + mutex_.Unlock(); + } + sv = GetAndRefSuperVersion(cfd); + + bool ret = cfd->internal_stats()->GetIntPropertyOutOfMutex( + property_info, sv->current, value); + + ReturnAndCleanupSuperVersion(cfd, sv); + if (is_locked) { + mutex_.Lock(); + } + + return ret; + } +} + +bool DBImpl::GetPropertyHandleOptionsStatistics(std::string* value) { + assert(value != nullptr); + Statistics* statistics = immutable_db_options_.stats; + if (!statistics) { + return false; + } + *value = statistics->ToString(); + return true; +} + +#ifndef ROCKSDB_LITE +Status DBImpl::ResetStats() { + InstrumentedMutexLock l(&mutex_); + for (auto* cfd : *versions_->GetColumnFamilySet()) { + if (cfd->initialized()) { + cfd->internal_stats()->Clear(); + } + } + return Status::OK(); +} +#endif // ROCKSDB_LITE + +bool DBImpl::GetAggregatedIntProperty(const Slice& property, + uint64_t* aggregated_value) { + const DBPropertyInfo* property_info = GetPropertyInfo(property); + if (property_info == nullptr || property_info->handle_int == nullptr) { + return false; + } + + uint64_t sum = 0; + bool ret = true; + { + // Needs mutex to protect the list of column families. + InstrumentedMutexLock l(&mutex_); + uint64_t value; + for (auto* cfd : versions_->GetRefedColumnFamilySet()) { + if (!cfd->initialized()) { + continue; + } + ret = GetIntPropertyInternal(cfd, *property_info, true, &value); + // GetIntPropertyInternal may release db mutex and re-acquire it. + mutex_.AssertHeld(); + if (ret) { + sum += value; + } else { + ret = false; + break; + } + } + } + *aggregated_value = sum; + return ret; +} + +SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) { + // TODO(ljin): consider using GetReferencedSuperVersion() directly + return cfd->GetThreadLocalSuperVersion(this); +} + +// REQUIRED: this function should only be called on the write thread or if the +// mutex is held. +SuperVersion* DBImpl::GetAndRefSuperVersion(uint32_t column_family_id) { + auto column_family_set = versions_->GetColumnFamilySet(); + auto cfd = column_family_set->GetColumnFamily(column_family_id); + if (!cfd) { + return nullptr; + } + + return GetAndRefSuperVersion(cfd); +} + +void DBImpl::CleanupSuperVersion(SuperVersion* sv) { + // Release SuperVersion + if (sv->Unref()) { + bool defer_purge = immutable_db_options().avoid_unnecessary_blocking_io; + { + InstrumentedMutexLock l(&mutex_); + sv->Cleanup(); + if (defer_purge) { + AddSuperVersionsToFreeQueue(sv); + SchedulePurge(); + } + } + if (!defer_purge) { + delete sv; + } + RecordTick(stats_, NUMBER_SUPERVERSION_CLEANUPS); + } + RecordTick(stats_, NUMBER_SUPERVERSION_RELEASES); +} + +void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, + SuperVersion* sv) { + if (!cfd->ReturnThreadLocalSuperVersion(sv)) { + CleanupSuperVersion(sv); + } +} + +// REQUIRED: this function should only be called on the write thread. +void DBImpl::ReturnAndCleanupSuperVersion(uint32_t column_family_id, + SuperVersion* sv) { + auto column_family_set = versions_->GetColumnFamilySet(); + auto cfd = column_family_set->GetColumnFamily(column_family_id); + + // If SuperVersion is held, and we successfully fetched a cfd using + // GetAndRefSuperVersion(), it must still exist. + assert(cfd != nullptr); + ReturnAndCleanupSuperVersion(cfd, sv); +} + +// REQUIRED: this function should only be called on the write thread or if the +// mutex is held. +ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) { + ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get(); + + if (!cf_memtables->Seek(column_family_id)) { + return nullptr; + } + + return cf_memtables->GetColumnFamilyHandle(); +} + +// REQUIRED: mutex is NOT held. +std::unique_ptr DBImpl::GetColumnFamilyHandleUnlocked( + uint32_t column_family_id) { + InstrumentedMutexLock l(&mutex_); + + auto* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(column_family_id); + if (cfd == nullptr) { + return nullptr; + } + + return std::unique_ptr( + new ColumnFamilyHandleImpl(cfd, this, &mutex_)); +} + +void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family, + const Range& range, + uint64_t* const count, + uint64_t* const size) { + ColumnFamilyHandleImpl* cfh = + static_cast_with_check(column_family); + ColumnFamilyData* cfd = cfh->cfd(); + SuperVersion* sv = GetAndRefSuperVersion(cfd); + + // Convert user_key into a corresponding internal key. + InternalKey k1(range.start, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(range.limit, kMaxSequenceNumber, kValueTypeForSeek); + MemTable::MemTableStats memStats = + sv->mem->ApproximateStats(k1.Encode(), k2.Encode()); + MemTable::MemTableStats immStats = + sv->imm->ApproximateStats(k1.Encode(), k2.Encode()); + *count = memStats.count + immStats.count; + *size = memStats.size + immStats.size; + + ReturnAndCleanupSuperVersion(cfd, sv); +} + +Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, + const Range* range, int n, uint64_t* sizes) { + if (!options.include_memtables && !options.include_files) { + return Status::InvalidArgument("Invalid options"); + } + + const Comparator* const ucmp = column_family->GetComparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + + Version* v; + auto cfh = static_cast_with_check(column_family); + auto cfd = cfh->cfd(); + SuperVersion* sv = GetAndRefSuperVersion(cfd); + v = sv->current; + + for (int i = 0; i < n; i++) { + Slice start = range[i].start; + Slice limit = range[i].limit; + + // Add timestamp if needed + std::string start_with_ts, limit_with_ts; + if (ts_sz > 0) { + // Maximum timestamp means including all key with any timestamp + AppendKeyWithMaxTimestamp(&start_with_ts, start, ts_sz); + // Append a maximum timestamp as the range limit is exclusive: + // [start, limit) + AppendKeyWithMaxTimestamp(&limit_with_ts, limit, ts_sz); + start = start_with_ts; + limit = limit_with_ts; + } + // Convert user_key into a corresponding internal key. + InternalKey k1(start, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(limit, kMaxSequenceNumber, kValueTypeForSeek); + sizes[i] = 0; + if (options.include_files) { + sizes[i] += versions_->ApproximateSize( + options, v, k1.Encode(), k2.Encode(), /*start_level=*/0, + /*end_level=*/-1, TableReaderCaller::kUserApproximateSize); + } + if (options.include_memtables) { + sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size; + sizes[i] += sv->imm->ApproximateStats(k1.Encode(), k2.Encode()).size; + } + } + + ReturnAndCleanupSuperVersion(cfd, sv); + return Status::OK(); +} + +std::list::iterator +DBImpl::CaptureCurrentFileNumberInPendingOutputs() { + // We need to remember the iterator of our insert, because after the + // background job is done, we need to remove that element from + // pending_outputs_. + pending_outputs_.push_back(versions_->current_next_file_number()); + auto pending_outputs_inserted_elem = pending_outputs_.end(); + --pending_outputs_inserted_elem; + return pending_outputs_inserted_elem; +} + +void DBImpl::ReleaseFileNumberFromPendingOutputs( + std::unique_ptr::iterator>& v) { + if (v.get() != nullptr) { + pending_outputs_.erase(*v.get()); + v.reset(); + } +} + +#ifndef ROCKSDB_LITE +Status DBImpl::GetUpdatesSince( + SequenceNumber seq, std::unique_ptr* iter, + const TransactionLogIterator::ReadOptions& read_options) { + RecordTick(stats_, GET_UPDATES_SINCE_CALLS); + if (seq_per_batch_) { + return Status::NotSupported( + "This API is not yet compatible with write-prepared/write-unprepared " + "transactions"); + } + if (seq > versions_->LastSequence()) { + return Status::NotFound("Requested sequence not yet written in the db"); + } + return wal_manager_.GetUpdatesSince(seq, iter, read_options, versions_.get()); +} + +Status DBImpl::DeleteFile(std::string name) { + uint64_t number; + FileType type; + WalFileType log_type; + if (!ParseFileName(name, &number, &type, &log_type) || + (type != kTableFile && type != kWalFile)) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, "DeleteFile %s failed.\n", + name.c_str()); + return Status::InvalidArgument("Invalid file name"); + } + + if (type == kWalFile) { + // Only allow deleting archived log files + if (log_type != kArchivedLogFile) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "DeleteFile %s failed - not archived log.\n", + name.c_str()); + return Status::NotSupported("Delete only supported for archived logs"); + } + Status status = wal_manager_.DeleteFile(name, number); + if (!status.ok()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "DeleteFile %s failed -- %s.\n", name.c_str(), + status.ToString().c_str()); + } + return status; + } + + Status status; + int level; + FileMetaData* metadata; + ColumnFamilyData* cfd; + VersionEdit edit; + JobContext job_context(next_job_id_.fetch_add(1), true); + { + InstrumentedMutexLock l(&mutex_); + status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd); + if (!status.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "DeleteFile %s failed. File not found\n", name.c_str()); + job_context.Clean(); + return Status::InvalidArgument("File not found"); + } + assert(level < cfd->NumberLevels()); + + // If the file is being compacted no need to delete. + if (metadata->being_compacted) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "DeleteFile %s Skipped. File about to be compacted\n", + name.c_str()); + job_context.Clean(); + return Status::OK(); + } + + // Only the files in the last level can be deleted externally. + // This is to make sure that any deletion tombstones are not + // lost. Check that the level passed is the last level. + auto* vstoreage = cfd->current()->storage_info(); + for (int i = level + 1; i < cfd->NumberLevels(); i++) { + if (vstoreage->NumLevelFiles(i) != 0) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "DeleteFile %s FAILED. File not in last level\n", + name.c_str()); + job_context.Clean(); + return Status::InvalidArgument("File not in last level"); + } + } + // if level == 0, it has to be the oldest file + if (level == 0 && + vstoreage->LevelFiles(0).back()->fd.GetNumber() != number) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "DeleteFile %s failed ---" + " target file in level 0 must be the oldest.", + name.c_str()); + job_context.Clean(); + return Status::InvalidArgument("File in level 0, but not oldest"); + } + edit.SetColumnFamily(cfd->GetID()); + edit.DeleteFile(level, number); + status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + &edit, &mutex_, directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, + &job_context.superversion_contexts[0], + *cfd->GetLatestMutableCFOptions()); + } + FindObsoleteFiles(&job_context, false); + } // lock released here + + LogFlush(immutable_db_options_.info_log); + // remove files outside the db-lock + if (job_context.HaveSomethingToDelete()) { + // Call PurgeObsoleteFiles() without holding mutex. + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + return status; +} + +Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family, + const RangePtr* ranges, size_t n, + bool include_end) { + Status status = Status::OK(); + auto cfh = static_cast_with_check(column_family); + ColumnFamilyData* cfd = cfh->cfd(); + VersionEdit edit; + std::set deleted_files; + JobContext job_context(next_job_id_.fetch_add(1), true); + { + InstrumentedMutexLock l(&mutex_); + Version* input_version = cfd->current(); + + auto* vstorage = input_version->storage_info(); + for (size_t r = 0; r < n; r++) { + auto begin = ranges[r].start, end = ranges[r].limit; + for (int i = 1; i < cfd->NumberLevels(); i++) { + if (vstorage->LevelFiles(i).empty() || + !vstorage->OverlapInLevel(i, begin, end)) { + continue; + } + std::vector level_files; + InternalKey begin_storage, end_storage, *begin_key, *end_key; + if (begin == nullptr) { + begin_key = nullptr; + } else { + begin_storage.SetMinPossibleForUserKey(*begin); + begin_key = &begin_storage; + } + if (end == nullptr) { + end_key = nullptr; + } else { + end_storage.SetMaxPossibleForUserKey(*end); + end_key = &end_storage; + } + + vstorage->GetCleanInputsWithinInterval( + i, begin_key, end_key, &level_files, -1 /* hint_index */, + nullptr /* file_index */); + FileMetaData* level_file; + for (uint32_t j = 0; j < level_files.size(); j++) { + level_file = level_files[j]; + if (level_file->being_compacted) { + continue; + } + if (deleted_files.find(level_file) != deleted_files.end()) { + continue; + } + if (!include_end && end != nullptr && + cfd->user_comparator()->Compare(level_file->largest.user_key(), + *end) == 0) { + continue; + } + edit.SetColumnFamily(cfd->GetID()); + edit.DeleteFile(i, level_file->fd.GetNumber()); + deleted_files.insert(level_file); + level_file->being_compacted = true; + } + vstorage->ComputeCompactionScore(*cfd->ioptions(), + *cfd->GetLatestMutableCFOptions()); + } + } + if (edit.GetDeletedFiles().empty()) { + job_context.Clean(); + return status; + } + input_version->Ref(); + status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + &edit, &mutex_, directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, + &job_context.superversion_contexts[0], + *cfd->GetLatestMutableCFOptions()); + } + for (auto* deleted_file : deleted_files) { + deleted_file->being_compacted = false; + } + input_version->Unref(); + FindObsoleteFiles(&job_context, false); + } // lock released here + + LogFlush(immutable_db_options_.info_log); + // remove files outside the db-lock + if (job_context.HaveSomethingToDelete()) { + // Call PurgeObsoleteFiles() without holding mutex. + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + return status; +} + +void DBImpl::GetLiveFilesMetaData(std::vector* metadata) { + InstrumentedMutexLock l(&mutex_); + versions_->GetLiveFilesMetaData(metadata); +} + +Status DBImpl::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) { + InstrumentedMutexLock l(&mutex_); + return versions_->GetLiveFilesChecksumInfo(checksum_list); +} + +void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, + ColumnFamilyMetaData* cf_meta) { + assert(column_family); + auto* cfd = + static_cast_with_check(column_family)->cfd(); + auto* sv = GetAndRefSuperVersion(cfd); + { + // Without mutex, Version::GetColumnFamilyMetaData will have data race with + // Compaction::MarkFilesBeingCompacted. One solution is to use mutex, but + // this may cause regression. An alternative is to make + // FileMetaData::being_compacted atomic, but it will make FileMetaData + // non-copy-able. Another option is to separate these variables from + // original FileMetaData struct, and this requires re-organization of data + // structures. For now, we take the easy approach. If + // DB::GetColumnFamilyMetaData is not called frequently, the regression + // should not be big. We still need to keep an eye on it. + InstrumentedMutexLock l(&mutex_); + sv->current->GetColumnFamilyMetaData(cf_meta); + } + ReturnAndCleanupSuperVersion(cfd, sv); +} + +void DBImpl::GetAllColumnFamilyMetaData( + std::vector* metadata) { + InstrumentedMutexLock l(&mutex_); + for (auto cfd : *(versions_->GetColumnFamilySet())) { + { + metadata->emplace_back(); + cfd->current()->GetColumnFamilyMetaData(&metadata->back()); + } + } +} + +#endif // ROCKSDB_LITE + +Status DBImpl::CheckConsistency() { + mutex_.AssertHeld(); + std::vector metadata; + versions_->GetLiveFilesMetaData(&metadata); + TEST_SYNC_POINT("DBImpl::CheckConsistency:AfterGetLiveFilesMetaData"); + + std::string corruption_messages; + + if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) { + // Instead of calling GetFileSize() for each expected file, call + // GetChildren() for the DB directory and check that all expected files + // are listed, without checking their sizes. + // Since sst files might be in different directories, do it for each + // directory separately. + std::map> files_by_directory; + for (const auto& md : metadata) { + // md.name has a leading "/". Remove it. + std::string fname = md.name; + if (!fname.empty() && fname[0] == '/') { + fname = fname.substr(1); + } + files_by_directory[md.db_path].push_back(fname); + } + + IOOptions io_opts; + io_opts.do_not_recurse = true; + for (const auto& dir_files : files_by_directory) { + std::string directory = dir_files.first; + std::vector existing_files; + Status s = fs_->GetChildren(directory, io_opts, &existing_files, + /*IODebugContext*=*/nullptr); + if (!s.ok()) { + corruption_messages += + "Can't list files in " + directory + ": " + s.ToString() + "\n"; + continue; + } + std::sort(existing_files.begin(), existing_files.end()); + + for (const std::string& fname : dir_files.second) { + if (!std::binary_search(existing_files.begin(), existing_files.end(), + fname) && + !std::binary_search(existing_files.begin(), existing_files.end(), + Rocks2LevelTableFileName(fname))) { + corruption_messages += + "Missing sst file " + fname + " in " + directory + "\n"; + } + } + } + } else { + for (const auto& md : metadata) { + // md.name has a leading "/". + std::string file_path = md.db_path + md.name; + + uint64_t fsize = 0; + TEST_SYNC_POINT("DBImpl::CheckConsistency:BeforeGetFileSize"); + Status s = env_->GetFileSize(file_path, &fsize); + if (!s.ok() && + env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) { + s = Status::OK(); + } + if (!s.ok()) { + corruption_messages += + "Can't access " + md.name + ": " + s.ToString() + "\n"; + } else if (fsize != md.size) { + corruption_messages += "Sst file size mismatch: " + file_path + + ". Size recorded in manifest " + + std::to_string(md.size) + ", actual size " + + std::to_string(fsize) + "\n"; + } + } + } + + if (corruption_messages.size() == 0) { + return Status::OK(); + } else { + return Status::Corruption(corruption_messages); + } +} + +Status DBImpl::GetDbIdentity(std::string& identity) const { + identity.assign(db_id_); + return Status::OK(); +} + +Status DBImpl::GetDbIdentityFromIdentityFile(std::string* identity) const { + std::string idfilename = IdentityFileName(dbname_); + const FileOptions soptions; + + Status s = ReadFileToString(fs_.get(), idfilename, identity); + if (!s.ok()) { + return s; + } + + // If last character is '\n' remove it from identity. (Old implementations + // of Env::GenerateUniqueId() would include a trailing '\n'.) + if (identity->size() > 0 && identity->back() == '\n') { + identity->pop_back(); + } + return s; +} + +Status DBImpl::GetDbSessionId(std::string& session_id) const { + session_id.assign(db_session_id_); + return Status::OK(); +} + +namespace { +SemiStructuredUniqueIdGen* DbSessionIdGen() { + static SemiStructuredUniqueIdGen gen; + return &gen; +} +} // namespace + +void DBImpl::TEST_ResetDbSessionIdGen() { DbSessionIdGen()->Reset(); } + +std::string DBImpl::GenerateDbSessionId(Env*) { + // See SemiStructuredUniqueIdGen for its desirable properties. + auto gen = DbSessionIdGen(); + + uint64_t lo, hi; + gen->GenerateNext(&hi, &lo); + if (lo == 0) { + // Avoid emitting session ID with lo==0, so that SST unique + // IDs can be more easily ensured non-zero + gen->GenerateNext(&hi, &lo); + assert(lo != 0); + } + return EncodeSessionId(hi, lo); +} + +void DBImpl::SetDbSessionId() { + db_session_id_ = GenerateDbSessionId(env_); + TEST_SYNC_POINT_CALLBACK("DBImpl::SetDbSessionId", &db_session_id_); +} + +// Default implementation -- returns not supported status +Status DB::CreateColumnFamily(const ColumnFamilyOptions& /*cf_options*/, + const std::string& /*column_family_name*/, + ColumnFamilyHandle** /*handle*/) { + return Status::NotSupported(""); +} + +Status DB::CreateColumnFamilies( + const ColumnFamilyOptions& /*cf_options*/, + const std::vector& /*column_family_names*/, + std::vector* /*handles*/) { + return Status::NotSupported(""); +} + +Status DB::CreateColumnFamilies( + const std::vector& /*column_families*/, + std::vector* /*handles*/) { + return Status::NotSupported(""); +} + +Status DB::DropColumnFamily(ColumnFamilyHandle* /*column_family*/) { + return Status::NotSupported(""); +} + +Status DB::DropColumnFamilies( + const std::vector& /*column_families*/) { + return Status::NotSupported(""); +} + +Status DB::DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family) { + if (DefaultColumnFamily() == column_family) { + return Status::InvalidArgument( + "Cannot destroy the handle returned by DefaultColumnFamily()"); + } + delete column_family; + return Status::OK(); +} + +DB::~DB() {} + +Status DBImpl::Close() { + InstrumentedMutexLock closing_lock_guard(&closing_mutex_); + if (closed_) { + return closing_status_; + } + + { + const Status s = MaybeReleaseTimestampedSnapshotsAndCheck(); + if (!s.ok()) { + return s; + } + } + + closing_status_ = CloseImpl(); + closed_ = true; + return closing_status_; +} + +Status DB::ListColumnFamilies(const DBOptions& db_options, + const std::string& name, + std::vector* column_families) { + const std::shared_ptr& fs = db_options.env->GetFileSystem(); + return VersionSet::ListColumnFamilies(column_families, name, fs.get()); +} + +Snapshot::~Snapshot() {} + +Status DestroyDB(const std::string& dbname, const Options& options, + const std::vector& column_families) { + ImmutableDBOptions soptions(SanitizeOptions(dbname, options)); + Env* env = soptions.env; + std::vector filenames; + bool wal_in_db_path = soptions.IsWalDirSameAsDBPath(); + + // Reset the logger because it holds a handle to the + // log file and prevents cleanup and directory removal + soptions.info_log.reset(); + IOOptions io_opts; + // Ignore error in case directory does not exist + soptions.fs + ->GetChildren(dbname, io_opts, &filenames, + /*IODebugContext*=*/nullptr) + .PermitUncheckedError(); + + FileLock* lock; + const std::string lockname = LockFileName(dbname); + Status result = env->LockFile(lockname, &lock); + if (result.ok()) { + uint64_t number; + FileType type; + InfoLogPrefix info_log_prefix(!soptions.db_log_dir.empty(), dbname); + for (const auto& fname : filenames) { + if (ParseFileName(fname, &number, info_log_prefix.prefix, &type) && + type != kDBLockFile) { // Lock file will be deleted at end + Status del; + std::string path_to_delete = dbname + "/" + fname; + if (type == kMetaDatabase) { + del = DestroyDB(path_to_delete, options); + } else if (type == kTableFile || type == kWalFile || + type == kBlobFile) { + del = DeleteDBFile( + &soptions, path_to_delete, dbname, + /*force_bg=*/false, + /*force_fg=*/(type == kWalFile) ? !wal_in_db_path : false); + } else { + del = env->DeleteFile(path_to_delete); + } + if (!del.ok() && result.ok()) { + result = del; + } + } + } + + std::set paths; + for (const DbPath& db_path : options.db_paths) { + paths.insert(db_path.path); + } + for (const ColumnFamilyDescriptor& cf : column_families) { + for (const DbPath& cf_path : cf.options.cf_paths) { + paths.insert(cf_path.path); + } + } + + for (const auto& path : paths) { + if (soptions.fs + ->GetChildren(path, io_opts, &filenames, + /*IODebugContext*=*/nullptr) + .ok()) { + for (const auto& fname : filenames) { + if (ParseFileName(fname, &number, &type) && + (type == kTableFile || + type == kBlobFile)) { // Lock file will be deleted at end + std::string file_path = path + "/" + fname; + Status del = DeleteDBFile(&soptions, file_path, dbname, + /*force_bg=*/false, /*force_fg=*/false); + if (!del.ok() && result.ok()) { + result = del; + } + } + } + // TODO: Should we return an error if we cannot delete the directory? + env->DeleteDir(path).PermitUncheckedError(); + } + } + + std::vector walDirFiles; + std::string archivedir = ArchivalDirectory(dbname); + bool wal_dir_exists = false; + if (!soptions.IsWalDirSameAsDBPath(dbname)) { + wal_dir_exists = + soptions.fs + ->GetChildren(soptions.wal_dir, io_opts, &walDirFiles, + /*IODebugContext*=*/nullptr) + .ok(); + archivedir = ArchivalDirectory(soptions.wal_dir); + } + + // Archive dir may be inside wal dir or dbname and should be + // processed and removed before those otherwise we have issues + // removing them + std::vector archiveFiles; + if (soptions.fs + ->GetChildren(archivedir, io_opts, &archiveFiles, + /*IODebugContext*=*/nullptr) + .ok()) { + // Delete archival files. + for (const auto& file : archiveFiles) { + if (ParseFileName(file, &number, &type) && type == kWalFile) { + Status del = + DeleteDBFile(&soptions, archivedir + "/" + file, archivedir, + /*force_bg=*/false, /*force_fg=*/!wal_in_db_path); + if (!del.ok() && result.ok()) { + result = del; + } + } + } + // Ignore error in case dir contains other files + env->DeleteDir(archivedir).PermitUncheckedError(); + } + + // Delete log files in the WAL dir + if (wal_dir_exists) { + for (const auto& file : walDirFiles) { + if (ParseFileName(file, &number, &type) && type == kWalFile) { + Status del = + DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number), + soptions.wal_dir, /*force_bg=*/false, + /*force_fg=*/!wal_in_db_path); + if (!del.ok() && result.ok()) { + result = del; + } + } + } + // Ignore error in case dir contains other files + env->DeleteDir(soptions.wal_dir).PermitUncheckedError(); + } + + // Ignore error since state is already gone + env->UnlockFile(lock).PermitUncheckedError(); + env->DeleteFile(lockname).PermitUncheckedError(); + + // sst_file_manager holds a ref to the logger. Make sure the logger is + // gone before trying to remove the directory. + soptions.sst_file_manager.reset(); + + // Ignore error in case dir contains other files + env->DeleteDir(dbname).PermitUncheckedError(); + ; + } + return result; +} + +Status DBImpl::WriteOptionsFile(bool need_mutex_lock, + bool need_enter_write_thread) { +#ifndef ROCKSDB_LITE + WriteThread::Writer w; + if (need_mutex_lock) { + mutex_.Lock(); + } else { + mutex_.AssertHeld(); + } + if (need_enter_write_thread) { + write_thread_.EnterUnbatched(&w, &mutex_); + } + + std::vector cf_names; + std::vector cf_opts; + + // This part requires mutex to protect the column family options + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + cf_names.push_back(cfd->GetName()); + cf_opts.push_back(cfd->GetLatestCFOptions()); + } + + // Unlock during expensive operations. New writes cannot get here + // because the single write thread ensures all new writes get queued. + DBOptions db_options = + BuildDBOptions(immutable_db_options_, mutable_db_options_); + mutex_.Unlock(); + + TEST_SYNC_POINT("DBImpl::WriteOptionsFile:1"); + TEST_SYNC_POINT("DBImpl::WriteOptionsFile:2"); + TEST_SYNC_POINT_CALLBACK("DBImpl::WriteOptionsFile:PersistOptions", + &db_options); + + std::string file_name = + TempOptionsFileName(GetName(), versions_->NewFileNumber()); + Status s = PersistRocksDBOptions(db_options, cf_names, cf_opts, file_name, + fs_.get()); + + if (s.ok()) { + s = RenameTempFileToOptionsFile(file_name); + } + // restore lock + if (!need_mutex_lock) { + mutex_.Lock(); + } + if (need_enter_write_thread) { + write_thread_.ExitUnbatched(&w); + } + if (!s.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Unnable to persist options -- %s", s.ToString().c_str()); + if (immutable_db_options_.fail_if_options_file_error) { + return Status::IOError("Unable to persist options.", + s.ToString().c_str()); + } + } +#else + (void)need_mutex_lock; + (void)need_enter_write_thread; +#endif // !ROCKSDB_LITE + return Status::OK(); +} + +#ifndef ROCKSDB_LITE +namespace { +void DeleteOptionsFilesHelper(const std::map& filenames, + const size_t num_files_to_keep, + const std::shared_ptr& info_log, + Env* env) { + if (filenames.size() <= num_files_to_keep) { + return; + } + for (auto iter = std::next(filenames.begin(), num_files_to_keep); + iter != filenames.end(); ++iter) { + if (!env->DeleteFile(iter->second).ok()) { + ROCKS_LOG_WARN(info_log, "Unable to delete options file %s", + iter->second.c_str()); + } + } +} +} // namespace +#endif // !ROCKSDB_LITE + +Status DBImpl::DeleteObsoleteOptionsFiles() { +#ifndef ROCKSDB_LITE + std::vector filenames; + // use ordered map to store keep the filenames sorted from the newest + // to the oldest. + std::map options_filenames; + Status s; + IOOptions io_opts; + io_opts.do_not_recurse = true; + s = fs_->GetChildren(GetName(), io_opts, &filenames, + /*IODebugContext*=*/nullptr); + if (!s.ok()) { + return s; + } + for (auto& filename : filenames) { + uint64_t file_number; + FileType type; + if (ParseFileName(filename, &file_number, &type) && type == kOptionsFile) { + options_filenames.insert( + {std::numeric_limits::max() - file_number, + GetName() + "/" + filename}); + } + } + + // Keeps the latest 2 Options file + const size_t kNumOptionsFilesKept = 2; + DeleteOptionsFilesHelper(options_filenames, kNumOptionsFilesKept, + immutable_db_options_.info_log, GetEnv()); + return Status::OK(); +#else + return Status::OK(); +#endif // !ROCKSDB_LITE +} + +Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name) { +#ifndef ROCKSDB_LITE + Status s; + + uint64_t options_file_number = versions_->NewFileNumber(); + std::string options_file_name = + OptionsFileName(GetName(), options_file_number); + uint64_t options_file_size = 0; + s = GetEnv()->GetFileSize(file_name, &options_file_size); + if (s.ok()) { + // Retry if the file name happen to conflict with an existing one. + s = GetEnv()->RenameFile(file_name, options_file_name); + std::unique_ptr dir_obj; + if (s.ok()) { + s = fs_->NewDirectory(GetName(), IOOptions(), &dir_obj, nullptr); + } + if (s.ok()) { + s = dir_obj->FsyncWithDirOptions(IOOptions(), nullptr, + DirFsyncOptions(options_file_name)); + } + if (s.ok()) { + Status temp_s = dir_obj->Close(IOOptions(), nullptr); + // The default Close() could return "NotSupproted" and we bypass it + // if it is not impelmented. Detailed explanations can be found in + // db/db_impl/db_impl.h + if (!temp_s.ok()) { + if (temp_s.IsNotSupported()) { + temp_s.PermitUncheckedError(); + } else { + s = temp_s; + } + } + } + } + if (s.ok()) { + InstrumentedMutexLock l(&mutex_); + versions_->options_file_number_ = options_file_number; + versions_->options_file_size_ = options_file_size; + } + + if (0 == disable_delete_obsolete_files_) { + // TODO: Should we check for errors here? + DeleteObsoleteOptionsFiles().PermitUncheckedError(); + } + return s; +#else + (void)file_name; + return Status::OK(); +#endif // !ROCKSDB_LITE +} + +#ifdef ROCKSDB_USING_THREAD_STATUS + +void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* cfd) const { + if (immutable_db_options_.enable_thread_tracking) { + ThreadStatusUtil::NewColumnFamilyInfo(this, cfd, cfd->GetName(), + cfd->ioptions()->env); + } +} + +void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const { + if (immutable_db_options_.enable_thread_tracking) { + ThreadStatusUtil::EraseColumnFamilyInfo(cfd); + } +} + +void DBImpl::EraseThreadStatusDbInfo() const { + if (immutable_db_options_.enable_thread_tracking) { + ThreadStatusUtil::EraseDatabaseInfo(this); + } +} + +#else +void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {} + +void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {} + +void DBImpl::EraseThreadStatusDbInfo() const {} +#endif // ROCKSDB_USING_THREAD_STATUS + +// +// A global method that can dump out the build version +void DumpRocksDBBuildVersion(Logger* log) { + ROCKS_LOG_HEADER(log, "RocksDB version: %s\n", + GetRocksVersionAsString().c_str()); + const auto& props = GetRocksBuildProperties(); + const auto& sha = props.find("rocksdb_build_git_sha"); + if (sha != props.end()) { + ROCKS_LOG_HEADER(log, "Git sha %s", sha->second.c_str()); + } + const auto date = props.find("rocksdb_build_date"); + if (date != props.end()) { + ROCKS_LOG_HEADER(log, "Compile date %s", date->second.c_str()); + } +} + +#ifndef ROCKSDB_LITE +SequenceNumber DBImpl::GetEarliestMemTableSequenceNumber(SuperVersion* sv, + bool include_history) { + // Find the earliest sequence number that we know we can rely on reading + // from the memtable without needing to check sst files. + SequenceNumber earliest_seq = + sv->imm->GetEarliestSequenceNumber(include_history); + if (earliest_seq == kMaxSequenceNumber) { + earliest_seq = sv->mem->GetEarliestSequenceNumber(); + } + assert(sv->mem->GetEarliestSequenceNumber() >= earliest_seq); + + return earliest_seq; +} + +Status DBImpl::GetLatestSequenceForKey( + SuperVersion* sv, const Slice& key, bool cache_only, + SequenceNumber lower_bound_seq, SequenceNumber* seq, std::string* timestamp, + bool* found_record_for_key, bool* is_blob_index) { + Status s; + MergeContext merge_context; + SequenceNumber max_covering_tombstone_seq = 0; + + ReadOptions read_options; + SequenceNumber current_seq = versions_->LastSequence(); + + ColumnFamilyData* cfd = sv->cfd; + assert(cfd); + const Comparator* const ucmp = cfd->user_comparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + std::string ts_buf; + if (ts_sz > 0) { + assert(timestamp); + ts_buf.assign(ts_sz, '\xff'); + } else { + assert(!timestamp); + } + Slice ts(ts_buf); + + LookupKey lkey(key, current_seq, ts_sz == 0 ? nullptr : &ts); + + *seq = kMaxSequenceNumber; + *found_record_for_key = false; + + // Check if there is a record for this key in the latest memtable + sv->mem->Get(lkey, /*value=*/nullptr, /*columns=*/nullptr, timestamp, &s, + &merge_context, &max_covering_tombstone_seq, seq, read_options, + false /* immutable_memtable */, nullptr /*read_callback*/, + is_blob_index); + + if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { + // unexpected error reading memtable. + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Unexpected status returned from MemTable::Get: %s\n", + s.ToString().c_str()); + + return s; + } + assert(!ts_sz || + (*seq != kMaxSequenceNumber && + *timestamp != std::string(ts_sz, '\xff')) || + (*seq == kMaxSequenceNumber && timestamp->empty())); + + TEST_SYNC_POINT_CALLBACK("DBImpl::GetLatestSequenceForKey:mem", timestamp); + + if (*seq != kMaxSequenceNumber) { + // Found a sequence number, no need to check immutable memtables + *found_record_for_key = true; + return Status::OK(); + } + + SequenceNumber lower_bound_in_mem = sv->mem->GetEarliestSequenceNumber(); + if (lower_bound_in_mem != kMaxSequenceNumber && + lower_bound_in_mem < lower_bound_seq) { + *found_record_for_key = false; + return Status::OK(); + } + + // Check if there is a record for this key in the immutable memtables + sv->imm->Get(lkey, /*value=*/nullptr, /*columns=*/nullptr, timestamp, &s, + &merge_context, &max_covering_tombstone_seq, seq, read_options, + nullptr /*read_callback*/, is_blob_index); + + if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { + // unexpected error reading memtable. + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Unexpected status returned from MemTableList::Get: %s\n", + s.ToString().c_str()); + + return s; + } + + assert(!ts_sz || + (*seq != kMaxSequenceNumber && + *timestamp != std::string(ts_sz, '\xff')) || + (*seq == kMaxSequenceNumber && timestamp->empty())); + + if (*seq != kMaxSequenceNumber) { + // Found a sequence number, no need to check memtable history + *found_record_for_key = true; + return Status::OK(); + } + + SequenceNumber lower_bound_in_imm = sv->imm->GetEarliestSequenceNumber(); + if (lower_bound_in_imm != kMaxSequenceNumber && + lower_bound_in_imm < lower_bound_seq) { + *found_record_for_key = false; + return Status::OK(); + } + + // Check if there is a record for this key in the immutable memtables + sv->imm->GetFromHistory(lkey, /*value=*/nullptr, /*columns=*/nullptr, + timestamp, &s, &merge_context, + &max_covering_tombstone_seq, seq, read_options, + is_blob_index); + + if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { + // unexpected error reading memtable. + ROCKS_LOG_ERROR( + immutable_db_options_.info_log, + "Unexpected status returned from MemTableList::GetFromHistory: %s\n", + s.ToString().c_str()); + + return s; + } + + assert(!ts_sz || + (*seq != kMaxSequenceNumber && + *timestamp != std::string(ts_sz, '\xff')) || + (*seq == kMaxSequenceNumber && timestamp->empty())); + + if (*seq != kMaxSequenceNumber) { + // Found a sequence number, no need to check SST files + assert(0 == ts_sz || *timestamp != std::string(ts_sz, '\xff')); + *found_record_for_key = true; + return Status::OK(); + } + + // We could do a sv->imm->GetEarliestSequenceNumber(/*include_history*/ true) + // check here to skip the history if possible. But currently the caller + // already does that. Maybe we should move the logic here later. + + // TODO(agiardullo): possible optimization: consider checking cached + // SST files if cache_only=true? + if (!cache_only) { + // Check tables + PinnedIteratorsManager pinned_iters_mgr; + sv->current->Get(read_options, lkey, /*value=*/nullptr, /*columns=*/nullptr, + timestamp, &s, &merge_context, &max_covering_tombstone_seq, + &pinned_iters_mgr, nullptr /* value_found */, + found_record_for_key, seq, nullptr /*read_callback*/, + is_blob_index); + + if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { + // unexpected error reading SST files + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Unexpected status returned from Version::Get: %s\n", + s.ToString().c_str()); + } + } + + return s; +} + +Status DBImpl::IngestExternalFile( + ColumnFamilyHandle* column_family, + const std::vector& external_files, + const IngestExternalFileOptions& ingestion_options) { + IngestExternalFileArg arg; + arg.column_family = column_family; + arg.external_files = external_files; + arg.options = ingestion_options; + return IngestExternalFiles({arg}); +} + +Status DBImpl::IngestExternalFiles( + const std::vector& args) { + if (args.empty()) { + return Status::InvalidArgument("ingestion arg list is empty"); + } + { + std::unordered_set unique_cfhs; + for (const auto& arg : args) { + if (arg.column_family == nullptr) { + return Status::InvalidArgument("column family handle is null"); + } else if (unique_cfhs.count(arg.column_family) > 0) { + return Status::InvalidArgument( + "ingestion args have duplicate column families"); + } + unique_cfhs.insert(arg.column_family); + } + } + // Ingest multiple external SST files atomically. + const size_t num_cfs = args.size(); + for (size_t i = 0; i != num_cfs; ++i) { + if (args[i].external_files.empty()) { + char err_msg[128] = {0}; + snprintf(err_msg, 128, "external_files[%zu] is empty", i); + return Status::InvalidArgument(err_msg); + } + } + for (const auto& arg : args) { + const IngestExternalFileOptions& ingest_opts = arg.options; + if (ingest_opts.ingest_behind && + !immutable_db_options_.allow_ingest_behind) { + return Status::InvalidArgument( + "can't ingest_behind file in DB with allow_ingest_behind=false"); + } + } + + // TODO (yanqin) maybe handle the case in which column_families have + // duplicates + std::unique_ptr::iterator> pending_output_elem; + size_t total = 0; + for (const auto& arg : args) { + total += arg.external_files.size(); + } + uint64_t next_file_number = 0; + Status status = ReserveFileNumbersBeforeIngestion( + static_cast(args[0].column_family)->cfd(), total, + pending_output_elem, &next_file_number); + if (!status.ok()) { + InstrumentedMutexLock l(&mutex_); + ReleaseFileNumberFromPendingOutputs(pending_output_elem); + return status; + } + + std::vector ingestion_jobs; + for (const auto& arg : args) { + auto* cfd = static_cast(arg.column_family)->cfd(); + ingestion_jobs.emplace_back(versions_.get(), cfd, immutable_db_options_, + file_options_, &snapshots_, arg.options, + &directories_, &event_logger_, io_tracer_); + } + + // TODO(yanqin) maybe make jobs run in parallel + uint64_t start_file_number = next_file_number; + for (size_t i = 1; i != num_cfs; ++i) { + start_file_number += args[i - 1].external_files.size(); + auto* cfd = + static_cast(args[i].column_family)->cfd(); + SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); + Status es = ingestion_jobs[i].Prepare( + args[i].external_files, args[i].files_checksums, + args[i].files_checksum_func_names, args[i].file_temperature, + start_file_number, super_version); + // capture first error only + if (!es.ok() && status.ok()) { + status = es; + } + CleanupSuperVersion(super_version); + } + TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0"); + TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1"); + { + auto* cfd = + static_cast(args[0].column_family)->cfd(); + SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); + Status es = ingestion_jobs[0].Prepare( + args[0].external_files, args[0].files_checksums, + args[0].files_checksum_func_names, args[0].file_temperature, + next_file_number, super_version); + if (!es.ok()) { + status = es; + } + CleanupSuperVersion(super_version); + } + if (!status.ok()) { + for (size_t i = 0; i != num_cfs; ++i) { + ingestion_jobs[i].Cleanup(status); + } + InstrumentedMutexLock l(&mutex_); + ReleaseFileNumberFromPendingOutputs(pending_output_elem); + return status; + } + + std::vector sv_ctxs; + for (size_t i = 0; i != num_cfs; ++i) { + sv_ctxs.emplace_back(true /* create_superversion */); + } + TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeJobsRun:0"); + TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeJobsRun:1"); + TEST_SYNC_POINT("DBImpl::AddFile:Start"); + { + InstrumentedMutexLock l(&mutex_); + TEST_SYNC_POINT("DBImpl::AddFile:MutexLock"); + + // Stop writes to the DB by entering both write threads + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + + // When unordered_write is enabled, the keys are writing to memtable in an + // unordered way. If the ingestion job checks memtable key range before the + // key landing in memtable, the ingestion job may skip the necessary + // memtable flush. + // So wait here to ensure there is no pending write to memtable. + WaitForPendingWrites(); + + num_running_ingest_file_ += static_cast(num_cfs); + TEST_SYNC_POINT("DBImpl::IngestExternalFile:AfterIncIngestFileCounter"); + + bool at_least_one_cf_need_flush = false; + std::vector need_flush(num_cfs, false); + for (size_t i = 0; i != num_cfs; ++i) { + auto* cfd = + static_cast(args[i].column_family)->cfd(); + if (cfd->IsDropped()) { + // TODO (yanqin) investigate whether we should abort ingestion or + // proceed with other non-dropped column families. + status = Status::InvalidArgument( + "cannot ingest an external file into a dropped CF"); + break; + } + bool tmp = false; + status = ingestion_jobs[i].NeedsFlush(&tmp, cfd->GetSuperVersion()); + need_flush[i] = tmp; + at_least_one_cf_need_flush = (at_least_one_cf_need_flush || tmp); + if (!status.ok()) { + break; + } + } + TEST_SYNC_POINT_CALLBACK("DBImpl::IngestExternalFile:NeedFlush", + &at_least_one_cf_need_flush); + + if (status.ok() && at_least_one_cf_need_flush) { + FlushOptions flush_opts; + flush_opts.allow_write_stall = true; + if (immutable_db_options_.atomic_flush) { + autovector cfds_to_flush; + SelectColumnFamiliesForAtomicFlush(&cfds_to_flush); + mutex_.Unlock(); + status = AtomicFlushMemTables(cfds_to_flush, flush_opts, + FlushReason::kExternalFileIngestion, + true /* entered_write_thread */); + mutex_.Lock(); + } else { + for (size_t i = 0; i != num_cfs; ++i) { + if (need_flush[i]) { + mutex_.Unlock(); + auto* cfd = + static_cast(args[i].column_family) + ->cfd(); + status = FlushMemTable(cfd, flush_opts, + FlushReason::kExternalFileIngestion, + true /* entered_write_thread */); + mutex_.Lock(); + if (!status.ok()) { + break; + } + } + } + } + } + // Run ingestion jobs. + if (status.ok()) { + for (size_t i = 0; i != num_cfs; ++i) { + status = ingestion_jobs[i].Run(); + if (!status.ok()) { + break; + } + } + } + if (status.ok()) { + autovector cfds_to_commit; + autovector mutable_cf_options_list; + autovector> edit_lists; + uint32_t num_entries = 0; + for (size_t i = 0; i != num_cfs; ++i) { + auto* cfd = + static_cast(args[i].column_family)->cfd(); + if (cfd->IsDropped()) { + continue; + } + cfds_to_commit.push_back(cfd); + mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions()); + autovector edit_list; + edit_list.push_back(ingestion_jobs[i].edit()); + edit_lists.push_back(edit_list); + ++num_entries; + } + // Mark the version edits as an atomic group if the number of version + // edits exceeds 1. + if (cfds_to_commit.size() > 1) { + for (auto& edits : edit_lists) { + assert(edits.size() == 1); + edits[0]->MarkAtomicGroup(--num_entries); + } + assert(0 == num_entries); + } + status = + versions_->LogAndApply(cfds_to_commit, mutable_cf_options_list, + edit_lists, &mutex_, directories_.GetDbDir()); + // It is safe to update VersionSet last seqno here after LogAndApply since + // LogAndApply persists last sequence number from VersionEdits, + // which are from file's largest seqno and not from VersionSet. + // + // It is necessary to update last seqno here since LogAndApply releases + // mutex when persisting MANIFEST file, and the snapshots taken during + // that period will not be stable if VersionSet last seqno is updated + // before LogAndApply. + int consumed_seqno_count = + ingestion_jobs[0].ConsumedSequenceNumbersCount(); + for (size_t i = 1; i != num_cfs; ++i) { + consumed_seqno_count = + std::max(consumed_seqno_count, + ingestion_jobs[i].ConsumedSequenceNumbersCount()); + } + if (consumed_seqno_count > 0) { + const SequenceNumber last_seqno = versions_->LastSequence(); + versions_->SetLastAllocatedSequence(last_seqno + consumed_seqno_count); + versions_->SetLastPublishedSequence(last_seqno + consumed_seqno_count); + versions_->SetLastSequence(last_seqno + consumed_seqno_count); + } + } + + if (status.ok()) { + for (size_t i = 0; i != num_cfs; ++i) { + auto* cfd = + static_cast(args[i].column_family)->cfd(); + if (!cfd->IsDropped()) { + InstallSuperVersionAndScheduleWork(cfd, &sv_ctxs[i], + *cfd->GetLatestMutableCFOptions()); +#ifndef NDEBUG + if (0 == i && num_cfs > 1) { + TEST_SYNC_POINT( + "DBImpl::IngestExternalFiles:InstallSVForFirstCF:0"); + TEST_SYNC_POINT( + "DBImpl::IngestExternalFiles:InstallSVForFirstCF:1"); + } +#endif // !NDEBUG + } + } + } else if (versions_->io_status().IsIOError()) { + // Error while writing to MANIFEST. + // In fact, versions_->io_status() can also be the result of renaming + // CURRENT file. With current code, it's just difficult to tell. So just + // be pessimistic and try write to a new MANIFEST. + // TODO: distinguish between MANIFEST write and CURRENT renaming + const IOStatus& io_s = versions_->io_status(); + // Should handle return error? + error_handler_.SetBGError(io_s, BackgroundErrorReason::kManifestWrite); + } + + // Resume writes to the DB + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + write_thread_.ExitUnbatched(&w); + + if (status.ok()) { + for (auto& job : ingestion_jobs) { + job.UpdateStats(); + } + } + ReleaseFileNumberFromPendingOutputs(pending_output_elem); + num_running_ingest_file_ -= static_cast(num_cfs); + if (0 == num_running_ingest_file_) { + bg_cv_.SignalAll(); + } + TEST_SYNC_POINT("DBImpl::AddFile:MutexUnlock"); + } + // mutex_ is unlocked here + + // Cleanup + for (size_t i = 0; i != num_cfs; ++i) { + sv_ctxs[i].Clean(); + // This may rollback jobs that have completed successfully. This is + // intended for atomicity. + ingestion_jobs[i].Cleanup(status); + } + if (status.ok()) { + for (size_t i = 0; i != num_cfs; ++i) { + auto* cfd = + static_cast(args[i].column_family)->cfd(); + if (!cfd->IsDropped()) { + NotifyOnExternalFileIngested(cfd, ingestion_jobs[i]); + } + } + } + return status; +} + +Status DBImpl::CreateColumnFamilyWithImport( + const ColumnFamilyOptions& options, const std::string& column_family_name, + const ImportColumnFamilyOptions& import_options, + const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) { + assert(handle != nullptr); + assert(*handle == nullptr); + std::string cf_comparator_name = options.comparator->Name(); + if (cf_comparator_name != metadata.db_comparator_name) { + return Status::InvalidArgument("Comparator name mismatch"); + } + + // Create column family. + auto status = CreateColumnFamily(options, column_family_name, handle); + if (!status.ok()) { + return status; + } + + // Import sst files from metadata. + auto cfh = static_cast_with_check(*handle); + auto cfd = cfh->cfd(); + ImportColumnFamilyJob import_job(versions_.get(), cfd, immutable_db_options_, + file_options_, import_options, + metadata.files, io_tracer_); + + SuperVersionContext dummy_sv_ctx(/* create_superversion */ true); + VersionEdit dummy_edit; + uint64_t next_file_number = 0; + std::unique_ptr::iterator> pending_output_elem; + { + // Lock db mutex + InstrumentedMutexLock l(&mutex_); + if (error_handler_.IsDBStopped()) { + // Don't import files when there is a bg_error + status = error_handler_.GetBGError(); + } + + // Make sure that bg cleanup wont delete the files that we are importing + pending_output_elem.reset(new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + + if (status.ok()) { + // If crash happen after a hard link established, Recover function may + // reuse the file number that has already assigned to the internal file, + // and this will overwrite the external file. To protect the external + // file, we have to make sure the file number will never being reused. + next_file_number = versions_->FetchAddFileNumber(metadata.files.size()); + auto cf_options = cfd->GetLatestMutableCFOptions(); + status = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_, + directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options); + } + } + } + dummy_sv_ctx.Clean(); + + if (status.ok()) { + SuperVersion* sv = cfd->GetReferencedSuperVersion(this); + status = import_job.Prepare(next_file_number, sv); + CleanupSuperVersion(sv); + } + + if (status.ok()) { + SuperVersionContext sv_context(true /*create_superversion*/); + { + // Lock db mutex + InstrumentedMutexLock l(&mutex_); + + // Stop writes to the DB by entering both write threads + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + + num_running_ingest_file_++; + assert(!cfd->IsDropped()); + status = import_job.Run(); + + // Install job edit [Mutex will be unlocked here] + if (status.ok()) { + auto cf_options = cfd->GetLatestMutableCFOptions(); + status = versions_->LogAndApply(cfd, *cf_options, import_job.edit(), + &mutex_, directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, &sv_context, *cf_options); + } + } + + // Resume writes to the DB + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + write_thread_.ExitUnbatched(&w); + + num_running_ingest_file_--; + if (num_running_ingest_file_ == 0) { + bg_cv_.SignalAll(); + } + } + // mutex_ is unlocked here + + sv_context.Clean(); + } + + { + InstrumentedMutexLock l(&mutex_); + ReleaseFileNumberFromPendingOutputs(pending_output_elem); + } + + import_job.Cleanup(status); + if (!status.ok()) { + Status temp_s = DropColumnFamily(*handle); + if (!temp_s.ok()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "DropColumnFamily failed with error %s", + temp_s.ToString().c_str()); + } + // Always returns Status::OK() + temp_s = DestroyColumnFamilyHandle(*handle); + assert(temp_s.ok()); + *handle = nullptr; + } + return status; +} + +Status DBImpl::VerifyFileChecksums(const ReadOptions& read_options) { + return VerifyChecksumInternal(read_options, /*use_file_checksum=*/true); +} + +Status DBImpl::VerifyChecksum(const ReadOptions& read_options) { + return VerifyChecksumInternal(read_options, /*use_file_checksum=*/false); +} + +Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options, + bool use_file_checksum) { + // `bytes_read` stat is enabled based on compile-time support and cannot + // be dynamically toggled. So we do not need to worry about `PerfLevel` + // here, unlike many other `IOStatsContext` / `PerfContext` stats. + uint64_t prev_bytes_read = IOSTATS(bytes_read); + + Status s; + + if (use_file_checksum) { + FileChecksumGenFactory* const file_checksum_gen_factory = + immutable_db_options_.file_checksum_gen_factory.get(); + if (!file_checksum_gen_factory) { + s = Status::InvalidArgument( + "Cannot verify file checksum if options.file_checksum_gen_factory is " + "null"); + return s; + } + } + + // TODO: simplify using GetRefedColumnFamilySet? + std::vector cfd_list; + { + InstrumentedMutexLock l(&mutex_); + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->IsDropped() && cfd->initialized()) { + cfd->Ref(); + cfd_list.push_back(cfd); + } + } + } + std::vector sv_list; + for (auto cfd : cfd_list) { + sv_list.push_back(cfd->GetReferencedSuperVersion(this)); + } + + for (auto& sv : sv_list) { + VersionStorageInfo* vstorage = sv->current->storage_info(); + ColumnFamilyData* cfd = sv->current->cfd(); + Options opts; + if (!use_file_checksum) { + InstrumentedMutexLock l(&mutex_); + opts = Options(BuildDBOptions(immutable_db_options_, mutable_db_options_), + cfd->GetLatestCFOptions()); + } + for (int i = 0; i < vstorage->num_non_empty_levels() && s.ok(); i++) { + for (size_t j = 0; j < vstorage->LevelFilesBrief(i).num_files && s.ok(); + j++) { + const auto& fd_with_krange = vstorage->LevelFilesBrief(i).files[j]; + const auto& fd = fd_with_krange.fd; + const FileMetaData* fmeta = fd_with_krange.file_metadata; + assert(fmeta); + std::string fname = TableFileName(cfd->ioptions()->cf_paths, + fd.GetNumber(), fd.GetPathId()); + if (use_file_checksum) { + s = VerifyFullFileChecksum(fmeta->file_checksum, + fmeta->file_checksum_func_name, fname, + read_options); + } else { + s = ROCKSDB_NAMESPACE::VerifySstFileChecksum( + opts, file_options_, read_options, fname, fd.largest_seqno); + } + RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES, + IOSTATS(bytes_read) - prev_bytes_read); + prev_bytes_read = IOSTATS(bytes_read); + } + } + + if (s.ok() && use_file_checksum) { + const auto& blob_files = vstorage->GetBlobFiles(); + for (const auto& meta : blob_files) { + assert(meta); + + const uint64_t blob_file_number = meta->GetBlobFileNumber(); + + const std::string blob_file_name = BlobFileName( + cfd->ioptions()->cf_paths.front().path, blob_file_number); + s = VerifyFullFileChecksum(meta->GetChecksumValue(), + meta->GetChecksumMethod(), blob_file_name, + read_options); + RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES, + IOSTATS(bytes_read) - prev_bytes_read); + prev_bytes_read = IOSTATS(bytes_read); + if (!s.ok()) { + break; + } + } + } + if (!s.ok()) { + break; + } + } + + bool defer_purge = immutable_db_options().avoid_unnecessary_blocking_io; + { + InstrumentedMutexLock l(&mutex_); + for (auto sv : sv_list) { + if (sv && sv->Unref()) { + sv->Cleanup(); + if (defer_purge) { + AddSuperVersionsToFreeQueue(sv); + } else { + delete sv; + } + } + } + if (defer_purge) { + SchedulePurge(); + } + for (auto cfd : cfd_list) { + cfd->UnrefAndTryDelete(); + } + } + RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES, + IOSTATS(bytes_read) - prev_bytes_read); + return s; +} + +Status DBImpl::VerifyFullFileChecksum(const std::string& file_checksum_expected, + const std::string& func_name_expected, + const std::string& fname, + const ReadOptions& read_options) { + Status s; + if (file_checksum_expected == kUnknownFileChecksum) { + return s; + } + std::string file_checksum; + std::string func_name; + s = ROCKSDB_NAMESPACE::GenerateOneFileChecksum( + fs_.get(), fname, immutable_db_options_.file_checksum_gen_factory.get(), + func_name_expected, &file_checksum, &func_name, + read_options.readahead_size, immutable_db_options_.allow_mmap_reads, + io_tracer_, immutable_db_options_.rate_limiter.get(), + read_options.rate_limiter_priority); + if (s.ok()) { + assert(func_name_expected == func_name); + if (file_checksum != file_checksum_expected) { + std::ostringstream oss; + oss << fname << " file checksum mismatch, "; + oss << "expecting " + << Slice(file_checksum_expected).ToString(/*hex=*/true); + oss << ", but actual " << Slice(file_checksum).ToString(/*hex=*/true); + s = Status::Corruption(oss.str()); + TEST_SYNC_POINT_CALLBACK("DBImpl::VerifyFullFileChecksum:mismatch", &s); + } + } + return s; +} + +void DBImpl::NotifyOnExternalFileIngested( + ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job) { + if (immutable_db_options_.listeners.empty()) { + return; + } + + for (const IngestedFileInfo& f : ingestion_job.files_to_ingest()) { + ExternalFileIngestionInfo info; + info.cf_name = cfd->GetName(); + info.external_file_path = f.external_file_path; + info.internal_file_path = f.internal_file_path; + info.global_seqno = f.assigned_seqno; + info.table_properties = f.table_properties; + for (auto listener : immutable_db_options_.listeners) { + listener->OnExternalFileIngested(this, info); + } + } +} + +void DBImpl::WaitForIngestFile() { + mutex_.AssertHeld(); + while (num_running_ingest_file_ > 0) { + bg_cv_.Wait(); + } +} + +Status DBImpl::StartTrace(const TraceOptions& trace_options, + std::unique_ptr&& trace_writer) { + InstrumentedMutexLock lock(&trace_mutex_); + tracer_.reset(new Tracer(immutable_db_options_.clock, trace_options, + std::move(trace_writer))); + return Status::OK(); +} + +Status DBImpl::EndTrace() { + InstrumentedMutexLock lock(&trace_mutex_); + Status s; + if (tracer_ != nullptr) { + s = tracer_->Close(); + tracer_.reset(); + } else { + s = Status::IOError("No trace file to close"); + } + return s; +} + +Status DBImpl::NewDefaultReplayer( + const std::vector& handles, + std::unique_ptr&& reader, + std::unique_ptr* replayer) { + replayer->reset(new ReplayerImpl(this, handles, std::move(reader))); + return Status::OK(); +} + +Status DBImpl::StartBlockCacheTrace( + const TraceOptions& trace_options, + std::unique_ptr&& trace_writer) { + BlockCacheTraceOptions block_trace_opts; + block_trace_opts.sampling_frequency = trace_options.sampling_frequency; + + BlockCacheTraceWriterOptions trace_writer_opt; + trace_writer_opt.max_trace_file_size = trace_options.max_trace_file_size; + + std::unique_ptr block_cache_trace_writer = + NewBlockCacheTraceWriter(env_->GetSystemClock().get(), trace_writer_opt, + std::move(trace_writer)); + + return block_cache_tracer_.StartTrace(block_trace_opts, + std::move(block_cache_trace_writer)); +} + +Status DBImpl::StartBlockCacheTrace( + const BlockCacheTraceOptions& trace_options, + std::unique_ptr&& trace_writer) { + return block_cache_tracer_.StartTrace(trace_options, std::move(trace_writer)); +} + +Status DBImpl::EndBlockCacheTrace() { + block_cache_tracer_.EndTrace(); + return Status::OK(); +} + +Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key, + const Slice& lower_bound, + const Slice upper_bound) { + Status s; + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + s = tracer_->IteratorSeek(cf_id, key, lower_bound, upper_bound); + } + } + return s; +} + +Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key, + const Slice& lower_bound, + const Slice upper_bound) { + Status s; + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + s = tracer_->IteratorSeekForPrev(cf_id, key, lower_bound, upper_bound); + } + } + return s; +} + +Status DBImpl::ReserveFileNumbersBeforeIngestion( + ColumnFamilyData* cfd, uint64_t num, + std::unique_ptr::iterator>& pending_output_elem, + uint64_t* next_file_number) { + Status s; + SuperVersionContext dummy_sv_ctx(true /* create_superversion */); + assert(nullptr != next_file_number); + InstrumentedMutexLock l(&mutex_); + if (error_handler_.IsDBStopped()) { + // Do not ingest files when there is a bg_error + return error_handler_.GetBGError(); + } + pending_output_elem.reset(new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + *next_file_number = versions_->FetchAddFileNumber(static_cast(num)); + auto cf_options = cfd->GetLatestMutableCFOptions(); + VersionEdit dummy_edit; + // If crash happen after a hard link established, Recover function may + // reuse the file number that has already assigned to the internal file, + // and this will overwrite the external file. To protect the external + // file, we have to make sure the file number will never being reused. + s = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_, + directories_.GetDbDir()); + if (s.ok()) { + InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options); + } + dummy_sv_ctx.Clean(); + return s; +} + +Status DBImpl::GetCreationTimeOfOldestFile(uint64_t* creation_time) { + if (mutable_db_options_.max_open_files == -1) { + uint64_t oldest_time = std::numeric_limits::max(); + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->IsDropped()) { + uint64_t ctime; + { + SuperVersion* sv = GetAndRefSuperVersion(cfd); + Version* version = sv->current; + version->GetCreationTimeOfOldestFile(&ctime); + ReturnAndCleanupSuperVersion(cfd, sv); + } + + if (ctime < oldest_time) { + oldest_time = ctime; + } + if (oldest_time == 0) { + break; + } + } + } + *creation_time = oldest_time; + return Status::OK(); + } else { + return Status::NotSupported("This API only works if max_open_files = -1"); + } +} + +void DBImpl::RecordSeqnoToTimeMapping() { + // Get time first then sequence number, so the actual time of seqno is <= + // unix_time recorded + int64_t unix_time = 0; + immutable_db_options_.clock->GetCurrentTime(&unix_time) + .PermitUncheckedError(); // Ignore error + SequenceNumber seqno = GetLatestSequenceNumber(); + bool appended = false; + { + InstrumentedMutexLock l(&mutex_); + appended = seqno_time_mapping_.Append(seqno, unix_time); + } + if (!appended) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Failed to insert sequence number to time entry: %" PRIu64 + " -> %" PRIu64, + seqno, unix_time); + } +} +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_impl.h b/src/rocksdb/db/db_impl/db_impl.h new file mode 100644 index 000000000..725e77c18 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl.h @@ -0,0 +1,2804 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/column_family.h" +#include "db/compaction/compaction_iterator.h" +#include "db/compaction/compaction_job.h" +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "db/external_sst_file_ingestion_job.h" +#include "db/flush_job.h" +#include "db/flush_scheduler.h" +#include "db/import_column_family_job.h" +#include "db/internal_stats.h" +#include "db/log_writer.h" +#include "db/logs_with_prep_tracker.h" +#include "db/memtable_list.h" +#include "db/periodic_task_scheduler.h" +#include "db/post_memtable_callback.h" +#include "db/pre_release_callback.h" +#include "db/range_del_aggregator.h" +#include "db/read_callback.h" +#include "db/seqno_to_time_mapping.h" +#include "db/snapshot_checker.h" +#include "db/snapshot_impl.h" +#include "db/trim_history_scheduler.h" +#include "db/version_edit.h" +#include "db/wal_manager.h" +#include "db/write_controller.h" +#include "db/write_thread.h" +#include "logging/event_logger.h" +#include "monitoring/instrumented_mutex.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/status.h" +#ifndef ROCKSDB_LITE +#include "rocksdb/trace_reader_writer.h" +#endif // ROCKSDB_LITE +#include "rocksdb/transaction_log.h" +#ifndef ROCKSDB_LITE +#include "rocksdb/utilities/replayer.h" +#endif // ROCKSDB_LITE +#include "rocksdb/write_buffer_manager.h" +#include "table/merging_iterator.h" +#include "table/scoped_arena_iterator.h" +#include "util/autovector.h" +#include "util/hash.h" +#include "util/repeatable_thread.h" +#include "util/stop_watch.h" +#include "util/thread_local.h" + +namespace ROCKSDB_NAMESPACE { + +class Arena; +class ArenaWrappedDBIter; +class InMemoryStatsHistoryIterator; +class MemTable; +class PersistentStatsHistoryIterator; +class TableCache; +class TaskLimiterToken; +class Version; +class VersionEdit; +class VersionSet; +class WriteCallback; +struct JobContext; +struct ExternalSstFileInfo; +struct MemTableInfo; + +// Class to maintain directories for all database paths other than main one. +class Directories { + public: + IOStatus SetDirectories(FileSystem* fs, const std::string& dbname, + const std::string& wal_dir, + const std::vector& data_paths); + + FSDirectory* GetDataDir(size_t path_id) const { + assert(path_id < data_dirs_.size()); + FSDirectory* ret_dir = data_dirs_[path_id].get(); + if (ret_dir == nullptr) { + // Should use db_dir_ + return db_dir_.get(); + } + return ret_dir; + } + + FSDirectory* GetWalDir() { + if (wal_dir_) { + return wal_dir_.get(); + } + return db_dir_.get(); + } + + FSDirectory* GetDbDir() { return db_dir_.get(); } + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) { + // close all directories for all database paths + IOStatus s = IOStatus::OK(); + + // The default implementation for Close() in Directory/FSDirectory class + // "NotSupported" status, the upper level interface should be able to + // handle this error so that Close() does not fail after upgrading when + // run on FileSystems that have not implemented `Directory::Close()` or + // `FSDirectory::Close()` yet + + if (db_dir_) { + IOStatus temp_s = db_dir_->Close(options, dbg); + if (!temp_s.ok() && !temp_s.IsNotSupported() && s.ok()) { + s = std::move(temp_s); + } + } + + // Attempt to close everything even if one fails + s.PermitUncheckedError(); + + if (wal_dir_) { + IOStatus temp_s = wal_dir_->Close(options, dbg); + if (!temp_s.ok() && !temp_s.IsNotSupported() && s.ok()) { + s = std::move(temp_s); + } + } + + s.PermitUncheckedError(); + + for (auto& data_dir_ptr : data_dirs_) { + if (data_dir_ptr) { + IOStatus temp_s = data_dir_ptr->Close(options, dbg); + if (!temp_s.ok() && !temp_s.IsNotSupported() && s.ok()) { + s = std::move(temp_s); + } + } + } + + // Ready for caller + s.MustCheck(); + return s; + } + + private: + std::unique_ptr db_dir_; + std::vector> data_dirs_; + std::unique_ptr wal_dir_; +}; + +// While DB is the public interface of RocksDB, and DBImpl is the actual +// class implementing it. It's the entrance of the core RocksdB engine. +// All other DB implementations, e.g. TransactionDB, BlobDB, etc, wrap a +// DBImpl internally. +// Other than functions implementing the DB interface, some public +// functions are there for other internal components to call. For +// example, TransactionDB directly calls DBImpl::WriteImpl() and +// BlobDB directly calls DBImpl::GetImpl(). Some other functions +// are for sub-components to call. For example, ColumnFamilyHandleImpl +// calls DBImpl::FindObsoleteFiles(). +// +// Since it's a very large class, the definition of the functions is +// divided in several db_impl_*.cc files, besides db_impl.cc. +class DBImpl : public DB { + public: + DBImpl(const DBOptions& options, const std::string& dbname, + const bool seq_per_batch = false, const bool batch_per_txn = true, + bool read_only = false); + // No copying allowed + DBImpl(const DBImpl&) = delete; + void operator=(const DBImpl&) = delete; + + virtual ~DBImpl(); + + // ---- Implementations of the DB interface ---- + + using DB::Resume; + Status Resume() override; + + using DB::Put; + Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) override; + Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts, const Slice& value) override; + + using DB::PutEntity; + Status PutEntity(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const WideColumns& columns) override; + + using DB::Merge; + Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) override; + Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts, const Slice& value) override; + + using DB::Delete; + Status Delete(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key) override; + Status Delete(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts) override; + + using DB::SingleDelete; + Status SingleDelete(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key) override; + Status SingleDelete(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts) override; + + using DB::DeleteRange; + Status DeleteRange(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& begin_key, + const Slice& end_key) override; + Status DeleteRange(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& begin_key, + const Slice& end_key, const Slice& ts) override; + + using DB::Write; + virtual Status Write(const WriteOptions& options, + WriteBatch* updates) override; + + using DB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, std::string* timestamp) override; + + using DB::GetEntity; + Status GetEntity(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableWideColumns* columns) override; + + using DB::GetMergeOperands; + Status GetMergeOperands(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* merge_operands, + GetMergeOperandsOptions* get_merge_operands_options, + int* number_of_operands) override { + GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.merge_operands = merge_operands; + get_impl_options.get_merge_operands_options = get_merge_operands_options; + get_impl_options.number_of_operands = number_of_operands; + get_impl_options.get_value = false; + return GetImpl(options, key, get_impl_options); + } + + using DB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values, + std::vector* timestamps) override; + + // This MultiGet is a batched version, which may be faster than calling Get + // multiple times, especially if the keys have some spatial locality that + // enables them to be queried in the same SST files/set of files. The larger + // the batch size, the more scope for batching and performance improvement + // The values and statuses parameters are arrays with number of elements + // equal to keys.size(). This allows the storage for those to be alloacted + // by the caller on the stack for small batches + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input = false) override; + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, std::string* timestamps, + Status* statuses, + const bool sorted_input = false) override; + + virtual void MultiGet(const ReadOptions& options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input = false) override; + virtual void MultiGet(const ReadOptions& options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, std::string* timestamps, + Status* statuses, + const bool sorted_input = false) override; + + virtual void MultiGetWithCallback( + const ReadOptions& options, ColumnFamilyHandle* column_family, + ReadCallback* callback, + autovector* sorted_keys); + + virtual Status CreateColumnFamily(const ColumnFamilyOptions& cf_options, + const std::string& column_family, + ColumnFamilyHandle** handle) override; + virtual Status CreateColumnFamilies( + const ColumnFamilyOptions& cf_options, + const std::vector& column_family_names, + std::vector* handles) override; + virtual Status CreateColumnFamilies( + const std::vector& column_families, + std::vector* handles) override; + virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override; + virtual Status DropColumnFamilies( + const std::vector& column_families) override; + + // Returns false if key doesn't exist in the database and true if it may. + // If value_found is not passed in as null, then return the value if found in + // memory. On return, if value was found, then value_found will be set to true + // , otherwise false. + using DB::KeyMayExist; + virtual bool KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, std::string* timestamp, + bool* value_found = nullptr) override; + + using DB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) override; + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) override; + + virtual const Snapshot* GetSnapshot() override; + virtual void ReleaseSnapshot(const Snapshot* snapshot) override; + // Create a timestamped snapshot. This snapshot can be shared by multiple + // readers. If any of them uses it for write conflict checking, then + // is_write_conflict_boundary is true. For simplicity, set it to true by + // default. + std::pair> CreateTimestampedSnapshot( + SequenceNumber snapshot_seq, uint64_t ts); + std::shared_ptr GetTimestampedSnapshot(uint64_t ts) const; + void ReleaseTimestampedSnapshotsOlderThan( + uint64_t ts, size_t* remaining_total_ss = nullptr); + Status GetTimestampedSnapshots(uint64_t ts_lb, uint64_t ts_ub, + std::vector>& + timestamped_snapshots) const; + + using DB::GetProperty; + virtual bool GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) override; + using DB::GetMapProperty; + virtual bool GetMapProperty( + ColumnFamilyHandle* column_family, const Slice& property, + std::map* value) override; + using DB::GetIntProperty; + virtual bool GetIntProperty(ColumnFamilyHandle* column_family, + const Slice& property, uint64_t* value) override; + using DB::GetAggregatedIntProperty; + virtual bool GetAggregatedIntProperty(const Slice& property, + uint64_t* aggregated_value) override; + using DB::GetApproximateSizes; + virtual Status GetApproximateSizes(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, + const Range* range, int n, + uint64_t* sizes) override; + using DB::GetApproximateMemTableStats; + virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family, + const Range& range, + uint64_t* const count, + uint64_t* const size) override; + using DB::CompactRange; + virtual Status CompactRange(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) override; + + using DB::CompactFiles; + virtual Status CompactFiles( + const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector& input_file_names, const int output_level, + const int output_path_id = -1, + std::vector* const output_file_names = nullptr, + CompactionJobInfo* compaction_job_info = nullptr) override; + + virtual Status PauseBackgroundWork() override; + virtual Status ContinueBackgroundWork() override; + + virtual Status EnableAutoCompaction( + const std::vector& column_family_handles) override; + + virtual void EnableManualCompaction() override; + virtual void DisableManualCompaction() override; + + using DB::SetOptions; + Status SetOptions( + ColumnFamilyHandle* column_family, + const std::unordered_map& options_map) override; + + virtual Status SetDBOptions( + const std::unordered_map& options_map) override; + + using DB::NumberLevels; + virtual int NumberLevels(ColumnFamilyHandle* column_family) override; + using DB::MaxMemCompactionLevel; + virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) override; + using DB::Level0StopWriteTrigger; + virtual int Level0StopWriteTrigger( + ColumnFamilyHandle* column_family) override; + virtual const std::string& GetName() const override; + virtual Env* GetEnv() const override; + virtual FileSystem* GetFileSystem() const override; + using DB::GetOptions; + virtual Options GetOptions(ColumnFamilyHandle* column_family) const override; + using DB::GetDBOptions; + virtual DBOptions GetDBOptions() const override; + using DB::Flush; + virtual Status Flush(const FlushOptions& options, + ColumnFamilyHandle* column_family) override; + virtual Status Flush( + const FlushOptions& options, + const std::vector& column_families) override; + virtual Status FlushWAL(bool sync) override; + bool WALBufferIsEmpty(bool lock = true); + virtual Status SyncWAL() override; + virtual Status LockWAL() override; + virtual Status UnlockWAL() override; + + virtual SequenceNumber GetLatestSequenceNumber() const override; + + // IncreaseFullHistoryTsLow(ColumnFamilyHandle*, std::string) will acquire + // and release db_mutex + Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family, + std::string ts_low) override; + + // GetFullHistoryTsLow(ColumnFamilyHandle*, std::string*) will acquire and + // release db_mutex + Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family, + std::string* ts_low) override; + + virtual Status GetDbIdentity(std::string& identity) const override; + + virtual Status GetDbIdentityFromIdentityFile(std::string* identity) const; + + virtual Status GetDbSessionId(std::string& session_id) const override; + + ColumnFamilyHandle* DefaultColumnFamily() const override; + + ColumnFamilyHandle* PersistentStatsColumnFamily() const; + + virtual Status Close() override; + + virtual Status DisableFileDeletions() override; + + virtual Status EnableFileDeletions(bool force) override; + + virtual bool IsFileDeletionsEnabled() const; + + Status GetStatsHistory( + uint64_t start_time, uint64_t end_time, + std::unique_ptr* stats_iterator) override; + +#ifndef ROCKSDB_LITE + using DB::ResetStats; + virtual Status ResetStats() override; + // All the returned filenames start with "/" + virtual Status GetLiveFiles(std::vector&, + uint64_t* manifest_file_size, + bool flush_memtable = true) override; + virtual Status GetSortedWalFiles(VectorLogPtr& files) override; + virtual Status GetCurrentWalFile( + std::unique_ptr* current_log_file) override; + virtual Status GetCreationTimeOfOldestFile( + uint64_t* creation_time) override; + + virtual Status GetUpdatesSince( + SequenceNumber seq_number, std::unique_ptr* iter, + const TransactionLogIterator::ReadOptions& read_options = + TransactionLogIterator::ReadOptions()) override; + virtual Status DeleteFile(std::string name) override; + Status DeleteFilesInRanges(ColumnFamilyHandle* column_family, + const RangePtr* ranges, size_t n, + bool include_end = true); + + virtual void GetLiveFilesMetaData( + std::vector* metadata) override; + + virtual Status GetLiveFilesChecksumInfo( + FileChecksumList* checksum_list) override; + + virtual Status GetLiveFilesStorageInfo( + const LiveFilesStorageInfoOptions& opts, + std::vector* files) override; + + // Obtains the meta data of the specified column family of the DB. + // TODO(yhchiang): output parameter is placed in the end in this codebase. + virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, + ColumnFamilyMetaData* metadata) override; + + void GetAllColumnFamilyMetaData( + std::vector* metadata) override; + + Status SuggestCompactRange(ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) override; + + Status PromoteL0(ColumnFamilyHandle* column_family, + int target_level) override; + + using DB::IngestExternalFile; + virtual Status IngestExternalFile( + ColumnFamilyHandle* column_family, + const std::vector& external_files, + const IngestExternalFileOptions& ingestion_options) override; + + using DB::IngestExternalFiles; + virtual Status IngestExternalFiles( + const std::vector& args) override; + + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& options, const std::string& column_family_name, + const ImportColumnFamilyOptions& import_options, + const ExportImportFilesMetaData& metadata, + ColumnFamilyHandle** handle) override; + + using DB::VerifyFileChecksums; + Status VerifyFileChecksums(const ReadOptions& read_options) override; + + using DB::VerifyChecksum; + virtual Status VerifyChecksum(const ReadOptions& /*read_options*/) override; + // Verify the checksums of files in db. Currently only tables are checked. + // + // read_options: controls file I/O behavior, e.g. read ahead size while + // reading all the live table files. + // + // use_file_checksum: if false, verify the block checksums of all live table + // in db. Otherwise, obtain the file checksums and compare + // with the MANIFEST. Currently, file checksums are + // recomputed by reading all table files. + // + // Returns: OK if there is no file whose file or block checksum mismatches. + Status VerifyChecksumInternal(const ReadOptions& read_options, + bool use_file_checksum); + + Status VerifyFullFileChecksum(const std::string& file_checksum_expected, + const std::string& func_name_expected, + const std::string& fpath, + const ReadOptions& read_options); + + using DB::StartTrace; + virtual Status StartTrace( + const TraceOptions& options, + std::unique_ptr&& trace_writer) override; + + using DB::EndTrace; + virtual Status EndTrace() override; + + using DB::NewDefaultReplayer; + virtual Status NewDefaultReplayer( + const std::vector& handles, + std::unique_ptr&& reader, + std::unique_ptr* replayer) override; + + using DB::StartBlockCacheTrace; + Status StartBlockCacheTrace( + const TraceOptions& trace_options, + std::unique_ptr&& trace_writer) override; + + Status StartBlockCacheTrace( + const BlockCacheTraceOptions& options, + std::unique_ptr&& trace_writer) override; + + using DB::EndBlockCacheTrace; + Status EndBlockCacheTrace() override; + + using DB::StartIOTrace; + Status StartIOTrace(const TraceOptions& options, + std::unique_ptr&& trace_writer) override; + + using DB::EndIOTrace; + Status EndIOTrace() override; + + using DB::GetPropertiesOfAllTables; + virtual Status GetPropertiesOfAllTables( + ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) override; + virtual Status GetPropertiesOfTablesInRange( + ColumnFamilyHandle* column_family, const Range* range, std::size_t n, + TablePropertiesCollection* props) override; + +#endif // ROCKSDB_LITE + + // ---- End of implementations of the DB interface ---- + SystemClock* GetSystemClock() const; + + struct GetImplOptions { + ColumnFamilyHandle* column_family = nullptr; + PinnableSlice* value = nullptr; + PinnableWideColumns* columns = nullptr; + std::string* timestamp = nullptr; + bool* value_found = nullptr; + ReadCallback* callback = nullptr; + bool* is_blob_index = nullptr; + // If true return value associated with key via value pointer else return + // all merge operands for key via merge_operands pointer + bool get_value = true; + // Pointer to an array of size + // get_merge_operands_options.expected_max_number_of_operands allocated by + // user + PinnableSlice* merge_operands = nullptr; + GetMergeOperandsOptions* get_merge_operands_options = nullptr; + int* number_of_operands = nullptr; + }; + + // Function that Get and KeyMayExist call with no_io true or false + // Note: 'value_found' from KeyMayExist propagates here + // This function is also called by GetMergeOperands + // If get_impl_options.get_value = true get value associated with + // get_impl_options.key via get_impl_options.value + // If get_impl_options.get_value = false get merge operands associated with + // get_impl_options.key via get_impl_options.merge_operands + Status GetImpl(const ReadOptions& options, const Slice& key, + GetImplOptions& get_impl_options); + + // If `snapshot` == kMaxSequenceNumber, set a recent one inside the file. + ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options, + ColumnFamilyData* cfd, + SequenceNumber snapshot, + ReadCallback* read_callback, + bool expose_blob_index = false, + bool allow_refresh = true); + + virtual SequenceNumber GetLastPublishedSequence() const { + if (last_seq_same_as_publish_seq_) { + return versions_->LastSequence(); + } else { + return versions_->LastPublishedSequence(); + } + } + + // REQUIRES: joined the main write queue if two_write_queues is disabled, and + // the second write queue otherwise. + virtual void SetLastPublishedSequence(SequenceNumber seq); + // Returns LastSequence in last_seq_same_as_publish_seq_ + // mode and LastAllocatedSequence otherwise. This is useful when visiblility + // depends also on data written to the WAL but not to the memtable. + SequenceNumber TEST_GetLastVisibleSequence() const; + +#ifndef ROCKSDB_LITE + // Similar to Write() but will call the callback once on the single write + // thread to determine whether it is safe to perform the write. + virtual Status WriteWithCallback(const WriteOptions& write_options, + WriteBatch* my_batch, + WriteCallback* callback); + + // Returns the sequence number that is guaranteed to be smaller than or equal + // to the sequence number of any key that could be inserted into the current + // memtables. It can then be assumed that any write with a larger(or equal) + // sequence number will be present in this memtable or a later memtable. + // + // If the earliest sequence number could not be determined, + // kMaxSequenceNumber will be returned. + // + // If include_history=true, will also search Memtables in MemTableList + // History. + SequenceNumber GetEarliestMemTableSequenceNumber(SuperVersion* sv, + bool include_history); + + // For a given key, check to see if there are any records for this key + // in the memtables, including memtable history. If cache_only is false, + // SST files will also be checked. + // + // `key` should NOT have user-defined timestamp appended to user key even if + // timestamp is enabled. + // + // If a key is found, *found_record_for_key will be set to true and + // *seq will be set to the stored sequence number for the latest + // operation on this key or kMaxSequenceNumber if unknown. If user-defined + // timestamp is enabled for this column family and timestamp is not nullptr, + // then *timestamp will be set to the stored timestamp for the latest + // operation on this key. + // If no key is found, *found_record_for_key will be set to false. + // + // Note: If cache_only=false, it is possible for *seq to be set to 0 if + // the sequence number has been cleared from the record. If the caller is + // holding an active db snapshot, we know the missing sequence must be less + // than the snapshot's sequence number (sequence numbers are only cleared + // when there are no earlier active snapshots). + // + // If NotFound is returned and found_record_for_key is set to false, then no + // record for this key was found. If the caller is holding an active db + // snapshot, we know that no key could have existing after this snapshot + // (since we do not compact keys that have an earlier snapshot). + // + // Only records newer than or at `lower_bound_seq` are guaranteed to be + // returned. Memtables and files may not be checked if it only contains data + // older than `lower_bound_seq`. + // + // Returns OK or NotFound on success, + // other status on unexpected error. + // TODO(andrewkr): this API need to be aware of range deletion operations + Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key, + bool cache_only, + SequenceNumber lower_bound_seq, + SequenceNumber* seq, std::string* timestamp, + bool* found_record_for_key, + bool* is_blob_index); + + Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key, + const Slice& lower_bound, const Slice upper_bound); + Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key, + const Slice& lower_bound, + const Slice upper_bound); +#endif // ROCKSDB_LITE + + // Similar to GetSnapshot(), but also lets the db know that this snapshot + // will be used for transaction write-conflict checking. The DB can then + // make sure not to compact any keys that would prevent a write-conflict from + // being detected. + const Snapshot* GetSnapshotForWriteConflictBoundary(); + + // checks if all live files exist on file system and that their file sizes + // match to our in-memory records + virtual Status CheckConsistency(); + + // max_file_num_to_ignore allows bottom level compaction to filter out newly + // compacted SST files. Setting max_file_num_to_ignore to kMaxUint64 will + // disable the filtering + Status RunManualCompaction(ColumnFamilyData* cfd, int input_level, + int output_level, + const CompactRangeOptions& compact_range_options, + const Slice* begin, const Slice* end, + bool exclusive, bool disallow_trivial_move, + uint64_t max_file_num_to_ignore, + const std::string& trim_ts); + + // Return an internal iterator over the current state of the database. + // The keys of this iterator are internal keys (see format.h). + // The returned iterator should be deleted when no longer needed. + // If allow_unprepared_value is true, the returned iterator may defer reading + // the value and so will require PrepareValue() to be called before value(); + // allow_unprepared_value = false is convenient when this optimization is not + // useful, e.g. when reading the whole column family. + // + // read_options.ignore_range_deletions determines whether range tombstones are + // processed in the returned interator internally, i.e., whether range + // tombstone covered keys are in this iterator's output. + // @param read_options Must outlive the returned iterator. + InternalIterator* NewInternalIterator( + const ReadOptions& read_options, Arena* arena, SequenceNumber sequence, + ColumnFamilyHandle* column_family = nullptr, + bool allow_unprepared_value = false); + + // Note: to support DB iterator refresh, memtable range tombstones in the + // underlying merging iterator needs to be refreshed. If db_iter is not + // nullptr, db_iter->SetMemtableRangetombstoneIter() is called with the + // memtable range tombstone iterator used by the underlying merging iterator. + // This range tombstone iterator can be refreshed later by db_iter. + // @param read_options Must outlive the returned iterator. + InternalIterator* NewInternalIterator(const ReadOptions& read_options, + ColumnFamilyData* cfd, + SuperVersion* super_version, + Arena* arena, SequenceNumber sequence, + bool allow_unprepared_value, + ArenaWrappedDBIter* db_iter = nullptr); + + LogsWithPrepTracker* logs_with_prep_tracker() { + return &logs_with_prep_tracker_; + } + + struct BGJobLimits { + int max_flushes; + int max_compactions; + }; + // Returns maximum background flushes and compactions allowed to be scheduled + BGJobLimits GetBGJobLimits() const; + // Need a static version that can be called during SanitizeOptions(). + static BGJobLimits GetBGJobLimits(int max_background_flushes, + int max_background_compactions, + int max_background_jobs, + bool parallelize_compactions); + + // move logs pending closing from job_context to the DB queue and + // schedule a purge + void ScheduleBgLogWriterClose(JobContext* job_context); + + uint64_t MinLogNumberToKeep(); + + // Returns the lower bound file number for SSTs that won't be deleted, even if + // they're obsolete. This lower bound is used internally to prevent newly + // created flush/compaction output files from being deleted before they're + // installed. This technique avoids the need for tracking the exact numbers of + // files pending creation, although it prevents more files than necessary from + // being deleted. + uint64_t MinObsoleteSstNumberToKeep(); + + // Returns the list of live files in 'live' and the list + // of all files in the filesystem in 'candidate_files'. + // If force == false and the last call was less than + // db_options_.delete_obsolete_files_period_micros microseconds ago, + // it will not fill up the job_context + void FindObsoleteFiles(JobContext* job_context, bool force, + bool no_full_scan = false); + + // Diffs the files listed in filenames and those that do not + // belong to live files are possibly removed. Also, removes all the + // files in sst_delete_files and log_delete_files. + // It is not necessary to hold the mutex when invoking this method. + // If FindObsoleteFiles() was run, we need to also run + // PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true + void PurgeObsoleteFiles(JobContext& background_contet, + bool schedule_only = false); + + // Schedule a background job to actually delete obsolete files. + void SchedulePurge(); + + const SnapshotList& snapshots() const { return snapshots_; } + + // load list of snapshots to `snap_vector` that is no newer than `max_seq` + // in ascending order. + // `oldest_write_conflict_snapshot` is filled with the oldest snapshot + // which satisfies SnapshotImpl.is_write_conflict_boundary_ = true. + void LoadSnapshots(std::vector* snap_vector, + SequenceNumber* oldest_write_conflict_snapshot, + const SequenceNumber& max_seq) const { + InstrumentedMutexLock l(mutex()); + snapshots().GetAll(snap_vector, oldest_write_conflict_snapshot, max_seq); + } + + const ImmutableDBOptions& immutable_db_options() const { + return immutable_db_options_; + } + + // Cancel all background jobs, including flush, compaction, background + // purging, stats dumping threads, etc. If `wait` = true, wait for the + // running jobs to abort or finish before returning. Otherwise, only + // sends the signals. + void CancelAllBackgroundWork(bool wait); + + // Find Super version and reference it. Based on options, it might return + // the thread local cached one. + // Call ReturnAndCleanupSuperVersion() when it is no longer needed. + SuperVersion* GetAndRefSuperVersion(ColumnFamilyData* cfd); + + // Similar to the previous function but looks up based on a column family id. + // nullptr will be returned if this column family no longer exists. + // REQUIRED: this function should only be called on the write thread or if the + // mutex is held. + SuperVersion* GetAndRefSuperVersion(uint32_t column_family_id); + + // Un-reference the super version and clean it up if it is the last reference. + void CleanupSuperVersion(SuperVersion* sv); + + // Un-reference the super version and return it to thread local cache if + // needed. If it is the last reference of the super version. Clean it up + // after un-referencing it. + void ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, SuperVersion* sv); + + // Similar to the previous function but looks up based on a column family id. + // nullptr will be returned if this column family no longer exists. + // REQUIRED: this function should only be called on the write thread. + void ReturnAndCleanupSuperVersion(uint32_t colun_family_id, SuperVersion* sv); + + // REQUIRED: this function should only be called on the write thread or if the + // mutex is held. Return value only valid until next call to this function or + // mutex is released. + ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id); + + // Same as above, should called without mutex held and not on write thread. + std::unique_ptr GetColumnFamilyHandleUnlocked( + uint32_t column_family_id); + + // Returns the number of currently running flushes. + // REQUIREMENT: mutex_ must be held when calling this function. + int num_running_flushes() { + mutex_.AssertHeld(); + return num_running_flushes_; + } + + // Returns the number of currently running compactions. + // REQUIREMENT: mutex_ must be held when calling this function. + int num_running_compactions() { + mutex_.AssertHeld(); + return num_running_compactions_; + } + + const WriteController& write_controller() { return write_controller_; } + + // hollow transactions shell used for recovery. + // these will then be passed to TransactionDB so that + // locks can be reacquired before writing can resume. + struct RecoveredTransaction { + std::string name_; + bool unprepared_; + + struct BatchInfo { + uint64_t log_number_; + // TODO(lth): For unprepared, the memory usage here can be big for + // unprepared transactions. This is only useful for rollbacks, and we + // can in theory just keep keyset for that. + WriteBatch* batch_; + // Number of sub-batches. A new sub-batch is created if txn attempts to + // insert a duplicate key,seq to memtable. This is currently used in + // WritePreparedTxn/WriteUnpreparedTxn. + size_t batch_cnt_; + }; + + // This maps the seq of the first key in the batch to BatchInfo, which + // contains WriteBatch and other information relevant to the batch. + // + // For WriteUnprepared, batches_ can have size greater than 1, but for + // other write policies, it must be of size 1. + std::map batches_; + + explicit RecoveredTransaction(const uint64_t log, const std::string& name, + WriteBatch* batch, SequenceNumber seq, + size_t batch_cnt, bool unprepared) + : name_(name), unprepared_(unprepared) { + batches_[seq] = {log, batch, batch_cnt}; + } + + ~RecoveredTransaction() { + for (auto& it : batches_) { + delete it.second.batch_; + } + } + + void AddBatch(SequenceNumber seq, uint64_t log_number, WriteBatch* batch, + size_t batch_cnt, bool unprepared) { + assert(batches_.count(seq) == 0); + batches_[seq] = {log_number, batch, batch_cnt}; + // Prior state must be unprepared, since the prepare batch must be the + // last batch. + assert(unprepared_); + unprepared_ = unprepared; + } + }; + + bool allow_2pc() const { return immutable_db_options_.allow_2pc; } + + std::unordered_map + recovered_transactions() { + return recovered_transactions_; + } + + RecoveredTransaction* GetRecoveredTransaction(const std::string& name) { + auto it = recovered_transactions_.find(name); + if (it == recovered_transactions_.end()) { + return nullptr; + } else { + return it->second; + } + } + + void InsertRecoveredTransaction(const uint64_t log, const std::string& name, + WriteBatch* batch, SequenceNumber seq, + size_t batch_cnt, bool unprepared_batch) { + // For WriteUnpreparedTxn, InsertRecoveredTransaction is called multiple + // times for every unprepared batch encountered during recovery. + // + // If the transaction is prepared, then the last call to + // InsertRecoveredTransaction will have unprepared_batch = false. + auto rtxn = recovered_transactions_.find(name); + if (rtxn == recovered_transactions_.end()) { + recovered_transactions_[name] = new RecoveredTransaction( + log, name, batch, seq, batch_cnt, unprepared_batch); + } else { + rtxn->second->AddBatch(seq, log, batch, batch_cnt, unprepared_batch); + } + logs_with_prep_tracker_.MarkLogAsContainingPrepSection(log); + } + + void DeleteRecoveredTransaction(const std::string& name) { + auto it = recovered_transactions_.find(name); + assert(it != recovered_transactions_.end()); + auto* trx = it->second; + recovered_transactions_.erase(it); + for (const auto& info : trx->batches_) { + logs_with_prep_tracker_.MarkLogAsHavingPrepSectionFlushed( + info.second.log_number_); + } + delete trx; + } + + void DeleteAllRecoveredTransactions() { + for (auto it = recovered_transactions_.begin(); + it != recovered_transactions_.end(); ++it) { + delete it->second; + } + recovered_transactions_.clear(); + } + + void AddToLogsToFreeQueue(log::Writer* log_writer) { + mutex_.AssertHeld(); + logs_to_free_queue_.push_back(log_writer); + } + + void AddSuperVersionsToFreeQueue(SuperVersion* sv) { + superversions_to_free_queue_.push_back(sv); + } + + void SetSnapshotChecker(SnapshotChecker* snapshot_checker); + + // Fill JobContext with snapshot information needed by flush and compaction. + void GetSnapshotContext(JobContext* job_context, + std::vector* snapshot_seqs, + SequenceNumber* earliest_write_conflict_snapshot, + SnapshotChecker** snapshot_checker); + + // Not thread-safe. + void SetRecoverableStatePreReleaseCallback(PreReleaseCallback* callback); + + InstrumentedMutex* mutex() const { return &mutex_; } + + // Initialize a brand new DB. The DB directory is expected to be empty before + // calling it. Push new manifest file name into `new_filenames`. + Status NewDB(std::vector* new_filenames); + + // This is to be used only by internal rocksdb classes. + static Status Open(const DBOptions& db_options, const std::string& name, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + const bool seq_per_batch, const bool batch_per_txn); + + static IOStatus CreateAndNewDirectory( + FileSystem* fs, const std::string& dirname, + std::unique_ptr* directory); + + // find stats map from stats_history_ with smallest timestamp in + // the range of [start_time, end_time) + bool FindStatsByTime(uint64_t start_time, uint64_t end_time, + uint64_t* new_time, + std::map* stats_map); + + // Print information of all tombstones of all iterators to the std::string + // This is only used by ldb. The output might be capped. Tombstones + // printed out are not guaranteed to be in any order. + Status TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family, + int max_entries_to_print, + std::string* out_str); + + VersionSet* GetVersionSet() const { return versions_.get(); } + + // Wait for any compaction + // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this + // is only for the special test of CancelledCompactions + Status WaitForCompact(bool waitUnscheduled = false); + +#ifndef NDEBUG + // Compact any files in the named level that overlap [*begin, *end] + Status TEST_CompactRange(int level, const Slice* begin, const Slice* end, + ColumnFamilyHandle* column_family = nullptr, + bool disallow_trivial_move = false); + + Status TEST_SwitchWAL(); + + bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; } + + bool TEST_IsLogGettingFlushed() { + return alive_log_files_.begin()->getting_flushed; + } + + Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr); + + // Force current memtable contents to be flushed. + Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false, + ColumnFamilyHandle* cfh = nullptr); + + Status TEST_FlushMemTable(ColumnFamilyData* cfd, + const FlushOptions& flush_opts); + + // Flush (multiple) ColumnFamilyData without using ColumnFamilyHandle. This + // is because in certain cases, we can flush column families, wait for the + // flush to complete, but delete the column family handle before the wait + // finishes. For example in CompactRange. + Status TEST_AtomicFlushMemTables(const autovector& cfds, + const FlushOptions& flush_opts); + + // Wait for background threads to complete scheduled work. + Status TEST_WaitForBackgroundWork(); + + // Wait for memtable compaction + Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr); + + // Wait for any compaction + // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this + // is only for the special test of CancelledCompactions + Status TEST_WaitForCompact(bool waitUnscheduled = false); + + // Wait for any background purge + Status TEST_WaitForPurge(); + + // Get the background error status + Status TEST_GetBGError(); + + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + uint64_t TEST_MaxNextLevelOverlappingBytes( + ColumnFamilyHandle* column_family = nullptr); + + // Return the current manifest file no. + uint64_t TEST_Current_Manifest_FileNo(); + + // Returns the number that'll be assigned to the next file that's created. + uint64_t TEST_Current_Next_FileNo(); + + // get total level0 file size. Only for testing. + uint64_t TEST_GetLevel0TotalSize(); + + void TEST_GetFilesMetaData( + ColumnFamilyHandle* column_family, + std::vector>* metadata, + std::vector>* blob_metadata = nullptr); + + void TEST_LockMutex(); + + void TEST_UnlockMutex(); + + // REQUIRES: mutex locked + void* TEST_BeginWrite(); + + // REQUIRES: mutex locked + // pass the pointer that you got from TEST_BeginWrite() + void TEST_EndWrite(void* w); + + uint64_t TEST_MaxTotalInMemoryState() const { + return max_total_in_memory_state_; + } + + size_t TEST_LogsToFreeSize(); + + uint64_t TEST_LogfileNumber(); + + uint64_t TEST_total_log_size() const { return total_log_size_; } + + // Returns column family name to ImmutableCFOptions map. + Status TEST_GetAllImmutableCFOptions( + std::unordered_map* iopts_map); + + // Return the lastest MutableCFOptions of a column family + Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family, + MutableCFOptions* mutable_cf_options); + + Cache* TEST_table_cache() { return table_cache_.get(); } + + WriteController& TEST_write_controler() { return write_controller_; } + + uint64_t TEST_FindMinLogContainingOutstandingPrep(); + uint64_t TEST_FindMinPrepLogReferencedByMemTable(); + size_t TEST_PreparedSectionCompletedSize(); + size_t TEST_LogsWithPrepSize(); + + int TEST_BGCompactionsAllowed() const; + int TEST_BGFlushesAllowed() const; + size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; + void TEST_WaitForPeridicTaskRun(std::function callback) const; + SeqnoToTimeMapping TEST_GetSeqnoToTimeMapping() const; + size_t TEST_EstimateInMemoryStatsHistorySize() const; + + uint64_t TEST_GetCurrentLogNumber() const { + InstrumentedMutexLock l(mutex()); + assert(!logs_.empty()); + return logs_.back().number; + } + + const std::unordered_set& TEST_GetFilesGrabbedForPurge() const { + return files_grabbed_for_purge_; + } + +#ifndef ROCKSDB_LITE + const PeriodicTaskScheduler& TEST_GetPeriodicTaskScheduler() const; +#endif // !ROCKSDB_LITE + +#endif // NDEBUG + + // persist stats to column family "_persistent_stats" + void PersistStats(); + + // dump rocksdb.stats to LOG + void DumpStats(); + + // flush LOG out of application buffer + void FlushInfoLog(); + + // record current sequence number to time mapping + void RecordSeqnoToTimeMapping(); + + // Interface to block and signal the DB in case of stalling writes by + // WriteBufferManager. Each DBImpl object contains ptr to WBMStallInterface. + // When DB needs to be blocked or signalled by WriteBufferManager, + // state_ is changed accordingly. + class WBMStallInterface : public StallInterface { + public: + enum State { + BLOCKED = 0, + RUNNING, + }; + + WBMStallInterface() : state_cv_(&state_mutex_) { + MutexLock lock(&state_mutex_); + state_ = State::RUNNING; + } + + void SetState(State state) { + MutexLock lock(&state_mutex_); + state_ = state; + } + + // Change the state_ to State::BLOCKED and wait until its state is + // changed by WriteBufferManager. When stall is cleared, Signal() is + // called to change the state and unblock the DB. + void Block() override { + MutexLock lock(&state_mutex_); + while (state_ == State::BLOCKED) { + TEST_SYNC_POINT("WBMStallInterface::BlockDB"); + state_cv_.Wait(); + } + } + + // Called from WriteBufferManager. This function changes the state_ + // to State::RUNNING indicating the stall is cleared and DB can proceed. + void Signal() override { + { + MutexLock lock(&state_mutex_); + state_ = State::RUNNING; + } + state_cv_.Signal(); + } + + private: + // Conditional variable and mutex to block and + // signal the DB during stalling process. + port::Mutex state_mutex_; + port::CondVar state_cv_; + // state represting whether DB is running or blocked because of stall by + // WriteBufferManager. + State state_; + }; + + static void TEST_ResetDbSessionIdGen(); + static std::string GenerateDbSessionId(Env* env); + + bool seq_per_batch() const { return seq_per_batch_; } + + protected: + const std::string dbname_; + // TODO(peterd): unify with VersionSet::db_id_ + std::string db_id_; + // db_session_id_ is an identifier that gets reset + // every time the DB is opened + std::string db_session_id_; + std::unique_ptr versions_; + // Flag to check whether we allocated and own the info log file + bool own_info_log_; + Status init_logger_creation_s_; + const DBOptions initial_db_options_; + Env* const env_; + std::shared_ptr io_tracer_; + const ImmutableDBOptions immutable_db_options_; + FileSystemPtr fs_; + MutableDBOptions mutable_db_options_; + Statistics* stats_; + std::unordered_map + recovered_transactions_; + std::unique_ptr tracer_; + InstrumentedMutex trace_mutex_; + BlockCacheTracer block_cache_tracer_; + + // constant false canceled flag, used when the compaction is not manual + const std::atomic kManualCompactionCanceledFalse_{false}; + + // State below is protected by mutex_ + // With two_write_queues enabled, some of the variables that accessed during + // WriteToWAL need different synchronization: log_empty_, alive_log_files_, + // logs_, logfile_number_. Refer to the definition of each variable below for + // more description. + // + // `mutex_` can be a hot lock in some workloads, so it deserves dedicated + // cachelines. + mutable CacheAlignedInstrumentedMutex mutex_; + + ColumnFamilyHandleImpl* default_cf_handle_; + InternalStats* default_cf_internal_stats_; + + // table_cache_ provides its own synchronization + std::shared_ptr table_cache_; + + ErrorHandler error_handler_; + + // Unified interface for logging events + EventLogger event_logger_; + + // only used for dynamically adjusting max_total_wal_size. it is a sum of + // [write_buffer_size * max_write_buffer_number] over all column families + std::atomic max_total_in_memory_state_; + + // The options to access storage files + const FileOptions file_options_; + + // Additonal options for compaction and flush + FileOptions file_options_for_compaction_; + + std::unique_ptr column_family_memtables_; + + // Increase the sequence number after writing each batch, whether memtable is + // disabled for that or not. Otherwise the sequence number is increased after + // writing each key into memtable. This implies that when disable_memtable is + // set, the seq is not increased at all. + // + // Default: false + const bool seq_per_batch_; + // This determines during recovery whether we expect one writebatch per + // recovered transaction, or potentially multiple writebatches per + // transaction. For WriteUnprepared, this is set to false, since multiple + // batches can exist per transaction. + // + // Default: true + const bool batch_per_txn_; + + // Each flush or compaction gets its own job id. this counter makes sure + // they're unique + std::atomic next_job_id_; + + std::atomic shutting_down_; + + // RecoveryContext struct stores the context about version edits along + // with corresponding column_family_data and column_family_options. + class RecoveryContext { + public: + ~RecoveryContext() { + for (auto& edit_list : edit_lists_) { + for (auto* edit : edit_list) { + delete edit; + } + } + } + + void UpdateVersionEdits(ColumnFamilyData* cfd, const VersionEdit& edit) { + assert(cfd != nullptr); + if (map_.find(cfd->GetID()) == map_.end()) { + uint32_t size = static_cast(map_.size()); + map_.emplace(cfd->GetID(), size); + cfds_.emplace_back(cfd); + mutable_cf_opts_.emplace_back(cfd->GetLatestMutableCFOptions()); + edit_lists_.emplace_back(autovector()); + } + uint32_t i = map_[cfd->GetID()]; + edit_lists_[i].emplace_back(new VersionEdit(edit)); + } + + std::unordered_map map_; // cf_id to index; + autovector cfds_; + autovector mutable_cf_opts_; + autovector> edit_lists_; + // files_to_delete_ contains sst files + std::unordered_set files_to_delete_; + }; + + // Except in DB::Open(), WriteOptionsFile can only be called when: + // Persist options to options file. + // If need_mutex_lock = false, the method will lock DB mutex. + // If need_enter_write_thread = false, the method will enter write thread. + Status WriteOptionsFile(bool need_mutex_lock, bool need_enter_write_thread); + + Status CompactRangeInternal(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end, + const std::string& trim_ts); + + // The following two functions can only be called when: + // 1. WriteThread::Writer::EnterUnbatched() is used. + // 2. db_mutex is NOT held + Status RenameTempFileToOptionsFile(const std::string& file_name); + Status DeleteObsoleteOptionsFiles(); + + void NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, + const MutableCFOptions& mutable_cf_options, + int job_id); + + void NotifyOnFlushCompleted( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + std::list>* flush_jobs_info); + + void NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c, + const Status& st, + const CompactionJobStats& job_stats, int job_id); + + void NotifyOnCompactionCompleted(ColumnFamilyData* cfd, Compaction* c, + const Status& st, + const CompactionJobStats& job_stats, + int job_id); + void NotifyOnMemTableSealed(ColumnFamilyData* cfd, + const MemTableInfo& mem_table_info); + +#ifndef ROCKSDB_LITE + void NotifyOnExternalFileIngested( + ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job); + + virtual Status FlushForGetLiveFiles(); +#endif // !ROCKSDB_LITE + + void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const; + + void EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const; + + void EraseThreadStatusDbInfo() const; + + // If disable_memtable is set the application logic must guarantee that the + // batch will still be skipped from memtable during the recovery. An excption + // to this is seq_per_batch_ mode, in which since each batch already takes one + // seq, it is ok for the batch to write to memtable during recovery as long as + // it only takes one sequence number: i.e., no duplicate keys. + // In WriteCommitted it is guarnateed since disable_memtable is used for + // prepare batch which will be written to memtable later during the commit, + // and in WritePrepared it is guaranteed since it will be used only for WAL + // markers which will never be written to memtable. If the commit marker is + // accompanied with CommitTimeWriteBatch that is not written to memtable as + // long as it has no duplicate keys, it does not violate the one-seq-per-batch + // policy. + // batch_cnt is expected to be non-zero in seq_per_batch mode and + // indicates the number of sub-patches. A sub-patch is a subset of the write + // batch that does not have duplicate keys. + Status WriteImpl(const WriteOptions& options, WriteBatch* updates, + WriteCallback* callback = nullptr, + uint64_t* log_used = nullptr, uint64_t log_ref = 0, + bool disable_memtable = false, uint64_t* seq_used = nullptr, + size_t batch_cnt = 0, + PreReleaseCallback* pre_release_callback = nullptr, + PostMemTableCallback* post_memtable_callback = nullptr); + + Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates, + WriteCallback* callback = nullptr, + uint64_t* log_used = nullptr, uint64_t log_ref = 0, + bool disable_memtable = false, + uint64_t* seq_used = nullptr); + + // Write only to memtables without joining any write queue + Status UnorderedWriteMemtable(const WriteOptions& write_options, + WriteBatch* my_batch, WriteCallback* callback, + uint64_t log_ref, SequenceNumber seq, + const size_t sub_batch_cnt); + + // Whether the batch requires to be assigned with an order + enum AssignOrder : bool { kDontAssignOrder, kDoAssignOrder }; + // Whether it requires publishing last sequence or not + enum PublishLastSeq : bool { kDontPublishLastSeq, kDoPublishLastSeq }; + + // Join the write_thread to write the batch only to the WAL. It is the + // responsibility of the caller to also write the write batch to the memtable + // if it required. + // + // sub_batch_cnt is expected to be non-zero when assign_order = kDoAssignOrder + // indicating the number of sub-batches in my_batch. A sub-patch is a subset + // of the write batch that does not have duplicate keys. When seq_per_batch is + // not set, each key is a separate sub_batch. Otherwise each duplicate key + // marks start of a new sub-batch. + Status WriteImplWALOnly( + WriteThread* write_thread, const WriteOptions& options, + WriteBatch* updates, WriteCallback* callback, uint64_t* log_used, + const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt, + PreReleaseCallback* pre_release_callback, const AssignOrder assign_order, + const PublishLastSeq publish_last_seq, const bool disable_memtable); + + // write cached_recoverable_state_ to memtable if it is not empty + // The writer must be the leader in write_thread_ and holding mutex_ + Status WriteRecoverableState(); + + // Actual implementation of Close() + Status CloseImpl(); + + // Recover the descriptor from persistent storage. May do a significant + // amount of work to recover recently logged updates. Any changes to + // be made to the descriptor are added to *edit. + // recovered_seq is set to less than kMaxSequenceNumber if the log's tail is + // skipped. + // recovery_ctx stores the context about version edits and all those + // edits are persisted to new Manifest after successfully syncing the new WAL. + virtual Status Recover( + const std::vector& column_families, + bool read_only = false, bool error_if_wal_file_exists = false, + bool error_if_data_exists_in_wals = false, + uint64_t* recovered_seq = nullptr, + RecoveryContext* recovery_ctx = nullptr); + + virtual bool OwnTablesAndLogs() const { return true; } + + // Setup DB identity file, and write DB ID to manifest if necessary. + Status SetupDBId(bool read_only, RecoveryContext* recovery_ctx); + // Assign db_id_ and write DB ID to manifest if necessary. + void SetDBId(std::string&& id, bool read_only, RecoveryContext* recovery_ctx); + + // REQUIRES: db mutex held when calling this function, but the db mutex can + // be released and re-acquired. Db mutex will be held when the function + // returns. + // After recovery, there may be SST files in db/cf paths that are + // not referenced in the MANIFEST (e.g. + // 1. It's best effort recovery; + // 2. The VersionEdits referencing the SST files are appended to + // RecoveryContext, DB crashes when syncing the MANIFEST, the VersionEdits are + // still not synced to MANIFEST during recovery.) + // It stores the SST files to be deleted in RecoveryContext. In the + // meantime, we find out the largest file number present in the paths, and + // bump up the version set's next_file_number_ to be 1 + largest_file_number. + // recovery_ctx stores the context about version edits and files to be + // deleted. All those edits are persisted to new Manifest after successfully + // syncing the new WAL. + Status DeleteUnreferencedSstFiles(RecoveryContext* recovery_ctx); + + // SetDbSessionId() should be called in the constuctor DBImpl() + // to ensure that db_session_id_ gets updated every time the DB is opened + void SetDbSessionId(); + + Status FailIfCfHasTs(const ColumnFamilyHandle* column_family) const; + Status FailIfTsMismatchCf(ColumnFamilyHandle* column_family, const Slice& ts, + bool ts_for_read) const; + + // recovery_ctx stores the context about version edits and + // LogAndApplyForRecovery persist all those edits to new Manifest after + // successfully syncing new WAL. + // LogAndApplyForRecovery should be called only once during recovery and it + // should be called when RocksDB writes to a first new MANIFEST since this + // recovery. + Status LogAndApplyForRecovery(const RecoveryContext& recovery_ctx); + + void InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap(); + + // Return true to proceed with current WAL record whose content is stored in + // `batch`. Return false to skip current WAL record. + bool InvokeWalFilterIfNeededOnWalRecord(uint64_t wal_number, + const std::string& wal_fname, + log::Reader::Reporter& reporter, + Status& status, bool& stop_replay, + WriteBatch& batch); + + private: + friend class DB; + friend class ErrorHandler; + friend class InternalStats; + friend class PessimisticTransaction; + friend class TransactionBaseImpl; + friend class WriteCommittedTxn; + friend class WritePreparedTxn; + friend class WritePreparedTxnDB; + friend class WriteBatchWithIndex; + friend class WriteUnpreparedTxnDB; + friend class WriteUnpreparedTxn; + +#ifndef ROCKSDB_LITE + friend class ForwardIterator; +#endif + friend struct SuperVersion; + friend class CompactedDBImpl; + friend class DBTest_ConcurrentFlushWAL_Test; + friend class DBTest_MixedSlowdownOptionsStop_Test; + friend class DBCompactionTest_CompactBottomLevelFilesWithDeletions_Test; + friend class DBCompactionTest_CompactionDuringShutdown_Test; + friend class StatsHistoryTest_PersistentStatsCreateColumnFamilies_Test; +#ifndef NDEBUG + friend class DBTest2_ReadCallbackTest_Test; + friend class WriteCallbackPTest_WriteWithCallbackTest_Test; + friend class XFTransactionWriteHandler; + friend class DBBlobIndexTest; + friend class WriteUnpreparedTransactionTest_RecoveryTest_Test; +#endif + + struct CompactionState; + struct PrepickedCompaction; + struct PurgeFileInfo; + + struct WriteContext { + SuperVersionContext superversion_context; + autovector memtables_to_free_; + + explicit WriteContext(bool create_superversion = false) + : superversion_context(create_superversion) {} + + ~WriteContext() { + superversion_context.Clean(); + for (auto& m : memtables_to_free_) { + delete m; + } + } + }; + + struct LogFileNumberSize { + explicit LogFileNumberSize(uint64_t _number) : number(_number) {} + LogFileNumberSize() {} + void AddSize(uint64_t new_size) { size += new_size; } + uint64_t number; + uint64_t size = 0; + bool getting_flushed = false; + }; + + struct LogWriterNumber { + // pass ownership of _writer + LogWriterNumber(uint64_t _number, log::Writer* _writer) + : number(_number), writer(_writer) {} + + log::Writer* ReleaseWriter() { + auto* w = writer; + writer = nullptr; + return w; + } + Status ClearWriter() { + Status s = writer->WriteBuffer(); + delete writer; + writer = nullptr; + return s; + } + + bool IsSyncing() { return getting_synced; } + + uint64_t GetPreSyncSize() { + assert(getting_synced); + return pre_sync_size; + } + + void PrepareForSync() { + assert(!getting_synced); + // Size is expected to be monotonically increasing. + assert(writer->file()->GetFlushedSize() >= pre_sync_size); + getting_synced = true; + pre_sync_size = writer->file()->GetFlushedSize(); + } + + void FinishSync() { + assert(getting_synced); + getting_synced = false; + } + + uint64_t number; + // Visual Studio doesn't support deque's member to be noncopyable because + // of a std::unique_ptr as a member. + log::Writer* writer; // own + + private: + // true for some prefix of logs_ + bool getting_synced = false; + // The size of the file before the sync happens. This amount is guaranteed + // to be persisted even if appends happen during sync so it can be used for + // tracking the synced size in MANIFEST. + uint64_t pre_sync_size = 0; + }; + + struct LogContext { + explicit LogContext(bool need_sync = false) + : need_log_sync(need_sync), need_log_dir_sync(need_sync) {} + bool need_log_sync = false; + bool need_log_dir_sync = false; + log::Writer* writer = nullptr; + LogFileNumberSize* log_file_number_size = nullptr; + }; + + // PurgeFileInfo is a structure to hold information of files to be deleted in + // purge_files_ + struct PurgeFileInfo { + std::string fname; + std::string dir_to_sync; + FileType type; + uint64_t number; + int job_id; + PurgeFileInfo(std::string fn, std::string d, FileType t, uint64_t num, + int jid) + : fname(fn), dir_to_sync(d), type(t), number(num), job_id(jid) {} + }; + + // Argument required by background flush thread. + struct BGFlushArg { + BGFlushArg() + : cfd_(nullptr), max_memtable_id_(0), superversion_context_(nullptr) {} + BGFlushArg(ColumnFamilyData* cfd, uint64_t max_memtable_id, + SuperVersionContext* superversion_context) + : cfd_(cfd), + max_memtable_id_(max_memtable_id), + superversion_context_(superversion_context) {} + + // Column family to flush. + ColumnFamilyData* cfd_; + // Maximum ID of memtable to flush. In this column family, memtables with + // IDs smaller than this value must be flushed before this flush completes. + uint64_t max_memtable_id_; + // Pointer to a SuperVersionContext object. After flush completes, RocksDB + // installs a new superversion for the column family. This operation + // requires a SuperVersionContext object (currently embedded in JobContext). + SuperVersionContext* superversion_context_; + }; + + // Argument passed to flush thread. + struct FlushThreadArg { + DBImpl* db_; + + Env::Priority thread_pri_; + }; + + // Information for a manual compaction + struct ManualCompactionState { + ManualCompactionState(ColumnFamilyData* _cfd, int _input_level, + int _output_level, uint32_t _output_path_id, + bool _exclusive, bool _disallow_trivial_move, + std::atomic* _canceled) + : cfd(_cfd), + input_level(_input_level), + output_level(_output_level), + output_path_id(_output_path_id), + exclusive(_exclusive), + disallow_trivial_move(_disallow_trivial_move), + canceled(_canceled ? *_canceled : canceled_internal_storage) {} + // When _canceled is not provided by ther user, we assign the reference of + // canceled_internal_storage to it to consolidate canceled and + // manual_compaction_paused since DisableManualCompaction() might be + // called + + ColumnFamilyData* cfd; + int input_level; + int output_level; + uint32_t output_path_id; + Status status; + bool done = false; + bool in_progress = false; // compaction request being processed? + bool incomplete = false; // only part of requested range compacted + bool exclusive; // current behavior of only one manual + bool disallow_trivial_move; // Force actual compaction to run + const InternalKey* begin = nullptr; // nullptr means beginning of key range + const InternalKey* end = nullptr; // nullptr means end of key range + InternalKey* manual_end = nullptr; // how far we are compacting + InternalKey tmp_storage; // Used to keep track of compaction progress + InternalKey tmp_storage1; // Used to keep track of compaction progress + + // When the user provides a canceled pointer in CompactRangeOptions, the + // above varaibe is the reference of the user-provided + // `canceled`, otherwise, it is the reference of canceled_internal_storage + std::atomic canceled_internal_storage = false; + std::atomic& canceled; // Compaction canceled pointer reference + }; + struct PrepickedCompaction { + // background compaction takes ownership of `compaction`. + Compaction* compaction; + // caller retains ownership of `manual_compaction_state` as it is reused + // across background compactions. + ManualCompactionState* manual_compaction_state; // nullptr if non-manual + // task limiter token is requested during compaction picking. + std::unique_ptr task_token; + }; + + struct CompactionArg { + // caller retains ownership of `db`. + DBImpl* db; + // background compaction takes ownership of `prepicked_compaction`. + PrepickedCompaction* prepicked_compaction; + Env::Priority compaction_pri_; + }; + + // Initialize the built-in column family for persistent stats. Depending on + // whether on-disk persistent stats have been enabled before, it may either + // create a new column family and column family handle or just a column family + // handle. + // Required: DB mutex held + Status InitPersistStatsColumnFamily(); + + // Persistent Stats column family has two format version key which are used + // for compatibility check. Write format version if it's created for the + // first time, read format version and check compatibility if recovering + // from disk. This function requires DB mutex held at entrance but may + // release and re-acquire DB mutex in the process. + // Required: DB mutex held + Status PersistentStatsProcessFormatVersion(); + + Status ResumeImpl(DBRecoverContext context); + + void MaybeIgnoreError(Status* s) const; + + const Status CreateArchivalDirectory(); + + Status CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, + const std::string& cf_name, + ColumnFamilyHandle** handle); + + Status DropColumnFamilyImpl(ColumnFamilyHandle* column_family); + + // Delete any unneeded files and stale in-memory entries. + void DeleteObsoleteFiles(); + // Delete obsolete files and log status and information of file deletion + void DeleteObsoleteFileImpl(int job_id, const std::string& fname, + const std::string& path_to_sync, FileType type, + uint64_t number); + + // Background process needs to call + // auto x = CaptureCurrentFileNumberInPendingOutputs() + // auto file_num = versions_->NewFileNumber(); + // + // ReleaseFileNumberFromPendingOutputs(x) + // This will protect any file with number `file_num` or greater from being + // deleted while is running. + // ----------- + // This function will capture current file number and append it to + // pending_outputs_. This will prevent any background process to delete any + // file created after this point. + std::list::iterator CaptureCurrentFileNumberInPendingOutputs(); + // This function should be called with the result of + // CaptureCurrentFileNumberInPendingOutputs(). It then marks that any file + // created between the calls CaptureCurrentFileNumberInPendingOutputs() and + // ReleaseFileNumberFromPendingOutputs() can now be deleted (if it's not live + // and blocked by any other pending_outputs_ calls) + void ReleaseFileNumberFromPendingOutputs( + std::unique_ptr::iterator>& v); + + IOStatus SyncClosedLogs(JobContext* job_context, VersionEdit* synced_wals); + + // Flush the in-memory write buffer to storage. Switches to a new + // log-file/memtable and writes a new descriptor iff successful. Then + // installs a new super version for the column family. + Status FlushMemTableToOutputFile( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + bool* madeProgress, JobContext* job_context, + SuperVersionContext* superversion_context, + std::vector& snapshot_seqs, + SequenceNumber earliest_write_conflict_snapshot, + SnapshotChecker* snapshot_checker, LogBuffer* log_buffer, + Env::Priority thread_pri); + + // Flush the memtables of (multiple) column families to multiple files on + // persistent storage. + Status FlushMemTablesToOutputFiles( + const autovector& bg_flush_args, bool* made_progress, + JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri); + + Status AtomicFlushMemTablesToOutputFiles( + const autovector& bg_flush_args, bool* made_progress, + JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri); + + // REQUIRES: log_numbers are sorted in ascending order + // corrupted_log_found is set to true if we recover from a corrupted log file. + Status RecoverLogFiles(const std::vector& log_numbers, + SequenceNumber* next_sequence, bool read_only, + bool* corrupted_log_found, + RecoveryContext* recovery_ctx); + + // The following two methods are used to flush a memtable to + // storage. The first one is used at database RecoveryTime (when the + // database is opened) and is heavyweight because it holds the mutex + // for the entire period. The second method WriteLevel0Table supports + // concurrent flush memtables to storage. + Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, + MemTable* mem, VersionEdit* edit); + + // Get the size of a log file and, if truncate is true, truncate the + // log file to its actual size, thereby freeing preallocated space. + // Return success even if truncate fails + Status GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate, + LogFileNumberSize* log); + + // Restore alive_log_files_ and total_log_size_ after recovery. + // It needs to run only when there's no flush during recovery + // (e.g. avoid_flush_during_recovery=true). May also trigger flush + // in case total_log_size > max_total_wal_size. + Status RestoreAliveLogFiles(const std::vector& log_numbers); + + // num_bytes: for slowdown case, delay time is calculated based on + // `num_bytes` going through. + Status DelayWrite(uint64_t num_bytes, const WriteOptions& write_options); + + // Begin stalling of writes when memory usage increases beyond a certain + // threshold. + void WriteBufferManagerStallWrites(); + + Status ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options, + WriteBatch* my_batch); + + // REQUIRES: mutex locked and in write thread. + Status ScheduleFlushes(WriteContext* context); + + void MaybeFlushStatsCF(autovector* cfds); + + Status TrimMemtableHistory(WriteContext* context); + + Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context); + + void SelectColumnFamiliesForAtomicFlush(autovector* cfds); + + // Force current memtable contents to be flushed. + Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options, + FlushReason flush_reason, + bool entered_write_thread = false); + + Status AtomicFlushMemTables( + const autovector& column_family_datas, + const FlushOptions& options, FlushReason flush_reason, + bool entered_write_thread = false); + + // Wait until flushing this column family won't stall writes + Status WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd, + bool* flush_needed); + + // Wait for memtable flushed. + // If flush_memtable_id is non-null, wait until the memtable with the ID + // gets flush. Otherwise, wait until the column family don't have any + // memtable pending flush. + // resuming_from_bg_err indicates whether the caller is attempting to resume + // from background error. + Status WaitForFlushMemTable(ColumnFamilyData* cfd, + const uint64_t* flush_memtable_id = nullptr, + bool resuming_from_bg_err = false) { + return WaitForFlushMemTables({cfd}, {flush_memtable_id}, + resuming_from_bg_err); + } + // Wait for memtables to be flushed for multiple column families. + Status WaitForFlushMemTables( + const autovector& cfds, + const autovector& flush_memtable_ids, + bool resuming_from_bg_err); + + inline void WaitForPendingWrites() { + mutex_.AssertHeld(); + TEST_SYNC_POINT("DBImpl::WaitForPendingWrites:BeforeBlock"); + // In case of pipelined write is enabled, wait for all pending memtable + // writers. + if (immutable_db_options_.enable_pipelined_write) { + // Memtable writers may call DB::Get in case max_successive_merges > 0, + // which may lock mutex. Unlocking mutex here to avoid deadlock. + mutex_.Unlock(); + write_thread_.WaitForMemTableWriters(); + mutex_.Lock(); + } + + if (!immutable_db_options_.unordered_write) { + // Then the writes are finished before the next write group starts + return; + } + + // Wait for the ones who already wrote to the WAL to finish their + // memtable write. + if (pending_memtable_writes_.load() != 0) { + std::unique_lock guard(switch_mutex_); + switch_cv_.wait(guard, + [&] { return pending_memtable_writes_.load() == 0; }); + } + } + + // TaskType is used to identify tasks in thread-pool, currently only + // differentiate manual compaction, which could be unscheduled from the + // thread-pool. + enum class TaskType : uint8_t { + kDefault = 0, + kManualCompaction = 1, + kCount = 2, + }; + + // Task tag is used to identity tasks in thread-pool, which is + // dbImpl obj address + type + inline void* GetTaskTag(TaskType type) { + return GetTaskTag(static_cast(type)); + } + + inline void* GetTaskTag(uint8_t type) { + return static_cast(static_cast(this)) + type; + } + + // REQUIRES: mutex locked and in write thread. + void AssignAtomicFlushSeq(const autovector& cfds); + + // REQUIRES: mutex locked and in write thread. + Status SwitchWAL(WriteContext* write_context); + + // REQUIRES: mutex locked and in write thread. + Status HandleWriteBufferManagerFlush(WriteContext* write_context); + + // REQUIRES: mutex locked + Status PreprocessWrite(const WriteOptions& write_options, + LogContext* log_context, WriteContext* write_context); + + // Merge write batches in the write group into merged_batch. + // Returns OK if merge is successful. + // Returns Corruption if corruption in write batch is detected. + Status MergeBatch(const WriteThread::WriteGroup& write_group, + WriteBatch* tmp_batch, WriteBatch** merged_batch, + size_t* write_with_wal, WriteBatch** to_be_cached_state); + + // rate_limiter_priority is used to charge `DBOptions::rate_limiter` + // for automatic WAL flush (`Options::manual_wal_flush` == false) + // associated with this WriteToWAL + IOStatus WriteToWAL(const WriteBatch& merged_batch, log::Writer* log_writer, + uint64_t* log_used, uint64_t* log_size, + Env::IOPriority rate_limiter_priority, + LogFileNumberSize& log_file_number_size); + + IOStatus WriteToWAL(const WriteThread::WriteGroup& write_group, + log::Writer* log_writer, uint64_t* log_used, + bool need_log_sync, bool need_log_dir_sync, + SequenceNumber sequence, + LogFileNumberSize& log_file_number_size); + + IOStatus ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group, + uint64_t* log_used, + SequenceNumber* last_sequence, size_t seq_inc); + + // Used by WriteImpl to update bg_error_ if paranoid check is enabled. + // Caller must hold mutex_. + void WriteStatusCheckOnLocked(const Status& status); + + // Used by WriteImpl to update bg_error_ if paranoid check is enabled. + void WriteStatusCheck(const Status& status); + + // Used by WriteImpl to update bg_error_ when IO error happens, e.g., write + // WAL, sync WAL fails, if paranoid check is enabled. + void IOStatusCheck(const IOStatus& status); + + // Used by WriteImpl to update bg_error_ in case of memtable insert error. + void MemTableInsertStatusCheck(const Status& memtable_insert_status); + +#ifndef ROCKSDB_LITE + Status CompactFilesImpl(const CompactionOptions& compact_options, + ColumnFamilyData* cfd, Version* version, + const std::vector& input_file_names, + std::vector* const output_file_names, + const int output_level, int output_path_id, + JobContext* job_context, LogBuffer* log_buffer, + CompactionJobInfo* compaction_job_info); + + // Wait for current IngestExternalFile() calls to finish. + // REQUIRES: mutex_ held + void WaitForIngestFile(); +#else + // IngestExternalFile is not supported in ROCKSDB_LITE so this function + // will be no-op + void WaitForIngestFile() {} +#endif // ROCKSDB_LITE + + ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name); + + void MaybeScheduleFlushOrCompaction(); + + // A flush request specifies the column families to flush as well as the + // largest memtable id to persist for each column family. Once all the + // memtables whose IDs are smaller than or equal to this per-column-family + // specified value, this flush request is considered to have completed its + // work of flushing this column family. After completing the work for all + // column families in this request, this flush is considered complete. + using FlushRequest = std::vector>; + + void GenerateFlushRequest(const autovector& cfds, + FlushRequest* req); + + void SchedulePendingFlush(const FlushRequest& req, FlushReason flush_reason); + + void SchedulePendingCompaction(ColumnFamilyData* cfd); + void SchedulePendingPurge(std::string fname, std::string dir_to_sync, + FileType type, uint64_t number, int job_id); + static void BGWorkCompaction(void* arg); + // Runs a pre-chosen universal compaction involving bottom level in a + // separate, bottom-pri thread pool. + static void BGWorkBottomCompaction(void* arg); + static void BGWorkFlush(void* arg); + static void BGWorkPurge(void* arg); + static void UnscheduleCompactionCallback(void* arg); + static void UnscheduleFlushCallback(void* arg); + void BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, + Env::Priority thread_pri); + void BackgroundCallFlush(Env::Priority thread_pri); + void BackgroundCallPurge(); + Status BackgroundCompaction(bool* madeProgress, JobContext* job_context, + LogBuffer* log_buffer, + PrepickedCompaction* prepicked_compaction, + Env::Priority thread_pri); + Status BackgroundFlush(bool* madeProgress, JobContext* job_context, + LogBuffer* log_buffer, FlushReason* reason, + Env::Priority thread_pri); + + bool EnoughRoomForCompaction(ColumnFamilyData* cfd, + const std::vector& inputs, + bool* sfm_bookkeeping, LogBuffer* log_buffer); + + // Request compaction tasks token from compaction thread limiter. + // It always succeeds if force = true or limiter is disable. + bool RequestCompactionToken(ColumnFamilyData* cfd, bool force, + std::unique_ptr* token, + LogBuffer* log_buffer); + + // Schedule background tasks + Status StartPeriodicTaskScheduler(); + + Status RegisterRecordSeqnoTimeWorker(); + + void PrintStatistics(); + + size_t EstimateInMemoryStatsHistorySize() const; + + // Return the minimum empty level that could hold the total data in the + // input level. Return the input level, if such level could not be found. + int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, + const MutableCFOptions& mutable_cf_options, + int level); + + // Move the files in the input level to the target level. + // If target_level < 0, automatically calculate the minimum level that could + // hold the data set. + Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1); + + // helper functions for adding and removing from flush & compaction queues + void AddToCompactionQueue(ColumnFamilyData* cfd); + ColumnFamilyData* PopFirstFromCompactionQueue(); + FlushRequest PopFirstFromFlushQueue(); + + // Pick the first unthrottled compaction with task token from queue. + ColumnFamilyData* PickCompactionFromQueue( + std::unique_ptr* token, LogBuffer* log_buffer); + + // helper function to call after some of the logs_ were synced + void MarkLogsSynced(uint64_t up_to, bool synced_dir, VersionEdit* edit); + Status ApplyWALToManifest(VersionEdit* edit); + // WALs with log number up to up_to are not synced successfully. + void MarkLogsNotSynced(uint64_t up_to); + + SnapshotImpl* GetSnapshotImpl(bool is_write_conflict_boundary, + bool lock = true); + + // If snapshot_seq != kMaxSequenceNumber, then this function can only be + // called from the write thread that publishes sequence numbers to readers. + // For 1) write-committed, or 2) write-prepared + one-write-queue, this will + // be the write thread performing memtable writes. For write-prepared with + // two write queues, this will be the write thread writing commit marker to + // the WAL. + // If snapshot_seq == kMaxSequenceNumber, this function is called by a caller + // ensuring no writes to the database. + std::pair> + CreateTimestampedSnapshotImpl(SequenceNumber snapshot_seq, uint64_t ts, + bool lock = true); + + uint64_t GetMaxTotalWalSize() const; + + FSDirectory* GetDataDir(ColumnFamilyData* cfd, size_t path_id) const; + + Status MaybeReleaseTimestampedSnapshotsAndCheck(); + + Status CloseHelper(); + + void WaitForBackgroundWork(); + + // Background threads call this function, which is just a wrapper around + // the InstallSuperVersion() function. Background threads carry + // sv_context which can have new_superversion already + // allocated. + // All ColumnFamily state changes go through this function. Here we analyze + // the new state and we schedule background work if we detect that the new + // state needs flush or compaction. + void InstallSuperVersionAndScheduleWork( + ColumnFamilyData* cfd, SuperVersionContext* sv_context, + const MutableCFOptions& mutable_cf_options); + + bool GetIntPropertyInternal(ColumnFamilyData* cfd, + const DBPropertyInfo& property_info, + bool is_locked, uint64_t* value); + bool GetPropertyHandleOptionsStatistics(std::string* value); + + bool HasPendingManualCompaction(); + bool HasExclusiveManualCompaction(); + void AddManualCompaction(ManualCompactionState* m); + void RemoveManualCompaction(ManualCompactionState* m); + bool ShouldntRunManualCompaction(ManualCompactionState* m); + bool HaveManualCompaction(ColumnFamilyData* cfd); + bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1); +#ifndef ROCKSDB_LITE + void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c, + const Status& st, + const CompactionJobStats& compaction_job_stats, + const int job_id, const Version* current, + CompactionJobInfo* compaction_job_info) const; + // Reserve the next 'num' file numbers for to-be-ingested external SST files, + // and return the current file_number in 'next_file_number'. + // Write a version edit to the MANIFEST. + Status ReserveFileNumbersBeforeIngestion( + ColumnFamilyData* cfd, uint64_t num, + std::unique_ptr::iterator>& pending_output_elem, + uint64_t* next_file_number); +#endif //! ROCKSDB_LITE + + bool ShouldPurge(uint64_t file_number) const; + void MarkAsGrabbedForPurge(uint64_t file_number); + + size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; + Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; } + + IOStatus CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, + size_t preallocate_block_size, log::Writer** new_log); + + // Validate self-consistency of DB options + static Status ValidateOptions(const DBOptions& db_options); + // Validate self-consistency of DB options and its consistency with cf options + static Status ValidateOptions( + const DBOptions& db_options, + const std::vector& column_families); + + // Utility function to do some debug validation and sort the given vector + // of MultiGet keys + void PrepareMultiGetKeys( + const size_t num_keys, bool sorted, + autovector* key_ptrs); + + // A structure to hold the information required to process MultiGet of keys + // belonging to one column family. For a multi column family MultiGet, there + // will be a container of these objects. + struct MultiGetColumnFamilyData { + ColumnFamilyHandle* cf; + ColumnFamilyData* cfd; + + // For the batched MultiGet which relies on sorted keys, start specifies + // the index of first key belonging to this column family in the sorted + // list. + size_t start; + + // For the batched MultiGet case, num_keys specifies the number of keys + // belonging to this column family in the sorted list + size_t num_keys; + + // SuperVersion for the column family obtained in a manner that ensures a + // consistent view across all column families in the DB + SuperVersion* super_version; + MultiGetColumnFamilyData(ColumnFamilyHandle* column_family, + SuperVersion* sv) + : cf(column_family), + cfd(static_cast(cf)->cfd()), + start(0), + num_keys(0), + super_version(sv) {} + + MultiGetColumnFamilyData(ColumnFamilyHandle* column_family, size_t first, + size_t count, SuperVersion* sv) + : cf(column_family), + cfd(static_cast(cf)->cfd()), + start(first), + num_keys(count), + super_version(sv) {} + + MultiGetColumnFamilyData() = default; + }; + + // A common function to obtain a consistent snapshot, which can be implicit + // if the user doesn't specify a snapshot in read_options, across + // multiple column families for MultiGet. It will attempt to get an implicit + // snapshot without acquiring the db_mutes, but will give up after a few + // tries and acquire the mutex if a memtable flush happens. The template + // allows both the batched and non-batched MultiGet to call this with + // either an std::unordered_map or autovector of column families. + // + // If callback is non-null, the callback is refreshed with the snapshot + // sequence number + // + // A return value of true indicates that the SuperVersions were obtained + // from the ColumnFamilyData, whereas false indicates they are thread + // local + template + bool MultiCFSnapshot( + const ReadOptions& read_options, ReadCallback* callback, + std::function& + iter_deref_func, + T* cf_list, SequenceNumber* snapshot); + + // The actual implementation of the batching MultiGet. The caller is expected + // to have acquired the SuperVersion and pass in a snapshot sequence number + // in order to construct the LookupKeys. The start_key and num_keys specify + // the range of keys in the sorted_keys vector for a single column family. + Status MultiGetImpl( + const ReadOptions& read_options, size_t start_key, size_t num_keys, + autovector* sorted_keys, + SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback); + + Status DisableFileDeletionsWithLock(); + + Status IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd, + std::string ts_low); + + bool ShouldReferenceSuperVersion(const MergeContext& merge_context); + + // Lock over the persistent DB state. Non-nullptr iff successfully acquired. + FileLock* db_lock_; + + // In addition to mutex_, log_write_mutex_ protected writes to stats_history_ + InstrumentedMutex stats_history_mutex_; + // In addition to mutex_, log_write_mutex_ protected writes to logs_ and + // logfile_number_. With two_write_queues it also protects alive_log_files_, + // and log_empty_. Refer to the definition of each variable below for more + // details. + // Note: to avoid dealock, if needed to acquire both log_write_mutex_ and + // mutex_, the order should be first mutex_ and then log_write_mutex_. + InstrumentedMutex log_write_mutex_; + + // If zero, manual compactions are allowed to proceed. If non-zero, manual + // compactions may still be running, but will quickly fail with + // `Status::Incomplete`. The value indicates how many threads have paused + // manual compactions. It is accessed in read mode outside the DB mutex in + // compaction code paths. + std::atomic manual_compaction_paused_; + + // This condition variable is signaled on these conditions: + // * whenever bg_compaction_scheduled_ goes down to 0 + // * if AnyManualCompaction, whenever a compaction finishes, even if it hasn't + // made any progress + // * whenever a compaction made any progress + // * whenever bg_flush_scheduled_ or bg_purge_scheduled_ value decreases + // (i.e. whenever a flush is done, even if it didn't make any progress) + // * whenever there is an error in background purge, flush or compaction + // * whenever num_running_ingest_file_ goes to 0. + // * whenever pending_purge_obsolete_files_ goes to 0. + // * whenever disable_delete_obsolete_files_ goes to 0. + // * whenever SetOptions successfully updates options. + // * whenever a column family is dropped. + InstrumentedCondVar bg_cv_; + // Writes are protected by locking both mutex_ and log_write_mutex_, and reads + // must be under either mutex_ or log_write_mutex_. Since after ::Open, + // logfile_number_ is currently updated only in write_thread_, it can be read + // from the same write_thread_ without any locks. + uint64_t logfile_number_; + // Log files that we can recycle. Must be protected by db mutex_. + std::deque log_recycle_files_; + // Protected by log_write_mutex_. + bool log_dir_synced_; + // Without two_write_queues, read and writes to log_empty_ are protected by + // mutex_. Since it is currently updated/read only in write_thread_, it can be + // accessed from the same write_thread_ without any locks. With + // two_write_queues writes, where it can be updated in different threads, + // read and writes are protected by log_write_mutex_ instead. This is to avoid + // expensive mutex_ lock during WAL write, which update log_empty_. + bool log_empty_; + + ColumnFamilyHandleImpl* persist_stats_cf_handle_; + + bool persistent_stats_cfd_exists_ = true; + + // alive_log_files_ is protected by mutex_ and log_write_mutex_ with details + // as follows: + // 1. read by FindObsoleteFiles() which can be called in either application + // thread or RocksDB bg threads, both mutex_ and log_write_mutex_ are + // held. + // 2. pop_front() by FindObsoleteFiles(), both mutex_ and log_write_mutex_ + // are held. + // 3. push_back() by DBImpl::Open() and DBImpl::RestoreAliveLogFiles() + // (actually called by Open()), only mutex_ is held because at this point, + // the DB::Open() call has not returned success to application, and the + // only other thread(s) that can conflict are bg threads calling + // FindObsoleteFiles() which ensure that both mutex_ and log_write_mutex_ + // are held when accessing alive_log_files_. + // 4. read by DBImpl::Open() is protected by mutex_. + // 5. push_back() by SwitchMemtable(). Both mutex_ and log_write_mutex_ are + // held. This is done by the write group leader. Note that in the case of + // two-write-queues, another WAL-only write thread can be writing to the + // WAL concurrently. See 9. + // 6. read by SwitchWAL() with both mutex_ and log_write_mutex_ held. This is + // done by write group leader. + // 7. read by ConcurrentWriteToWAL() by the write group leader in the case of + // two-write-queues. Only log_write_mutex_ is held to protect concurrent + // pop_front() by FindObsoleteFiles(). + // 8. read by PreprocessWrite() by the write group leader. log_write_mutex_ + // is held to protect the data structure from concurrent pop_front() by + // FindObsoleteFiles(). + // 9. read by ConcurrentWriteToWAL() by a WAL-only write thread in the case + // of two-write-queues. Only log_write_mutex_ is held. This suffices to + // protect the data structure from concurrent push_back() by current + // write group leader as well as pop_front() by FindObsoleteFiles(). + std::deque alive_log_files_; + + // Log files that aren't fully synced, and the current log file. + // Synchronization: + // 1. read by FindObsoleteFiles() which can be called either in application + // thread or RocksDB bg threads. log_write_mutex_ is always held, while + // some reads are performed without mutex_. + // 2. pop_front() by FindObsoleteFiles() with only log_write_mutex_ held. + // 3. read by DBImpl::Open() with both mutex_ and log_write_mutex_. + // 4. emplace_back() by DBImpl::Open() with both mutex_ and log_write_mutex. + // Note that at this point, DB::Open() has not returned success to + // application, thus the only other thread(s) that can conflict are bg + // threads calling FindObsoleteFiles(). See 1. + // 5. iteration and clear() from CloseHelper() always hold log_write_mutex + // and mutex_. + // 6. back() called by APIs FlushWAL() and LockWAL() are protected by only + // log_write_mutex_. These two can be called by application threads after + // DB::Open() returns success to applications. + // 7. read by SyncWAL(), another API, protected by only log_write_mutex_. + // 8. read by MarkLogsNotSynced() and MarkLogsSynced() are protected by + // log_write_mutex_. + // 9. erase() by MarkLogsSynced() protected by log_write_mutex_. + // 10. read by SyncClosedLogs() protected by only log_write_mutex_. This can + // happen in bg flush threads after DB::Open() returns success to + // applications. + // 11. reads, e.g. front(), iteration, and back() called by PreprocessWrite() + // holds only the log_write_mutex_. This is done by the write group + // leader. A bg thread calling FindObsoleteFiles() or MarkLogsSynced() + // can happen concurrently. This is fine because log_write_mutex_ is used + // by all parties. See 2, 5, 9. + // 12. reads, empty(), back() called by SwitchMemtable() hold both mutex_ and + // log_write_mutex_. This happens in the write group leader. + // 13. emplace_back() by SwitchMemtable() hold both mutex_ and + // log_write_mutex_. This happens in the write group leader. Can conflict + // with bg threads calling FindObsoleteFiles(), MarkLogsSynced(), + // SyncClosedLogs(), etc. as well as application threads calling + // FlushWAL(), SyncWAL(), LockWAL(). This is fine because all parties + // require at least log_write_mutex_. + // 14. iteration called in WriteToWAL(write_group) protected by + // log_write_mutex_. This is done by write group leader when + // two-write-queues is disabled and write needs to sync logs. + // 15. back() called in ConcurrentWriteToWAL() protected by log_write_mutex_. + // This can be done by the write group leader if two-write-queues is + // enabled. It can also be done by another WAL-only write thread. + // + // Other observations: + // - back() and items with getting_synced=true are not popped, + // - The same thread that sets getting_synced=true will reset it. + // - it follows that the object referred by back() can be safely read from + // the write_thread_ without using mutex. Note that calling back() without + // mutex may be unsafe because different implementations of deque::back() may + // access other member variables of deque, causing undefined behaviors. + // Generally, do not access stl containers without proper synchronization. + // - it follows that the items with getting_synced=true can be safely read + // from the same thread that has set getting_synced=true + std::deque logs_; + + // Signaled when getting_synced becomes false for some of the logs_. + InstrumentedCondVar log_sync_cv_; + // This is the app-level state that is written to the WAL but will be used + // only during recovery. Using this feature enables not writing the state to + // memtable on normal writes and hence improving the throughput. Each new + // write of the state will replace the previous state entirely even if the + // keys in the two consecutive states do not overlap. + // It is protected by log_write_mutex_ when two_write_queues_ is enabled. + // Otherwise only the heaad of write_thread_ can access it. + WriteBatch cached_recoverable_state_; + std::atomic cached_recoverable_state_empty_ = {true}; + std::atomic total_log_size_; + + // If this is non-empty, we need to delete these log files in background + // threads. Protected by log_write_mutex_. + autovector logs_to_free_; + + bool is_snapshot_supported_; + + std::map> stats_history_; + + std::map stats_slice_; + + bool stats_slice_initialized_ = false; + + Directories directories_; + + WriteBufferManager* write_buffer_manager_; + + WriteThread write_thread_; + WriteBatch tmp_batch_; + // The write thread when the writers have no memtable write. This will be used + // in 2PC to batch the prepares separately from the serial commit. + WriteThread nonmem_write_thread_; + + WriteController write_controller_; + + // Size of the last batch group. In slowdown mode, next write needs to + // sleep if it uses up the quota. + // Note: This is to protect memtable and compaction. If the batch only writes + // to the WAL its size need not to be included in this. + uint64_t last_batch_group_size_; + + FlushScheduler flush_scheduler_; + + TrimHistoryScheduler trim_history_scheduler_; + + SnapshotList snapshots_; + + TimestampedSnapshotList timestamped_snapshots_; + + // For each background job, pending_outputs_ keeps the current file number at + // the time that background job started. + // FindObsoleteFiles()/PurgeObsoleteFiles() never deletes any file that has + // number bigger than any of the file number in pending_outputs_. Since file + // numbers grow monotonically, this also means that pending_outputs_ is always + // sorted. After a background job is done executing, its file number is + // deleted from pending_outputs_, which allows PurgeObsoleteFiles() to clean + // it up. + // State is protected with db mutex. + std::list pending_outputs_; + + // flush_queue_ and compaction_queue_ hold column families that we need to + // flush and compact, respectively. + // A column family is inserted into flush_queue_ when it satisfies condition + // cfd->imm()->IsFlushPending() + // A column family is inserted into compaction_queue_ when it satisfied + // condition cfd->NeedsCompaction() + // Column families in this list are all Ref()-erenced + // TODO(icanadi) Provide some kind of ReferencedColumnFamily class that will + // do RAII on ColumnFamilyData + // Column families are in this queue when they need to be flushed or + // compacted. Consumers of these queues are flush and compaction threads. When + // column family is put on this queue, we increase unscheduled_flushes_ and + // unscheduled_compactions_. When these variables are bigger than zero, that + // means we need to schedule background threads for flush and compaction. + // Once the background threads are scheduled, we decrease unscheduled_flushes_ + // and unscheduled_compactions_. That way we keep track of number of + // compaction and flush threads we need to schedule. This scheduling is done + // in MaybeScheduleFlushOrCompaction() + // invariant(column family present in flush_queue_ <==> + // ColumnFamilyData::pending_flush_ == true) + std::deque flush_queue_; + // invariant(column family present in compaction_queue_ <==> + // ColumnFamilyData::pending_compaction_ == true) + std::deque compaction_queue_; + + // A map to store file numbers and filenames of the files to be purged + std::unordered_map purge_files_; + + // A vector to store the file numbers that have been assigned to certain + // JobContext. Current implementation tracks table and blob files only. + std::unordered_set files_grabbed_for_purge_; + + // A queue to store log writers to close. Protected by db mutex_. + std::deque logs_to_free_queue_; + + std::deque superversions_to_free_queue_; + + int unscheduled_flushes_; + + int unscheduled_compactions_; + + // count how many background compactions are running or have been scheduled in + // the BOTTOM pool + int bg_bottom_compaction_scheduled_; + + // count how many background compactions are running or have been scheduled + int bg_compaction_scheduled_; + + // stores the number of compactions are currently running + int num_running_compactions_; + + // number of background memtable flush jobs, submitted to the HIGH pool + int bg_flush_scheduled_; + + // stores the number of flushes are currently running + int num_running_flushes_; + + // number of background obsolete file purge jobs, submitted to the HIGH pool + int bg_purge_scheduled_; + + std::deque manual_compaction_dequeue_; + + // shall we disable deletion of obsolete files + // if 0 the deletion is enabled. + // if non-zero, files will not be getting deleted + // This enables two different threads to call + // EnableFileDeletions() and DisableFileDeletions() + // without any synchronization + int disable_delete_obsolete_files_; + + // Number of times FindObsoleteFiles has found deletable files and the + // corresponding call to PurgeObsoleteFiles has not yet finished. + int pending_purge_obsolete_files_; + + // last time when DeleteObsoleteFiles with full scan was executed. Originally + // initialized with startup time. + uint64_t delete_obsolete_files_last_run_; + + // last time stats were dumped to LOG + std::atomic last_stats_dump_time_microsec_; + + // The thread that wants to switch memtable, can wait on this cv until the + // pending writes to memtable finishes. + std::condition_variable switch_cv_; + // The mutex used by switch_cv_. mutex_ should be acquired beforehand. + std::mutex switch_mutex_; + // Number of threads intending to write to memtable + std::atomic pending_memtable_writes_ = {}; + + // A flag indicating whether the current rocksdb database has any + // data that is not yet persisted into either WAL or SST file. + // Used when disableWAL is true. + std::atomic has_unpersisted_data_; + + // if an attempt was made to flush all column families that + // the oldest log depends on but uncommitted data in the oldest + // log prevents the log from being released. + // We must attempt to free the dependent memtables again + // at a later time after the transaction in the oldest + // log is fully commited. + bool unable_to_release_oldest_log_; + + // Number of running IngestExternalFile() or CreateColumnFamilyWithImport() + // calls. + // REQUIRES: mutex held + int num_running_ingest_file_; + +#ifndef ROCKSDB_LITE + WalManager wal_manager_; +#endif // ROCKSDB_LITE + + // A value of > 0 temporarily disables scheduling of background work + int bg_work_paused_; + + // A value of > 0 temporarily disables scheduling of background compaction + int bg_compaction_paused_; + + // Guard against multiple concurrent refitting + bool refitting_level_; + + // Indicate DB was opened successfully + bool opened_successfully_; + + // The min threshold to triggere bottommost compaction for removing + // garbages, among all column families. + SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber; + + LogsWithPrepTracker logs_with_prep_tracker_; + + // Callback for compaction to check if a key is visible to a snapshot. + // REQUIRES: mutex held + std::unique_ptr snapshot_checker_; + + // Callback for when the cached_recoverable_state_ is written to memtable + // Only to be set during initialization + std::unique_ptr recoverable_state_pre_release_callback_; + +#ifndef ROCKSDB_LITE + // Scheduler to run DumpStats(), PersistStats(), and FlushInfoLog(). + // Currently, internally it has a global timer instance for running the tasks. + PeriodicTaskScheduler periodic_task_scheduler_; + + // It contains the implementations for each periodic task. + std::map periodic_task_functions_; +#endif + + // When set, we use a separate queue for writes that don't write to memtable. + // In 2PC these are the writes at Prepare phase. + const bool two_write_queues_; + const bool manual_wal_flush_; + + // LastSequence also indicates last published sequence visibile to the + // readers. Otherwise LastPublishedSequence should be used. + const bool last_seq_same_as_publish_seq_; + // It indicates that a customized gc algorithm must be used for + // flush/compaction and if it is not provided vis SnapshotChecker, we should + // disable gc to be safe. + const bool use_custom_gc_; + // Flag to indicate that the DB instance shutdown has been initiated. This + // different from shutting_down_ atomic in that it is set at the beginning + // of shutdown sequence, specifically in order to prevent any background + // error recovery from going on in parallel. The latter, shutting_down_, + // is set a little later during the shutdown after scheduling memtable + // flushes + std::atomic shutdown_initiated_; + // Flag to indicate whether sst_file_manager object was allocated in + // DB::Open() or passed to us + bool own_sfm_; + + // Flag to check whether Close() has been called on this DB + bool closed_; + // save the closing status, for re-calling the close() + Status closing_status_; + // mutex for DB::Close() + InstrumentedMutex closing_mutex_; + + // Conditional variable to coordinate installation of atomic flush results. + // With atomic flush, each bg thread installs the result of flushing multiple + // column families, and different threads can flush different column + // families. It's difficult to rely on one thread to perform batch + // installation for all threads. This is different from the non-atomic flush + // case. + // atomic_flush_install_cv_ makes sure that threads install atomic flush + // results sequentially. Flush results of memtables with lower IDs get + // installed to MANIFEST first. + InstrumentedCondVar atomic_flush_install_cv_; + + bool wal_in_db_path_; + std::atomic max_total_wal_size_; + + BlobFileCompletionCallback blob_callback_; + + // Pointer to WriteBufferManager stalling interface. + std::unique_ptr wbm_stall_; + + // seqno_time_mapping_ stores the sequence number to time mapping, it's not + // thread safe, both read and write need db mutex hold. + SeqnoToTimeMapping seqno_time_mapping_; +}; + +class GetWithTimestampReadCallback : public ReadCallback { + public: + explicit GetWithTimestampReadCallback(SequenceNumber seq) + : ReadCallback(seq) {} + bool IsVisibleFullCheck(SequenceNumber seq) override { + return seq <= max_visible_seq_; + } +}; + +extern Options SanitizeOptions(const std::string& db, const Options& src, + bool read_only = false, + Status* logger_creation_s = nullptr); + +extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src, + bool read_only = false, + Status* logger_creation_s = nullptr); + +extern CompressionType GetCompressionFlush( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options); + +// Return the earliest log file to keep after the memtable flush is +// finalized. +// `cfd_to_flush` is the column family whose memtable (specified in +// `memtables_to_flush`) will be flushed and thus will not depend on any WAL +// file. +// The function is only applicable to 2pc mode. +extern uint64_t PrecomputeMinLogNumberToKeep2PC( + VersionSet* vset, const ColumnFamilyData& cfd_to_flush, + const autovector& edit_list, + const autovector& memtables_to_flush, + LogsWithPrepTracker* prep_tracker); +// For atomic flush. +extern uint64_t PrecomputeMinLogNumberToKeep2PC( + VersionSet* vset, const autovector& cfds_to_flush, + const autovector>& edit_lists, + const autovector*>& memtables_to_flush, + LogsWithPrepTracker* prep_tracker); + +// In non-2PC mode, WALs with log number < the returned number can be +// deleted after the cfd_to_flush column family is flushed successfully. +extern uint64_t PrecomputeMinLogNumberToKeepNon2PC( + VersionSet* vset, const ColumnFamilyData& cfd_to_flush, + const autovector& edit_list); +// For atomic flush. +extern uint64_t PrecomputeMinLogNumberToKeepNon2PC( + VersionSet* vset, const autovector& cfds_to_flush, + const autovector>& edit_lists); + +// `cfd_to_flush` is the column family whose memtable will be flushed and thus +// will not depend on any WAL file. nullptr means no memtable is being flushed. +// The function is only applicable to 2pc mode. +extern uint64_t FindMinPrepLogReferencedByMemTable( + VersionSet* vset, const autovector& memtables_to_flush); +// For atomic flush. +extern uint64_t FindMinPrepLogReferencedByMemTable( + VersionSet* vset, + const autovector*>& memtables_to_flush); + +// Fix user-supplied options to be reasonable +template +static void ClipToRange(T* ptr, V minvalue, V maxvalue) { + if (static_cast(*ptr) > maxvalue) *ptr = maxvalue; + if (static_cast(*ptr) < minvalue) *ptr = minvalue; +} + +inline Status DBImpl::FailIfCfHasTs( + const ColumnFamilyHandle* column_family) const { + column_family = column_family ? column_family : DefaultColumnFamily(); + assert(column_family); + const Comparator* const ucmp = column_family->GetComparator(); + assert(ucmp); + if (ucmp->timestamp_size() > 0) { + std::ostringstream oss; + oss << "cannot call this method on column family " + << column_family->GetName() << " that enables timestamp"; + return Status::InvalidArgument(oss.str()); + } + return Status::OK(); +} + +inline Status DBImpl::FailIfTsMismatchCf(ColumnFamilyHandle* column_family, + const Slice& ts, + bool ts_for_read) const { + if (!column_family) { + return Status::InvalidArgument("column family handle cannot be null"); + } + assert(column_family); + const Comparator* const ucmp = column_family->GetComparator(); + assert(ucmp); + if (0 == ucmp->timestamp_size()) { + std::stringstream oss; + oss << "cannot call this method on column family " + << column_family->GetName() << " that does not enable timestamp"; + return Status::InvalidArgument(oss.str()); + } + const size_t ts_sz = ts.size(); + if (ts_sz != ucmp->timestamp_size()) { + std::stringstream oss; + oss << "Timestamp sizes mismatch: expect " << ucmp->timestamp_size() << ", " + << ts_sz << " given"; + return Status::InvalidArgument(oss.str()); + } + if (ts_for_read) { + auto cfh = static_cast_with_check(column_family); + auto cfd = cfh->cfd(); + std::string current_ts_low = cfd->GetFullHistoryTsLow(); + if (!current_ts_low.empty() && + ucmp->CompareTimestamp(ts, current_ts_low) < 0) { + std::stringstream oss; + oss << "Read timestamp: " << ts.ToString(true) + << " is smaller than full_history_ts_low: " + << Slice(current_ts_low).ToString(true) << std::endl; + return Status::InvalidArgument(oss.str()); + } + } + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc b/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc new file mode 100644 index 000000000..a605fac87 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc @@ -0,0 +1,3857 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include + +#include "db/builder.h" +#include "db/db_impl/db_impl.h" +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "file/sst_file_manager_impl.h" +#include "logging/logging.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/thread_status_updater.h" +#include "monitoring/thread_status_util.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/concurrent_task_limiter_impl.h" + +namespace ROCKSDB_NAMESPACE { + +bool DBImpl::EnoughRoomForCompaction( + ColumnFamilyData* cfd, const std::vector& inputs, + bool* sfm_reserved_compact_space, LogBuffer* log_buffer) { + // Check if we have enough room to do the compaction + bool enough_room = true; +#ifndef ROCKSDB_LITE + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + if (sfm) { + // Pass the current bg_error_ to SFM so it can decide what checks to + // perform. If this DB instance hasn't seen any error yet, the SFM can be + // optimistic and not do disk space checks + Status bg_error = error_handler_.GetBGError(); + enough_room = sfm->EnoughRoomForCompaction(cfd, inputs, bg_error); + bg_error.PermitUncheckedError(); // bg_error is just a copy of the Status + // from the error_handler_ + if (enough_room) { + *sfm_reserved_compact_space = true; + } + } +#else + (void)cfd; + (void)inputs; + (void)sfm_reserved_compact_space; +#endif // ROCKSDB_LITE + if (!enough_room) { + // Just in case tests want to change the value of enough_room + TEST_SYNC_POINT_CALLBACK( + "DBImpl::BackgroundCompaction():CancelledCompaction", &enough_room); + ROCKS_LOG_BUFFER(log_buffer, + "Cancelled compaction because not enough room"); + RecordTick(stats_, COMPACTION_CANCELLED, 1); + } + return enough_room; +} + +bool DBImpl::RequestCompactionToken(ColumnFamilyData* cfd, bool force, + std::unique_ptr* token, + LogBuffer* log_buffer) { + assert(*token == nullptr); + auto limiter = static_cast( + cfd->ioptions()->compaction_thread_limiter.get()); + if (limiter == nullptr) { + return true; + } + *token = limiter->GetToken(force); + if (*token != nullptr) { + ROCKS_LOG_BUFFER(log_buffer, + "Thread limiter [%s] increase [%s] compaction task, " + "force: %s, tasks after: %d", + limiter->GetName().c_str(), cfd->GetName().c_str(), + force ? "true" : "false", limiter->GetOutstandingTask()); + return true; + } + return false; +} + +IOStatus DBImpl::SyncClosedLogs(JobContext* job_context, + VersionEdit* synced_wals) { + TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start"); + InstrumentedMutexLock l(&log_write_mutex_); + autovector logs_to_sync; + uint64_t current_log_number = logfile_number_; + while (logs_.front().number < current_log_number && + logs_.front().IsSyncing()) { + log_sync_cv_.Wait(); + } + for (auto it = logs_.begin(); + it != logs_.end() && it->number < current_log_number; ++it) { + auto& log = *it; + log.PrepareForSync(); + logs_to_sync.push_back(log.writer); + } + + IOStatus io_s; + if (!logs_to_sync.empty()) { + log_write_mutex_.Unlock(); + + assert(job_context); + + for (log::Writer* log : logs_to_sync) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[JOB %d] Syncing log #%" PRIu64, job_context->job_id, + log->get_log_number()); + if (error_handler_.IsRecoveryInProgress()) { + log->file()->reset_seen_error(); + } + io_s = log->file()->Sync(immutable_db_options_.use_fsync); + if (!io_s.ok()) { + break; + } + + if (immutable_db_options_.recycle_log_file_num > 0) { + if (error_handler_.IsRecoveryInProgress()) { + log->file()->reset_seen_error(); + } + io_s = log->Close(); + if (!io_s.ok()) { + break; + } + } + } + if (io_s.ok()) { + io_s = directories_.GetWalDir()->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + } + + TEST_SYNC_POINT_CALLBACK("DBImpl::SyncClosedLogs:BeforeReLock", + /*arg=*/nullptr); + log_write_mutex_.Lock(); + + // "number <= current_log_number - 1" is equivalent to + // "number < current_log_number". + if (io_s.ok()) { + MarkLogsSynced(current_log_number - 1, true, synced_wals); + } else { + MarkLogsNotSynced(current_log_number - 1); + } + if (!io_s.ok()) { + TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Failed"); + return io_s; + } + } + TEST_SYNC_POINT("DBImpl::SyncClosedLogs:end"); + return io_s; +} + +Status DBImpl::FlushMemTableToOutputFile( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + bool* made_progress, JobContext* job_context, + SuperVersionContext* superversion_context, + std::vector& snapshot_seqs, + SequenceNumber earliest_write_conflict_snapshot, + SnapshotChecker* snapshot_checker, LogBuffer* log_buffer, + Env::Priority thread_pri) { + mutex_.AssertHeld(); + assert(cfd); + assert(cfd->imm()); + assert(cfd->imm()->NumNotFlushed() != 0); + assert(cfd->imm()->IsFlushPending()); + assert(versions_); + assert(versions_->GetColumnFamilySet()); + // If there are more than one column families, we need to make sure that + // all the log files except the most recent one are synced. Otherwise if + // the host crashes after flushing and before WAL is persistent, the + // flushed SST may contain data from write batches whose updates to + // other (unflushed) column families are missing. + const bool needs_to_sync_closed_wals = + logfile_number_ > 0 && + versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1; + + // If needs_to_sync_closed_wals is true, we need to record the current + // maximum memtable ID of this column family so that a later PickMemtables() + // call will not pick memtables whose IDs are higher. This is due to the fact + // that SyncClosedLogs() may release the db mutex, and memtable switch can + // happen for this column family in the meantime. The newly created memtables + // have their data backed by unsynced WALs, thus they cannot be included in + // this flush job. + // Another reason why we must record the current maximum memtable ID of this + // column family: SyncClosedLogs() may release db mutex, thus it's possible + // for application to continue to insert into memtables increasing db's + // sequence number. The application may take a snapshot, but this snapshot is + // not included in `snapshot_seqs` which will be passed to flush job because + // `snapshot_seqs` has already been computed before this function starts. + // Recording the max memtable ID ensures that the flush job does not flush + // a memtable without knowing such snapshot(s). + uint64_t max_memtable_id = needs_to_sync_closed_wals + ? cfd->imm()->GetLatestMemTableID() + : std::numeric_limits::max(); + + // If needs_to_sync_closed_wals is false, then the flush job will pick ALL + // existing memtables of the column family when PickMemTable() is called + // later. Although we won't call SyncClosedLogs() in this case, we may still + // call the callbacks of the listeners, i.e. NotifyOnFlushBegin() which also + // releases and re-acquires the db mutex. In the meantime, the application + // can still insert into the memtables and increase the db's sequence number. + // The application can take a snapshot, hoping that the latest visible state + // to this snapshto is preserved. This is hard to guarantee since db mutex + // not held. This newly-created snapshot is not included in `snapshot_seqs` + // and the flush job is unaware of its presence. Consequently, the flush job + // may drop certain keys when generating the L0, causing incorrect data to be + // returned for snapshot read using this snapshot. + // To address this, we make sure NotifyOnFlushBegin() executes after memtable + // picking so that no new snapshot can be taken between the two functions. + + FlushJob flush_job( + dbname_, cfd, immutable_db_options_, mutable_cf_options, max_memtable_id, + file_options_for_compaction_, versions_.get(), &mutex_, &shutting_down_, + snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker, + job_context, log_buffer, directories_.GetDbDir(), GetDataDir(cfd, 0U), + GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_, + &event_logger_, mutable_cf_options.report_bg_io_stats, + true /* sync_output_directory */, true /* write_manifest */, thread_pri, + io_tracer_, seqno_time_mapping_, db_id_, db_session_id_, + cfd->GetFullHistoryTsLow(), &blob_callback_); + FileMetaData file_meta; + + Status s; + bool need_cancel = false; + IOStatus log_io_s = IOStatus::OK(); + if (needs_to_sync_closed_wals) { + // SyncClosedLogs() may unlock and re-lock the log_write_mutex multiple + // times. + VersionEdit synced_wals; + mutex_.Unlock(); + log_io_s = SyncClosedLogs(job_context, &synced_wals); + mutex_.Lock(); + if (log_io_s.ok() && synced_wals.IsWalAddition()) { + log_io_s = status_to_io_status(ApplyWALToManifest(&synced_wals)); + TEST_SYNC_POINT_CALLBACK("DBImpl::FlushMemTableToOutputFile:CommitWal:1", + nullptr); + } + + if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() && + !log_io_s.IsColumnFamilyDropped()) { + error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush); + } + } else { + TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Skip"); + } + s = log_io_s; + + // If the log sync failed, we do not need to pick memtable. Otherwise, + // num_flush_not_started_ needs to be rollback. + TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:BeforePickMemtables"); + if (s.ok()) { + flush_job.PickMemTable(); + need_cancel = true; + } + TEST_SYNC_POINT_CALLBACK( + "DBImpl::FlushMemTableToOutputFile:AfterPickMemtables", &flush_job); + +#ifndef ROCKSDB_LITE + // may temporarily unlock and lock the mutex. + NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id); +#endif // ROCKSDB_LITE + + bool switched_to_mempurge = false; + // Within flush_job.Run, rocksdb may call event listener to notify + // file creation and deletion. + // + // Note that flush_job.Run will unlock and lock the db_mutex, + // and EventListener callback will be called when the db_mutex + // is unlocked by the current thread. + if (s.ok()) { + s = flush_job.Run(&logs_with_prep_tracker_, &file_meta, + &switched_to_mempurge); + need_cancel = false; + } + + if (!s.ok() && need_cancel) { + flush_job.Cancel(); + } + + if (s.ok()) { + InstallSuperVersionAndScheduleWork(cfd, superversion_context, + mutable_cf_options); + if (made_progress) { + *made_progress = true; + } + + const std::string& column_family_name = cfd->GetName(); + + Version* const current = cfd->current(); + assert(current); + + const VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); + + VersionStorageInfo::LevelSummaryStorage tmp; + ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n", + column_family_name.c_str(), + storage_info->LevelSummary(&tmp)); + + const auto& blob_files = storage_info->GetBlobFiles(); + if (!blob_files.empty()) { + assert(blob_files.front()); + assert(blob_files.back()); + + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Blob file summary: head=%" PRIu64 ", tail=%" PRIu64 "\n", + column_family_name.c_str(), blob_files.front()->GetBlobFileNumber(), + blob_files.back()->GetBlobFileNumber()); + } + } + + if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) { + if (log_io_s.ok()) { + // Error while writing to MANIFEST. + // In fact, versions_->io_status() can also be the result of renaming + // CURRENT file. With current code, it's just difficult to tell. So just + // be pessimistic and try write to a new MANIFEST. + // TODO: distinguish between MANIFEST write and CURRENT renaming + if (!versions_->io_status().ok()) { + // If WAL sync is successful (either WAL size is 0 or there is no IO + // error), all the Manifest write will be map to soft error. + // TODO: kManifestWriteNoWAL and kFlushNoWAL are misleading. Refactor is + // needed. + error_handler_.SetBGError(s, + BackgroundErrorReason::kManifestWriteNoWAL); + } else { + // If WAL sync is successful (either WAL size is 0 or there is no IO + // error), all the other SST file write errors will be set as + // kFlushNoWAL. + error_handler_.SetBGError(s, BackgroundErrorReason::kFlushNoWAL); + } + } else { + assert(s == log_io_s); + Status new_bg_error = s; + error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); + } + } + // If flush ran smoothly and no mempurge happened + // install new SST file path. + if (s.ok() && (!switched_to_mempurge)) { +#ifndef ROCKSDB_LITE + // may temporarily unlock and lock the mutex. + NotifyOnFlushCompleted(cfd, mutable_cf_options, + flush_job.GetCommittedFlushJobsInfo()); + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + if (sfm) { + // Notify sst_file_manager that a new file was added + std::string file_path = MakeTableFileName( + cfd->ioptions()->cf_paths[0].path, file_meta.fd.GetNumber()); + // TODO (PR7798). We should only add the file to the FileManager if it + // exists. Otherwise, some tests may fail. Ignore the error in the + // interim. + sfm->OnAddFile(file_path).PermitUncheckedError(); + if (sfm->IsMaxAllowedSpaceReached()) { + Status new_bg_error = + Status::SpaceLimit("Max allowed space was reached"); + TEST_SYNC_POINT_CALLBACK( + "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached", + &new_bg_error); + error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); + } + } +#endif // ROCKSDB_LITE + } + TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:Finish"); + return s; +} + +Status DBImpl::FlushMemTablesToOutputFiles( + const autovector& bg_flush_args, bool* made_progress, + JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) { + if (immutable_db_options_.atomic_flush) { + return AtomicFlushMemTablesToOutputFiles( + bg_flush_args, made_progress, job_context, log_buffer, thread_pri); + } + assert(bg_flush_args.size() == 1); + std::vector snapshot_seqs; + SequenceNumber earliest_write_conflict_snapshot; + SnapshotChecker* snapshot_checker; + GetSnapshotContext(job_context, &snapshot_seqs, + &earliest_write_conflict_snapshot, &snapshot_checker); + const auto& bg_flush_arg = bg_flush_args[0]; + ColumnFamilyData* cfd = bg_flush_arg.cfd_; + // intentional infrequent copy for each flush + MutableCFOptions mutable_cf_options_copy = *cfd->GetLatestMutableCFOptions(); + SuperVersionContext* superversion_context = + bg_flush_arg.superversion_context_; + Status s = FlushMemTableToOutputFile( + cfd, mutable_cf_options_copy, made_progress, job_context, + superversion_context, snapshot_seqs, earliest_write_conflict_snapshot, + snapshot_checker, log_buffer, thread_pri); + return s; +} + +/* + * Atomically flushes multiple column families. + * + * For each column family, all memtables with ID smaller than or equal to the + * ID specified in bg_flush_args will be flushed. Only after all column + * families finish flush will this function commit to MANIFEST. If any of the + * column families are not flushed successfully, this function does not have + * any side-effect on the state of the database. + */ +Status DBImpl::AtomicFlushMemTablesToOutputFiles( + const autovector& bg_flush_args, bool* made_progress, + JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) { + mutex_.AssertHeld(); + + autovector cfds; + for (const auto& arg : bg_flush_args) { + cfds.emplace_back(arg.cfd_); + } + +#ifndef NDEBUG + for (const auto cfd : cfds) { + assert(cfd->imm()->NumNotFlushed() != 0); + assert(cfd->imm()->IsFlushPending()); + assert(cfd->GetFlushReason() == cfds[0]->GetFlushReason()); + } +#endif /* !NDEBUG */ + + std::vector snapshot_seqs; + SequenceNumber earliest_write_conflict_snapshot; + SnapshotChecker* snapshot_checker; + GetSnapshotContext(job_context, &snapshot_seqs, + &earliest_write_conflict_snapshot, &snapshot_checker); + + autovector distinct_output_dirs; + autovector distinct_output_dir_paths; + std::vector> jobs; + std::vector all_mutable_cf_options; + int num_cfs = static_cast(cfds.size()); + all_mutable_cf_options.reserve(num_cfs); + for (int i = 0; i < num_cfs; ++i) { + auto cfd = cfds[i]; + FSDirectory* data_dir = GetDataDir(cfd, 0U); + const std::string& curr_path = cfd->ioptions()->cf_paths[0].path; + + // Add to distinct output directories if eligible. Use linear search. Since + // the number of elements in the vector is not large, performance should be + // tolerable. + bool found = false; + for (const auto& path : distinct_output_dir_paths) { + if (path == curr_path) { + found = true; + break; + } + } + if (!found) { + distinct_output_dir_paths.emplace_back(curr_path); + distinct_output_dirs.emplace_back(data_dir); + } + + all_mutable_cf_options.emplace_back(*cfd->GetLatestMutableCFOptions()); + const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.back(); + uint64_t max_memtable_id = bg_flush_args[i].max_memtable_id_; + jobs.emplace_back(new FlushJob( + dbname_, cfd, immutable_db_options_, mutable_cf_options, + max_memtable_id, file_options_for_compaction_, versions_.get(), &mutex_, + &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot, + snapshot_checker, job_context, log_buffer, directories_.GetDbDir(), + data_dir, GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), + stats_, &event_logger_, mutable_cf_options.report_bg_io_stats, + false /* sync_output_directory */, false /* write_manifest */, + thread_pri, io_tracer_, seqno_time_mapping_, db_id_, db_session_id_, + cfd->GetFullHistoryTsLow(), &blob_callback_)); + } + + std::vector file_meta(num_cfs); + // Use of deque because vector + // is specific and doesn't allow &v[i]. + std::deque switched_to_mempurge(num_cfs, false); + Status s; + IOStatus log_io_s = IOStatus::OK(); + assert(num_cfs == static_cast(jobs.size())); + +#ifndef ROCKSDB_LITE + for (int i = 0; i != num_cfs; ++i) { + const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.at(i); + // may temporarily unlock and lock the mutex. + NotifyOnFlushBegin(cfds[i], &file_meta[i], mutable_cf_options, + job_context->job_id); + } +#endif /* !ROCKSDB_LITE */ + + if (logfile_number_ > 0) { + // TODO (yanqin) investigate whether we should sync the closed logs for + // single column family case. + VersionEdit synced_wals; + mutex_.Unlock(); + log_io_s = SyncClosedLogs(job_context, &synced_wals); + mutex_.Lock(); + if (log_io_s.ok() && synced_wals.IsWalAddition()) { + log_io_s = status_to_io_status(ApplyWALToManifest(&synced_wals)); + } + + if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() && + !log_io_s.IsColumnFamilyDropped()) { + if (total_log_size_ > 0) { + error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush); + } else { + // If the WAL is empty, we use different error reason + error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlushNoWAL); + } + } + } + s = log_io_s; + + // exec_status stores the execution status of flush_jobs as + // + autovector> exec_status; + std::vector pick_status; + for (int i = 0; i != num_cfs; ++i) { + // Initially all jobs are not executed, with status OK. + exec_status.emplace_back(false, Status::OK()); + pick_status.push_back(false); + } + + if (s.ok()) { + for (int i = 0; i != num_cfs; ++i) { + jobs[i]->PickMemTable(); + pick_status[i] = true; + } + } + + if (s.ok()) { + assert(switched_to_mempurge.size() == + static_cast(num_cfs)); + // TODO (yanqin): parallelize jobs with threads. + for (int i = 1; i != num_cfs; ++i) { + exec_status[i].second = + jobs[i]->Run(&logs_with_prep_tracker_, &file_meta[i], + &(switched_to_mempurge.at(i))); + exec_status[i].first = true; + } + if (num_cfs > 1) { + TEST_SYNC_POINT( + "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:1"); + TEST_SYNC_POINT( + "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2"); + } + assert(exec_status.size() > 0); + assert(!file_meta.empty()); + exec_status[0].second = jobs[0]->Run( + &logs_with_prep_tracker_, file_meta.data() /* &file_meta[0] */, + switched_to_mempurge.empty() ? nullptr : &(switched_to_mempurge.at(0))); + exec_status[0].first = true; + + Status error_status; + for (const auto& e : exec_status) { + if (!e.second.ok()) { + s = e.second; + if (!e.second.IsShutdownInProgress() && + !e.second.IsColumnFamilyDropped()) { + // If a flush job did not return OK, and the CF is not dropped, and + // the DB is not shutting down, then we have to return this result to + // caller later. + error_status = e.second; + } + } + } + + s = error_status.ok() ? s : error_status; + } + + if (s.IsColumnFamilyDropped()) { + s = Status::OK(); + } + + if (s.ok() || s.IsShutdownInProgress()) { + // Sync on all distinct output directories. + for (auto dir : distinct_output_dirs) { + if (dir != nullptr) { + Status error_status = dir->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + if (!error_status.ok()) { + s = error_status; + break; + } + } + } + } else { + // Need to undo atomic flush if something went wrong, i.e. s is not OK and + // it is not because of CF drop. + // Have to cancel the flush jobs that have NOT executed because we need to + // unref the versions. + for (int i = 0; i != num_cfs; ++i) { + if (pick_status[i] && !exec_status[i].first) { + jobs[i]->Cancel(); + } + } + for (int i = 0; i != num_cfs; ++i) { + if (exec_status[i].second.ok() && exec_status[i].first) { + auto& mems = jobs[i]->GetMemTables(); + cfds[i]->imm()->RollbackMemtableFlush(mems, + file_meta[i].fd.GetNumber()); + } + } + } + + if (s.ok()) { + const auto wait_to_install_func = + [&]() -> std::pair { + if (!versions_->io_status().ok()) { + // Something went wrong elsewhere, we cannot count on waiting for our + // turn to write/sync to MANIFEST or CURRENT. Just return. + return std::make_pair(versions_->io_status(), false); + } else if (shutting_down_.load(std::memory_order_acquire)) { + return std::make_pair(Status::ShutdownInProgress(), false); + } + bool ready = true; + for (size_t i = 0; i != cfds.size(); ++i) { + const auto& mems = jobs[i]->GetMemTables(); + if (cfds[i]->IsDropped()) { + // If the column family is dropped, then do not wait. + continue; + } else if (!mems.empty() && + cfds[i]->imm()->GetEarliestMemTableID() < mems[0]->GetID()) { + // If a flush job needs to install the flush result for mems and + // mems[0] is not the earliest memtable, it means another thread must + // be installing flush results for the same column family, then the + // current thread needs to wait. + ready = false; + break; + } else if (mems.empty() && cfds[i]->imm()->GetEarliestMemTableID() <= + bg_flush_args[i].max_memtable_id_) { + // If a flush job does not need to install flush results, then it has + // to wait until all memtables up to max_memtable_id_ (inclusive) are + // installed. + ready = false; + break; + } + } + return std::make_pair(Status::OK(), !ready); + }; + + bool resuming_from_bg_err = + error_handler_.IsDBStopped() || + (cfds[0]->GetFlushReason() == FlushReason::kErrorRecovery || + cfds[0]->GetFlushReason() == FlushReason::kErrorRecoveryRetryFlush); + while ((!resuming_from_bg_err || error_handler_.GetRecoveryError().ok())) { + std::pair res = wait_to_install_func(); + + TEST_SYNC_POINT_CALLBACK( + "DBImpl::AtomicFlushMemTablesToOutputFiles:WaitToCommit", &res); + + if (!res.first.ok()) { + s = res.first; + break; + } else if (!res.second) { + break; + } + atomic_flush_install_cv_.Wait(); + + resuming_from_bg_err = + error_handler_.IsDBStopped() || + (cfds[0]->GetFlushReason() == FlushReason::kErrorRecovery || + cfds[0]->GetFlushReason() == FlushReason::kErrorRecoveryRetryFlush); + } + + if (!resuming_from_bg_err) { + // If not resuming from bg err, then we determine future action based on + // whether we hit background error. + if (s.ok()) { + s = error_handler_.GetBGError(); + } + } else if (s.ok()) { + // If resuming from bg err, we still rely on wait_to_install_func()'s + // result to determine future action. If wait_to_install_func() returns + // non-ok already, then we should not proceed to flush result + // installation. + s = error_handler_.GetRecoveryError(); + } + } + + if (s.ok()) { + autovector tmp_cfds; + autovector*> mems_list; + autovector mutable_cf_options_list; + autovector tmp_file_meta; + autovector>*> + committed_flush_jobs_info; + for (int i = 0; i != num_cfs; ++i) { + const auto& mems = jobs[i]->GetMemTables(); + if (!cfds[i]->IsDropped() && !mems.empty()) { + tmp_cfds.emplace_back(cfds[i]); + mems_list.emplace_back(&mems); + mutable_cf_options_list.emplace_back(&all_mutable_cf_options[i]); + tmp_file_meta.emplace_back(&file_meta[i]); +#ifndef ROCKSDB_LITE + committed_flush_jobs_info.emplace_back( + jobs[i]->GetCommittedFlushJobsInfo()); +#endif //! ROCKSDB_LITE + } + } + + s = InstallMemtableAtomicFlushResults( + nullptr /* imm_lists */, tmp_cfds, mutable_cf_options_list, mems_list, + versions_.get(), &logs_with_prep_tracker_, &mutex_, tmp_file_meta, + committed_flush_jobs_info, &job_context->memtables_to_free, + directories_.GetDbDir(), log_buffer); + } + + if (s.ok()) { + assert(num_cfs == + static_cast(job_context->superversion_contexts.size())); + for (int i = 0; i != num_cfs; ++i) { + assert(cfds[i]); + + if (cfds[i]->IsDropped()) { + continue; + } + InstallSuperVersionAndScheduleWork(cfds[i], + &job_context->superversion_contexts[i], + all_mutable_cf_options[i]); + + const std::string& column_family_name = cfds[i]->GetName(); + + Version* const current = cfds[i]->current(); + assert(current); + + const VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); + + VersionStorageInfo::LevelSummaryStorage tmp; + ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n", + column_family_name.c_str(), + storage_info->LevelSummary(&tmp)); + + const auto& blob_files = storage_info->GetBlobFiles(); + if (!blob_files.empty()) { + assert(blob_files.front()); + assert(blob_files.back()); + + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Blob file summary: head=%" PRIu64 ", tail=%" PRIu64 "\n", + column_family_name.c_str(), blob_files.front()->GetBlobFileNumber(), + blob_files.back()->GetBlobFileNumber()); + } + } + if (made_progress) { + *made_progress = true; + } +#ifndef ROCKSDB_LITE + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + assert(all_mutable_cf_options.size() == static_cast(num_cfs)); + for (int i = 0; s.ok() && i != num_cfs; ++i) { + // If mempurge happened instead of Flush, + // no NotifyOnFlushCompleted call (no SST file created). + if (switched_to_mempurge[i]) { + continue; + } + if (cfds[i]->IsDropped()) { + continue; + } + NotifyOnFlushCompleted(cfds[i], all_mutable_cf_options[i], + jobs[i]->GetCommittedFlushJobsInfo()); + if (sfm) { + std::string file_path = MakeTableFileName( + cfds[i]->ioptions()->cf_paths[0].path, file_meta[i].fd.GetNumber()); + // TODO (PR7798). We should only add the file to the FileManager if it + // exists. Otherwise, some tests may fail. Ignore the error in the + // interim. + sfm->OnAddFile(file_path).PermitUncheckedError(); + if (sfm->IsMaxAllowedSpaceReached() && + error_handler_.GetBGError().ok()) { + Status new_bg_error = + Status::SpaceLimit("Max allowed space was reached"); + error_handler_.SetBGError(new_bg_error, + BackgroundErrorReason::kFlush); + } + } + } +#endif // ROCKSDB_LITE + } + + // Need to undo atomic flush if something went wrong, i.e. s is not OK and + // it is not because of CF drop. + if (!s.ok() && !s.IsColumnFamilyDropped()) { + if (log_io_s.ok()) { + // Error while writing to MANIFEST. + // In fact, versions_->io_status() can also be the result of renaming + // CURRENT file. With current code, it's just difficult to tell. So just + // be pessimistic and try write to a new MANIFEST. + // TODO: distinguish between MANIFEST write and CURRENT renaming + if (!versions_->io_status().ok()) { + // If WAL sync is successful (either WAL size is 0 or there is no IO + // error), all the Manifest write will be map to soft error. + // TODO: kManifestWriteNoWAL and kFlushNoWAL are misleading. Refactor + // is needed. + error_handler_.SetBGError(s, + BackgroundErrorReason::kManifestWriteNoWAL); + } else { + // If WAL sync is successful (either WAL size is 0 or there is no IO + // error), all the other SST file write errors will be set as + // kFlushNoWAL. + error_handler_.SetBGError(s, BackgroundErrorReason::kFlushNoWAL); + } + } else { + assert(s == log_io_s); + Status new_bg_error = s; + error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); + } + } + + return s; +} + +void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, + const MutableCFOptions& mutable_cf_options, + int job_id) { +#ifndef ROCKSDB_LITE + if (immutable_db_options_.listeners.size() == 0U) { + return; + } + mutex_.AssertHeld(); + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + bool triggered_writes_slowdown = + (cfd->current()->storage_info()->NumLevelFiles(0) >= + mutable_cf_options.level0_slowdown_writes_trigger); + bool triggered_writes_stop = + (cfd->current()->storage_info()->NumLevelFiles(0) >= + mutable_cf_options.level0_stop_writes_trigger); + // release lock while notifying events + mutex_.Unlock(); + { + FlushJobInfo info{}; + info.cf_id = cfd->GetID(); + info.cf_name = cfd->GetName(); + // TODO(yhchiang): make db_paths dynamic in case flush does not + // go to L0 in the future. + const uint64_t file_number = file_meta->fd.GetNumber(); + info.file_path = + MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_number); + info.file_number = file_number; + info.thread_id = env_->GetThreadID(); + info.job_id = job_id; + info.triggered_writes_slowdown = triggered_writes_slowdown; + info.triggered_writes_stop = triggered_writes_stop; + info.smallest_seqno = file_meta->fd.smallest_seqno; + info.largest_seqno = file_meta->fd.largest_seqno; + info.flush_reason = cfd->GetFlushReason(); + for (auto listener : immutable_db_options_.listeners) { + listener->OnFlushBegin(this, info); + } + } + mutex_.Lock(); +// no need to signal bg_cv_ as it will be signaled at the end of the +// flush process. +#else + (void)cfd; + (void)file_meta; + (void)mutable_cf_options; + (void)job_id; +#endif // ROCKSDB_LITE +} + +void DBImpl::NotifyOnFlushCompleted( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + std::list>* flush_jobs_info) { +#ifndef ROCKSDB_LITE + assert(flush_jobs_info != nullptr); + if (immutable_db_options_.listeners.size() == 0U) { + return; + } + mutex_.AssertHeld(); + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + bool triggered_writes_slowdown = + (cfd->current()->storage_info()->NumLevelFiles(0) >= + mutable_cf_options.level0_slowdown_writes_trigger); + bool triggered_writes_stop = + (cfd->current()->storage_info()->NumLevelFiles(0) >= + mutable_cf_options.level0_stop_writes_trigger); + // release lock while notifying events + mutex_.Unlock(); + { + for (auto& info : *flush_jobs_info) { + info->triggered_writes_slowdown = triggered_writes_slowdown; + info->triggered_writes_stop = triggered_writes_stop; + for (auto listener : immutable_db_options_.listeners) { + listener->OnFlushCompleted(this, *info); + } + TEST_SYNC_POINT( + "DBImpl::NotifyOnFlushCompleted::PostAllOnFlushCompleted"); + } + flush_jobs_info->clear(); + } + mutex_.Lock(); + // no need to signal bg_cv_ as it will be signaled at the end of the + // flush process. +#else + (void)cfd; + (void)mutable_cf_options; + (void)flush_jobs_info; +#endif // ROCKSDB_LITE +} + +Status DBImpl::CompactRange(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* begin_without_ts, + const Slice* end_without_ts) { + if (manual_compaction_paused_.load(std::memory_order_acquire) > 0) { + return Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + + if (options.canceled && options.canceled->load(std::memory_order_acquire)) { + return Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + + const Comparator* const ucmp = column_family->GetComparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + if (ts_sz == 0) { + return CompactRangeInternal(options, column_family, begin_without_ts, + end_without_ts, "" /*trim_ts*/); + } + + std::string begin_str; + std::string end_str; + + // CompactRange compact all keys: [begin, end] inclusively. Add maximum + // timestamp to include all `begin` keys, and add minimal timestamp to include + // all `end` keys. + if (begin_without_ts != nullptr) { + AppendKeyWithMaxTimestamp(&begin_str, *begin_without_ts, ts_sz); + } + if (end_without_ts != nullptr) { + AppendKeyWithMinTimestamp(&end_str, *end_without_ts, ts_sz); + } + Slice begin(begin_str); + Slice end(end_str); + + Slice* begin_with_ts = begin_without_ts ? &begin : nullptr; + Slice* end_with_ts = end_without_ts ? &end : nullptr; + + return CompactRangeInternal(options, column_family, begin_with_ts, + end_with_ts, "" /*trim_ts*/); +} + +Status DBImpl::IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family, + std::string ts_low) { + ColumnFamilyData* cfd = nullptr; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = static_cast_with_check(column_family); + assert(cfh != nullptr); + cfd = cfh->cfd(); + } + assert(cfd != nullptr && cfd->user_comparator() != nullptr); + if (cfd->user_comparator()->timestamp_size() == 0) { + return Status::InvalidArgument( + "Timestamp is not enabled in this column family"); + } + if (cfd->user_comparator()->timestamp_size() != ts_low.size()) { + return Status::InvalidArgument("ts_low size mismatch"); + } + return IncreaseFullHistoryTsLowImpl(cfd, ts_low); +} + +Status DBImpl::IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd, + std::string ts_low) { + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + edit.SetFullHistoryTsLow(ts_low); + TEST_SYNC_POINT_CALLBACK("DBImpl::IncreaseFullHistoryTsLowImpl:BeforeEdit", + &edit); + + InstrumentedMutexLock l(&mutex_); + std::string current_ts_low = cfd->GetFullHistoryTsLow(); + const Comparator* ucmp = cfd->user_comparator(); + assert(ucmp->timestamp_size() == ts_low.size() && !ts_low.empty()); + if (!current_ts_low.empty() && + ucmp->CompareTimestamp(ts_low, current_ts_low) < 0) { + return Status::InvalidArgument("Cannot decrease full_history_ts_low"); + } + + Status s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + &edit, &mutex_, directories_.GetDbDir()); + if (!s.ok()) { + return s; + } + current_ts_low = cfd->GetFullHistoryTsLow(); + if (!current_ts_low.empty() && + ucmp->CompareTimestamp(current_ts_low, ts_low) > 0) { + std::stringstream oss; + oss << "full_history_ts_low: " << Slice(current_ts_low).ToString(true) + << " is set to be higher than the requested " + "timestamp: " + << Slice(ts_low).ToString(true) << std::endl; + return Status::TryAgain(oss.str()); + } + return Status::OK(); +} + +Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end, + const std::string& trim_ts) { + auto cfh = static_cast_with_check(column_family); + auto cfd = cfh->cfd(); + + if (options.target_path_id >= cfd->ioptions()->cf_paths.size()) { + return Status::InvalidArgument("Invalid target path ID"); + } + + bool flush_needed = true; + + // Update full_history_ts_low if it's set + if (options.full_history_ts_low != nullptr && + !options.full_history_ts_low->empty()) { + std::string ts_low = options.full_history_ts_low->ToString(); + if (begin != nullptr || end != nullptr) { + return Status::InvalidArgument( + "Cannot specify compaction range with full_history_ts_low"); + } + Status s = IncreaseFullHistoryTsLowImpl(cfd, ts_low); + if (!s.ok()) { + LogFlush(immutable_db_options_.info_log); + return s; + } + } + + Status s; + if (begin != nullptr && end != nullptr) { + // TODO(ajkr): We could also optimize away the flush in certain cases where + // one/both sides of the interval are unbounded. But it requires more + // changes to RangesOverlapWithMemtables. + Range range(*begin, *end); + SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); + s = cfd->RangesOverlapWithMemtables( + {range}, super_version, immutable_db_options_.allow_data_in_errors, + &flush_needed); + CleanupSuperVersion(super_version); + } + + if (s.ok() && flush_needed) { + FlushOptions fo; + fo.allow_write_stall = options.allow_write_stall; + if (immutable_db_options_.atomic_flush) { + autovector cfds; + mutex_.Lock(); + SelectColumnFamiliesForAtomicFlush(&cfds); + mutex_.Unlock(); + s = AtomicFlushMemTables(cfds, fo, FlushReason::kManualCompaction, + false /* entered_write_thread */); + } else { + s = FlushMemTable(cfd, fo, FlushReason::kManualCompaction, + false /* entered_write_thread */); + } + if (!s.ok()) { + LogFlush(immutable_db_options_.info_log); + return s; + } + } + + constexpr int kInvalidLevel = -1; + int final_output_level = kInvalidLevel; + bool exclusive = options.exclusive_manual_compaction; + if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal && + cfd->NumberLevels() > 1) { + // Always compact all files together. + final_output_level = cfd->NumberLevels() - 1; + // if bottom most level is reserved + if (immutable_db_options_.allow_ingest_behind) { + final_output_level--; + } + s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels, + final_output_level, options, begin, end, exclusive, + false, std::numeric_limits::max(), + trim_ts); + } else { + int first_overlapped_level = kInvalidLevel; + int max_overlapped_level = kInvalidLevel; + { + SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); + Version* current_version = super_version->current; + ReadOptions ro; + ro.total_order_seek = true; + bool overlap; + for (int level = 0; + level < current_version->storage_info()->num_non_empty_levels(); + level++) { + overlap = true; + if (begin != nullptr && end != nullptr) { + Status status = current_version->OverlapWithLevelIterator( + ro, file_options_, *begin, *end, level, &overlap); + if (!status.ok()) { + overlap = current_version->storage_info()->OverlapInLevel( + level, begin, end); + } + } else { + overlap = current_version->storage_info()->OverlapInLevel(level, + begin, end); + } + if (overlap) { + if (first_overlapped_level == kInvalidLevel) { + first_overlapped_level = level; + } + max_overlapped_level = level; + } + } + CleanupSuperVersion(super_version); + } + if (s.ok() && first_overlapped_level != kInvalidLevel) { + // max_file_num_to_ignore can be used to filter out newly created SST + // files, useful for bottom level compaction in a manual compaction + uint64_t max_file_num_to_ignore = std::numeric_limits::max(); + uint64_t next_file_number = versions_->current_next_file_number(); + final_output_level = max_overlapped_level; + int output_level; + for (int level = first_overlapped_level; level <= max_overlapped_level; + level++) { + bool disallow_trivial_move = false; + // in case the compaction is universal or if we're compacting the + // bottom-most level, the output level will be the same as input one. + // level 0 can never be the bottommost level (i.e. if all files are in + // level 0, we will compact to level 1) + if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + output_level = level; + } else if (level == max_overlapped_level && level > 0) { + if (options.bottommost_level_compaction == + BottommostLevelCompaction::kSkip) { + // Skip bottommost level compaction + continue; + } else if (options.bottommost_level_compaction == + BottommostLevelCompaction::kIfHaveCompactionFilter && + cfd->ioptions()->compaction_filter == nullptr && + cfd->ioptions()->compaction_filter_factory == nullptr) { + // Skip bottommost level compaction since we don't have a compaction + // filter + continue; + } + output_level = level; + // update max_file_num_to_ignore only for bottom level compaction + // because data in newly compacted files in middle levels may still + // need to be pushed down + max_file_num_to_ignore = next_file_number; + } else { + output_level = level + 1; + if (cfd->ioptions()->compaction_style == kCompactionStyleLevel && + cfd->ioptions()->level_compaction_dynamic_level_bytes && + level == 0) { + output_level = ColumnFamilyData::kCompactToBaseLevel; + } + // if it's a BottommostLevel compaction and `kForce*` compaction is + // set, disallow trivial move + if (level == max_overlapped_level && + (options.bottommost_level_compaction == + BottommostLevelCompaction::kForce || + options.bottommost_level_compaction == + BottommostLevelCompaction::kForceOptimized)) { + disallow_trivial_move = true; + } + } + // trim_ts need real compaction to remove latest record + if (!trim_ts.empty()) { + disallow_trivial_move = true; + } + s = RunManualCompaction(cfd, level, output_level, options, begin, end, + exclusive, disallow_trivial_move, + max_file_num_to_ignore, trim_ts); + if (!s.ok()) { + break; + } + if (output_level == ColumnFamilyData::kCompactToBaseLevel) { + final_output_level = cfd->NumberLevels() - 1; + } else if (output_level > final_output_level) { + final_output_level = output_level; + } + TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1"); + TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2"); + } + } + } + if (!s.ok() || final_output_level == kInvalidLevel) { + LogFlush(immutable_db_options_.info_log); + return s; + } + + if (options.change_level) { + TEST_SYNC_POINT("DBImpl::CompactRange:BeforeRefit:1"); + TEST_SYNC_POINT("DBImpl::CompactRange:BeforeRefit:2"); + + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[RefitLevel] waiting for background threads to stop"); + DisableManualCompaction(); + s = PauseBackgroundWork(); + if (s.ok()) { + TEST_SYNC_POINT("DBImpl::CompactRange:PreRefitLevel"); + s = ReFitLevel(cfd, final_output_level, options.target_level); + TEST_SYNC_POINT("DBImpl::CompactRange:PostRefitLevel"); + // ContinueBackgroundWork always return Status::OK(). + Status temp_s = ContinueBackgroundWork(); + assert(temp_s.ok()); + } + EnableManualCompaction(); + TEST_SYNC_POINT( + "DBImpl::CompactRange:PostRefitLevel:ManualCompactionEnabled"); + } + LogFlush(immutable_db_options_.info_log); + + { + InstrumentedMutexLock l(&mutex_); + // an automatic compaction that has been scheduled might have been + // preempted by the manual compactions. Need to schedule it back. + MaybeScheduleFlushOrCompaction(); + } + + return s; +} + +Status DBImpl::CompactFiles(const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector& input_file_names, + const int output_level, const int output_path_id, + std::vector* const output_file_names, + CompactionJobInfo* compaction_job_info) { +#ifdef ROCKSDB_LITE + (void)compact_options; + (void)column_family; + (void)input_file_names; + (void)output_level; + (void)output_path_id; + (void)output_file_names; + (void)compaction_job_info; + // not supported in lite version + return Status::NotSupported("Not supported in ROCKSDB LITE"); +#else + if (column_family == nullptr) { + return Status::InvalidArgument("ColumnFamilyHandle must be non-null."); + } + + auto cfd = + static_cast_with_check(column_family)->cfd(); + assert(cfd); + + Status s; + JobContext job_context(next_job_id_.fetch_add(1), true); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, + immutable_db_options_.info_log.get()); + + // Perform CompactFiles + TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile2"); + TEST_SYNC_POINT_CALLBACK( + "TestCompactFiles:PausingManualCompaction:3", + reinterpret_cast( + const_cast*>(&manual_compaction_paused_))); + { + InstrumentedMutexLock l(&mutex_); + + // This call will unlock/lock the mutex to wait for current running + // IngestExternalFile() calls to finish. + WaitForIngestFile(); + + // We need to get current after `WaitForIngestFile`, because + // `IngestExternalFile` may add files that overlap with `input_file_names` + auto* current = cfd->current(); + current->Ref(); + + s = CompactFilesImpl(compact_options, cfd, current, input_file_names, + output_file_names, output_level, output_path_id, + &job_context, &log_buffer, compaction_job_info); + + current->Unref(); + } + + // Find and delete obsolete files + { + InstrumentedMutexLock l(&mutex_); + // If !s.ok(), this means that Compaction failed. In that case, we want + // to delete all obsolete files we might have created and we force + // FindObsoleteFiles(). This is because job_context does not + // catch all created files if compaction failed. + FindObsoleteFiles(&job_context, !s.ok()); + } // release the mutex + + // delete unnecessary files if any, this is done outside the mutex + if (job_context.HaveSomethingToClean() || + job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { + // Have to flush the info logs before bg_compaction_scheduled_-- + // because if bg_flush_scheduled_ becomes 0 and the lock is + // released, the deconstructor of DB can kick in and destroy all the + // states of DB so info_log might not be available after that point. + // It also applies to access other states that DB owns. + log_buffer.FlushBufferToLog(); + if (job_context.HaveSomethingToDelete()) { + // no mutex is locked here. No need to Unlock() and Lock() here. + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + } + + return s; +#endif // ROCKSDB_LITE +} + +#ifndef ROCKSDB_LITE +Status DBImpl::CompactFilesImpl( + const CompactionOptions& compact_options, ColumnFamilyData* cfd, + Version* version, const std::vector& input_file_names, + std::vector* const output_file_names, const int output_level, + int output_path_id, JobContext* job_context, LogBuffer* log_buffer, + CompactionJobInfo* compaction_job_info) { + mutex_.AssertHeld(); + + if (shutting_down_.load(std::memory_order_acquire)) { + return Status::ShutdownInProgress(); + } + if (manual_compaction_paused_.load(std::memory_order_acquire) > 0) { + return Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + + std::unordered_set input_set; + for (const auto& file_name : input_file_names) { + input_set.insert(TableFileNameToNumber(file_name)); + } + + ColumnFamilyMetaData cf_meta; + // TODO(yhchiang): can directly use version here if none of the + // following functions call is pluggable to external developers. + version->GetColumnFamilyMetaData(&cf_meta); + + if (output_path_id < 0) { + if (cfd->ioptions()->cf_paths.size() == 1U) { + output_path_id = 0; + } else { + return Status::NotSupported( + "Automatic output path selection is not " + "yet supported in CompactFiles()"); + } + } + + Status s = cfd->compaction_picker()->SanitizeCompactionInputFiles( + &input_set, cf_meta, output_level); + if (!s.ok()) { + return s; + } + + std::vector input_files; + s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers( + &input_files, &input_set, version->storage_info(), compact_options); + if (!s.ok()) { + return s; + } + + for (const auto& inputs : input_files) { + if (cfd->compaction_picker()->AreFilesInCompaction(inputs.files)) { + return Status::Aborted( + "Some of the necessary compaction input " + "files are already being compacted"); + } + } + bool sfm_reserved_compact_space = false; + // First check if we have enough room to do the compaction + bool enough_room = EnoughRoomForCompaction( + cfd, input_files, &sfm_reserved_compact_space, log_buffer); + + if (!enough_room) { + // m's vars will get set properly at the end of this function, + // as long as status == CompactionTooLarge + return Status::CompactionTooLarge(); + } + + // At this point, CompactFiles will be run. + bg_compaction_scheduled_++; + + std::unique_ptr c; + assert(cfd->compaction_picker()); + c.reset(cfd->compaction_picker()->CompactFiles( + compact_options, input_files, output_level, version->storage_info(), + *cfd->GetLatestMutableCFOptions(), mutable_db_options_, output_path_id)); + // we already sanitized the set of input files and checked for conflicts + // without releasing the lock, so we're guaranteed a compaction can be formed. + assert(c != nullptr); + + c->SetInputVersion(version); + // deletion compaction currently not allowed in CompactFiles. + assert(!c->deletion_compaction()); + + std::vector snapshot_seqs; + SequenceNumber earliest_write_conflict_snapshot; + SnapshotChecker* snapshot_checker; + GetSnapshotContext(job_context, &snapshot_seqs, + &earliest_write_conflict_snapshot, &snapshot_checker); + + std::unique_ptr::iterator> pending_outputs_inserted_elem( + new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + + assert(is_snapshot_supported_ || snapshots_.empty()); + CompactionJobStats compaction_job_stats; + CompactionJob compaction_job( + job_context->job_id, c.get(), immutable_db_options_, mutable_db_options_, + file_options_for_compaction_, versions_.get(), &shutting_down_, + log_buffer, directories_.GetDbDir(), + GetDataDir(c->column_family_data(), c->output_path_id()), + GetDataDir(c->column_family_data(), 0), stats_, &mutex_, &error_handler_, + snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker, + job_context, table_cache_, &event_logger_, + c->mutable_cf_options()->paranoid_file_checks, + c->mutable_cf_options()->report_bg_io_stats, dbname_, + &compaction_job_stats, Env::Priority::USER, io_tracer_, + kManualCompactionCanceledFalse_, db_id_, db_session_id_, + c->column_family_data()->GetFullHistoryTsLow(), c->trim_ts(), + &blob_callback_, &bg_compaction_scheduled_, + &bg_bottom_compaction_scheduled_); + + // Creating a compaction influences the compaction score because the score + // takes running compactions into account (by skipping files that are already + // being compacted). Since we just changed compaction score, we recalculate it + // here. + version->storage_info()->ComputeCompactionScore(*cfd->ioptions(), + *c->mutable_cf_options()); + + compaction_job.Prepare(); + + mutex_.Unlock(); + TEST_SYNC_POINT("CompactFilesImpl:0"); + TEST_SYNC_POINT("CompactFilesImpl:1"); + // Ignore the status here, as it will be checked in the Install down below... + compaction_job.Run().PermitUncheckedError(); + TEST_SYNC_POINT("CompactFilesImpl:2"); + TEST_SYNC_POINT("CompactFilesImpl:3"); + mutex_.Lock(); + + Status status = compaction_job.Install(*c->mutable_cf_options()); + if (status.ok()) { + assert(compaction_job.io_status().ok()); + InstallSuperVersionAndScheduleWork(c->column_family_data(), + &job_context->superversion_contexts[0], + *c->mutable_cf_options()); + } + // status above captures any error during compaction_job.Install, so its ok + // not check compaction_job.io_status() explicitly if we're not calling + // SetBGError + compaction_job.io_status().PermitUncheckedError(); + c->ReleaseCompactionFiles(s); +#ifndef ROCKSDB_LITE + // Need to make sure SstFileManager does its bookkeeping + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + if (sfm && sfm_reserved_compact_space) { + sfm->OnCompactionCompletion(c.get()); + } +#endif // ROCKSDB_LITE + + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); + + if (compaction_job_info != nullptr) { + BuildCompactionJobInfo(cfd, c.get(), s, compaction_job_stats, + job_context->job_id, version, compaction_job_info); + } + + if (status.ok()) { + // Done + } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) { + // Ignore compaction errors found during shutting down + } else if (status.IsManualCompactionPaused()) { + // Don't report stopping manual compaction as error + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] [JOB %d] Stopping manual compaction", + c->column_family_data()->GetName().c_str(), + job_context->job_id); + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "[%s] [JOB %d] Compaction error: %s", + c->column_family_data()->GetName().c_str(), + job_context->job_id, status.ToString().c_str()); + IOStatus io_s = compaction_job.io_status(); + if (!io_s.ok()) { + error_handler_.SetBGError(io_s, BackgroundErrorReason::kCompaction); + } else { + error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction); + } + } + + if (output_file_names != nullptr) { + for (const auto& newf : c->edit()->GetNewFiles()) { + output_file_names->push_back(TableFileName( + c->immutable_options()->cf_paths, newf.second.fd.GetNumber(), + newf.second.fd.GetPathId())); + } + + for (const auto& blob_file : c->edit()->GetBlobFileAdditions()) { + output_file_names->push_back( + BlobFileName(c->immutable_options()->cf_paths.front().path, + blob_file.GetBlobFileNumber())); + } + } + + c.reset(); + + bg_compaction_scheduled_--; + if (bg_compaction_scheduled_ == 0) { + bg_cv_.SignalAll(); + } + MaybeScheduleFlushOrCompaction(); + TEST_SYNC_POINT("CompactFilesImpl:End"); + + return status; +} +#endif // ROCKSDB_LITE + +Status DBImpl::PauseBackgroundWork() { + InstrumentedMutexLock guard_lock(&mutex_); + bg_compaction_paused_++; + while (bg_bottom_compaction_scheduled_ > 0 || bg_compaction_scheduled_ > 0 || + bg_flush_scheduled_ > 0) { + bg_cv_.Wait(); + } + bg_work_paused_++; + return Status::OK(); +} + +Status DBImpl::ContinueBackgroundWork() { + InstrumentedMutexLock guard_lock(&mutex_); + if (bg_work_paused_ == 0) { + return Status::InvalidArgument(); + } + assert(bg_work_paused_ > 0); + assert(bg_compaction_paused_ > 0); + bg_compaction_paused_--; + bg_work_paused_--; + // It's sufficient to check just bg_work_paused_ here since + // bg_work_paused_ is always no greater than bg_compaction_paused_ + if (bg_work_paused_ == 0) { + MaybeScheduleFlushOrCompaction(); + } + return Status::OK(); +} + +void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c, + const Status& st, + const CompactionJobStats& job_stats, + int job_id) { +#ifndef ROCKSDB_LITE + if (immutable_db_options_.listeners.empty()) { + return; + } + mutex_.AssertHeld(); + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + if (c->is_manual_compaction() && + manual_compaction_paused_.load(std::memory_order_acquire) > 0) { + return; + } + + c->SetNotifyOnCompactionCompleted(); + Version* current = cfd->current(); + current->Ref(); + // release lock while notifying events + mutex_.Unlock(); + TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex"); + { + CompactionJobInfo info{}; + BuildCompactionJobInfo(cfd, c, st, job_stats, job_id, current, &info); + for (auto listener : immutable_db_options_.listeners) { + listener->OnCompactionBegin(this, info); + } + info.status.PermitUncheckedError(); + } + mutex_.Lock(); + current->Unref(); +#else + (void)cfd; + (void)c; + (void)st; + (void)job_stats; + (void)job_id; +#endif // ROCKSDB_LITE +} + +void DBImpl::NotifyOnCompactionCompleted( + ColumnFamilyData* cfd, Compaction* c, const Status& st, + const CompactionJobStats& compaction_job_stats, const int job_id) { +#ifndef ROCKSDB_LITE + if (immutable_db_options_.listeners.size() == 0U) { + return; + } + mutex_.AssertHeld(); + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + + if (c->ShouldNotifyOnCompactionCompleted() == false) { + return; + } + + Version* current = cfd->current(); + current->Ref(); + // release lock while notifying events + mutex_.Unlock(); + TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex"); + { + CompactionJobInfo info{}; + BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, current, + &info); + for (auto listener : immutable_db_options_.listeners) { + listener->OnCompactionCompleted(this, info); + } + } + mutex_.Lock(); + current->Unref(); + // no need to signal bg_cv_ as it will be signaled at the end of the + // flush process. +#else + (void)cfd; + (void)c; + (void)st; + (void)compaction_job_stats; + (void)job_id; +#endif // ROCKSDB_LITE +} + +// REQUIREMENT: block all background work by calling PauseBackgroundWork() +// before calling this function +Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { + assert(level < cfd->NumberLevels()); + if (target_level >= cfd->NumberLevels()) { + return Status::InvalidArgument("Target level exceeds number of levels"); + } + + SuperVersionContext sv_context(/* create_superversion */ true); + + InstrumentedMutexLock guard_lock(&mutex_); + + // only allow one thread refitting + if (refitting_level_) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[ReFitLevel] another thread is refitting"); + return Status::NotSupported("another thread is refitting"); + } + refitting_level_ = true; + + const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + // move to a smaller level + int to_level = target_level; + if (target_level < 0) { + to_level = FindMinimumEmptyLevelFitting(cfd, mutable_cf_options, level); + } + + auto* vstorage = cfd->current()->storage_info(); + if (to_level != level) { + if (to_level > level) { + if (level == 0) { + refitting_level_ = false; + return Status::NotSupported( + "Cannot change from level 0 to other levels."); + } + // Check levels are empty for a trivial move + for (int l = level + 1; l <= to_level; l++) { + if (vstorage->NumLevelFiles(l) > 0) { + refitting_level_ = false; + return Status::NotSupported( + "Levels between source and target are not empty for a move."); + } + } + } else { + // to_level < level + // Check levels are empty for a trivial move + for (int l = to_level; l < level; l++) { + if (vstorage->NumLevelFiles(l) > 0) { + refitting_level_ = false; + return Status::NotSupported( + "Levels between source and target are not empty for a move."); + } + } + } + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] Before refitting:\n%s", cfd->GetName().c_str(), + cfd->current()->DebugString().data()); + + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + for (const auto& f : vstorage->LevelFiles(level)) { + edit.DeleteFile(level, f->fd.GetNumber()); + edit.AddFile( + to_level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(), + f->smallest, f->largest, f->fd.smallest_seqno, f->fd.largest_seqno, + f->marked_for_compaction, f->temperature, f->oldest_blob_file_number, + f->oldest_ancester_time, f->file_creation_time, f->file_checksum, + f->file_checksum_func_name, f->unique_id); + } + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), + edit.DebugString().data()); + + Status status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, + &mutex_, directories_.GetDbDir()); + + InstallSuperVersionAndScheduleWork(cfd, &sv_context, mutable_cf_options); + + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] LogAndApply: %s\n", + cfd->GetName().c_str(), status.ToString().data()); + + if (status.ok()) { + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] After refitting:\n%s", cfd->GetName().c_str(), + cfd->current()->DebugString().data()); + } + sv_context.Clean(); + refitting_level_ = false; + + return status; + } + + refitting_level_ = false; + return Status::OK(); +} + +int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) { + auto cfh = static_cast_with_check(column_family); + return cfh->cfd()->NumberLevels(); +} + +int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) { + return 0; +} + +int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) { + auto cfh = static_cast_with_check(column_family); + InstrumentedMutexLock l(&mutex_); + return cfh->cfd() + ->GetSuperVersion() + ->mutable_cf_options.level0_stop_writes_trigger; +} + +Status DBImpl::Flush(const FlushOptions& flush_options, + ColumnFamilyHandle* column_family) { + auto cfh = static_cast_with_check(column_family); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "[%s] Manual flush start.", + cfh->GetName().c_str()); + Status s; + if (immutable_db_options_.atomic_flush) { + s = AtomicFlushMemTables({cfh->cfd()}, flush_options, + FlushReason::kManualFlush); + } else { + s = FlushMemTable(cfh->cfd(), flush_options, FlushReason::kManualFlush); + } + + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] Manual flush finished, status: %s\n", + cfh->GetName().c_str(), s.ToString().c_str()); + return s; +} + +Status DBImpl::Flush(const FlushOptions& flush_options, + const std::vector& column_families) { + Status s; + if (!immutable_db_options_.atomic_flush) { + for (auto cfh : column_families) { + s = Flush(flush_options, cfh); + if (!s.ok()) { + break; + } + } + } else { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Manual atomic flush start.\n" + "=====Column families:====="); + for (auto cfh : column_families) { + auto cfhi = static_cast(cfh); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", + cfhi->GetName().c_str()); + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "=====End of column families list====="); + autovector cfds; + std::for_each(column_families.begin(), column_families.end(), + [&cfds](ColumnFamilyHandle* elem) { + auto cfh = static_cast(elem); + cfds.emplace_back(cfh->cfd()); + }); + s = AtomicFlushMemTables(cfds, flush_options, FlushReason::kManualFlush); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Manual atomic flush finished, status: %s\n" + "=====Column families:=====", + s.ToString().c_str()); + for (auto cfh : column_families) { + auto cfhi = static_cast(cfh); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", + cfhi->GetName().c_str()); + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "=====End of column families list====="); + } + return s; +} + +Status DBImpl::RunManualCompaction( + ColumnFamilyData* cfd, int input_level, int output_level, + const CompactRangeOptions& compact_range_options, const Slice* begin, + const Slice* end, bool exclusive, bool disallow_trivial_move, + uint64_t max_file_num_to_ignore, const std::string& trim_ts) { + assert(input_level == ColumnFamilyData::kCompactAllLevels || + input_level >= 0); + + InternalKey begin_storage, end_storage; + CompactionArg* ca = nullptr; + + bool scheduled = false; + bool unscheduled = false; + Env::Priority thread_pool_priority = Env::Priority::TOTAL; + bool manual_conflict = false; + + ManualCompactionState manual( + cfd, input_level, output_level, compact_range_options.target_path_id, + exclusive, disallow_trivial_move, compact_range_options.canceled); + // For universal compaction, we enforce every manual compaction to compact + // all files. + if (begin == nullptr || + cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + manual.begin = nullptr; + } else { + begin_storage.SetMinPossibleForUserKey(*begin); + manual.begin = &begin_storage; + } + if (end == nullptr || + cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + manual.end = nullptr; + } else { + end_storage.SetMaxPossibleForUserKey(*end); + manual.end = &end_storage; + } + + TEST_SYNC_POINT("DBImpl::RunManualCompaction:0"); + TEST_SYNC_POINT("DBImpl::RunManualCompaction:1"); + InstrumentedMutexLock l(&mutex_); + + if (manual_compaction_paused_ > 0) { + // Does not make sense to `AddManualCompaction()` in this scenario since + // `DisableManualCompaction()` just waited for the manual compaction queue + // to drain. So return immediately. + TEST_SYNC_POINT("DBImpl::RunManualCompaction:PausedAtStart"); + manual.status = + Status::Incomplete(Status::SubCode::kManualCompactionPaused); + manual.done = true; + return manual.status; + } + + // When a manual compaction arrives, temporarily disable scheduling of + // non-manual compactions and wait until the number of scheduled compaction + // jobs drops to zero. This used to be needed to ensure that this manual + // compaction can compact any range of keys/files. Now it is optional + // (see `CompactRangeOptions::exclusive_manual_compaction`). The use case for + // `exclusive_manual_compaction=true` is unclear beyond not trusting the code. + // + // HasPendingManualCompaction() is true when at least one thread is inside + // RunManualCompaction(), i.e. during that time no other compaction will + // get scheduled (see MaybeScheduleFlushOrCompaction). + // + // Note that the following loop doesn't stop more that one thread calling + // RunManualCompaction() from getting to the second while loop below. + // However, only one of them will actually schedule compaction, while + // others will wait on a condition variable until it completes. + + AddManualCompaction(&manual); + TEST_SYNC_POINT_CALLBACK("DBImpl::RunManualCompaction:NotScheduled", &mutex_); + if (exclusive) { + // Limitation: there's no way to wake up the below loop when user sets + // `*manual.canceled`. So `CompactRangeOptions::exclusive_manual_compaction` + // and `CompactRangeOptions::canceled` might not work well together. + while (bg_bottom_compaction_scheduled_ > 0 || + bg_compaction_scheduled_ > 0) { + if (manual_compaction_paused_ > 0 || manual.canceled == true) { + // Pretend the error came from compaction so the below cleanup/error + // handling code can process it. + manual.done = true; + manual.status = + Status::Incomplete(Status::SubCode::kManualCompactionPaused); + break; + } + TEST_SYNC_POINT("DBImpl::RunManualCompaction:WaitScheduled"); + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "[%s] Manual compaction waiting for all other scheduled background " + "compactions to finish", + cfd->GetName().c_str()); + bg_cv_.Wait(); + } + } + + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, + immutable_db_options_.info_log.get()); + + ROCKS_LOG_BUFFER(&log_buffer, "[%s] Manual compaction starting", + cfd->GetName().c_str()); + + // We don't check bg_error_ here, because if we get the error in compaction, + // the compaction will set manual.status to bg_error_ and set manual.done to + // true. + while (!manual.done) { + assert(HasPendingManualCompaction()); + manual_conflict = false; + Compaction* compaction = nullptr; + if (ShouldntRunManualCompaction(&manual) || (manual.in_progress == true) || + scheduled || + (((manual.manual_end = &manual.tmp_storage1) != nullptr) && + ((compaction = manual.cfd->CompactRange( + *manual.cfd->GetLatestMutableCFOptions(), mutable_db_options_, + manual.input_level, manual.output_level, compact_range_options, + manual.begin, manual.end, &manual.manual_end, &manual_conflict, + max_file_num_to_ignore, trim_ts)) == nullptr && + manual_conflict))) { + // exclusive manual compactions should not see a conflict during + // CompactRange + assert(!exclusive || !manual_conflict); + // Running either this or some other manual compaction + bg_cv_.Wait(); + if (manual_compaction_paused_ > 0 && scheduled && !unscheduled) { + assert(thread_pool_priority != Env::Priority::TOTAL); + // unschedule all manual compactions + auto unscheduled_task_num = env_->UnSchedule( + GetTaskTag(TaskType::kManualCompaction), thread_pool_priority); + if (unscheduled_task_num > 0) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "[%s] Unscheduled %d number of manual compactions from the " + "thread-pool", + cfd->GetName().c_str(), unscheduled_task_num); + // it may unschedule other manual compactions, notify others. + bg_cv_.SignalAll(); + } + unscheduled = true; + TEST_SYNC_POINT("DBImpl::RunManualCompaction:Unscheduled"); + } + if (scheduled && manual.incomplete == true) { + assert(!manual.in_progress); + scheduled = false; + manual.incomplete = false; + } + } else if (!scheduled) { + if (compaction == nullptr) { + manual.done = true; + bg_cv_.SignalAll(); + continue; + } + ca = new CompactionArg; + ca->db = this; + ca->prepicked_compaction = new PrepickedCompaction; + ca->prepicked_compaction->manual_compaction_state = &manual; + ca->prepicked_compaction->compaction = compaction; + if (!RequestCompactionToken( + cfd, true, &ca->prepicked_compaction->task_token, &log_buffer)) { + // Don't throttle manual compaction, only count outstanding tasks. + assert(false); + } + manual.incomplete = false; + if (compaction->bottommost_level() && + env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) { + bg_bottom_compaction_scheduled_++; + ca->compaction_pri_ = Env::Priority::BOTTOM; + env_->Schedule(&DBImpl::BGWorkBottomCompaction, ca, + Env::Priority::BOTTOM, + GetTaskTag(TaskType::kManualCompaction), + &DBImpl::UnscheduleCompactionCallback); + thread_pool_priority = Env::Priority::BOTTOM; + } else { + bg_compaction_scheduled_++; + ca->compaction_pri_ = Env::Priority::LOW; + env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, + GetTaskTag(TaskType::kManualCompaction), + &DBImpl::UnscheduleCompactionCallback); + thread_pool_priority = Env::Priority::LOW; + } + scheduled = true; + TEST_SYNC_POINT("DBImpl::RunManualCompaction:Scheduled"); + } + } + + log_buffer.FlushBufferToLog(); + assert(!manual.in_progress); + assert(HasPendingManualCompaction()); + RemoveManualCompaction(&manual); + // if the manual job is unscheduled, try schedule other jobs in case there's + // any unscheduled compaction job which was blocked by exclusive manual + // compaction. + if (manual.status.IsIncomplete() && + manual.status.subcode() == Status::SubCode::kManualCompactionPaused) { + MaybeScheduleFlushOrCompaction(); + } + bg_cv_.SignalAll(); + return manual.status; +} + +void DBImpl::GenerateFlushRequest(const autovector& cfds, + FlushRequest* req) { + assert(req != nullptr); + req->reserve(cfds.size()); + for (const auto cfd : cfds) { + if (nullptr == cfd) { + // cfd may be null, see DBImpl::ScheduleFlushes + continue; + } + uint64_t max_memtable_id = cfd->imm()->GetLatestMemTableID(); + req->emplace_back(cfd, max_memtable_id); + } +} + +Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, + const FlushOptions& flush_options, + FlushReason flush_reason, + bool entered_write_thread) { + // This method should not be called if atomic_flush is true. + assert(!immutable_db_options_.atomic_flush); + if (!flush_options.wait && write_controller_.IsStopped()) { + std::ostringstream oss; + oss << "Writes have been stopped, thus unable to perform manual flush. " + "Please try again later after writes are resumed"; + return Status::TryAgain(oss.str()); + } + Status s; + if (!flush_options.allow_write_stall) { + bool flush_needed = true; + s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed); + TEST_SYNC_POINT("DBImpl::FlushMemTable:StallWaitDone"); + if (!s.ok() || !flush_needed) { + return s; + } + } + + const bool needs_to_join_write_thread = !entered_write_thread; + autovector flush_reqs; + autovector memtable_ids_to_wait; + { + WriteContext context; + InstrumentedMutexLock guard_lock(&mutex_); + + WriteThread::Writer w; + WriteThread::Writer nonmem_w; + if (needs_to_join_write_thread) { + write_thread_.EnterUnbatched(&w, &mutex_); + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + } + WaitForPendingWrites(); + + if (flush_reason != FlushReason::kErrorRecoveryRetryFlush && + (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load())) { + // Note that, when flush reason is kErrorRecoveryRetryFlush, during the + // auto retry resume, we want to avoid creating new small memtables. + // Therefore, SwitchMemtable will not be called. Also, since ResumeImpl + // will iterate through all the CFs and call FlushMemtable during auto + // retry resume, it is possible that in some CFs, + // cfd->imm()->NumNotFlushed() = 0. In this case, so no flush request will + // be created and scheduled, status::OK() will be returned. + s = SwitchMemtable(cfd, &context); + } + const uint64_t flush_memtable_id = std::numeric_limits::max(); + if (s.ok()) { + if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() || + !cached_recoverable_state_empty_.load()) { + FlushRequest req{{cfd, flush_memtable_id}}; + flush_reqs.emplace_back(std::move(req)); + memtable_ids_to_wait.emplace_back(cfd->imm()->GetLatestMemTableID()); + } + if (immutable_db_options_.persist_stats_to_disk && + flush_reason != FlushReason::kErrorRecoveryRetryFlush) { + ColumnFamilyData* cfd_stats = + versions_->GetColumnFamilySet()->GetColumnFamily( + kPersistentStatsColumnFamilyName); + if (cfd_stats != nullptr && cfd_stats != cfd && + !cfd_stats->mem()->IsEmpty()) { + // only force flush stats CF when it will be the only CF lagging + // behind after the current flush + bool stats_cf_flush_needed = true; + for (auto* loop_cfd : *versions_->GetColumnFamilySet()) { + if (loop_cfd == cfd_stats || loop_cfd == cfd) { + continue; + } + if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) { + stats_cf_flush_needed = false; + } + } + if (stats_cf_flush_needed) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Force flushing stats CF with manual flush of %s " + "to avoid holding old logs", + cfd->GetName().c_str()); + s = SwitchMemtable(cfd_stats, &context); + FlushRequest req{{cfd_stats, flush_memtable_id}}; + flush_reqs.emplace_back(std::move(req)); + memtable_ids_to_wait.emplace_back( + cfd->imm()->GetLatestMemTableID()); + } + } + } + } + + if (s.ok() && !flush_reqs.empty()) { + for (const auto& req : flush_reqs) { + assert(req.size() == 1); + ColumnFamilyData* loop_cfd = req[0].first; + loop_cfd->imm()->FlushRequested(); + } + // If the caller wants to wait for this flush to complete, it indicates + // that the caller expects the ColumnFamilyData not to be free'ed by + // other threads which may drop the column family concurrently. + // Therefore, we increase the cfd's ref count. + if (flush_options.wait) { + for (const auto& req : flush_reqs) { + assert(req.size() == 1); + ColumnFamilyData* loop_cfd = req[0].first; + loop_cfd->Ref(); + } + } + for (const auto& req : flush_reqs) { + SchedulePendingFlush(req, flush_reason); + } + MaybeScheduleFlushOrCompaction(); + } + + if (needs_to_join_write_thread) { + write_thread_.ExitUnbatched(&w); + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + } + } + TEST_SYNC_POINT("DBImpl::FlushMemTable:AfterScheduleFlush"); + TEST_SYNC_POINT("DBImpl::FlushMemTable:BeforeWaitForBgFlush"); + if (s.ok() && flush_options.wait) { + autovector cfds; + autovector flush_memtable_ids; + assert(flush_reqs.size() == memtable_ids_to_wait.size()); + for (size_t i = 0; i < flush_reqs.size(); ++i) { + assert(flush_reqs[i].size() == 1); + cfds.push_back(flush_reqs[i][0].first); + flush_memtable_ids.push_back(&(memtable_ids_to_wait[i])); + } + s = WaitForFlushMemTables( + cfds, flush_memtable_ids, + (flush_reason == FlushReason::kErrorRecovery || + flush_reason == FlushReason::kErrorRecoveryRetryFlush)); + InstrumentedMutexLock lock_guard(&mutex_); + for (auto* tmp_cfd : cfds) { + tmp_cfd->UnrefAndTryDelete(); + } + } + TEST_SYNC_POINT("DBImpl::FlushMemTable:FlushMemTableFinished"); + return s; +} + +// Flush all elements in 'column_family_datas' +// and atomically record the result to the MANIFEST. +Status DBImpl::AtomicFlushMemTables( + const autovector& column_family_datas, + const FlushOptions& flush_options, FlushReason flush_reason, + bool entered_write_thread) { + assert(immutable_db_options_.atomic_flush); + if (!flush_options.wait && write_controller_.IsStopped()) { + std::ostringstream oss; + oss << "Writes have been stopped, thus unable to perform manual flush. " + "Please try again later after writes are resumed"; + return Status::TryAgain(oss.str()); + } + Status s; + if (!flush_options.allow_write_stall) { + int num_cfs_to_flush = 0; + for (auto cfd : column_family_datas) { + bool flush_needed = true; + s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed); + if (!s.ok()) { + return s; + } else if (flush_needed) { + ++num_cfs_to_flush; + } + } + if (0 == num_cfs_to_flush) { + return s; + } + } + const bool needs_to_join_write_thread = !entered_write_thread; + FlushRequest flush_req; + autovector cfds; + { + WriteContext context; + InstrumentedMutexLock guard_lock(&mutex_); + + WriteThread::Writer w; + WriteThread::Writer nonmem_w; + if (needs_to_join_write_thread) { + write_thread_.EnterUnbatched(&w, &mutex_); + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + } + WaitForPendingWrites(); + + for (auto cfd : column_family_datas) { + if (cfd->IsDropped()) { + continue; + } + if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() || + !cached_recoverable_state_empty_.load()) { + cfds.emplace_back(cfd); + } + } + for (auto cfd : cfds) { + if ((cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) || + flush_reason == FlushReason::kErrorRecoveryRetryFlush) { + continue; + } + cfd->Ref(); + s = SwitchMemtable(cfd, &context); + cfd->UnrefAndTryDelete(); + if (!s.ok()) { + break; + } + } + if (s.ok()) { + AssignAtomicFlushSeq(cfds); + for (auto cfd : cfds) { + cfd->imm()->FlushRequested(); + } + // If the caller wants to wait for this flush to complete, it indicates + // that the caller expects the ColumnFamilyData not to be free'ed by + // other threads which may drop the column family concurrently. + // Therefore, we increase the cfd's ref count. + if (flush_options.wait) { + for (auto cfd : cfds) { + cfd->Ref(); + } + } + GenerateFlushRequest(cfds, &flush_req); + SchedulePendingFlush(flush_req, flush_reason); + MaybeScheduleFlushOrCompaction(); + } + + if (needs_to_join_write_thread) { + write_thread_.ExitUnbatched(&w); + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + } + } + TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:AfterScheduleFlush"); + TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush"); + if (s.ok() && flush_options.wait) { + autovector flush_memtable_ids; + for (auto& iter : flush_req) { + flush_memtable_ids.push_back(&(iter.second)); + } + s = WaitForFlushMemTables( + cfds, flush_memtable_ids, + (flush_reason == FlushReason::kErrorRecovery || + flush_reason == FlushReason::kErrorRecoveryRetryFlush)); + InstrumentedMutexLock lock_guard(&mutex_); + for (auto* cfd : cfds) { + cfd->UnrefAndTryDelete(); + } + } + return s; +} + +// Calling FlushMemTable(), whether from DB::Flush() or from Backup Engine, can +// cause write stall, for example if one memtable is being flushed already. +// This method tries to avoid write stall (similar to CompactRange() behavior) +// it emulates how the SuperVersion / LSM would change if flush happens, checks +// it against various constrains and delays flush if it'd cause write stall. +// Caller should check status and flush_needed to see if flush already happened. +Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd, + bool* flush_needed) { + { + *flush_needed = true; + InstrumentedMutexLock l(&mutex_); + uint64_t orig_active_memtable_id = cfd->mem()->GetID(); + WriteStallCondition write_stall_condition = WriteStallCondition::kNormal; + do { + if (write_stall_condition != WriteStallCondition::kNormal) { + // Same error handling as user writes: Don't wait if there's a + // background error, even if it's a soft error. We might wait here + // indefinitely as the pending flushes/compactions may never finish + // successfully, resulting in the stall condition lasting indefinitely + if (error_handler_.IsBGWorkStopped()) { + return error_handler_.GetBGError(); + } + + TEST_SYNC_POINT("DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait"); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] WaitUntilFlushWouldNotStallWrites" + " waiting on stall conditions to clear", + cfd->GetName().c_str()); + bg_cv_.Wait(); + } + if (cfd->IsDropped()) { + return Status::ColumnFamilyDropped(); + } + if (shutting_down_.load(std::memory_order_acquire)) { + return Status::ShutdownInProgress(); + } + + uint64_t earliest_memtable_id = + std::min(cfd->mem()->GetID(), cfd->imm()->GetEarliestMemTableID()); + if (earliest_memtable_id > orig_active_memtable_id) { + // We waited so long that the memtable we were originally waiting on was + // flushed. + *flush_needed = false; + return Status::OK(); + } + + const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + const auto* vstorage = cfd->current()->storage_info(); + + // Skip stalling check if we're below auto-flush and auto-compaction + // triggers. If it stalled in these conditions, that'd mean the stall + // triggers are so low that stalling is needed for any background work. In + // that case we shouldn't wait since background work won't be scheduled. + if (cfd->imm()->NumNotFlushed() < + cfd->ioptions()->min_write_buffer_number_to_merge && + vstorage->l0_delay_trigger_count() < + mutable_cf_options.level0_file_num_compaction_trigger) { + break; + } + + // check whether one extra immutable memtable or an extra L0 file would + // cause write stalling mode to be entered. It could still enter stall + // mode due to pending compaction bytes, but that's less common + write_stall_condition = ColumnFamilyData::GetWriteStallConditionAndCause( + cfd->imm()->NumNotFlushed() + 1, + vstorage->l0_delay_trigger_count() + 1, + vstorage->estimated_compaction_needed_bytes(), + mutable_cf_options, *cfd->ioptions()) + .first; + } while (write_stall_condition != WriteStallCondition::kNormal); + } + return Status::OK(); +} + +// Wait for memtables to be flushed for multiple column families. +// let N = cfds.size() +// for i in [0, N), +// 1) if flush_memtable_ids[i] is not null, then the memtables with lower IDs +// have to be flushed for THIS column family; +// 2) if flush_memtable_ids[i] is null, then all memtables in THIS column +// family have to be flushed. +// Finish waiting when ALL column families finish flushing memtables. +// resuming_from_bg_err indicates whether the caller is trying to resume from +// background error or in normal processing. +Status DBImpl::WaitForFlushMemTables( + const autovector& cfds, + const autovector& flush_memtable_ids, + bool resuming_from_bg_err) { + int num = static_cast(cfds.size()); + // Wait until the compaction completes + InstrumentedMutexLock l(&mutex_); + Status s; + // If the caller is trying to resume from bg error, then + // error_handler_.IsDBStopped() is true. + while (resuming_from_bg_err || !error_handler_.IsDBStopped()) { + if (shutting_down_.load(std::memory_order_acquire)) { + s = Status::ShutdownInProgress(); + return s; + } + // If an error has occurred during resumption, then no need to wait. + // But flush operation may fail because of this error, so need to + // return the status. + if (!error_handler_.GetRecoveryError().ok()) { + s = error_handler_.GetRecoveryError(); + break; + } + // If BGWorkStopped, which indicate that there is a BG error and + // 1) soft error but requires no BG work, 2) no in auto_recovery_ + if (!resuming_from_bg_err && error_handler_.IsBGWorkStopped() && + error_handler_.GetBGError().severity() < Status::Severity::kHardError) { + s = error_handler_.GetBGError(); + return s; + } + + // Number of column families that have been dropped. + int num_dropped = 0; + // Number of column families that have finished flush. + int num_finished = 0; + for (int i = 0; i < num; ++i) { + if (cfds[i]->IsDropped()) { + ++num_dropped; + } else if (cfds[i]->imm()->NumNotFlushed() == 0 || + (flush_memtable_ids[i] != nullptr && + cfds[i]->imm()->GetEarliestMemTableID() > + *flush_memtable_ids[i])) { + ++num_finished; + } + } + if (1 == num_dropped && 1 == num) { + s = Status::ColumnFamilyDropped(); + return s; + } + // Column families involved in this flush request have either been dropped + // or finished flush. Then it's time to finish waiting. + if (num_dropped + num_finished == num) { + break; + } + bg_cv_.Wait(); + } + // If not resuming from bg error, and an error has caused the DB to stop, + // then report the bg error to caller. + if (!resuming_from_bg_err && error_handler_.IsDBStopped()) { + s = error_handler_.GetBGError(); + } + return s; +} + +Status DBImpl::EnableAutoCompaction( + const std::vector& column_family_handles) { + Status s; + for (auto cf_ptr : column_family_handles) { + Status status = + this->SetOptions(cf_ptr, {{"disable_auto_compactions", "false"}}); + if (!status.ok()) { + s = status; + } + } + + return s; +} + +// NOTE: Calling DisableManualCompaction() may overwrite the +// user-provided canceled variable in CompactRangeOptions +void DBImpl::DisableManualCompaction() { + InstrumentedMutexLock l(&mutex_); + manual_compaction_paused_.fetch_add(1, std::memory_order_release); + + // Mark the canceled as true when the cancellation is triggered by + // manual_compaction_paused (may overwrite user-provided `canceled`) + for (const auto& manual_compaction : manual_compaction_dequeue_) { + manual_compaction->canceled = true; + } + + // Wake up manual compactions waiting to start. + bg_cv_.SignalAll(); + + // Wait for any pending manual compactions to finish (typically through + // failing with `Status::Incomplete`) prior to returning. This way we are + // guaranteed no pending manual compaction will commit while manual + // compactions are "disabled". + while (HasPendingManualCompaction()) { + bg_cv_.Wait(); + } +} + +// NOTE: In contrast to DisableManualCompaction(), calling +// EnableManualCompaction() does NOT overwrite the user-provided *canceled +// variable to be false since there is NO CHANCE a canceled compaction +// is uncanceled. In other words, a canceled compaction must have been +// dropped out of the manual compaction queue, when we disable it. +void DBImpl::EnableManualCompaction() { + InstrumentedMutexLock l(&mutex_); + assert(manual_compaction_paused_ > 0); + manual_compaction_paused_.fetch_sub(1, std::memory_order_release); +} + +void DBImpl::MaybeScheduleFlushOrCompaction() { + mutex_.AssertHeld(); + if (!opened_successfully_) { + // Compaction may introduce data race to DB open + return; + } + if (bg_work_paused_ > 0) { + // we paused the background work + return; + } else if (error_handler_.IsBGWorkStopped() && + !error_handler_.IsRecoveryInProgress()) { + // There has been a hard error and this call is not part of the recovery + // sequence. Bail out here so we don't get into an endless loop of + // scheduling BG work which will again call this function + return; + } else if (shutting_down_.load(std::memory_order_acquire)) { + // DB is being deleted; no more background compactions + return; + } + auto bg_job_limits = GetBGJobLimits(); + bool is_flush_pool_empty = + env_->GetBackgroundThreads(Env::Priority::HIGH) == 0; + while (!is_flush_pool_empty && unscheduled_flushes_ > 0 && + bg_flush_scheduled_ < bg_job_limits.max_flushes) { + bg_flush_scheduled_++; + FlushThreadArg* fta = new FlushThreadArg; + fta->db_ = this; + fta->thread_pri_ = Env::Priority::HIGH; + env_->Schedule(&DBImpl::BGWorkFlush, fta, Env::Priority::HIGH, this, + &DBImpl::UnscheduleFlushCallback); + --unscheduled_flushes_; + TEST_SYNC_POINT_CALLBACK( + "DBImpl::MaybeScheduleFlushOrCompaction:AfterSchedule:0", + &unscheduled_flushes_); + } + + // special case -- if high-pri (flush) thread pool is empty, then schedule + // flushes in low-pri (compaction) thread pool. + if (is_flush_pool_empty) { + while (unscheduled_flushes_ > 0 && + bg_flush_scheduled_ + bg_compaction_scheduled_ < + bg_job_limits.max_flushes) { + bg_flush_scheduled_++; + FlushThreadArg* fta = new FlushThreadArg; + fta->db_ = this; + fta->thread_pri_ = Env::Priority::LOW; + env_->Schedule(&DBImpl::BGWorkFlush, fta, Env::Priority::LOW, this, + &DBImpl::UnscheduleFlushCallback); + --unscheduled_flushes_; + } + } + + if (bg_compaction_paused_ > 0) { + // we paused the background compaction + return; + } else if (error_handler_.IsBGWorkStopped()) { + // Compaction is not part of the recovery sequence from a hard error. We + // might get here because recovery might do a flush and install a new + // super version, which will try to schedule pending compactions. Bail + // out here and let the higher level recovery handle compactions + return; + } + + if (HasExclusiveManualCompaction()) { + // only manual compactions are allowed to run. don't schedule automatic + // compactions + TEST_SYNC_POINT("DBImpl::MaybeScheduleFlushOrCompaction:Conflict"); + return; + } + + while (bg_compaction_scheduled_ + bg_bottom_compaction_scheduled_ < + bg_job_limits.max_compactions && + unscheduled_compactions_ > 0) { + CompactionArg* ca = new CompactionArg; + ca->db = this; + ca->compaction_pri_ = Env::Priority::LOW; + ca->prepicked_compaction = nullptr; + bg_compaction_scheduled_++; + unscheduled_compactions_--; + env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this, + &DBImpl::UnscheduleCompactionCallback); + } +} + +DBImpl::BGJobLimits DBImpl::GetBGJobLimits() const { + mutex_.AssertHeld(); + return GetBGJobLimits(mutable_db_options_.max_background_flushes, + mutable_db_options_.max_background_compactions, + mutable_db_options_.max_background_jobs, + write_controller_.NeedSpeedupCompaction()); +} + +DBImpl::BGJobLimits DBImpl::GetBGJobLimits(int max_background_flushes, + int max_background_compactions, + int max_background_jobs, + bool parallelize_compactions) { + BGJobLimits res; + if (max_background_flushes == -1 && max_background_compactions == -1) { + // for our first stab implementing max_background_jobs, simply allocate a + // quarter of the threads to flushes. + res.max_flushes = std::max(1, max_background_jobs / 4); + res.max_compactions = std::max(1, max_background_jobs - res.max_flushes); + } else { + // compatibility code in case users haven't migrated to max_background_jobs, + // which automatically computes flush/compaction limits + res.max_flushes = std::max(1, max_background_flushes); + res.max_compactions = std::max(1, max_background_compactions); + } + if (!parallelize_compactions) { + // throttle background compactions until we deem necessary + res.max_compactions = 1; + } + return res; +} + +void DBImpl::AddToCompactionQueue(ColumnFamilyData* cfd) { + assert(!cfd->queued_for_compaction()); + cfd->Ref(); + compaction_queue_.push_back(cfd); + cfd->set_queued_for_compaction(true); +} + +ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() { + assert(!compaction_queue_.empty()); + auto cfd = *compaction_queue_.begin(); + compaction_queue_.pop_front(); + assert(cfd->queued_for_compaction()); + cfd->set_queued_for_compaction(false); + return cfd; +} + +DBImpl::FlushRequest DBImpl::PopFirstFromFlushQueue() { + assert(!flush_queue_.empty()); + FlushRequest flush_req = flush_queue_.front(); + flush_queue_.pop_front(); + if (!immutable_db_options_.atomic_flush) { + assert(flush_req.size() == 1); + } + for (const auto& elem : flush_req) { + if (!immutable_db_options_.atomic_flush) { + ColumnFamilyData* cfd = elem.first; + assert(cfd); + assert(cfd->queued_for_flush()); + cfd->set_queued_for_flush(false); + } + } + // TODO: need to unset flush reason? + return flush_req; +} + +ColumnFamilyData* DBImpl::PickCompactionFromQueue( + std::unique_ptr* token, LogBuffer* log_buffer) { + assert(!compaction_queue_.empty()); + assert(*token == nullptr); + autovector throttled_candidates; + ColumnFamilyData* cfd = nullptr; + while (!compaction_queue_.empty()) { + auto first_cfd = *compaction_queue_.begin(); + compaction_queue_.pop_front(); + assert(first_cfd->queued_for_compaction()); + if (!RequestCompactionToken(first_cfd, false, token, log_buffer)) { + throttled_candidates.push_back(first_cfd); + continue; + } + cfd = first_cfd; + cfd->set_queued_for_compaction(false); + break; + } + // Add throttled compaction candidates back to queue in the original order. + for (auto iter = throttled_candidates.rbegin(); + iter != throttled_candidates.rend(); ++iter) { + compaction_queue_.push_front(*iter); + } + return cfd; +} + +void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req, + FlushReason flush_reason) { + mutex_.AssertHeld(); + if (flush_req.empty()) { + return; + } + if (!immutable_db_options_.atomic_flush) { + // For the non-atomic flush case, we never schedule multiple column + // families in the same flush request. + assert(flush_req.size() == 1); + ColumnFamilyData* cfd = flush_req[0].first; + assert(cfd); + + if (!cfd->queued_for_flush() && cfd->imm()->IsFlushPending()) { + cfd->Ref(); + cfd->set_queued_for_flush(true); + cfd->SetFlushReason(flush_reason); + ++unscheduled_flushes_; + flush_queue_.push_back(flush_req); + } + } else { + for (auto& iter : flush_req) { + ColumnFamilyData* cfd = iter.first; + cfd->Ref(); + cfd->SetFlushReason(flush_reason); + } + ++unscheduled_flushes_; + flush_queue_.push_back(flush_req); + } +} + +void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) { + mutex_.AssertHeld(); + if (!cfd->queued_for_compaction() && cfd->NeedsCompaction()) { + AddToCompactionQueue(cfd); + ++unscheduled_compactions_; + } +} + +void DBImpl::SchedulePendingPurge(std::string fname, std::string dir_to_sync, + FileType type, uint64_t number, int job_id) { + mutex_.AssertHeld(); + PurgeFileInfo file_info(fname, dir_to_sync, type, number, job_id); + purge_files_.insert({{number, std::move(file_info)}}); +} + +void DBImpl::BGWorkFlush(void* arg) { + FlushThreadArg fta = *(reinterpret_cast(arg)); + delete reinterpret_cast(arg); + + IOSTATS_SET_THREAD_POOL_ID(fta.thread_pri_); + TEST_SYNC_POINT("DBImpl::BGWorkFlush"); + static_cast_with_check(fta.db_)->BackgroundCallFlush(fta.thread_pri_); + TEST_SYNC_POINT("DBImpl::BGWorkFlush:done"); +} + +void DBImpl::BGWorkCompaction(void* arg) { + CompactionArg ca = *(reinterpret_cast(arg)); + delete reinterpret_cast(arg); + IOSTATS_SET_THREAD_POOL_ID(Env::Priority::LOW); + TEST_SYNC_POINT("DBImpl::BGWorkCompaction"); + auto prepicked_compaction = + static_cast(ca.prepicked_compaction); + static_cast_with_check(ca.db)->BackgroundCallCompaction( + prepicked_compaction, Env::Priority::LOW); + delete prepicked_compaction; +} + +void DBImpl::BGWorkBottomCompaction(void* arg) { + CompactionArg ca = *(static_cast(arg)); + delete static_cast(arg); + IOSTATS_SET_THREAD_POOL_ID(Env::Priority::BOTTOM); + TEST_SYNC_POINT("DBImpl::BGWorkBottomCompaction"); + auto* prepicked_compaction = ca.prepicked_compaction; + assert(prepicked_compaction && prepicked_compaction->compaction); + ca.db->BackgroundCallCompaction(prepicked_compaction, Env::Priority::BOTTOM); + delete prepicked_compaction; +} + +void DBImpl::BGWorkPurge(void* db) { + IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH); + TEST_SYNC_POINT("DBImpl::BGWorkPurge:start"); + reinterpret_cast(db)->BackgroundCallPurge(); + TEST_SYNC_POINT("DBImpl::BGWorkPurge:end"); +} + +void DBImpl::UnscheduleCompactionCallback(void* arg) { + CompactionArg* ca_ptr = reinterpret_cast(arg); + Env::Priority compaction_pri = ca_ptr->compaction_pri_; + if (Env::Priority::BOTTOM == compaction_pri) { + // Decrement bg_bottom_compaction_scheduled_ if priority is BOTTOM + ca_ptr->db->bg_bottom_compaction_scheduled_--; + } else if (Env::Priority::LOW == compaction_pri) { + // Decrement bg_compaction_scheduled_ if priority is LOW + ca_ptr->db->bg_compaction_scheduled_--; + } + CompactionArg ca = *(ca_ptr); + delete reinterpret_cast(arg); + if (ca.prepicked_compaction != nullptr) { + // if it's a manual compaction, set status to ManualCompactionPaused + if (ca.prepicked_compaction->manual_compaction_state) { + ca.prepicked_compaction->manual_compaction_state->done = true; + ca.prepicked_compaction->manual_compaction_state->status = + Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + if (ca.prepicked_compaction->compaction != nullptr) { + ca.prepicked_compaction->compaction->ReleaseCompactionFiles( + Status::Incomplete(Status::SubCode::kManualCompactionPaused)); + delete ca.prepicked_compaction->compaction; + } + delete ca.prepicked_compaction; + } + TEST_SYNC_POINT("DBImpl::UnscheduleCompactionCallback"); +} + +void DBImpl::UnscheduleFlushCallback(void* arg) { + // Decrement bg_flush_scheduled_ in flush callback + reinterpret_cast(arg)->db_->bg_flush_scheduled_--; + Env::Priority flush_pri = reinterpret_cast(arg)->thread_pri_; + if (Env::Priority::LOW == flush_pri) { + TEST_SYNC_POINT("DBImpl::UnscheduleLowFlushCallback"); + } else if (Env::Priority::HIGH == flush_pri) { + TEST_SYNC_POINT("DBImpl::UnscheduleHighFlushCallback"); + } + delete reinterpret_cast(arg); + TEST_SYNC_POINT("DBImpl::UnscheduleFlushCallback"); +} + +Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, + LogBuffer* log_buffer, FlushReason* reason, + Env::Priority thread_pri) { + mutex_.AssertHeld(); + + Status status; + *reason = FlushReason::kOthers; + // If BG work is stopped due to an error, but a recovery is in progress, + // that means this flush is part of the recovery. So allow it to go through + if (!error_handler_.IsBGWorkStopped()) { + if (shutting_down_.load(std::memory_order_acquire)) { + status = Status::ShutdownInProgress(); + } + } else if (!error_handler_.IsRecoveryInProgress()) { + status = error_handler_.GetBGError(); + } + + if (!status.ok()) { + return status; + } + + autovector bg_flush_args; + std::vector& superversion_contexts = + job_context->superversion_contexts; + autovector column_families_not_to_flush; + while (!flush_queue_.empty()) { + // This cfd is already referenced + const FlushRequest& flush_req = PopFirstFromFlushQueue(); + superversion_contexts.clear(); + superversion_contexts.reserve(flush_req.size()); + + for (const auto& iter : flush_req) { + ColumnFamilyData* cfd = iter.first; + if (cfd->GetMempurgeUsed()) { + // If imm() contains silent memtables (e.g.: because + // MemPurge was activated), requesting a flush will + // mark the imm_needed as true. + cfd->imm()->FlushRequested(); + } + + if (cfd->IsDropped() || !cfd->imm()->IsFlushPending()) { + // can't flush this CF, try next one + column_families_not_to_flush.push_back(cfd); + continue; + } + superversion_contexts.emplace_back(SuperVersionContext(true)); + bg_flush_args.emplace_back(cfd, iter.second, + &(superversion_contexts.back())); + } + if (!bg_flush_args.empty()) { + break; + } + } + + if (!bg_flush_args.empty()) { + auto bg_job_limits = GetBGJobLimits(); + for (const auto& arg : bg_flush_args) { + ColumnFamilyData* cfd = arg.cfd_; + ROCKS_LOG_BUFFER( + log_buffer, + "Calling FlushMemTableToOutputFile with column " + "family [%s], flush slots available %d, compaction slots available " + "%d, " + "flush slots scheduled %d, compaction slots scheduled %d", + cfd->GetName().c_str(), bg_job_limits.max_flushes, + bg_job_limits.max_compactions, bg_flush_scheduled_, + bg_compaction_scheduled_); + } + status = FlushMemTablesToOutputFiles(bg_flush_args, made_progress, + job_context, log_buffer, thread_pri); + TEST_SYNC_POINT("DBImpl::BackgroundFlush:BeforeFlush"); + // All the CFDs in the FlushReq must have the same flush reason, so just + // grab the first one + *reason = bg_flush_args[0].cfd_->GetFlushReason(); + for (auto& arg : bg_flush_args) { + ColumnFamilyData* cfd = arg.cfd_; + if (cfd->UnrefAndTryDelete()) { + arg.cfd_ = nullptr; + } + } + } + for (auto cfd : column_families_not_to_flush) { + cfd->UnrefAndTryDelete(); + } + return status; +} + +void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) { + bool made_progress = false; + JobContext job_context(next_job_id_.fetch_add(1), true); + + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCallFlush:start", nullptr); + + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, + immutable_db_options_.info_log.get()); + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:1"); + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:2"); + { + InstrumentedMutexLock l(&mutex_); + assert(bg_flush_scheduled_); + num_running_flushes_++; + + std::unique_ptr::iterator> + pending_outputs_inserted_elem(new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + FlushReason reason; + + Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer, + &reason, thread_pri); + if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped() && + reason != FlushReason::kErrorRecovery) { + // Wait a little bit before retrying background flush in + // case this is an environmental problem and we do not want to + // chew up resources for failed flushes for the duration of + // the problem. + uint64_t error_cnt = + default_cf_internal_stats_->BumpAndGetBackgroundErrorCount(); + bg_cv_.SignalAll(); // In case a waiter can proceed despite the error + mutex_.Unlock(); + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Waiting after background flush error: %s" + "Accumulated background error counts: %" PRIu64, + s.ToString().c_str(), error_cnt); + log_buffer.FlushBufferToLog(); + LogFlush(immutable_db_options_.info_log); + immutable_db_options_.clock->SleepForMicroseconds(1000000); + mutex_.Lock(); + } + + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FlushFinish:0"); + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); + + // If flush failed, we want to delete all temporary files that we might have + // created. Thus, we force full scan in FindObsoleteFiles() + FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() && + !s.IsColumnFamilyDropped()); + // delete unnecessary files if any, this is done outside the mutex + if (job_context.HaveSomethingToClean() || + job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { + mutex_.Unlock(); + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FilesFound"); + // Have to flush the info logs before bg_flush_scheduled_-- + // because if bg_flush_scheduled_ becomes 0 and the lock is + // released, the deconstructor of DB can kick in and destroy all the + // states of DB so info_log might not be available after that point. + // It also applies to access other states that DB owns. + log_buffer.FlushBufferToLog(); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + mutex_.Lock(); + } + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:ContextCleanedUp"); + + assert(num_running_flushes_ > 0); + num_running_flushes_--; + bg_flush_scheduled_--; + // See if there's more work to be done + MaybeScheduleFlushOrCompaction(); + atomic_flush_install_cv_.SignalAll(); + bg_cv_.SignalAll(); + // IMPORTANT: there should be no code after calling SignalAll. This call may + // signal the DB destructor that it's OK to proceed with destruction. In + // that case, all DB variables will be dealloacated and referencing them + // will cause trouble. + } +} + +void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, + Env::Priority bg_thread_pri) { + bool made_progress = false; + JobContext job_context(next_job_id_.fetch_add(1), true); + TEST_SYNC_POINT("BackgroundCallCompaction:0"); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, + immutable_db_options_.info_log.get()); + { + InstrumentedMutexLock l(&mutex_); + + // This call will unlock/lock the mutex to wait for current running + // IngestExternalFile() calls to finish. + WaitForIngestFile(); + + num_running_compactions_++; + + std::unique_ptr::iterator> + pending_outputs_inserted_elem(new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + + assert((bg_thread_pri == Env::Priority::BOTTOM && + bg_bottom_compaction_scheduled_) || + (bg_thread_pri == Env::Priority::LOW && bg_compaction_scheduled_)); + Status s = BackgroundCompaction(&made_progress, &job_context, &log_buffer, + prepicked_compaction, bg_thread_pri); + TEST_SYNC_POINT("BackgroundCallCompaction:1"); + if (s.IsBusy()) { + bg_cv_.SignalAll(); // In case a waiter can proceed despite the error + mutex_.Unlock(); + immutable_db_options_.clock->SleepForMicroseconds( + 10000); // prevent hot loop + mutex_.Lock(); + } else if (!s.ok() && !s.IsShutdownInProgress() && + !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped()) { + // Wait a little bit before retrying background compaction in + // case this is an environmental problem and we do not want to + // chew up resources for failed compactions for the duration of + // the problem. + uint64_t error_cnt = + default_cf_internal_stats_->BumpAndGetBackgroundErrorCount(); + bg_cv_.SignalAll(); // In case a waiter can proceed despite the error + mutex_.Unlock(); + log_buffer.FlushBufferToLog(); + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Waiting after background compaction error: %s, " + "Accumulated background error counts: %" PRIu64, + s.ToString().c_str(), error_cnt); + LogFlush(immutable_db_options_.info_log); + immutable_db_options_.clock->SleepForMicroseconds(1000000); + mutex_.Lock(); + } else if (s.IsManualCompactionPaused()) { + assert(prepicked_compaction); + ManualCompactionState* m = prepicked_compaction->manual_compaction_state; + assert(m); + ROCKS_LOG_BUFFER(&log_buffer, "[%s] [JOB %d] Manual compaction paused", + m->cfd->GetName().c_str(), job_context.job_id); + } + + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); + + // If compaction failed, we want to delete all temporary files that we + // might have created (they might not be all recorded in job_context in + // case of a failure). Thus, we force full scan in FindObsoleteFiles() + FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() && + !s.IsManualCompactionPaused() && + !s.IsColumnFamilyDropped() && + !s.IsBusy()); + TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"); + + // delete unnecessary files if any, this is done outside the mutex + if (job_context.HaveSomethingToClean() || + job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { + mutex_.Unlock(); + // Have to flush the info logs before bg_compaction_scheduled_-- + // because if bg_flush_scheduled_ becomes 0 and the lock is + // released, the deconstructor of DB can kick in and destroy all the + // states of DB so info_log might not be available after that point. + // It also applies to access other states that DB owns. + log_buffer.FlushBufferToLog(); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles"); + } + job_context.Clean(); + mutex_.Lock(); + } + + assert(num_running_compactions_ > 0); + num_running_compactions_--; + + if (bg_thread_pri == Env::Priority::LOW) { + bg_compaction_scheduled_--; + } else { + assert(bg_thread_pri == Env::Priority::BOTTOM); + bg_bottom_compaction_scheduled_--; + } + + // See if there's more work to be done + MaybeScheduleFlushOrCompaction(); + + if (prepicked_compaction != nullptr && + prepicked_compaction->task_token != nullptr) { + // Releasing task tokens affects (and asserts on) the DB state, so + // must be done before we potentially signal the DB close process to + // proceed below. + prepicked_compaction->task_token.reset(); + } + + if (made_progress || + (bg_compaction_scheduled_ == 0 && + bg_bottom_compaction_scheduled_ == 0) || + HasPendingManualCompaction() || unscheduled_compactions_ == 0) { + // signal if + // * made_progress -- need to wakeup DelayWrite + // * bg_{bottom,}_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl + // * HasPendingManualCompaction -- need to wakeup RunManualCompaction + // If none of this is true, there is no need to signal since nobody is + // waiting for it + bg_cv_.SignalAll(); + } + // IMPORTANT: there should be no code after calling SignalAll. This call may + // signal the DB destructor that it's OK to proceed with destruction. In + // that case, all DB variables will be dealloacated and referencing them + // will cause trouble. + } +} + +Status DBImpl::BackgroundCompaction(bool* made_progress, + JobContext* job_context, + LogBuffer* log_buffer, + PrepickedCompaction* prepicked_compaction, + Env::Priority thread_pri) { + ManualCompactionState* manual_compaction = + prepicked_compaction == nullptr + ? nullptr + : prepicked_compaction->manual_compaction_state; + *made_progress = false; + mutex_.AssertHeld(); + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Start"); + + bool is_manual = (manual_compaction != nullptr); + std::unique_ptr c; + if (prepicked_compaction != nullptr && + prepicked_compaction->compaction != nullptr) { + c.reset(prepicked_compaction->compaction); + } + bool is_prepicked = is_manual || c; + + // (manual_compaction->in_progress == false); + bool trivial_move_disallowed = + is_manual && manual_compaction->disallow_trivial_move; + + CompactionJobStats compaction_job_stats; + Status status; + if (!error_handler_.IsBGWorkStopped()) { + if (shutting_down_.load(std::memory_order_acquire)) { + status = Status::ShutdownInProgress(); + } else if (is_manual && + manual_compaction->canceled.load(std::memory_order_acquire)) { + status = Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + } else { + status = error_handler_.GetBGError(); + // If we get here, it means a hard error happened after this compaction + // was scheduled by MaybeScheduleFlushOrCompaction(), but before it got + // a chance to execute. Since we didn't pop a cfd from the compaction + // queue, increment unscheduled_compactions_ + unscheduled_compactions_++; + } + + if (!status.ok()) { + if (is_manual) { + manual_compaction->status = status; + manual_compaction->done = true; + manual_compaction->in_progress = false; + manual_compaction = nullptr; + } + if (c) { + c->ReleaseCompactionFiles(status); + c.reset(); + } + return status; + } + + if (is_manual) { + // another thread cannot pick up the same work + manual_compaction->in_progress = true; + } + + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:InProgress"); + + std::unique_ptr task_token; + + // InternalKey manual_end_storage; + // InternalKey* manual_end = &manual_end_storage; + bool sfm_reserved_compact_space = false; + if (is_manual) { + ManualCompactionState* m = manual_compaction; + assert(m->in_progress); + if (!c) { + m->done = true; + m->manual_end = nullptr; + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Manual compaction from level-%d from %s .. " + "%s; nothing to do\n", + m->cfd->GetName().c_str(), m->input_level, + (m->begin ? m->begin->DebugString(true).c_str() : "(begin)"), + (m->end ? m->end->DebugString(true).c_str() : "(end)")); + } else { + // First check if we have enough room to do the compaction + bool enough_room = EnoughRoomForCompaction( + m->cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer); + + if (!enough_room) { + // Then don't do the compaction + c->ReleaseCompactionFiles(status); + c.reset(); + // m's vars will get set properly at the end of this function, + // as long as status == CompactionTooLarge + status = Status::CompactionTooLarge(); + } else { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Manual compaction from level-%d to level-%d from %s .. " + "%s; will stop at %s\n", + m->cfd->GetName().c_str(), m->input_level, c->output_level(), + (m->begin ? m->begin->DebugString(true).c_str() : "(begin)"), + (m->end ? m->end->DebugString(true).c_str() : "(end)"), + ((m->done || m->manual_end == nullptr) + ? "(end)" + : m->manual_end->DebugString(true).c_str())); + } + } + } else if (!is_prepicked && !compaction_queue_.empty()) { + if (HasExclusiveManualCompaction()) { + // Can't compact right now, but try again later + TEST_SYNC_POINT("DBImpl::BackgroundCompaction()::Conflict"); + + // Stay in the compaction queue. + unscheduled_compactions_++; + + return Status::OK(); + } + + auto cfd = PickCompactionFromQueue(&task_token, log_buffer); + if (cfd == nullptr) { + // Can't find any executable task from the compaction queue. + // All tasks have been throttled by compaction thread limiter. + ++unscheduled_compactions_; + return Status::Busy(); + } + + // We unreference here because the following code will take a Ref() on + // this cfd if it is going to use it (Compaction class holds a + // reference). + // This will all happen under a mutex so we don't have to be afraid of + // somebody else deleting it. + if (cfd->UnrefAndTryDelete()) { + // This was the last reference of the column family, so no need to + // compact. + return Status::OK(); + } + + // Pick up latest mutable CF Options and use it throughout the + // compaction job + // Compaction makes a copy of the latest MutableCFOptions. It should be used + // throughout the compaction procedure to make sure consistency. It will + // eventually be installed into SuperVersion + auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + if (!mutable_cf_options->disable_auto_compactions && !cfd->IsDropped()) { + // NOTE: try to avoid unnecessary copy of MutableCFOptions if + // compaction is not necessary. Need to make sure mutex is held + // until we make a copy in the following code + TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction"); + c.reset(cfd->PickCompaction(*mutable_cf_options, mutable_db_options_, + log_buffer)); + TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction"); + + if (c != nullptr) { + bool enough_room = EnoughRoomForCompaction( + cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer); + + if (!enough_room) { + // Then don't do the compaction + c->ReleaseCompactionFiles(status); + c->column_family_data() + ->current() + ->storage_info() + ->ComputeCompactionScore(*(c->immutable_options()), + *(c->mutable_cf_options())); + AddToCompactionQueue(cfd); + ++unscheduled_compactions_; + + c.reset(); + // Don't need to sleep here, because BackgroundCallCompaction + // will sleep if !s.ok() + status = Status::CompactionTooLarge(); + } else { + // update statistics + size_t num_files = 0; + for (auto& each_level : *c->inputs()) { + num_files += each_level.files.size(); + } + RecordInHistogram(stats_, NUM_FILES_IN_SINGLE_COMPACTION, num_files); + + // There are three things that can change compaction score: + // 1) When flush or compaction finish. This case is covered by + // InstallSuperVersionAndScheduleWork + // 2) When MutableCFOptions changes. This case is also covered by + // InstallSuperVersionAndScheduleWork, because this is when the new + // options take effect. + // 3) When we Pick a new compaction, we "remove" those files being + // compacted from the calculation, which then influences compaction + // score. Here we check if we need the new compaction even without the + // files that are currently being compacted. If we need another + // compaction, we might be able to execute it in parallel, so we add + // it to the queue and schedule a new thread. + if (cfd->NeedsCompaction()) { + // Yes, we need more compactions! + AddToCompactionQueue(cfd); + ++unscheduled_compactions_; + MaybeScheduleFlushOrCompaction(); + } + } + } + } + } + + IOStatus io_s; + if (!c) { + // Nothing to do + ROCKS_LOG_BUFFER(log_buffer, "Compaction nothing to do"); + } else if (c->deletion_compaction()) { + // TODO(icanadi) Do we want to honor snapshots here? i.e. not delete old + // file if there is alive snapshot pointing to it + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction", + c->column_family_data()); + assert(c->num_input_files(1) == 0); + assert(c->column_family_data()->ioptions()->compaction_style == + kCompactionStyleFIFO); + + compaction_job_stats.num_input_files = c->num_input_files(0); + + NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, + compaction_job_stats, job_context->job_id); + + for (const auto& f : *c->inputs(0)) { + c->edit()->DeleteFile(c->level(), f->fd.GetNumber()); + } + status = versions_->LogAndApply(c->column_family_data(), + *c->mutable_cf_options(), c->edit(), + &mutex_, directories_.GetDbDir()); + io_s = versions_->io_status(); + InstallSuperVersionAndScheduleWork(c->column_family_data(), + &job_context->superversion_contexts[0], + *c->mutable_cf_options()); + ROCKS_LOG_BUFFER(log_buffer, "[%s] Deleted %d files\n", + c->column_family_data()->GetName().c_str(), + c->num_input_files(0)); + *made_progress = true; + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction", + c->column_family_data()); + } else if (!trivial_move_disallowed && c->IsTrivialMove()) { + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove"); + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction", + c->column_family_data()); + // Instrument for event update + // TODO(yhchiang): add op details for showing trivial-move. + ThreadStatusUtil::SetColumnFamily( + c->column_family_data(), c->column_family_data()->ioptions()->env, + immutable_db_options_.enable_thread_tracking); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); + + compaction_job_stats.num_input_files = c->num_input_files(0); + + NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, + compaction_job_stats, job_context->job_id); + + // Move files to next level + int32_t moved_files = 0; + int64_t moved_bytes = 0; + for (unsigned int l = 0; l < c->num_input_levels(); l++) { + if (c->level(l) == c->output_level()) { + continue; + } + for (size_t i = 0; i < c->num_input_files(l); i++) { + FileMetaData* f = c->input(l, i); + c->edit()->DeleteFile(c->level(l), f->fd.GetNumber()); + c->edit()->AddFile( + c->output_level(), f->fd.GetNumber(), f->fd.GetPathId(), + f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno, + f->fd.largest_seqno, f->marked_for_compaction, f->temperature, + f->oldest_blob_file_number, f->oldest_ancester_time, + f->file_creation_time, f->file_checksum, f->file_checksum_func_name, + f->unique_id); + + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Moving #%" PRIu64 " to level-%d %" PRIu64 " bytes\n", + c->column_family_data()->GetName().c_str(), f->fd.GetNumber(), + c->output_level(), f->fd.GetFileSize()); + ++moved_files; + moved_bytes += f->fd.GetFileSize(); + } + } + if (c->compaction_reason() == CompactionReason::kLevelMaxLevelSize && + c->immutable_options()->compaction_pri == kRoundRobin) { + int start_level = c->start_level(); + if (start_level > 0) { + auto vstorage = c->input_version()->storage_info(); + c->edit()->AddCompactCursor( + start_level, + vstorage->GetNextCompactCursor(start_level, c->num_input_files(0))); + } + } + status = versions_->LogAndApply(c->column_family_data(), + *c->mutable_cf_options(), c->edit(), + &mutex_, directories_.GetDbDir()); + io_s = versions_->io_status(); + // Use latest MutableCFOptions + InstallSuperVersionAndScheduleWork(c->column_family_data(), + &job_context->superversion_contexts[0], + *c->mutable_cf_options()); + + VersionStorageInfo::LevelSummaryStorage tmp; + c->column_family_data()->internal_stats()->IncBytesMoved(c->output_level(), + moved_bytes); + { + event_logger_.LogToBuffer(log_buffer) + << "job" << job_context->job_id << "event" + << "trivial_move" + << "destination_level" << c->output_level() << "files" << moved_files + << "total_files_size" << moved_bytes; + } + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Moved #%d files to level-%d %" PRIu64 " bytes %s: %s\n", + c->column_family_data()->GetName().c_str(), moved_files, + c->output_level(), moved_bytes, status.ToString().c_str(), + c->column_family_data()->current()->storage_info()->LevelSummary(&tmp)); + *made_progress = true; + + // Clear Instrument + ThreadStatusUtil::ResetThreadStatus(); + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction", + c->column_family_data()); + } else if (!is_prepicked && c->output_level() > 0 && + c->output_level() == + c->column_family_data() + ->current() + ->storage_info() + ->MaxOutputLevel( + immutable_db_options_.allow_ingest_behind) && + env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) { + // Forward compactions involving last level to the bottom pool if it exists, + // such that compactions unlikely to contribute to write stalls can be + // delayed or deprioritized. + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:ForwardToBottomPriPool"); + CompactionArg* ca = new CompactionArg; + ca->db = this; + ca->compaction_pri_ = Env::Priority::BOTTOM; + ca->prepicked_compaction = new PrepickedCompaction; + ca->prepicked_compaction->compaction = c.release(); + ca->prepicked_compaction->manual_compaction_state = nullptr; + // Transfer requested token, so it doesn't need to do it again. + ca->prepicked_compaction->task_token = std::move(task_token); + ++bg_bottom_compaction_scheduled_; + env_->Schedule(&DBImpl::BGWorkBottomCompaction, ca, Env::Priority::BOTTOM, + this, &DBImpl::UnscheduleCompactionCallback); + } else { + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction", + c->column_family_data()); + int output_level __attribute__((__unused__)); + output_level = c->output_level(); + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:NonTrivial", + &output_level); + std::vector snapshot_seqs; + SequenceNumber earliest_write_conflict_snapshot; + SnapshotChecker* snapshot_checker; + GetSnapshotContext(job_context, &snapshot_seqs, + &earliest_write_conflict_snapshot, &snapshot_checker); + assert(is_snapshot_supported_ || snapshots_.empty()); + + CompactionJob compaction_job( + job_context->job_id, c.get(), immutable_db_options_, + mutable_db_options_, file_options_for_compaction_, versions_.get(), + &shutting_down_, log_buffer, directories_.GetDbDir(), + GetDataDir(c->column_family_data(), c->output_path_id()), + GetDataDir(c->column_family_data(), 0), stats_, &mutex_, + &error_handler_, snapshot_seqs, earliest_write_conflict_snapshot, + snapshot_checker, job_context, table_cache_, &event_logger_, + c->mutable_cf_options()->paranoid_file_checks, + c->mutable_cf_options()->report_bg_io_stats, dbname_, + &compaction_job_stats, thread_pri, io_tracer_, + is_manual ? manual_compaction->canceled + : kManualCompactionCanceledFalse_, + db_id_, db_session_id_, c->column_family_data()->GetFullHistoryTsLow(), + c->trim_ts(), &blob_callback_, &bg_compaction_scheduled_, + &bg_bottom_compaction_scheduled_); + compaction_job.Prepare(); + + NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, + compaction_job_stats, job_context->job_id); + mutex_.Unlock(); + TEST_SYNC_POINT_CALLBACK( + "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr); + // Should handle erorr? + compaction_job.Run().PermitUncheckedError(); + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun"); + mutex_.Lock(); + + status = compaction_job.Install(*c->mutable_cf_options()); + io_s = compaction_job.io_status(); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(c->column_family_data(), + &job_context->superversion_contexts[0], + *c->mutable_cf_options()); + } + *made_progress = true; + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction", + c->column_family_data()); + } + + if (status.ok() && !io_s.ok()) { + status = io_s; + } else { + io_s.PermitUncheckedError(); + } + + if (c != nullptr) { + c->ReleaseCompactionFiles(status); + *made_progress = true; + +#ifndef ROCKSDB_LITE + // Need to make sure SstFileManager does its bookkeeping + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + if (sfm && sfm_reserved_compact_space) { + sfm->OnCompactionCompletion(c.get()); + } +#endif // ROCKSDB_LITE + + NotifyOnCompactionCompleted(c->column_family_data(), c.get(), status, + compaction_job_stats, job_context->job_id); + } + + if (status.ok() || status.IsCompactionTooLarge() || + status.IsManualCompactionPaused()) { + // Done + } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) { + // Ignore compaction errors found during shutting down + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s", + status.ToString().c_str()); + if (!io_s.ok()) { + // Error while writing to MANIFEST. + // In fact, versions_->io_status() can also be the result of renaming + // CURRENT file. With current code, it's just difficult to tell. So just + // be pessimistic and try write to a new MANIFEST. + // TODO: distinguish between MANIFEST write and CURRENT renaming + auto err_reason = versions_->io_status().ok() + ? BackgroundErrorReason::kCompaction + : BackgroundErrorReason::kManifestWrite; + error_handler_.SetBGError(io_s, err_reason); + } else { + error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction); + } + if (c != nullptr && !is_manual && !error_handler_.IsBGWorkStopped()) { + // Put this cfd back in the compaction queue so we can retry after some + // time + auto cfd = c->column_family_data(); + assert(cfd != nullptr); + // Since this compaction failed, we need to recompute the score so it + // takes the original input files into account + c->column_family_data() + ->current() + ->storage_info() + ->ComputeCompactionScore(*(c->immutable_options()), + *(c->mutable_cf_options())); + if (!cfd->queued_for_compaction()) { + AddToCompactionQueue(cfd); + ++unscheduled_compactions_; + } + } + } + // this will unref its input_version and column_family_data + c.reset(); + + if (is_manual) { + ManualCompactionState* m = manual_compaction; + if (!status.ok()) { + m->status = status; + m->done = true; + } + // For universal compaction: + // Because universal compaction always happens at level 0, so one + // compaction will pick up all overlapped files. No files will be + // filtered out due to size limit and left for a successive compaction. + // So we can safely conclude the current compaction. + // + // Also note that, if we don't stop here, then the current compaction + // writes a new file back to level 0, which will be used in successive + // compaction. Hence the manual compaction will never finish. + // + // Stop the compaction if manual_end points to nullptr -- this means + // that we compacted the whole range. manual_end should always point + // to nullptr in case of universal compaction + if (m->manual_end == nullptr) { + m->done = true; + } + if (!m->done) { + // We only compacted part of the requested range. Update *m + // to the range that is left to be compacted. + // Universal and FIFO compactions should always compact the whole range + assert(m->cfd->ioptions()->compaction_style != + kCompactionStyleUniversal || + m->cfd->ioptions()->num_levels > 1); + assert(m->cfd->ioptions()->compaction_style != kCompactionStyleFIFO); + m->tmp_storage = *m->manual_end; + m->begin = &m->tmp_storage; + m->incomplete = true; + } + m->in_progress = false; // not being processed anymore + } + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Finish"); + return status; +} + +bool DBImpl::HasPendingManualCompaction() { + return (!manual_compaction_dequeue_.empty()); +} + +void DBImpl::AddManualCompaction(DBImpl::ManualCompactionState* m) { + assert(manual_compaction_paused_ == 0); + manual_compaction_dequeue_.push_back(m); +} + +void DBImpl::RemoveManualCompaction(DBImpl::ManualCompactionState* m) { + // Remove from queue + std::deque::iterator it = + manual_compaction_dequeue_.begin(); + while (it != manual_compaction_dequeue_.end()) { + if (m == (*it)) { + it = manual_compaction_dequeue_.erase(it); + return; + } + ++it; + } + assert(false); + return; +} + +bool DBImpl::ShouldntRunManualCompaction(ManualCompactionState* m) { + if (num_running_ingest_file_ > 0) { + // We need to wait for other IngestExternalFile() calls to finish + // before running a manual compaction. + return true; + } + if (m->exclusive) { + return (bg_bottom_compaction_scheduled_ > 0 || + bg_compaction_scheduled_ > 0); + } + std::deque::iterator it = + manual_compaction_dequeue_.begin(); + bool seen = false; + while (it != manual_compaction_dequeue_.end()) { + if (m == (*it)) { + ++it; + seen = true; + continue; + } else if (MCOverlap(m, (*it)) && (!seen && !(*it)->in_progress)) { + // Consider the other manual compaction *it, conflicts if: + // overlaps with m + // and (*it) is ahead in the queue and is not yet in progress + return true; + } + ++it; + } + return false; +} + +bool DBImpl::HaveManualCompaction(ColumnFamilyData* cfd) { + // Remove from priority queue + std::deque::iterator it = + manual_compaction_dequeue_.begin(); + while (it != manual_compaction_dequeue_.end()) { + if ((*it)->exclusive) { + return true; + } + if ((cfd == (*it)->cfd) && (!((*it)->in_progress || (*it)->done))) { + // Allow automatic compaction if manual compaction is + // in progress + return true; + } + ++it; + } + return false; +} + +bool DBImpl::HasExclusiveManualCompaction() { + // Remove from priority queue + std::deque::iterator it = + manual_compaction_dequeue_.begin(); + while (it != manual_compaction_dequeue_.end()) { + if ((*it)->exclusive) { + return true; + } + ++it; + } + return false; +} + +bool DBImpl::MCOverlap(ManualCompactionState* m, ManualCompactionState* m1) { + if ((m->exclusive) || (m1->exclusive)) { + return true; + } + if (m->cfd != m1->cfd) { + return false; + } + return false; +} + +#ifndef ROCKSDB_LITE +void DBImpl::BuildCompactionJobInfo( + const ColumnFamilyData* cfd, Compaction* c, const Status& st, + const CompactionJobStats& compaction_job_stats, const int job_id, + const Version* current, CompactionJobInfo* compaction_job_info) const { + assert(compaction_job_info != nullptr); + compaction_job_info->cf_id = cfd->GetID(); + compaction_job_info->cf_name = cfd->GetName(); + compaction_job_info->status = st; + compaction_job_info->thread_id = env_->GetThreadID(); + compaction_job_info->job_id = job_id; + compaction_job_info->base_input_level = c->start_level(); + compaction_job_info->output_level = c->output_level(); + compaction_job_info->stats = compaction_job_stats; + compaction_job_info->table_properties = c->GetOutputTableProperties(); + compaction_job_info->compaction_reason = c->compaction_reason(); + compaction_job_info->compression = c->output_compression(); + for (size_t i = 0; i < c->num_input_levels(); ++i) { + for (const auto fmd : *c->inputs(i)) { + const FileDescriptor& desc = fmd->fd; + const uint64_t file_number = desc.GetNumber(); + auto fn = TableFileName(c->immutable_options()->cf_paths, file_number, + desc.GetPathId()); + compaction_job_info->input_files.push_back(fn); + compaction_job_info->input_file_infos.push_back(CompactionFileInfo{ + static_cast(i), file_number, fmd->oldest_blob_file_number}); + if (compaction_job_info->table_properties.count(fn) == 0) { + std::shared_ptr tp; + auto s = current->GetTableProperties(&tp, fmd, &fn); + if (s.ok()) { + compaction_job_info->table_properties[fn] = tp; + } + } + } + } + for (const auto& newf : c->edit()->GetNewFiles()) { + const FileMetaData& meta = newf.second; + const FileDescriptor& desc = meta.fd; + const uint64_t file_number = desc.GetNumber(); + compaction_job_info->output_files.push_back(TableFileName( + c->immutable_options()->cf_paths, file_number, desc.GetPathId())); + compaction_job_info->output_file_infos.push_back(CompactionFileInfo{ + newf.first, file_number, meta.oldest_blob_file_number}); + } + compaction_job_info->blob_compression_type = + c->mutable_cf_options()->blob_compression_type; + + // Update BlobFilesInfo. + for (const auto& blob_file : c->edit()->GetBlobFileAdditions()) { + BlobFileAdditionInfo blob_file_addition_info( + BlobFileName(c->immutable_options()->cf_paths.front().path, + blob_file.GetBlobFileNumber()) /*blob_file_path*/, + blob_file.GetBlobFileNumber(), blob_file.GetTotalBlobCount(), + blob_file.GetTotalBlobBytes()); + compaction_job_info->blob_file_addition_infos.emplace_back( + std::move(blob_file_addition_info)); + } + + // Update BlobFilesGarbageInfo. + for (const auto& blob_file : c->edit()->GetBlobFileGarbages()) { + BlobFileGarbageInfo blob_file_garbage_info( + BlobFileName(c->immutable_options()->cf_paths.front().path, + blob_file.GetBlobFileNumber()) /*blob_file_path*/, + blob_file.GetBlobFileNumber(), blob_file.GetGarbageBlobCount(), + blob_file.GetGarbageBlobBytes()); + compaction_job_info->blob_file_garbage_infos.emplace_back( + std::move(blob_file_garbage_info)); + } +} +#endif + +// SuperVersionContext gets created and destructed outside of the lock -- +// we use this conveniently to: +// * malloc one SuperVersion() outside of the lock -- new_superversion +// * delete SuperVersion()s outside of the lock -- superversions_to_free +// +// However, if InstallSuperVersionAndScheduleWork() gets called twice with the +// same sv_context, we can't reuse the SuperVersion() that got +// malloced because +// first call already used it. In that rare case, we take a hit and create a +// new SuperVersion() inside of the mutex. We do similar thing +// for superversion_to_free + +void DBImpl::InstallSuperVersionAndScheduleWork( + ColumnFamilyData* cfd, SuperVersionContext* sv_context, + const MutableCFOptions& mutable_cf_options) { + mutex_.AssertHeld(); + + // Update max_total_in_memory_state_ + size_t old_memtable_size = 0; + auto* old_sv = cfd->GetSuperVersion(); + if (old_sv) { + old_memtable_size = old_sv->mutable_cf_options.write_buffer_size * + old_sv->mutable_cf_options.max_write_buffer_number; + } + + // this branch is unlikely to step in + if (UNLIKELY(sv_context->new_superversion == nullptr)) { + sv_context->NewSuperVersion(); + } + cfd->InstallSuperVersion(sv_context, mutable_cf_options); + + // There may be a small data race here. The snapshot tricking bottommost + // compaction may already be released here. But assuming there will always be + // newer snapshot created and released frequently, the compaction will be + // triggered soon anyway. + bottommost_files_mark_threshold_ = kMaxSequenceNumber; + for (auto* my_cfd : *versions_->GetColumnFamilySet()) { + if (!my_cfd->ioptions()->allow_ingest_behind) { + bottommost_files_mark_threshold_ = std::min( + bottommost_files_mark_threshold_, + my_cfd->current()->storage_info()->bottommost_files_mark_threshold()); + } + } + + // Whenever we install new SuperVersion, we might need to issue new flushes or + // compactions. + SchedulePendingCompaction(cfd); + MaybeScheduleFlushOrCompaction(); + + // Update max_total_in_memory_state_ + max_total_in_memory_state_ = max_total_in_memory_state_ - old_memtable_size + + mutable_cf_options.write_buffer_size * + mutable_cf_options.max_write_buffer_number; +} + +// ShouldPurge is called by FindObsoleteFiles when doing a full scan, +// and db mutex (mutex_) should already be held. +// Actually, the current implementation of FindObsoleteFiles with +// full_scan=true can issue I/O requests to obtain list of files in +// directories, e.g. env_->getChildren while holding db mutex. +bool DBImpl::ShouldPurge(uint64_t file_number) const { + return files_grabbed_for_purge_.find(file_number) == + files_grabbed_for_purge_.end() && + purge_files_.find(file_number) == purge_files_.end(); +} + +// MarkAsGrabbedForPurge is called by FindObsoleteFiles, and db mutex +// (mutex_) should already be held. +void DBImpl::MarkAsGrabbedForPurge(uint64_t file_number) { + files_grabbed_for_purge_.insert(file_number); +} + +void DBImpl::SetSnapshotChecker(SnapshotChecker* snapshot_checker) { + InstrumentedMutexLock l(&mutex_); + // snapshot_checker_ should only set once. If we need to set it multiple + // times, we need to make sure the old one is not deleted while it is still + // using by a compaction job. + assert(!snapshot_checker_); + snapshot_checker_.reset(snapshot_checker); +} + +void DBImpl::GetSnapshotContext( + JobContext* job_context, std::vector* snapshot_seqs, + SequenceNumber* earliest_write_conflict_snapshot, + SnapshotChecker** snapshot_checker_ptr) { + mutex_.AssertHeld(); + assert(job_context != nullptr); + assert(snapshot_seqs != nullptr); + assert(earliest_write_conflict_snapshot != nullptr); + assert(snapshot_checker_ptr != nullptr); + + *snapshot_checker_ptr = snapshot_checker_.get(); + if (use_custom_gc_ && *snapshot_checker_ptr == nullptr) { + *snapshot_checker_ptr = DisableGCSnapshotChecker::Instance(); + } + if (*snapshot_checker_ptr != nullptr) { + // If snapshot_checker is used, that means the flush/compaction may + // contain values not visible to snapshot taken after + // flush/compaction job starts. Take a snapshot and it will appear + // in snapshot_seqs and force compaction iterator to consider such + // snapshots. + const Snapshot* job_snapshot = + GetSnapshotImpl(false /*write_conflict_boundary*/, false /*lock*/); + job_context->job_snapshot.reset(new ManagedSnapshot(this, job_snapshot)); + } + *snapshot_seqs = snapshots_.GetAll(earliest_write_conflict_snapshot); +} + +Status DBImpl::WaitForCompact(bool wait_unscheduled) { + // Wait until the compaction completes + InstrumentedMutexLock l(&mutex_); + while ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ || + bg_flush_scheduled_ || + (wait_unscheduled && unscheduled_compactions_)) && + (error_handler_.GetBGError().ok())) { + bg_cv_.Wait(); + } + return error_handler_.GetBGError(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_impl_debug.cc b/src/rocksdb/db/db_impl/db_impl_debug.cc new file mode 100644 index 000000000..7054b0669 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_debug.cc @@ -0,0 +1,312 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef NDEBUG + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/error_handler.h" +#include "db/periodic_task_scheduler.h" +#include "monitoring/thread_status_updater.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { +uint64_t DBImpl::TEST_GetLevel0TotalSize() { + InstrumentedMutexLock l(&mutex_); + return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0); +} + +Status DBImpl::TEST_SwitchWAL() { + WriteContext write_context; + InstrumentedMutexLock l(&mutex_); + void* writer = TEST_BeginWrite(); + auto s = SwitchWAL(&write_context); + TEST_EndWrite(writer); + return s; +} + +uint64_t DBImpl::TEST_MaxNextLevelOverlappingBytes( + ColumnFamilyHandle* column_family) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = static_cast_with_check(column_family); + cfd = cfh->cfd(); + } + InstrumentedMutexLock l(&mutex_); + return cfd->current()->storage_info()->MaxNextLevelOverlappingBytes(); +} + +void DBImpl::TEST_GetFilesMetaData( + ColumnFamilyHandle* column_family, + std::vector>* metadata, + std::vector>* blob_metadata) { + assert(metadata); + + auto cfh = static_cast_with_check(column_family); + assert(cfh); + + auto cfd = cfh->cfd(); + assert(cfd); + + InstrumentedMutexLock l(&mutex_); + + const auto* current = cfd->current(); + assert(current); + + const auto* vstorage = current->storage_info(); + assert(vstorage); + + metadata->resize(NumberLevels()); + + for (int level = 0; level < NumberLevels(); ++level) { + const std::vector& files = vstorage->LevelFiles(level); + + (*metadata)[level].clear(); + (*metadata)[level].reserve(files.size()); + + for (const auto& f : files) { + (*metadata)[level].push_back(*f); + } + } + + if (blob_metadata) { + *blob_metadata = vstorage->GetBlobFiles(); + } +} + +uint64_t DBImpl::TEST_Current_Manifest_FileNo() { + return versions_->manifest_file_number(); +} + +uint64_t DBImpl::TEST_Current_Next_FileNo() { + return versions_->current_next_file_number(); +} + +Status DBImpl::TEST_CompactRange(int level, const Slice* begin, + const Slice* end, + ColumnFamilyHandle* column_family, + bool disallow_trivial_move) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = static_cast_with_check(column_family); + cfd = cfh->cfd(); + } + int output_level = + (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) + ? level + : level + 1; + return RunManualCompaction( + cfd, level, output_level, CompactRangeOptions(), begin, end, true, + disallow_trivial_move, + std::numeric_limits::max() /*max_file_num_to_ignore*/, + "" /*trim_ts*/); +} + +Status DBImpl::TEST_SwitchMemtable(ColumnFamilyData* cfd) { + WriteContext write_context; + InstrumentedMutexLock l(&mutex_); + if (cfd == nullptr) { + cfd = default_cf_handle_->cfd(); + } + + Status s; + void* writer = TEST_BeginWrite(); + if (two_write_queues_) { + WriteThread::Writer nonmem_w; + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + s = SwitchMemtable(cfd, &write_context); + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } else { + s = SwitchMemtable(cfd, &write_context); + } + TEST_EndWrite(writer); + return s; +} + +Status DBImpl::TEST_FlushMemTable(bool wait, bool allow_write_stall, + ColumnFamilyHandle* cfh) { + FlushOptions fo; + fo.wait = wait; + fo.allow_write_stall = allow_write_stall; + ColumnFamilyData* cfd; + if (cfh == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfhi = static_cast_with_check(cfh); + cfd = cfhi->cfd(); + } + return FlushMemTable(cfd, fo, FlushReason::kTest); +} + +Status DBImpl::TEST_FlushMemTable(ColumnFamilyData* cfd, + const FlushOptions& flush_opts) { + return FlushMemTable(cfd, flush_opts, FlushReason::kTest); +} + +Status DBImpl::TEST_AtomicFlushMemTables( + const autovector& cfds, const FlushOptions& flush_opts) { + return AtomicFlushMemTables(cfds, flush_opts, FlushReason::kTest); +} + +Status DBImpl::TEST_WaitForBackgroundWork() { + InstrumentedMutexLock l(&mutex_); + WaitForBackgroundWork(); + return error_handler_.GetBGError(); +} + +Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = static_cast_with_check(column_family); + cfd = cfh->cfd(); + } + return WaitForFlushMemTable(cfd, nullptr, false); +} + +Status DBImpl::TEST_WaitForCompact(bool wait_unscheduled) { + // Wait until the compaction completes + return WaitForCompact(wait_unscheduled); +} + +Status DBImpl::TEST_WaitForPurge() { + InstrumentedMutexLock l(&mutex_); + while (bg_purge_scheduled_ && error_handler_.GetBGError().ok()) { + bg_cv_.Wait(); + } + return error_handler_.GetBGError(); +} + +Status DBImpl::TEST_GetBGError() { + InstrumentedMutexLock l(&mutex_); + return error_handler_.GetBGError(); +} + +void DBImpl::TEST_LockMutex() { mutex_.Lock(); } + +void DBImpl::TEST_UnlockMutex() { mutex_.Unlock(); } + +void* DBImpl::TEST_BeginWrite() { + auto w = new WriteThread::Writer(); + write_thread_.EnterUnbatched(w, &mutex_); + return reinterpret_cast(w); +} + +void DBImpl::TEST_EndWrite(void* w) { + auto writer = reinterpret_cast(w); + write_thread_.ExitUnbatched(writer); + delete writer; +} + +size_t DBImpl::TEST_LogsToFreeSize() { + InstrumentedMutexLock l(&log_write_mutex_); + return logs_to_free_.size(); +} + +uint64_t DBImpl::TEST_LogfileNumber() { + InstrumentedMutexLock l(&mutex_); + return logfile_number_; +} + +Status DBImpl::TEST_GetAllImmutableCFOptions( + std::unordered_map* iopts_map) { + std::vector cf_names; + std::vector iopts; + { + InstrumentedMutexLock l(&mutex_); + for (auto cfd : *versions_->GetColumnFamilySet()) { + cf_names.push_back(cfd->GetName()); + iopts.push_back(cfd->ioptions()); + } + } + iopts_map->clear(); + for (size_t i = 0; i < cf_names.size(); ++i) { + iopts_map->insert({cf_names[i], iopts[i]}); + } + + return Status::OK(); +} + +uint64_t DBImpl::TEST_FindMinLogContainingOutstandingPrep() { + return logs_with_prep_tracker_.FindMinLogContainingOutstandingPrep(); +} + +size_t DBImpl::TEST_PreparedSectionCompletedSize() { + return logs_with_prep_tracker_.TEST_PreparedSectionCompletedSize(); +} + +size_t DBImpl::TEST_LogsWithPrepSize() { + return logs_with_prep_tracker_.TEST_LogsWithPrepSize(); +} + +uint64_t DBImpl::TEST_FindMinPrepLogReferencedByMemTable() { + autovector empty_list; + return FindMinPrepLogReferencedByMemTable(versions_.get(), empty_list); +} + +Status DBImpl::TEST_GetLatestMutableCFOptions( + ColumnFamilyHandle* column_family, MutableCFOptions* mutable_cf_options) { + InstrumentedMutexLock l(&mutex_); + + auto cfh = static_cast_with_check(column_family); + *mutable_cf_options = *cfh->cfd()->GetLatestMutableCFOptions(); + return Status::OK(); +} + +int DBImpl::TEST_BGCompactionsAllowed() const { + InstrumentedMutexLock l(&mutex_); + return GetBGJobLimits().max_compactions; +} + +int DBImpl::TEST_BGFlushesAllowed() const { + InstrumentedMutexLock l(&mutex_); + return GetBGJobLimits().max_flushes; +} + +SequenceNumber DBImpl::TEST_GetLastVisibleSequence() const { + if (last_seq_same_as_publish_seq_) { + return versions_->LastSequence(); + } else { + return versions_->LastAllocatedSequence(); + } +} + +size_t DBImpl::TEST_GetWalPreallocateBlockSize( + uint64_t write_buffer_size) const { + InstrumentedMutexLock l(&mutex_); + return GetWalPreallocateBlockSize(write_buffer_size); +} + +#ifndef ROCKSDB_LITE +void DBImpl::TEST_WaitForPeridicTaskRun(std::function callback) const { + periodic_task_scheduler_.TEST_WaitForRun(callback); +} + +const PeriodicTaskScheduler& DBImpl::TEST_GetPeriodicTaskScheduler() const { + return periodic_task_scheduler_; +} + +SeqnoToTimeMapping DBImpl::TEST_GetSeqnoToTimeMapping() const { + InstrumentedMutexLock l(&mutex_); + return seqno_time_mapping_; +} + +#endif // !ROCKSDB_LITE + +size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const { + return EstimateInMemoryStatsHistorySize(); +} +} // namespace ROCKSDB_NAMESPACE +#endif // NDEBUG diff --git a/src/rocksdb/db/db_impl/db_impl_experimental.cc b/src/rocksdb/db/db_impl/db_impl_experimental.cc new file mode 100644 index 000000000..c1b1e4137 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_experimental.cc @@ -0,0 +1,158 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/job_context.h" +#include "db/version_set.h" +#include "logging/logging.h" +#include "rocksdb/status.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE +Status DBImpl::SuggestCompactRange(ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) { + auto cfh = static_cast_with_check(column_family); + auto cfd = cfh->cfd(); + InternalKey start_key, end_key; + if (begin != nullptr) { + start_key.SetMinPossibleForUserKey(*begin); + } + if (end != nullptr) { + end_key.SetMaxPossibleForUserKey(*end); + } + { + InstrumentedMutexLock l(&mutex_); + auto vstorage = cfd->current()->storage_info(); + for (int level = 0; level < vstorage->num_non_empty_levels() - 1; ++level) { + std::vector inputs; + vstorage->GetOverlappingInputs( + level, begin == nullptr ? nullptr : &start_key, + end == nullptr ? nullptr : &end_key, &inputs); + for (auto f : inputs) { + f->marked_for_compaction = true; + } + } + // Since we have some more files to compact, we should also recompute + // compaction score + vstorage->ComputeCompactionScore(*cfd->ioptions(), + *cfd->GetLatestMutableCFOptions()); + SchedulePendingCompaction(cfd); + MaybeScheduleFlushOrCompaction(); + } + return Status::OK(); +} + +Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { + assert(column_family); + + if (target_level < 1) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "PromoteL0 FAILED. Invalid target level %d\n", target_level); + return Status::InvalidArgument("Invalid target level"); + } + + Status status; + VersionEdit edit; + JobContext job_context(next_job_id_.fetch_add(1), true); + { + InstrumentedMutexLock l(&mutex_); + auto* cfd = static_cast(column_family)->cfd(); + const auto* vstorage = cfd->current()->storage_info(); + + if (target_level >= vstorage->num_levels()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "PromoteL0 FAILED. Target level %d does not exist\n", + target_level); + job_context.Clean(); + status = Status::InvalidArgument("Target level does not exist"); + return status; + } + + // Sort L0 files by range. + const InternalKeyComparator* icmp = &cfd->internal_comparator(); + auto l0_files = vstorage->LevelFiles(0); + std::sort(l0_files.begin(), l0_files.end(), + [icmp](FileMetaData* f1, FileMetaData* f2) { + return icmp->Compare(f1->largest, f2->largest) < 0; + }); + + // Check that no L0 file is being compacted and that they have + // non-overlapping ranges. + for (size_t i = 0; i < l0_files.size(); ++i) { + auto f = l0_files[i]; + if (f->being_compacted) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "PromoteL0 FAILED. File %" PRIu64 " being compacted\n", + f->fd.GetNumber()); + job_context.Clean(); + status = + Status::InvalidArgument("PromoteL0 called during L0 compaction"); + return status; + } + + if (i == 0) continue; + auto prev_f = l0_files[i - 1]; + if (icmp->Compare(prev_f->largest, f->smallest) >= 0) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "PromoteL0 FAILED. Files %" PRIu64 " and %" PRIu64 + " have overlapping ranges\n", + prev_f->fd.GetNumber(), f->fd.GetNumber()); + job_context.Clean(); + status = Status::InvalidArgument("L0 has overlapping files"); + return status; + } + } + + // Check that all levels up to target_level are empty. + for (int level = 1; level <= target_level; ++level) { + if (vstorage->NumLevelFiles(level) > 0) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "PromoteL0 FAILED. Level %d not empty\n", level); + job_context.Clean(); + status = Status::InvalidArgument( + "All levels up to target_level " + "must be empty"); + return status; + } + } + + edit.SetColumnFamily(cfd->GetID()); + for (const auto& f : l0_files) { + edit.DeleteFile(0, f->fd.GetNumber()); + edit.AddFile(target_level, f->fd.GetNumber(), f->fd.GetPathId(), + f->fd.GetFileSize(), f->smallest, f->largest, + f->fd.smallest_seqno, f->fd.largest_seqno, + f->marked_for_compaction, f->temperature, + f->oldest_blob_file_number, f->oldest_ancester_time, + f->file_creation_time, f->file_checksum, + f->file_checksum_func_name, f->unique_id); + } + + status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + &edit, &mutex_, directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, + &job_context.superversion_contexts[0], + *cfd->GetLatestMutableCFOptions()); + } + } // lock released here + LogFlush(immutable_db_options_.info_log); + job_context.Clean(); + + return status; +} +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_impl_files.cc b/src/rocksdb/db/db_impl/db_impl_files.cc new file mode 100644 index 000000000..058df4da7 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_files.cc @@ -0,0 +1,1013 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/event_helpers.h" +#include "db/memtable_list.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "file/sst_file_manager_impl.h" +#include "logging/logging.h" +#include "port/port.h" +#include "util/autovector.h" +#include "util/defer.h" + +namespace ROCKSDB_NAMESPACE { + +uint64_t DBImpl::MinLogNumberToKeep() { + return versions_->min_log_number_to_keep(); +} + +uint64_t DBImpl::MinObsoleteSstNumberToKeep() { + mutex_.AssertHeld(); + if (!pending_outputs_.empty()) { + return *pending_outputs_.begin(); + } + return std::numeric_limits::max(); +} + +Status DBImpl::DisableFileDeletions() { + Status s; + int my_disable_delete_obsolete_files; + { + InstrumentedMutexLock l(&mutex_); + s = DisableFileDeletionsWithLock(); + my_disable_delete_obsolete_files = disable_delete_obsolete_files_; + } + if (my_disable_delete_obsolete_files == 1) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Disabled"); + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "File Deletions Disabled, but already disabled. Counter: %d", + my_disable_delete_obsolete_files); + } + return s; +} + +// FIXME: can be inconsistent with DisableFileDeletions in cases like +// DBImplReadOnly +Status DBImpl::DisableFileDeletionsWithLock() { + mutex_.AssertHeld(); + ++disable_delete_obsolete_files_; + return Status::OK(); +} + +Status DBImpl::EnableFileDeletions(bool force) { + // Job id == 0 means that this is not our background process, but rather + // user thread + JobContext job_context(0); + int saved_counter; // initialize on all paths + { + InstrumentedMutexLock l(&mutex_); + if (force) { + // if force, we need to enable file deletions right away + disable_delete_obsolete_files_ = 0; + } else if (disable_delete_obsolete_files_ > 0) { + --disable_delete_obsolete_files_; + } + saved_counter = disable_delete_obsolete_files_; + if (saved_counter == 0) { + FindObsoleteFiles(&job_context, true); + bg_cv_.SignalAll(); + } + } + if (saved_counter == 0) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enabled"); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "File Deletions Enable, but not really enabled. Counter: %d", + saved_counter); + } + job_context.Clean(); + LogFlush(immutable_db_options_.info_log); + return Status::OK(); +} + +bool DBImpl::IsFileDeletionsEnabled() const { + return 0 == disable_delete_obsolete_files_; +} + +// * Returns the list of live files in 'sst_live' and 'blob_live'. +// If it's doing full scan: +// * Returns the list of all files in the filesystem in +// 'full_scan_candidate_files'. +// Otherwise, gets obsolete files from VersionSet. +// no_full_scan = true -- never do the full scan using GetChildren() +// force = false -- don't force the full scan, except every +// mutable_db_options_.delete_obsolete_files_period_micros +// force = true -- force the full scan +void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, + bool no_full_scan) { + mutex_.AssertHeld(); + + // if deletion is disabled, do nothing + if (disable_delete_obsolete_files_ > 0) { + return; + } + + bool doing_the_full_scan = false; + + // logic for figuring out if we're doing the full scan + if (no_full_scan) { + doing_the_full_scan = false; + } else if (force || + mutable_db_options_.delete_obsolete_files_period_micros == 0) { + doing_the_full_scan = true; + } else { + const uint64_t now_micros = immutable_db_options_.clock->NowMicros(); + if ((delete_obsolete_files_last_run_ + + mutable_db_options_.delete_obsolete_files_period_micros) < + now_micros) { + doing_the_full_scan = true; + delete_obsolete_files_last_run_ = now_micros; + } + } + + // don't delete files that might be currently written to from compaction + // threads + // Since job_context->min_pending_output is set, until file scan finishes, + // mutex_ cannot be released. Otherwise, we might see no min_pending_output + // here but later find newer generated unfinalized files while scanning. + job_context->min_pending_output = MinObsoleteSstNumberToKeep(); + + // Get obsolete files. This function will also update the list of + // pending files in VersionSet(). + versions_->GetObsoleteFiles( + &job_context->sst_delete_files, &job_context->blob_delete_files, + &job_context->manifest_delete_files, job_context->min_pending_output); + + // Mark the elements in job_context->sst_delete_files and + // job_context->blob_delete_files as "grabbed for purge" so that other threads + // calling FindObsoleteFiles with full_scan=true will not add these files to + // candidate list for purge. + for (const auto& sst_to_del : job_context->sst_delete_files) { + MarkAsGrabbedForPurge(sst_to_del.metadata->fd.GetNumber()); + } + + for (const auto& blob_file : job_context->blob_delete_files) { + MarkAsGrabbedForPurge(blob_file.GetBlobFileNumber()); + } + + // store the current filenum, lognum, etc + job_context->manifest_file_number = versions_->manifest_file_number(); + job_context->pending_manifest_file_number = + versions_->pending_manifest_file_number(); + job_context->log_number = MinLogNumberToKeep(); + job_context->prev_log_number = versions_->prev_log_number(); + + if (doing_the_full_scan) { + versions_->AddLiveFiles(&job_context->sst_live, &job_context->blob_live); + InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(), + dbname_); + std::set paths; + for (size_t path_id = 0; path_id < immutable_db_options_.db_paths.size(); + path_id++) { + paths.insert(immutable_db_options_.db_paths[path_id].path); + } + + // Note that if cf_paths is not specified in the ColumnFamilyOptions + // of a particular column family, we use db_paths as the cf_paths + // setting. Hence, there can be multiple duplicates of files from db_paths + // in the following code. The duplicate are removed while identifying + // unique files in PurgeObsoleteFiles. + for (auto cfd : *versions_->GetColumnFamilySet()) { + for (size_t path_id = 0; path_id < cfd->ioptions()->cf_paths.size(); + path_id++) { + auto& path = cfd->ioptions()->cf_paths[path_id].path; + + if (paths.find(path) == paths.end()) { + paths.insert(path); + } + } + } + + IOOptions io_opts; + io_opts.do_not_recurse = true; + for (auto& path : paths) { + // set of all files in the directory. We'll exclude files that are still + // alive in the subsequent processings. + std::vector files; + Status s = immutable_db_options_.fs->GetChildren( + path, io_opts, &files, /*IODebugContext*=*/nullptr); + s.PermitUncheckedError(); // TODO: What should we do on error? + for (const std::string& file : files) { + uint64_t number; + FileType type; + // 1. If we cannot parse the file name, we skip; + // 2. If the file with file_number equals number has already been + // grabbed for purge by another compaction job, or it has already been + // schedule for purge, we also skip it if we + // are doing full scan in order to avoid double deletion of the same + // file under race conditions. See + // https://github.com/facebook/rocksdb/issues/3573 + if (!ParseFileName(file, &number, info_log_prefix.prefix, &type) || + !ShouldPurge(number)) { + continue; + } + + // TODO(icanadi) clean up this mess to avoid having one-off "/" + // prefixes + job_context->full_scan_candidate_files.emplace_back("/" + file, path); + } + } + + // Add log files in wal_dir + if (!immutable_db_options_.IsWalDirSameAsDBPath(dbname_)) { + std::vector log_files; + Status s = immutable_db_options_.fs->GetChildren( + immutable_db_options_.wal_dir, io_opts, &log_files, + /*IODebugContext*=*/nullptr); + s.PermitUncheckedError(); // TODO: What should we do on error? + for (const std::string& log_file : log_files) { + job_context->full_scan_candidate_files.emplace_back( + log_file, immutable_db_options_.wal_dir); + } + } + + // Add info log files in db_log_dir + if (!immutable_db_options_.db_log_dir.empty() && + immutable_db_options_.db_log_dir != dbname_) { + std::vector info_log_files; + Status s = immutable_db_options_.fs->GetChildren( + immutable_db_options_.db_log_dir, io_opts, &info_log_files, + /*IODebugContext*=*/nullptr); + s.PermitUncheckedError(); // TODO: What should we do on error? + for (std::string& log_file : info_log_files) { + job_context->full_scan_candidate_files.emplace_back( + log_file, immutable_db_options_.db_log_dir); + } + } + } else { + // Instead of filling ob_context->sst_live and job_context->blob_live, + // directly remove files that show up in any Version. This is because + // candidate files tend to be a small percentage of all files, so it is + // usually cheaper to check them against every version, compared to + // building a map for all files. + versions_->RemoveLiveFiles(job_context->sst_delete_files, + job_context->blob_delete_files); + } + + // Before potentially releasing mutex and waiting on condvar, increment + // pending_purge_obsolete_files_ so that another thread executing + // `GetSortedWals` will wait until this thread finishes execution since the + // other thread will be waiting for `pending_purge_obsolete_files_`. + // pending_purge_obsolete_files_ MUST be decremented if there is nothing to + // delete. + ++pending_purge_obsolete_files_; + + Defer cleanup([job_context, this]() { + assert(job_context != nullptr); + if (!job_context->HaveSomethingToDelete()) { + mutex_.AssertHeld(); + --pending_purge_obsolete_files_; + } + }); + + // logs_ is empty when called during recovery, in which case there can't yet + // be any tracked obsolete logs + log_write_mutex_.Lock(); + + if (alive_log_files_.empty() || logs_.empty()) { + mutex_.AssertHeld(); + // We may reach here if the db is DBImplSecondary + log_write_mutex_.Unlock(); + return; + } + + if (!alive_log_files_.empty() && !logs_.empty()) { + uint64_t min_log_number = job_context->log_number; + size_t num_alive_log_files = alive_log_files_.size(); + // find newly obsoleted log files + while (alive_log_files_.begin()->number < min_log_number) { + auto& earliest = *alive_log_files_.begin(); + if (immutable_db_options_.recycle_log_file_num > + log_recycle_files_.size()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "adding log %" PRIu64 " to recycle list\n", + earliest.number); + log_recycle_files_.push_back(earliest.number); + } else { + job_context->log_delete_files.push_back(earliest.number); + } + if (job_context->size_log_to_delete == 0) { + job_context->prev_total_log_size = total_log_size_; + job_context->num_alive_log_files = num_alive_log_files; + } + job_context->size_log_to_delete += earliest.size; + total_log_size_ -= earliest.size; + alive_log_files_.pop_front(); + + // Current log should always stay alive since it can't have + // number < MinLogNumber(). + assert(alive_log_files_.size()); + } + log_write_mutex_.Unlock(); + mutex_.Unlock(); + log_write_mutex_.Lock(); + while (!logs_.empty() && logs_.front().number < min_log_number) { + auto& log = logs_.front(); + if (log.IsSyncing()) { + log_sync_cv_.Wait(); + // logs_ could have changed while we were waiting. + continue; + } + logs_to_free_.push_back(log.ReleaseWriter()); + logs_.pop_front(); + } + // Current log cannot be obsolete. + assert(!logs_.empty()); + } + + // We're just cleaning up for DB::Write(). + assert(job_context->logs_to_free.empty()); + job_context->logs_to_free = logs_to_free_; + + logs_to_free_.clear(); + log_write_mutex_.Unlock(); + mutex_.Lock(); + job_context->log_recycle_files.assign(log_recycle_files_.begin(), + log_recycle_files_.end()); +} + +// Delete obsolete files and log status and information of file deletion +void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname, + const std::string& path_to_sync, + FileType type, uint64_t number) { + TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl::BeforeDeletion", + const_cast(&fname)); + + Status file_deletion_status; + if (type == kTableFile || type == kBlobFile || type == kWalFile) { + // Rate limit WAL deletion only if its in the DB dir + file_deletion_status = DeleteDBFile( + &immutable_db_options_, fname, path_to_sync, + /*force_bg=*/false, + /*force_fg=*/(type == kWalFile) ? !wal_in_db_path_ : false); + } else { + file_deletion_status = env_->DeleteFile(fname); + } + TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl:AfterDeletion", + &file_deletion_status); + if (file_deletion_status.ok()) { + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[JOB %d] Delete %s type=%d #%" PRIu64 " -- %s\n", job_id, + fname.c_str(), type, number, + file_deletion_status.ToString().c_str()); + } else if (env_->FileExists(fname).IsNotFound()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "[JOB %d] Tried to delete a non-existing file %s type=%d #%" PRIu64 + " -- %s\n", + job_id, fname.c_str(), type, number, + file_deletion_status.ToString().c_str()); + } else { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "[JOB %d] Failed to delete %s type=%d #%" PRIu64 " -- %s\n", + job_id, fname.c_str(), type, number, + file_deletion_status.ToString().c_str()); + } + if (type == kTableFile) { + EventHelpers::LogAndNotifyTableFileDeletion( + &event_logger_, job_id, number, fname, file_deletion_status, GetName(), + immutable_db_options_.listeners); + } + if (type == kBlobFile) { + EventHelpers::LogAndNotifyBlobFileDeletion( + &event_logger_, immutable_db_options_.listeners, job_id, number, fname, + file_deletion_status, GetName()); + } +} + +// Diffs the files listed in filenames and those that do not +// belong to live files are possibly removed. Also, removes all the +// files in sst_delete_files and log_delete_files. +// It is not necessary to hold the mutex when invoking this method. +void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { + TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:Begin"); + // we'd better have sth to delete + assert(state.HaveSomethingToDelete()); + + // FindObsoleteFiles() should've populated this so nonzero + assert(state.manifest_file_number != 0); + + // Now, convert lists to unordered sets, WITHOUT mutex held; set is slow. + std::unordered_set sst_live_set(state.sst_live.begin(), + state.sst_live.end()); + std::unordered_set blob_live_set(state.blob_live.begin(), + state.blob_live.end()); + std::unordered_set log_recycle_files_set( + state.log_recycle_files.begin(), state.log_recycle_files.end()); + + auto candidate_files = state.full_scan_candidate_files; + candidate_files.reserve( + candidate_files.size() + state.sst_delete_files.size() + + state.blob_delete_files.size() + state.log_delete_files.size() + + state.manifest_delete_files.size()); + // We may ignore the dbname when generating the file names. + for (auto& file : state.sst_delete_files) { + if (!file.only_delete_metadata) { + candidate_files.emplace_back( + MakeTableFileName(file.metadata->fd.GetNumber()), file.path); + } + if (file.metadata->table_reader_handle) { + table_cache_->Release(file.metadata->table_reader_handle); + } + file.DeleteMetadata(); + } + + for (const auto& blob_file : state.blob_delete_files) { + candidate_files.emplace_back(BlobFileName(blob_file.GetBlobFileNumber()), + blob_file.GetPath()); + } + + auto wal_dir = immutable_db_options_.GetWalDir(); + for (auto file_num : state.log_delete_files) { + if (file_num > 0) { + candidate_files.emplace_back(LogFileName(file_num), wal_dir); + } + } + for (const auto& filename : state.manifest_delete_files) { + candidate_files.emplace_back(filename, dbname_); + } + + // dedup state.candidate_files so we don't try to delete the same + // file twice + std::sort(candidate_files.begin(), candidate_files.end(), + [](const JobContext::CandidateFileInfo& lhs, + const JobContext::CandidateFileInfo& rhs) { + if (lhs.file_name > rhs.file_name) { + return true; + } else if (lhs.file_name < rhs.file_name) { + return false; + } else { + return (lhs.file_path > rhs.file_path); + } + }); + candidate_files.erase( + std::unique(candidate_files.begin(), candidate_files.end()), + candidate_files.end()); + + if (state.prev_total_log_size > 0) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[JOB %d] Try to delete WAL files size %" PRIu64 + ", prev total WAL file size %" PRIu64 + ", number of live WAL files %" ROCKSDB_PRIszt ".\n", + state.job_id, state.size_log_to_delete, + state.prev_total_log_size, state.num_alive_log_files); + } + + std::vector old_info_log_files; + InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(), + dbname_); + + // File numbers of most recent two OPTIONS file in candidate_files (found in + // previos FindObsoleteFiles(full_scan=true)) + // At this point, there must not be any duplicate file numbers in + // candidate_files. + uint64_t optsfile_num1 = std::numeric_limits::min(); + uint64_t optsfile_num2 = std::numeric_limits::min(); + for (const auto& candidate_file : candidate_files) { + const std::string& fname = candidate_file.file_name; + uint64_t number; + FileType type; + if (!ParseFileName(fname, &number, info_log_prefix.prefix, &type) || + type != kOptionsFile) { + continue; + } + if (number > optsfile_num1) { + optsfile_num2 = optsfile_num1; + optsfile_num1 = number; + } else if (number > optsfile_num2) { + optsfile_num2 = number; + } + } + + // Close WALs before trying to delete them. + for (const auto w : state.logs_to_free) { + // TODO: maybe check the return value of Close. + auto s = w->Close(); + s.PermitUncheckedError(); + } + + bool own_files = OwnTablesAndLogs(); + std::unordered_set files_to_del; + for (const auto& candidate_file : candidate_files) { + const std::string& to_delete = candidate_file.file_name; + uint64_t number; + FileType type; + // Ignore file if we cannot recognize it. + if (!ParseFileName(to_delete, &number, info_log_prefix.prefix, &type)) { + continue; + } + + bool keep = true; + switch (type) { + case kWalFile: + keep = ((number >= state.log_number) || + (number == state.prev_log_number) || + (log_recycle_files_set.find(number) != + log_recycle_files_set.end())); + break; + case kDescriptorFile: + // Keep my manifest file, and any newer incarnations' + // (can happen during manifest roll) + keep = (number >= state.manifest_file_number); + break; + case kTableFile: + // If the second condition is not there, this makes + // DontDeletePendingOutputs fail + keep = (sst_live_set.find(number) != sst_live_set.end()) || + number >= state.min_pending_output; + if (!keep) { + files_to_del.insert(number); + } + break; + case kBlobFile: + keep = number >= state.min_pending_output || + (blob_live_set.find(number) != blob_live_set.end()); + if (!keep) { + files_to_del.insert(number); + } + break; + case kTempFile: + // Any temp files that are currently being written to must + // be recorded in pending_outputs_, which is inserted into "live". + // Also, SetCurrentFile creates a temp file when writing out new + // manifest, which is equal to state.pending_manifest_file_number. We + // should not delete that file + // + // TODO(yhchiang): carefully modify the third condition to safely + // remove the temp options files. + keep = (sst_live_set.find(number) != sst_live_set.end()) || + (blob_live_set.find(number) != blob_live_set.end()) || + (number == state.pending_manifest_file_number) || + (to_delete.find(kOptionsFileNamePrefix) != std::string::npos); + break; + case kInfoLogFile: + keep = true; + if (number != 0) { + old_info_log_files.push_back(to_delete); + } + break; + case kOptionsFile: + keep = (number >= optsfile_num2); + break; + case kCurrentFile: + case kDBLockFile: + case kIdentityFile: + case kMetaDatabase: + keep = true; + break; + } + + if (keep) { + continue; + } + + std::string fname; + std::string dir_to_sync; + if (type == kTableFile) { + // evict from cache + TableCache::Evict(table_cache_.get(), number); + fname = MakeTableFileName(candidate_file.file_path, number); + dir_to_sync = candidate_file.file_path; + } else if (type == kBlobFile) { + fname = BlobFileName(candidate_file.file_path, number); + dir_to_sync = candidate_file.file_path; + } else { + dir_to_sync = (type == kWalFile) ? wal_dir : dbname_; + fname = dir_to_sync + + ((!dir_to_sync.empty() && dir_to_sync.back() == '/') || + (!to_delete.empty() && to_delete.front() == '/') + ? "" + : "/") + + to_delete; + } + +#ifndef ROCKSDB_LITE + if (type == kWalFile && (immutable_db_options_.WAL_ttl_seconds > 0 || + immutable_db_options_.WAL_size_limit_MB > 0)) { + wal_manager_.ArchiveWALFile(fname, number); + continue; + } +#endif // !ROCKSDB_LITE + + // If I do not own these files, e.g. secondary instance with max_open_files + // = -1, then no need to delete or schedule delete these files since they + // will be removed by their owner, e.g. the primary instance. + if (!own_files) { + continue; + } + if (schedule_only) { + InstrumentedMutexLock guard_lock(&mutex_); + SchedulePendingPurge(fname, dir_to_sync, type, number, state.job_id); + } else { + DeleteObsoleteFileImpl(state.job_id, fname, dir_to_sync, type, number); + } + } + + { + // After purging obsolete files, remove them from files_grabbed_for_purge_. + InstrumentedMutexLock guard_lock(&mutex_); + autovector to_be_removed; + for (auto fn : files_grabbed_for_purge_) { + if (files_to_del.count(fn) != 0) { + to_be_removed.emplace_back(fn); + } + } + for (auto fn : to_be_removed) { + files_grabbed_for_purge_.erase(fn); + } + } + + // Delete old info log files. + size_t old_info_log_file_count = old_info_log_files.size(); + if (old_info_log_file_count != 0 && + old_info_log_file_count >= immutable_db_options_.keep_log_file_num) { + std::sort(old_info_log_files.begin(), old_info_log_files.end()); + size_t end = + old_info_log_file_count - immutable_db_options_.keep_log_file_num; + for (unsigned int i = 0; i <= end; i++) { + std::string& to_delete = old_info_log_files.at(i); + std::string full_path_to_delete = + (immutable_db_options_.db_log_dir.empty() + ? dbname_ + : immutable_db_options_.db_log_dir) + + "/" + to_delete; + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[JOB %d] Delete info log file %s\n", state.job_id, + full_path_to_delete.c_str()); + Status s = env_->DeleteFile(full_path_to_delete); + if (!s.ok()) { + if (env_->FileExists(full_path_to_delete).IsNotFound()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "[JOB %d] Tried to delete non-existing info log file %s FAILED " + "-- %s\n", + state.job_id, to_delete.c_str(), s.ToString().c_str()); + } else { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "[JOB %d] Delete info log file %s FAILED -- %s\n", + state.job_id, to_delete.c_str(), + s.ToString().c_str()); + } + } + } + } +#ifndef ROCKSDB_LITE + wal_manager_.PurgeObsoleteWALFiles(); +#endif // ROCKSDB_LITE + LogFlush(immutable_db_options_.info_log); + InstrumentedMutexLock l(&mutex_); + --pending_purge_obsolete_files_; + assert(pending_purge_obsolete_files_ >= 0); + if (schedule_only) { + // Must change from pending_purge_obsolete_files_ to bg_purge_scheduled_ + // while holding mutex (for GetSortedWalFiles() etc.) + SchedulePurge(); + } + if (pending_purge_obsolete_files_ == 0) { + bg_cv_.SignalAll(); + } + TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:End"); +} + +void DBImpl::DeleteObsoleteFiles() { + mutex_.AssertHeld(); + JobContext job_context(next_job_id_.fetch_add(1)); + FindObsoleteFiles(&job_context, true); + + mutex_.Unlock(); + if (job_context.HaveSomethingToDelete()) { + bool defer_purge = immutable_db_options_.avoid_unnecessary_blocking_io; + PurgeObsoleteFiles(job_context, defer_purge); + } + job_context.Clean(); + mutex_.Lock(); +} + +uint64_t FindMinPrepLogReferencedByMemTable( + VersionSet* vset, const autovector& memtables_to_flush) { + uint64_t min_log = 0; + + // we must look through the memtables for two phase transactions + // that have been committed but not yet flushed + std::unordered_set memtables_to_flush_set( + memtables_to_flush.begin(), memtables_to_flush.end()); + for (auto loop_cfd : *vset->GetColumnFamilySet()) { + if (loop_cfd->IsDropped()) { + continue; + } + + auto log = loop_cfd->imm()->PrecomputeMinLogContainingPrepSection( + &memtables_to_flush_set); + + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + + log = loop_cfd->mem()->GetMinLogContainingPrepSection(); + + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + } + + return min_log; +} + +uint64_t FindMinPrepLogReferencedByMemTable( + VersionSet* vset, + const autovector*>& memtables_to_flush) { + uint64_t min_log = 0; + + std::unordered_set memtables_to_flush_set; + for (const autovector* memtables : memtables_to_flush) { + memtables_to_flush_set.insert(memtables->begin(), memtables->end()); + } + for (auto loop_cfd : *vset->GetColumnFamilySet()) { + if (loop_cfd->IsDropped()) { + continue; + } + + auto log = loop_cfd->imm()->PrecomputeMinLogContainingPrepSection( + &memtables_to_flush_set); + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + + log = loop_cfd->mem()->GetMinLogContainingPrepSection(); + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + } + + return min_log; +} + +uint64_t PrecomputeMinLogNumberToKeepNon2PC( + VersionSet* vset, const ColumnFamilyData& cfd_to_flush, + const autovector& edit_list) { + assert(vset != nullptr); + + // Precompute the min log number containing unflushed data for the column + // family being flushed (`cfd_to_flush`). + uint64_t cf_min_log_number_to_keep = 0; + for (auto& e : edit_list) { + if (e->HasLogNumber()) { + cf_min_log_number_to_keep = + std::max(cf_min_log_number_to_keep, e->GetLogNumber()); + } + } + if (cf_min_log_number_to_keep == 0) { + // No version edit contains information on log number. The log number + // for this column family should stay the same as it is. + cf_min_log_number_to_keep = cfd_to_flush.GetLogNumber(); + } + + // Get min log number containing unflushed data for other column families. + uint64_t min_log_number_to_keep = + vset->PreComputeMinLogNumberWithUnflushedData(&cfd_to_flush); + if (cf_min_log_number_to_keep != 0) { + min_log_number_to_keep = + std::min(cf_min_log_number_to_keep, min_log_number_to_keep); + } + return min_log_number_to_keep; +} + +uint64_t PrecomputeMinLogNumberToKeepNon2PC( + VersionSet* vset, const autovector& cfds_to_flush, + const autovector>& edit_lists) { + assert(vset != nullptr); + assert(!cfds_to_flush.empty()); + assert(cfds_to_flush.size() == edit_lists.size()); + + uint64_t min_log_number_to_keep = std::numeric_limits::max(); + for (const auto& edit_list : edit_lists) { + uint64_t log = 0; + for (const auto& e : edit_list) { + if (e->HasLogNumber()) { + log = std::max(log, e->GetLogNumber()); + } + } + if (log != 0) { + min_log_number_to_keep = std::min(min_log_number_to_keep, log); + } + } + if (min_log_number_to_keep == std::numeric_limits::max()) { + min_log_number_to_keep = cfds_to_flush[0]->GetLogNumber(); + for (size_t i = 1; i < cfds_to_flush.size(); i++) { + min_log_number_to_keep = + std::min(min_log_number_to_keep, cfds_to_flush[i]->GetLogNumber()); + } + } + + std::unordered_set flushed_cfds( + cfds_to_flush.begin(), cfds_to_flush.end()); + min_log_number_to_keep = + std::min(min_log_number_to_keep, + vset->PreComputeMinLogNumberWithUnflushedData(flushed_cfds)); + + return min_log_number_to_keep; +} + +uint64_t PrecomputeMinLogNumberToKeep2PC( + VersionSet* vset, const ColumnFamilyData& cfd_to_flush, + const autovector& edit_list, + const autovector& memtables_to_flush, + LogsWithPrepTracker* prep_tracker) { + assert(vset != nullptr); + assert(prep_tracker != nullptr); + // Calculate updated min_log_number_to_keep + // Since the function should only be called in 2pc mode, log number in + // the version edit should be sufficient. + + uint64_t min_log_number_to_keep = + PrecomputeMinLogNumberToKeepNon2PC(vset, cfd_to_flush, edit_list); + + // if are 2pc we must consider logs containing prepared + // sections of outstanding transactions. + // + // We must check min logs with outstanding prep before we check + // logs references by memtables because a log referenced by the + // first data structure could transition to the second under us. + // + // TODO: iterating over all column families under db mutex. + // should find more optimal solution + auto min_log_in_prep_heap = + prep_tracker->FindMinLogContainingOutstandingPrep(); + + if (min_log_in_prep_heap != 0 && + min_log_in_prep_heap < min_log_number_to_keep) { + min_log_number_to_keep = min_log_in_prep_heap; + } + + uint64_t min_log_refed_by_mem = + FindMinPrepLogReferencedByMemTable(vset, memtables_to_flush); + + if (min_log_refed_by_mem != 0 && + min_log_refed_by_mem < min_log_number_to_keep) { + min_log_number_to_keep = min_log_refed_by_mem; + } + return min_log_number_to_keep; +} + +uint64_t PrecomputeMinLogNumberToKeep2PC( + VersionSet* vset, const autovector& cfds_to_flush, + const autovector>& edit_lists, + const autovector*>& memtables_to_flush, + LogsWithPrepTracker* prep_tracker) { + assert(vset != nullptr); + assert(prep_tracker != nullptr); + assert(cfds_to_flush.size() == edit_lists.size()); + assert(cfds_to_flush.size() == memtables_to_flush.size()); + + uint64_t min_log_number_to_keep = + PrecomputeMinLogNumberToKeepNon2PC(vset, cfds_to_flush, edit_lists); + + uint64_t min_log_in_prep_heap = + prep_tracker->FindMinLogContainingOutstandingPrep(); + + if (min_log_in_prep_heap != 0 && + min_log_in_prep_heap < min_log_number_to_keep) { + min_log_number_to_keep = min_log_in_prep_heap; + } + + uint64_t min_log_refed_by_mem = + FindMinPrepLogReferencedByMemTable(vset, memtables_to_flush); + + if (min_log_refed_by_mem != 0 && + min_log_refed_by_mem < min_log_number_to_keep) { + min_log_number_to_keep = min_log_refed_by_mem; + } + + return min_log_number_to_keep; +} + +void DBImpl::SetDBId(std::string&& id, bool read_only, + RecoveryContext* recovery_ctx) { + assert(db_id_.empty()); + assert(!id.empty()); + db_id_ = std::move(id); + if (!read_only && immutable_db_options_.write_dbid_to_manifest) { + assert(recovery_ctx != nullptr); + assert(versions_->GetColumnFamilySet() != nullptr); + VersionEdit edit; + edit.SetDBId(db_id_); + versions_->db_id_ = db_id_; + recovery_ctx->UpdateVersionEdits( + versions_->GetColumnFamilySet()->GetDefault(), edit); + } +} + +Status DBImpl::SetupDBId(bool read_only, RecoveryContext* recovery_ctx) { + Status s; + // Check for the IDENTITY file and create it if not there or + // broken or not matching manifest + std::string db_id_in_file; + s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr); + if (s.ok()) { + s = GetDbIdentityFromIdentityFile(&db_id_in_file); + if (s.ok() && !db_id_in_file.empty()) { + if (db_id_.empty()) { + // Loaded from file and wasn't already known from manifest + SetDBId(std::move(db_id_in_file), read_only, recovery_ctx); + return s; + } else if (db_id_ == db_id_in_file) { + // Loaded from file and matches manifest + return s; + } + } + } + if (s.IsNotFound()) { + s = Status::OK(); + } + if (!s.ok()) { + assert(s.IsIOError()); + return s; + } + // Otherwise IDENTITY file is missing or no good. + // Generate new id if needed + if (db_id_.empty()) { + SetDBId(env_->GenerateUniqueId(), read_only, recovery_ctx); + } + // Persist it to IDENTITY file if allowed + if (!read_only) { + s = SetIdentityFile(env_, dbname_, db_id_); + } + return s; +} + +Status DBImpl::DeleteUnreferencedSstFiles(RecoveryContext* recovery_ctx) { + mutex_.AssertHeld(); + std::vector paths; + paths.push_back(NormalizePath(dbname_ + std::string(1, kFilePathSeparator))); + for (const auto& db_path : immutable_db_options_.db_paths) { + paths.push_back( + NormalizePath(db_path.path + std::string(1, kFilePathSeparator))); + } + for (const auto* cfd : *versions_->GetColumnFamilySet()) { + for (const auto& cf_path : cfd->ioptions()->cf_paths) { + paths.push_back( + NormalizePath(cf_path.path + std::string(1, kFilePathSeparator))); + } + } + // Dedup paths + std::sort(paths.begin(), paths.end()); + paths.erase(std::unique(paths.begin(), paths.end()), paths.end()); + + uint64_t next_file_number = versions_->current_next_file_number(); + uint64_t largest_file_number = next_file_number; + Status s; + for (const auto& path : paths) { + std::vector files; + s = env_->GetChildren(path, &files); + if (!s.ok()) { + break; + } + for (const auto& fname : files) { + uint64_t number = 0; + FileType type; + if (!ParseFileName(fname, &number, &type)) { + continue; + } + // path ends with '/' or '\\' + const std::string normalized_fpath = path + fname; + largest_file_number = std::max(largest_file_number, number); + if (type == kTableFile && number >= next_file_number && + recovery_ctx->files_to_delete_.find(normalized_fpath) == + recovery_ctx->files_to_delete_.end()) { + recovery_ctx->files_to_delete_.emplace(normalized_fpath); + } + } + } + if (!s.ok()) { + return s; + } + + if (largest_file_number >= next_file_number) { + versions_->next_file_number_.store(largest_file_number + 1); + } + + VersionEdit edit; + edit.SetNextFile(versions_->next_file_number_.load()); + assert(versions_->GetColumnFamilySet()); + ColumnFamilyData* default_cfd = versions_->GetColumnFamilySet()->GetDefault(); + assert(default_cfd); + recovery_ctx->UpdateVersionEdits(default_cfd, edit); + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_impl_open.cc b/src/rocksdb/db/db_impl/db_impl_open.cc new file mode 100644 index 000000000..40ffa2e85 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_open.cc @@ -0,0 +1,2106 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include + +#include "db/builder.h" +#include "db/db_impl/db_impl.h" +#include "db/error_handler.h" +#include "db/periodic_task_scheduler.h" +#include "env/composite_env_wrapper.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "file/sst_file_manager_impl.h" +#include "file/writable_file_writer.h" +#include "logging/logging.h" +#include "monitoring/persistent_stats_history.h" +#include "options/options_helper.h" +#include "rocksdb/table.h" +#include "rocksdb/wal_filter.h" +#include "test_util/sync_point.h" +#include "util/rate_limiter.h" + +namespace ROCKSDB_NAMESPACE { +Options SanitizeOptions(const std::string& dbname, const Options& src, + bool read_only, Status* logger_creation_s) { + auto db_options = + SanitizeOptions(dbname, DBOptions(src), read_only, logger_creation_s); + ImmutableDBOptions immutable_db_options(db_options); + auto cf_options = + SanitizeOptions(immutable_db_options, ColumnFamilyOptions(src)); + return Options(db_options, cf_options); +} + +DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src, + bool read_only, Status* logger_creation_s) { + DBOptions result(src); + + if (result.env == nullptr) { + result.env = Env::Default(); + } + + // result.max_open_files means an "infinite" open files. + if (result.max_open_files != -1) { + int max_max_open_files = port::GetMaxOpenFiles(); + if (max_max_open_files == -1) { + max_max_open_files = 0x400000; + } + ClipToRange(&result.max_open_files, 20, max_max_open_files); + TEST_SYNC_POINT_CALLBACK("SanitizeOptions::AfterChangeMaxOpenFiles", + &result.max_open_files); + } + + if (result.info_log == nullptr && !read_only) { + Status s = CreateLoggerFromOptions(dbname, result, &result.info_log); + if (!s.ok()) { + // No place suitable for logging + result.info_log = nullptr; + if (logger_creation_s) { + *logger_creation_s = s; + } + } + } + + if (!result.write_buffer_manager) { + result.write_buffer_manager.reset( + new WriteBufferManager(result.db_write_buffer_size)); + } + auto bg_job_limits = DBImpl::GetBGJobLimits( + result.max_background_flushes, result.max_background_compactions, + result.max_background_jobs, true /* parallelize_compactions */); + result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_compactions, + Env::Priority::LOW); + result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_flushes, + Env::Priority::HIGH); + + if (result.rate_limiter.get() != nullptr) { + if (result.bytes_per_sync == 0) { + result.bytes_per_sync = 1024 * 1024; + } + } + + if (result.delayed_write_rate == 0) { + if (result.rate_limiter.get() != nullptr) { + result.delayed_write_rate = result.rate_limiter->GetBytesPerSecond(); + } + if (result.delayed_write_rate == 0) { + result.delayed_write_rate = 16 * 1024 * 1024; + } + } + + if (result.WAL_ttl_seconds > 0 || result.WAL_size_limit_MB > 0) { + result.recycle_log_file_num = false; + } + + if (result.recycle_log_file_num && + (result.wal_recovery_mode == + WALRecoveryMode::kTolerateCorruptedTailRecords || + result.wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery || + result.wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency)) { + // - kTolerateCorruptedTailRecords is inconsistent with recycle log file + // feature. WAL recycling expects recovery success upon encountering a + // corrupt record at the point where new data ends and recycled data + // remains at the tail. However, `kTolerateCorruptedTailRecords` must fail + // upon encountering any such corrupt record, as it cannot differentiate + // between this and a real corruption, which would cause committed updates + // to be truncated -- a violation of the recovery guarantee. + // - kPointInTimeRecovery and kAbsoluteConsistency are incompatible with + // recycle log file feature temporarily due to a bug found introducing a + // hole in the recovered data + // (https://github.com/facebook/rocksdb/pull/7252#issuecomment-673766236). + // Besides this bug, we believe the features are fundamentally compatible. + result.recycle_log_file_num = 0; + } + + if (result.db_paths.size() == 0) { + result.db_paths.emplace_back(dbname, std::numeric_limits::max()); + } else if (result.wal_dir.empty()) { + // Use dbname as default + result.wal_dir = dbname; + } + if (!result.wal_dir.empty()) { + // If there is a wal_dir already set, check to see if the wal_dir is the + // same as the dbname AND the same as the db_path[0] (which must exist from + // a few lines ago). If the wal_dir matches both of these values, then clear + // the wal_dir value, which will make wal_dir == dbname. Most likely this + // condition was the result of reading an old options file where we forced + // wal_dir to be set (to dbname). + auto npath = NormalizePath(dbname + "/"); + if (npath == NormalizePath(result.wal_dir + "/") && + npath == NormalizePath(result.db_paths[0].path + "/")) { + result.wal_dir.clear(); + } + } + + if (!result.wal_dir.empty() && result.wal_dir.back() == '/') { + result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1); + } + + if (result.use_direct_reads && result.compaction_readahead_size == 0) { + TEST_SYNC_POINT_CALLBACK("SanitizeOptions:direct_io", nullptr); + result.compaction_readahead_size = 1024 * 1024 * 2; + } + + // Force flush on DB open if 2PC is enabled, since with 2PC we have no + // guarantee that consecutive log files have consecutive sequence id, which + // make recovery complicated. + if (result.allow_2pc) { + result.avoid_flush_during_recovery = false; + } + +#ifndef ROCKSDB_LITE + ImmutableDBOptions immutable_db_options(result); + if (!immutable_db_options.IsWalDirSameAsDBPath()) { + // Either the WAL dir and db_paths[0]/db_name are not the same, or we + // cannot tell for sure. In either case, assume they're different and + // explicitly cleanup the trash log files (bypass DeleteScheduler) + // Do this first so even if we end up calling + // DeleteScheduler::CleanupDirectory on the same dir later, it will be + // safe + std::vector filenames; + IOOptions io_opts; + io_opts.do_not_recurse = true; + auto wal_dir = immutable_db_options.GetWalDir(); + Status s = immutable_db_options.fs->GetChildren( + wal_dir, io_opts, &filenames, /*IODebugContext*=*/nullptr); + s.PermitUncheckedError(); //**TODO: What to do on error? + for (std::string& filename : filenames) { + if (filename.find(".log.trash", filename.length() - + std::string(".log.trash").length()) != + std::string::npos) { + std::string trash_file = wal_dir + "/" + filename; + result.env->DeleteFile(trash_file).PermitUncheckedError(); + } + } + } + // When the DB is stopped, it's possible that there are some .trash files that + // were not deleted yet, when we open the DB we will find these .trash files + // and schedule them to be deleted (or delete immediately if SstFileManager + // was not used) + auto sfm = static_cast(result.sst_file_manager.get()); + for (size_t i = 0; i < result.db_paths.size(); i++) { + DeleteScheduler::CleanupDirectory(result.env, sfm, result.db_paths[i].path) + .PermitUncheckedError(); + } + + // Create a default SstFileManager for purposes of tracking compaction size + // and facilitating recovery from out of space errors. + if (result.sst_file_manager.get() == nullptr) { + std::shared_ptr sst_file_manager( + NewSstFileManager(result.env, result.info_log)); + result.sst_file_manager = sst_file_manager; + } +#endif // !ROCKSDB_LITE + + // Supported wal compression types + if (!StreamingCompressionTypeSupported(result.wal_compression)) { + result.wal_compression = kNoCompression; + ROCKS_LOG_WARN(result.info_log, + "wal_compression is disabled since only zstd is supported"); + } + + if (!result.paranoid_checks) { + result.skip_checking_sst_file_sizes_on_db_open = true; + ROCKS_LOG_INFO(result.info_log, + "file size check will be skipped during open."); + } + + return result; +} + +namespace { +Status ValidateOptionsByTable( + const DBOptions& db_opts, + const std::vector& column_families) { + Status s; + for (auto cf : column_families) { + s = ValidateOptions(db_opts, cf.options); + if (!s.ok()) { + return s; + } + } + return Status::OK(); +} +} // namespace + +Status DBImpl::ValidateOptions( + const DBOptions& db_options, + const std::vector& column_families) { + Status s; + for (auto& cfd : column_families) { + s = ColumnFamilyData::ValidateOptions(db_options, cfd.options); + if (!s.ok()) { + return s; + } + } + s = ValidateOptions(db_options); + return s; +} + +Status DBImpl::ValidateOptions(const DBOptions& db_options) { + if (db_options.db_paths.size() > 4) { + return Status::NotSupported( + "More than four DB paths are not supported yet. "); + } + + if (db_options.allow_mmap_reads && db_options.use_direct_reads) { + // Protect against assert in PosixMMapReadableFile constructor + return Status::NotSupported( + "If memory mapped reads (allow_mmap_reads) are enabled " + "then direct I/O reads (use_direct_reads) must be disabled. "); + } + + if (db_options.allow_mmap_writes && + db_options.use_direct_io_for_flush_and_compaction) { + return Status::NotSupported( + "If memory mapped writes (allow_mmap_writes) are enabled " + "then direct I/O writes (use_direct_io_for_flush_and_compaction) must " + "be disabled. "); + } + + if (db_options.keep_log_file_num == 0) { + return Status::InvalidArgument("keep_log_file_num must be greater than 0"); + } + + if (db_options.unordered_write && + !db_options.allow_concurrent_memtable_write) { + return Status::InvalidArgument( + "unordered_write is incompatible with " + "!allow_concurrent_memtable_write"); + } + + if (db_options.unordered_write && db_options.enable_pipelined_write) { + return Status::InvalidArgument( + "unordered_write is incompatible with enable_pipelined_write"); + } + + if (db_options.atomic_flush && db_options.enable_pipelined_write) { + return Status::InvalidArgument( + "atomic_flush is incompatible with enable_pipelined_write"); + } + + // TODO remove this restriction + if (db_options.atomic_flush && db_options.best_efforts_recovery) { + return Status::InvalidArgument( + "atomic_flush is currently incompatible with best-efforts recovery"); + } + + if (db_options.use_direct_io_for_flush_and_compaction && + 0 == db_options.writable_file_max_buffer_size) { + return Status::InvalidArgument( + "writes in direct IO require writable_file_max_buffer_size > 0"); + } + + return Status::OK(); +} + +Status DBImpl::NewDB(std::vector* new_filenames) { + VersionEdit new_db; + Status s = SetIdentityFile(env_, dbname_); + if (!s.ok()) { + return s; + } + if (immutable_db_options_.write_dbid_to_manifest) { + std::string temp_db_id; + GetDbIdentityFromIdentityFile(&temp_db_id); + new_db.SetDBId(temp_db_id); + } + new_db.SetLogNumber(0); + new_db.SetNextFile(2); + new_db.SetLastSequence(0); + + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n"); + const std::string manifest = DescriptorFileName(dbname_, 1); + { + if (fs_->FileExists(manifest, IOOptions(), nullptr).ok()) { + fs_->DeleteFile(manifest, IOOptions(), nullptr).PermitUncheckedError(); + } + std::unique_ptr file; + FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_); + s = NewWritableFile(fs_.get(), manifest, &file, file_options); + if (!s.ok()) { + return s; + } + FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types; + file->SetPreallocationBlockSize( + immutable_db_options_.manifest_preallocation_size); + std::unique_ptr file_writer(new WritableFileWriter( + std::move(file), manifest, file_options, immutable_db_options_.clock, + io_tracer_, nullptr /* stats */, immutable_db_options_.listeners, + nullptr, tmp_set.Contains(FileType::kDescriptorFile), + tmp_set.Contains(FileType::kDescriptorFile))); + log::Writer log(std::move(file_writer), 0, false); + std::string record; + new_db.EncodeTo(&record); + s = log.AddRecord(record); + if (s.ok()) { + s = SyncManifest(&immutable_db_options_, log.file()); + } + } + if (s.ok()) { + // Make "CURRENT" file that points to the new manifest file. + s = SetCurrentFile(fs_.get(), dbname_, 1, directories_.GetDbDir()); + if (new_filenames) { + new_filenames->emplace_back( + manifest.substr(manifest.find_last_of("/\\") + 1)); + } + } else { + fs_->DeleteFile(manifest, IOOptions(), nullptr).PermitUncheckedError(); + } + return s; +} + +IOStatus DBImpl::CreateAndNewDirectory( + FileSystem* fs, const std::string& dirname, + std::unique_ptr* directory) { + // We call CreateDirIfMissing() as the directory may already exist (if we + // are reopening a DB), when this happens we don't want creating the + // directory to cause an error. However, we need to check if creating the + // directory fails or else we may get an obscure message about the lock + // file not existing. One real-world example of this occurring is if + // env->CreateDirIfMissing() doesn't create intermediate directories, e.g. + // when dbname_ is "dir/db" but when "dir" doesn't exist. + IOStatus io_s = fs->CreateDirIfMissing(dirname, IOOptions(), nullptr); + if (!io_s.ok()) { + return io_s; + } + return fs->NewDirectory(dirname, IOOptions(), directory, nullptr); +} + +IOStatus Directories::SetDirectories(FileSystem* fs, const std::string& dbname, + const std::string& wal_dir, + const std::vector& data_paths) { + IOStatus io_s = DBImpl::CreateAndNewDirectory(fs, dbname, &db_dir_); + if (!io_s.ok()) { + return io_s; + } + if (!wal_dir.empty() && dbname != wal_dir) { + io_s = DBImpl::CreateAndNewDirectory(fs, wal_dir, &wal_dir_); + if (!io_s.ok()) { + return io_s; + } + } + + data_dirs_.clear(); + for (auto& p : data_paths) { + const std::string db_path = p.path; + if (db_path == dbname) { + data_dirs_.emplace_back(nullptr); + } else { + std::unique_ptr path_directory; + io_s = DBImpl::CreateAndNewDirectory(fs, db_path, &path_directory); + if (!io_s.ok()) { + return io_s; + } + data_dirs_.emplace_back(path_directory.release()); + } + } + assert(data_dirs_.size() == data_paths.size()); + return IOStatus::OK(); +} + +Status DBImpl::Recover( + const std::vector& column_families, bool read_only, + bool error_if_wal_file_exists, bool error_if_data_exists_in_wals, + uint64_t* recovered_seq, RecoveryContext* recovery_ctx) { + mutex_.AssertHeld(); + + bool is_new_db = false; + assert(db_lock_ == nullptr); + std::vector files_in_dbname; + if (!read_only) { + Status s = directories_.SetDirectories(fs_.get(), dbname_, + immutable_db_options_.wal_dir, + immutable_db_options_.db_paths); + if (!s.ok()) { + return s; + } + + s = env_->LockFile(LockFileName(dbname_), &db_lock_); + if (!s.ok()) { + return s; + } + + std::string current_fname = CurrentFileName(dbname_); + // Path to any MANIFEST file in the db dir. It does not matter which one. + // Since best-efforts recovery ignores CURRENT file, existence of a + // MANIFEST indicates the recovery to recover existing db. If no MANIFEST + // can be found, a new db will be created. + std::string manifest_path; + if (!immutable_db_options_.best_efforts_recovery) { + s = env_->FileExists(current_fname); + } else { + s = Status::NotFound(); + IOOptions io_opts; + io_opts.do_not_recurse = true; + Status io_s = immutable_db_options_.fs->GetChildren( + dbname_, io_opts, &files_in_dbname, /*IODebugContext*=*/nullptr); + if (!io_s.ok()) { + s = io_s; + files_in_dbname.clear(); + } + for (const std::string& file : files_in_dbname) { + uint64_t number = 0; + FileType type = kWalFile; // initialize + if (ParseFileName(file, &number, &type) && type == kDescriptorFile) { + uint64_t bytes; + s = env_->GetFileSize(DescriptorFileName(dbname_, number), &bytes); + if (s.ok() && bytes != 0) { + // Found non-empty MANIFEST (descriptor log), thus best-efforts + // recovery does not have to treat the db as empty. + manifest_path = dbname_ + "/" + file; + break; + } + } + } + } + if (s.IsNotFound()) { + if (immutable_db_options_.create_if_missing) { + s = NewDB(&files_in_dbname); + is_new_db = true; + if (!s.ok()) { + return s; + } + } else { + return Status::InvalidArgument( + current_fname, "does not exist (create_if_missing is false)"); + } + } else if (s.ok()) { + if (immutable_db_options_.error_if_exists) { + return Status::InvalidArgument(dbname_, + "exists (error_if_exists is true)"); + } + } else { + // Unexpected error reading file + assert(s.IsIOError()); + return s; + } + // Verify compatibility of file_options_ and filesystem + { + std::unique_ptr idfile; + FileOptions customized_fs(file_options_); + customized_fs.use_direct_reads |= + immutable_db_options_.use_direct_io_for_flush_and_compaction; + const std::string& fname = + manifest_path.empty() ? current_fname : manifest_path; + s = fs_->NewRandomAccessFile(fname, customized_fs, &idfile, nullptr); + if (!s.ok()) { + std::string error_str = s.ToString(); + // Check if unsupported Direct I/O is the root cause + customized_fs.use_direct_reads = false; + s = fs_->NewRandomAccessFile(fname, customized_fs, &idfile, nullptr); + if (s.ok()) { + return Status::InvalidArgument( + "Direct I/O is not supported by the specified DB."); + } else { + return Status::InvalidArgument( + "Found options incompatible with filesystem", error_str.c_str()); + } + } + } + } else if (immutable_db_options_.best_efforts_recovery) { + assert(files_in_dbname.empty()); + IOOptions io_opts; + io_opts.do_not_recurse = true; + Status s = immutable_db_options_.fs->GetChildren( + dbname_, io_opts, &files_in_dbname, /*IODebugContext*=*/nullptr); + if (s.IsNotFound()) { + return Status::InvalidArgument(dbname_, + "does not exist (open for read only)"); + } else if (s.IsIOError()) { + return s; + } + assert(s.ok()); + } + assert(db_id_.empty()); + Status s; + bool missing_table_file = false; + if (!immutable_db_options_.best_efforts_recovery) { + s = versions_->Recover(column_families, read_only, &db_id_); + } else { + assert(!files_in_dbname.empty()); + s = versions_->TryRecover(column_families, read_only, files_in_dbname, + &db_id_, &missing_table_file); + if (s.ok()) { + // TryRecover may delete previous column_family_set_. + column_family_memtables_.reset( + new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); + } + } + if (!s.ok()) { + return s; + } + s = SetupDBId(read_only, recovery_ctx); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "DB ID: %s\n", db_id_.c_str()); + if (s.ok() && !read_only) { + s = DeleteUnreferencedSstFiles(recovery_ctx); + } + + if (immutable_db_options_.paranoid_checks && s.ok()) { + s = CheckConsistency(); + } + if (s.ok() && !read_only) { + // TODO: share file descriptors (FSDirectory) with SetDirectories above + std::map> created_dirs; + for (auto cfd : *versions_->GetColumnFamilySet()) { + s = cfd->AddDirectories(&created_dirs); + if (!s.ok()) { + return s; + } + } + } + + std::vector files_in_wal_dir; + if (s.ok()) { + // Initial max_total_in_memory_state_ before recovery wals. Log recovery + // may check this value to decide whether to flush. + max_total_in_memory_state_ = 0; + for (auto cfd : *versions_->GetColumnFamilySet()) { + auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + max_total_in_memory_state_ += mutable_cf_options->write_buffer_size * + mutable_cf_options->max_write_buffer_number; + } + + SequenceNumber next_sequence(kMaxSequenceNumber); + default_cf_handle_ = new ColumnFamilyHandleImpl( + versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_); + default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats(); + + // Recover from all newer log files than the ones named in the + // descriptor (new log files may have been added by the previous + // incarnation without registering them in the descriptor). + // + // Note that prev_log_number() is no longer used, but we pay + // attention to it in case we are recovering a database + // produced by an older version of rocksdb. + auto wal_dir = immutable_db_options_.GetWalDir(); + if (!immutable_db_options_.best_efforts_recovery) { + IOOptions io_opts; + io_opts.do_not_recurse = true; + s = immutable_db_options_.fs->GetChildren( + wal_dir, io_opts, &files_in_wal_dir, /*IODebugContext*=*/nullptr); + } + if (s.IsNotFound()) { + return Status::InvalidArgument("wal_dir not found", wal_dir); + } else if (!s.ok()) { + return s; + } + + std::unordered_map wal_files; + for (const auto& file : files_in_wal_dir) { + uint64_t number; + FileType type; + if (ParseFileName(file, &number, &type) && type == kWalFile) { + if (is_new_db) { + return Status::Corruption( + "While creating a new Db, wal_dir contains " + "existing log file: ", + file); + } else { + wal_files[number] = LogFileName(wal_dir, number); + } + } + } + + if (immutable_db_options_.track_and_verify_wals_in_manifest) { + if (!immutable_db_options_.best_efforts_recovery) { + // Verify WALs in MANIFEST. + s = versions_->GetWalSet().CheckWals(env_, wal_files); + } // else since best effort recovery does not recover from WALs, no need + // to check WALs. + } else if (!versions_->GetWalSet().GetWals().empty()) { + // Tracking is disabled, clear previously tracked WALs from MANIFEST, + // otherwise, in the future, if WAL tracking is enabled again, + // since the WALs deleted when WAL tracking is disabled are not persisted + // into MANIFEST, WAL check may fail. + VersionEdit edit; + WalNumber max_wal_number = + versions_->GetWalSet().GetWals().rbegin()->first; + edit.DeleteWalsBefore(max_wal_number + 1); + assert(recovery_ctx != nullptr); + assert(versions_->GetColumnFamilySet() != nullptr); + recovery_ctx->UpdateVersionEdits( + versions_->GetColumnFamilySet()->GetDefault(), edit); + } + if (!s.ok()) { + return s; + } + + if (!wal_files.empty()) { + if (error_if_wal_file_exists) { + return Status::Corruption( + "The db was opened in readonly mode with error_if_wal_file_exists" + "flag but a WAL file already exists"); + } else if (error_if_data_exists_in_wals) { + for (auto& wal_file : wal_files) { + uint64_t bytes; + s = env_->GetFileSize(wal_file.second, &bytes); + if (s.ok()) { + if (bytes > 0) { + return Status::Corruption( + "error_if_data_exists_in_wals is set but there are data " + " in WAL files."); + } + } + } + } + } + + if (!wal_files.empty()) { + // Recover in the order in which the wals were generated + std::vector wals; + wals.reserve(wal_files.size()); + for (const auto& wal_file : wal_files) { + wals.push_back(wal_file.first); + } + std::sort(wals.begin(), wals.end()); + + bool corrupted_wal_found = false; + s = RecoverLogFiles(wals, &next_sequence, read_only, &corrupted_wal_found, + recovery_ctx); + if (corrupted_wal_found && recovered_seq != nullptr) { + *recovered_seq = next_sequence; + } + if (!s.ok()) { + // Clear memtables if recovery failed + for (auto cfd : *versions_->GetColumnFamilySet()) { + cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), + kMaxSequenceNumber); + } + } + } + } + + if (read_only) { + // If we are opening as read-only, we need to update options_file_number_ + // to reflect the most recent OPTIONS file. It does not matter for regular + // read-write db instance because options_file_number_ will later be + // updated to versions_->NewFileNumber() in RenameTempFileToOptionsFile. + std::vector filenames; + if (s.ok()) { + const std::string normalized_dbname = NormalizePath(dbname_); + const std::string normalized_wal_dir = + NormalizePath(immutable_db_options_.GetWalDir()); + if (immutable_db_options_.best_efforts_recovery) { + filenames = std::move(files_in_dbname); + } else if (normalized_dbname == normalized_wal_dir) { + filenames = std::move(files_in_wal_dir); + } else { + IOOptions io_opts; + io_opts.do_not_recurse = true; + s = immutable_db_options_.fs->GetChildren( + GetName(), io_opts, &filenames, /*IODebugContext*=*/nullptr); + } + } + if (s.ok()) { + uint64_t number = 0; + uint64_t options_file_number = 0; + FileType type; + for (const auto& fname : filenames) { + if (ParseFileName(fname, &number, &type) && type == kOptionsFile) { + options_file_number = std::max(number, options_file_number); + } + } + versions_->options_file_number_ = options_file_number; + uint64_t options_file_size = 0; + if (options_file_number > 0) { + s = env_->GetFileSize(OptionsFileName(GetName(), options_file_number), + &options_file_size); + } + versions_->options_file_size_ = options_file_size; + } + } + return s; +} + +Status DBImpl::PersistentStatsProcessFormatVersion() { + mutex_.AssertHeld(); + Status s; + // persist version when stats CF doesn't exist + bool should_persist_format_version = !persistent_stats_cfd_exists_; + mutex_.Unlock(); + if (persistent_stats_cfd_exists_) { + // Check persistent stats format version compatibility. Drop and recreate + // persistent stats CF if format version is incompatible + uint64_t format_version_recovered = 0; + Status s_format = DecodePersistentStatsVersionNumber( + this, StatsVersionKeyType::kFormatVersion, &format_version_recovered); + uint64_t compatible_version_recovered = 0; + Status s_compatible = DecodePersistentStatsVersionNumber( + this, StatsVersionKeyType::kCompatibleVersion, + &compatible_version_recovered); + // abort reading from existing stats CF if any of following is true: + // 1. failed to read format version or compatible version from disk + // 2. sst's format version is greater than current format version, meaning + // this sst is encoded with a newer RocksDB release, and current compatible + // version is below the sst's compatible version + if (!s_format.ok() || !s_compatible.ok() || + (kStatsCFCurrentFormatVersion < format_version_recovered && + kStatsCFCompatibleFormatVersion < compatible_version_recovered)) { + if (!s_format.ok() || !s_compatible.ok()) { + ROCKS_LOG_WARN( + immutable_db_options_.info_log, + "Recreating persistent stats column family since reading " + "persistent stats version key failed. Format key: %s, compatible " + "key: %s", + s_format.ToString().c_str(), s_compatible.ToString().c_str()); + } else { + ROCKS_LOG_WARN( + immutable_db_options_.info_log, + "Recreating persistent stats column family due to corrupted or " + "incompatible format version. Recovered format: %" PRIu64 + "; recovered format compatible since: %" PRIu64 "\n", + format_version_recovered, compatible_version_recovered); + } + s = DropColumnFamily(persist_stats_cf_handle_); + if (s.ok()) { + s = DestroyColumnFamilyHandle(persist_stats_cf_handle_); + } + ColumnFamilyHandle* handle = nullptr; + if (s.ok()) { + ColumnFamilyOptions cfo; + OptimizeForPersistentStats(&cfo); + s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle); + } + if (s.ok()) { + persist_stats_cf_handle_ = static_cast(handle); + // should also persist version here because old stats CF is discarded + should_persist_format_version = true; + } + } + } + if (should_persist_format_version) { + // Persistent stats CF being created for the first time, need to write + // format version key + WriteBatch batch; + if (s.ok()) { + s = batch.Put(persist_stats_cf_handle_, kFormatVersionKeyString, + std::to_string(kStatsCFCurrentFormatVersion)); + } + if (s.ok()) { + s = batch.Put(persist_stats_cf_handle_, kCompatibleVersionKeyString, + std::to_string(kStatsCFCompatibleFormatVersion)); + } + if (s.ok()) { + WriteOptions wo; + wo.low_pri = true; + wo.no_slowdown = true; + wo.sync = false; + s = Write(wo, &batch); + } + } + mutex_.Lock(); + return s; +} + +Status DBImpl::InitPersistStatsColumnFamily() { + mutex_.AssertHeld(); + assert(!persist_stats_cf_handle_); + ColumnFamilyData* persistent_stats_cfd = + versions_->GetColumnFamilySet()->GetColumnFamily( + kPersistentStatsColumnFamilyName); + persistent_stats_cfd_exists_ = persistent_stats_cfd != nullptr; + + Status s; + if (persistent_stats_cfd != nullptr) { + // We are recovering from a DB which already contains persistent stats CF, + // the CF is already created in VersionSet::ApplyOneVersionEdit, but + // column family handle was not. Need to explicitly create handle here. + persist_stats_cf_handle_ = + new ColumnFamilyHandleImpl(persistent_stats_cfd, this, &mutex_); + } else { + mutex_.Unlock(); + ColumnFamilyHandle* handle = nullptr; + ColumnFamilyOptions cfo; + OptimizeForPersistentStats(&cfo); + s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle); + persist_stats_cf_handle_ = static_cast(handle); + mutex_.Lock(); + } + return s; +} + +Status DBImpl::LogAndApplyForRecovery(const RecoveryContext& recovery_ctx) { + mutex_.AssertHeld(); + assert(versions_->descriptor_log_ == nullptr); + Status s = versions_->LogAndApply( + recovery_ctx.cfds_, recovery_ctx.mutable_cf_opts_, + recovery_ctx.edit_lists_, &mutex_, directories_.GetDbDir()); + if (s.ok() && !(recovery_ctx.files_to_delete_.empty())) { + mutex_.Unlock(); + for (const auto& fname : recovery_ctx.files_to_delete_) { + s = env_->DeleteFile(fname); + if (!s.ok()) { + break; + } + } + mutex_.Lock(); + } + return s; +} + +void DBImpl::InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap() { +#ifndef ROCKSDB_LITE + if (immutable_db_options_.wal_filter == nullptr) { + return; + } + assert(immutable_db_options_.wal_filter != nullptr); + WalFilter& wal_filter = *(immutable_db_options_.wal_filter); + + std::map cf_name_id_map; + std::map cf_lognumber_map; + assert(versions_); + assert(versions_->GetColumnFamilySet()); + for (auto cfd : *versions_->GetColumnFamilySet()) { + assert(cfd); + cf_name_id_map.insert(std::make_pair(cfd->GetName(), cfd->GetID())); + cf_lognumber_map.insert(std::make_pair(cfd->GetID(), cfd->GetLogNumber())); + } + + wal_filter.ColumnFamilyLogNumberMap(cf_lognumber_map, cf_name_id_map); +#endif // !ROCKSDB_LITE +} + +bool DBImpl::InvokeWalFilterIfNeededOnWalRecord(uint64_t wal_number, + const std::string& wal_fname, + log::Reader::Reporter& reporter, + Status& status, + bool& stop_replay, + WriteBatch& batch) { +#ifndef ROCKSDB_LITE + if (immutable_db_options_.wal_filter == nullptr) { + return true; + } + assert(immutable_db_options_.wal_filter != nullptr); + WalFilter& wal_filter = *(immutable_db_options_.wal_filter); + + WriteBatch new_batch; + bool batch_changed = false; + + bool process_current_record = true; + + WalFilter::WalProcessingOption wal_processing_option = + wal_filter.LogRecordFound(wal_number, wal_fname, batch, &new_batch, + &batch_changed); + + switch (wal_processing_option) { + case WalFilter::WalProcessingOption::kContinueProcessing: + // do nothing, proceeed normally + break; + case WalFilter::WalProcessingOption::kIgnoreCurrentRecord: + // skip current record + process_current_record = false; + break; + case WalFilter::WalProcessingOption::kStopReplay: + // skip current record and stop replay + process_current_record = false; + stop_replay = true; + break; + case WalFilter::WalProcessingOption::kCorruptedRecord: { + status = Status::Corruption("Corruption reported by Wal Filter ", + wal_filter.Name()); + MaybeIgnoreError(&status); + if (!status.ok()) { + process_current_record = false; + reporter.Corruption(batch.GetDataSize(), status); + } + break; + } + default: { + // logical error which should not happen. If RocksDB throws, we would + // just do `throw std::logic_error`. + assert(false); + status = Status::NotSupported( + "Unknown WalProcessingOption returned by Wal Filter ", + wal_filter.Name()); + MaybeIgnoreError(&status); + if (!status.ok()) { + // Ignore the error with current record processing. + stop_replay = true; + } + break; + } + } + + if (!process_current_record) { + return false; + } + + if (batch_changed) { + // Make sure that the count in the new batch is + // within the orignal count. + int new_count = WriteBatchInternal::Count(&new_batch); + int original_count = WriteBatchInternal::Count(&batch); + if (new_count > original_count) { + ROCKS_LOG_FATAL( + immutable_db_options_.info_log, + "Recovering log #%" PRIu64 + " mode %d log filter %s returned " + "more records (%d) than original (%d) which is not allowed. " + "Aborting recovery.", + wal_number, static_cast(immutable_db_options_.wal_recovery_mode), + wal_filter.Name(), new_count, original_count); + status = Status::NotSupported( + "More than original # of records " + "returned by Wal Filter ", + wal_filter.Name()); + return false; + } + // Set the same sequence number in the new_batch + // as the original batch. + WriteBatchInternal::SetSequence(&new_batch, + WriteBatchInternal::Sequence(&batch)); + batch = new_batch; + } + return true; +#else // !ROCKSDB_LITE + (void)wal_number; + (void)wal_fname; + (void)reporter; + (void)status; + (void)stop_replay; + (void)batch; + return true; +#endif // ROCKSDB_LITE +} + +// REQUIRES: wal_numbers are sorted in ascending order +Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, + SequenceNumber* next_sequence, bool read_only, + bool* corrupted_wal_found, + RecoveryContext* recovery_ctx) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + const char* fname; + Status* status; // nullptr if immutable_db_options_.paranoid_checks==false + void Corruption(size_t bytes, const Status& s) override { + ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s", + (status == nullptr ? "(ignoring error) " : ""), fname, + static_cast(bytes), s.ToString().c_str()); + if (status != nullptr && status->ok()) { + *status = s; + } + } + }; + + mutex_.AssertHeld(); + Status status; + std::unordered_map version_edits; + // no need to refcount because iteration is under mutex + for (auto cfd : *versions_->GetColumnFamilySet()) { + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + version_edits.insert({cfd->GetID(), edit}); + } + int job_id = next_job_id_.fetch_add(1); + { + auto stream = event_logger_.Log(); + stream << "job" << job_id << "event" + << "recovery_started"; + stream << "wal_files"; + stream.StartArray(); + for (auto wal_number : wal_numbers) { + stream << wal_number; + } + stream.EndArray(); + } + + // No-op for immutable_db_options_.wal_filter == nullptr. + InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap(); + + bool stop_replay_by_wal_filter = false; + bool stop_replay_for_corruption = false; + bool flushed = false; + uint64_t corrupted_wal_number = kMaxSequenceNumber; + uint64_t min_wal_number = MinLogNumberToKeep(); + if (!allow_2pc()) { + // In non-2pc mode, we skip WALs that do not back unflushed data. + min_wal_number = + std::max(min_wal_number, versions_->MinLogNumberWithUnflushedData()); + } + for (auto wal_number : wal_numbers) { + if (wal_number < min_wal_number) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Skipping log #%" PRIu64 + " since it is older than min log to keep #%" PRIu64, + wal_number, min_wal_number); + continue; + } + // The previous incarnation may not have written any MANIFEST + // records after allocating this log number. So we manually + // update the file number allocation counter in VersionSet. + versions_->MarkFileNumberUsed(wal_number); + // Open the log file + std::string fname = + LogFileName(immutable_db_options_.GetWalDir(), wal_number); + + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Recovering log #%" PRIu64 " mode %d", wal_number, + static_cast(immutable_db_options_.wal_recovery_mode)); + auto logFileDropped = [this, &fname]() { + uint64_t bytes; + if (env_->GetFileSize(fname, &bytes).ok()) { + auto info_log = immutable_db_options_.info_log.get(); + ROCKS_LOG_WARN(info_log, "%s: dropping %d bytes", fname.c_str(), + static_cast(bytes)); + } + }; + if (stop_replay_by_wal_filter) { + logFileDropped(); + continue; + } + + std::unique_ptr file_reader; + { + std::unique_ptr file; + status = fs_->NewSequentialFile( + fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr); + if (!status.ok()) { + MaybeIgnoreError(&status); + if (!status.ok()) { + return status; + } else { + // Fail with one log file, but that's ok. + // Try next one. + continue; + } + } + file_reader.reset(new SequentialFileReader( + std::move(file), fname, immutable_db_options_.log_readahead_size, + io_tracer_)); + } + + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = immutable_db_options_.info_log.get(); + reporter.fname = fname.c_str(); + if (!immutable_db_options_.paranoid_checks || + immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kSkipAnyCorruptedRecords) { + reporter.status = nullptr; + } else { + reporter.status = &status; + } + // We intentially make log::Reader do checksumming even if + // paranoid_checks==false so that corruptions cause entire commits + // to be skipped instead of propagating bad information (like overly + // large sequence numbers). + log::Reader reader(immutable_db_options_.info_log, std::move(file_reader), + &reporter, true /*checksum*/, wal_number); + + // Determine if we should tolerate incomplete records at the tail end of the + // Read all the records and add to a memtable + std::string scratch; + Slice record; + + TEST_SYNC_POINT_CALLBACK("DBImpl::RecoverLogFiles:BeforeReadWal", + /*arg=*/nullptr); + uint64_t record_checksum; + while (!stop_replay_by_wal_filter && + reader.ReadRecord(&record, &scratch, + immutable_db_options_.wal_recovery_mode, + &record_checksum) && + status.ok()) { + if (record.size() < WriteBatchInternal::kHeader) { + reporter.Corruption(record.size(), + Status::Corruption("log record too small")); + continue; + } + + // We create a new batch and initialize with a valid prot_info_ to store + // the data checksums + WriteBatch batch; + + status = WriteBatchInternal::SetContents(&batch, record); + if (!status.ok()) { + return status; + } + TEST_SYNC_POINT_CALLBACK( + "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:batch", &batch); + TEST_SYNC_POINT_CALLBACK( + "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:checksum", + &record_checksum); + status = WriteBatchInternal::UpdateProtectionInfo( + &batch, 8 /* bytes_per_key */, &record_checksum); + if (!status.ok()) { + return status; + } + + SequenceNumber sequence = WriteBatchInternal::Sequence(&batch); + + if (immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kPointInTimeRecovery) { + // In point-in-time recovery mode, if sequence id of log files are + // consecutive, we continue recovery despite corruption. This could + // happen when we open and write to a corrupted DB, where sequence id + // will start from the last sequence id we recovered. + if (sequence == *next_sequence) { + stop_replay_for_corruption = false; + } + if (stop_replay_for_corruption) { + logFileDropped(); + break; + } + } + + // For the default case of wal_filter == nullptr, always performs no-op + // and returns true. + if (!InvokeWalFilterIfNeededOnWalRecord(wal_number, fname, reporter, + status, stop_replay_by_wal_filter, + batch)) { + continue; + } + + // If column family was not found, it might mean that the WAL write + // batch references to the column family that was dropped after the + // insert. We don't want to fail the whole write batch in that case -- + // we just ignore the update. + // That's why we set ignore missing column families to true + bool has_valid_writes = false; + status = WriteBatchInternal::InsertInto( + &batch, column_family_memtables_.get(), &flush_scheduler_, + &trim_history_scheduler_, true, wal_number, this, + false /* concurrent_memtable_writes */, next_sequence, + &has_valid_writes, seq_per_batch_, batch_per_txn_); + MaybeIgnoreError(&status); + if (!status.ok()) { + // We are treating this as a failure while reading since we read valid + // blocks that do not form coherent data + reporter.Corruption(record.size(), status); + continue; + } + + if (has_valid_writes && !read_only) { + // we can do this because this is called before client has access to the + // DB and there is only a single thread operating on DB + ColumnFamilyData* cfd; + + while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) { + cfd->UnrefAndTryDelete(); + // If this asserts, it means that InsertInto failed in + // filtering updates to already-flushed column families + assert(cfd->GetLogNumber() <= wal_number); + auto iter = version_edits.find(cfd->GetID()); + assert(iter != version_edits.end()); + VersionEdit* edit = &iter->second; + status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit); + if (!status.ok()) { + // Reflect errors immediately so that conditions like full + // file-systems cause the DB::Open() to fail. + return status; + } + flushed = true; + + cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), + *next_sequence); + } + } + } + + if (!status.ok()) { + if (status.IsNotSupported()) { + // We should not treat NotSupported as corruption. It is rather a clear + // sign that we are processing a WAL that is produced by an incompatible + // version of the code. + return status; + } + if (immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kSkipAnyCorruptedRecords) { + // We should ignore all errors unconditionally + status = Status::OK(); + } else if (immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kPointInTimeRecovery) { + if (status.IsIOError()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "IOError during point-in-time reading log #%" PRIu64 + " seq #%" PRIu64 + ". %s. This likely mean loss of synced WAL, " + "thus recovery fails.", + wal_number, *next_sequence, + status.ToString().c_str()); + return status; + } + // We should ignore the error but not continue replaying + status = Status::OK(); + stop_replay_for_corruption = true; + corrupted_wal_number = wal_number; + if (corrupted_wal_found != nullptr) { + *corrupted_wal_found = true; + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Point in time recovered to log #%" PRIu64 + " seq #%" PRIu64, + wal_number, *next_sequence); + } else { + assert(immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kTolerateCorruptedTailRecords || + immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kAbsoluteConsistency); + return status; + } + } + + flush_scheduler_.Clear(); + trim_history_scheduler_.Clear(); + auto last_sequence = *next_sequence - 1; + if ((*next_sequence != kMaxSequenceNumber) && + (versions_->LastSequence() <= last_sequence)) { + versions_->SetLastAllocatedSequence(last_sequence); + versions_->SetLastPublishedSequence(last_sequence); + versions_->SetLastSequence(last_sequence); + } + } + // Compare the corrupted log number to all columnfamily's current log number. + // Abort Open() if any column family's log number is greater than + // the corrupted log number, which means CF contains data beyond the point of + // corruption. This could during PIT recovery when the WAL is corrupted and + // some (but not all) CFs are flushed + // Exclude the PIT case where no log is dropped after the corruption point. + // This is to cover the case for empty wals after corrupted log, in which we + // don't reset stop_replay_for_corruption. + if (stop_replay_for_corruption == true && + (immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kPointInTimeRecovery || + immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kTolerateCorruptedTailRecords)) { + for (auto cfd : *versions_->GetColumnFamilySet()) { + // One special case cause cfd->GetLogNumber() > corrupted_wal_number but + // the CF is still consistent: If a new column family is created during + // the flush and the WAL sync fails at the same time, the new CF points to + // the new WAL but the old WAL is curropted. Since the new CF is empty, it + // is still consistent. We add the check of CF sst file size to avoid the + // false positive alert. + + // Note that, the check of (cfd->GetLiveSstFilesSize() > 0) may leads to + // the ignorance of a very rare inconsistency case caused in data + // canclation. One CF is empty due to KV deletion. But those operations + // are in the WAL. If the WAL is corrupted, the status of this CF might + // not be consistent with others. However, the consistency check will be + // bypassed due to empty CF. + // TODO: a better and complete implementation is needed to ensure strict + // consistency check in WAL recovery including hanlding the tailing + // issues. + if (cfd->GetLogNumber() > corrupted_wal_number && + cfd->GetLiveSstFilesSize() > 0) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Column family inconsistency: SST file contains data" + " beyond the point of corruption."); + return Status::Corruption("SST file is ahead of WALs in CF " + + cfd->GetName()); + } + } + } + + // True if there's any data in the WALs; if not, we can skip re-processing + // them later + bool data_seen = false; + if (!read_only) { + // no need to refcount since client still doesn't have access + // to the DB and can not drop column families while we iterate + const WalNumber max_wal_number = wal_numbers.back(); + for (auto cfd : *versions_->GetColumnFamilySet()) { + auto iter = version_edits.find(cfd->GetID()); + assert(iter != version_edits.end()); + VersionEdit* edit = &iter->second; + + if (cfd->GetLogNumber() > max_wal_number) { + // Column family cfd has already flushed the data + // from all wals. Memtable has to be empty because + // we filter the updates based on wal_number + // (in WriteBatch::InsertInto) + assert(cfd->mem()->GetFirstSequenceNumber() == 0); + assert(edit->NumEntries() == 0); + continue; + } + + TEST_SYNC_POINT_CALLBACK( + "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable", /*arg=*/nullptr); + + // flush the final memtable (if non-empty) + if (cfd->mem()->GetFirstSequenceNumber() != 0) { + // If flush happened in the middle of recovery (e.g. due to memtable + // being full), we flush at the end. Otherwise we'll need to record + // where we were on last flush, which make the logic complicated. + if (flushed || !immutable_db_options_.avoid_flush_during_recovery) { + status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit); + if (!status.ok()) { + // Recovery failed + break; + } + flushed = true; + + cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), + versions_->LastSequence()); + } + data_seen = true; + } + + // Update the log number info in the version edit corresponding to this + // column family. Note that the version edits will be written to MANIFEST + // together later. + // writing wal_number in the manifest means that any log file + // with number strongly less than (wal_number + 1) is already + // recovered and should be ignored on next reincarnation. + // Since we already recovered max_wal_number, we want all wals + // with numbers `<= max_wal_number` (includes this one) to be ignored + if (flushed || cfd->mem()->GetFirstSequenceNumber() == 0) { + edit->SetLogNumber(max_wal_number + 1); + } + } + if (status.ok()) { + // we must mark the next log number as used, even though it's + // not actually used. that is because VersionSet assumes + // VersionSet::next_file_number_ always to be strictly greater than any + // log number + versions_->MarkFileNumberUsed(max_wal_number + 1); + assert(recovery_ctx != nullptr); + + for (auto* cfd : *versions_->GetColumnFamilySet()) { + auto iter = version_edits.find(cfd->GetID()); + assert(iter != version_edits.end()); + recovery_ctx->UpdateVersionEdits(cfd, iter->second); + } + + if (flushed) { + VersionEdit wal_deletion; + if (immutable_db_options_.track_and_verify_wals_in_manifest) { + wal_deletion.DeleteWalsBefore(max_wal_number + 1); + } + if (!allow_2pc()) { + // In non-2pc mode, flushing the memtables of the column families + // means we can advance min_log_number_to_keep. + wal_deletion.SetMinLogNumberToKeep(max_wal_number + 1); + } + assert(versions_->GetColumnFamilySet() != nullptr); + recovery_ctx->UpdateVersionEdits( + versions_->GetColumnFamilySet()->GetDefault(), wal_deletion); + } + } + } + + if (status.ok()) { + if (data_seen && !flushed) { + status = RestoreAliveLogFiles(wal_numbers); + } else if (!wal_numbers.empty()) { // If there's no data in the WAL, or we + // flushed all the data, still + // truncate the log file. If the process goes into a crash loop before + // the file is deleted, the preallocated space will never get freed. + const bool truncate = !read_only; + GetLogSizeAndMaybeTruncate(wal_numbers.back(), truncate, nullptr) + .PermitUncheckedError(); + } + } + + event_logger_.Log() << "job" << job_id << "event" + << "recovery_finished"; + + return status; +} + +Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate, + LogFileNumberSize* log_ptr) { + LogFileNumberSize log(wal_number); + std::string fname = + LogFileName(immutable_db_options_.GetWalDir(), wal_number); + Status s; + // This gets the appear size of the wals, not including preallocated space. + s = env_->GetFileSize(fname, &log.size); + TEST_SYNC_POINT_CALLBACK("DBImpl::GetLogSizeAndMaybeTruncate:0", /*arg=*/&s); + if (s.ok() && truncate) { + std::unique_ptr last_log; + Status truncate_status = fs_->ReopenWritableFile( + fname, + fs_->OptimizeForLogWrite( + file_options_, + BuildDBOptions(immutable_db_options_, mutable_db_options_)), + &last_log, nullptr); + if (truncate_status.ok()) { + truncate_status = last_log->Truncate(log.size, IOOptions(), nullptr); + } + if (truncate_status.ok()) { + truncate_status = last_log->Close(IOOptions(), nullptr); + } + // Not a critical error if fail to truncate. + if (!truncate_status.ok() && !truncate_status.IsNotSupported()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Failed to truncate log #%" PRIu64 ": %s", wal_number, + truncate_status.ToString().c_str()); + } + } + if (log_ptr) { + *log_ptr = log; + } + return s; +} + +Status DBImpl::RestoreAliveLogFiles(const std::vector& wal_numbers) { + if (wal_numbers.empty()) { + return Status::OK(); + } + Status s; + mutex_.AssertHeld(); + assert(immutable_db_options_.avoid_flush_during_recovery); + // Mark these as alive so they'll be considered for deletion later by + // FindObsoleteFiles() + total_log_size_ = 0; + log_empty_ = false; + uint64_t min_wal_with_unflushed_data = + versions_->MinLogNumberWithUnflushedData(); + for (auto wal_number : wal_numbers) { + if (!allow_2pc() && wal_number < min_wal_with_unflushed_data) { + // In non-2pc mode, the WAL files not backing unflushed data are not + // alive, thus should not be added to the alive_log_files_. + continue; + } + // We preallocate space for wals, but then after a crash and restart, those + // preallocated space are not needed anymore. It is likely only the last + // log has such preallocated space, so we only truncate for the last log. + LogFileNumberSize log; + s = GetLogSizeAndMaybeTruncate( + wal_number, /*truncate=*/(wal_number == wal_numbers.back()), &log); + if (!s.ok()) { + break; + } + total_log_size_ += log.size; + alive_log_files_.push_back(log); + } + return s; +} + +Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, + MemTable* mem, VersionEdit* edit) { + mutex_.AssertHeld(); + assert(cfd); + assert(cfd->imm()); + // The immutable memtable list must be empty. + assert(std::numeric_limits::max() == + cfd->imm()->GetEarliestMemTableID()); + + const uint64_t start_micros = immutable_db_options_.clock->NowMicros(); + + FileMetaData meta; + std::vector blob_file_additions; + + std::unique_ptr::iterator> pending_outputs_inserted_elem( + new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); + ReadOptions ro; + ro.total_order_seek = true; + Arena arena; + Status s; + TableProperties table_properties; + { + ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] [WriteLevel0TableForRecovery]" + " Level-0 table #%" PRIu64 ": started", + cfd->GetName().c_str(), meta.fd.GetNumber()); + + // Get the latest mutable cf options while the mutex is still locked + const MutableCFOptions mutable_cf_options = + *cfd->GetLatestMutableCFOptions(); + bool paranoid_file_checks = + cfd->GetLatestMutableCFOptions()->paranoid_file_checks; + + int64_t _current_time = 0; + immutable_db_options_.clock->GetCurrentTime(&_current_time) + .PermitUncheckedError(); // ignore error + const uint64_t current_time = static_cast(_current_time); + meta.oldest_ancester_time = current_time; + + { + auto write_hint = cfd->CalculateSSTWriteHint(0); + mutex_.Unlock(); + + SequenceNumber earliest_write_conflict_snapshot; + std::vector snapshot_seqs = + snapshots_.GetAll(&earliest_write_conflict_snapshot); + auto snapshot_checker = snapshot_checker_.get(); + if (use_custom_gc_ && snapshot_checker == nullptr) { + snapshot_checker = DisableGCSnapshotChecker::Instance(); + } + std::vector> + range_del_iters; + auto range_del_iter = + // This is called during recovery, where a live memtable is flushed + // directly. In this case, no fragmented tombstone list is cached in + // this memtable yet. + mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber, + false /* immutable_memtable */); + if (range_del_iter != nullptr) { + range_del_iters.emplace_back(range_del_iter); + } + + IOStatus io_s; + TableBuilderOptions tboptions( + *cfd->ioptions(), mutable_cf_options, cfd->internal_comparator(), + cfd->int_tbl_prop_collector_factories(), + GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), + mutable_cf_options.compression_opts, cfd->GetID(), cfd->GetName(), + 0 /* level */, false /* is_bottommost */, + TableFileCreationReason::kRecovery, 0 /* oldest_key_time */, + 0 /* file_creation_time */, db_id_, db_session_id_, + 0 /* target_file_size */, meta.fd.GetNumber()); + SeqnoToTimeMapping empty_seqno_time_mapping; + s = BuildTable( + dbname_, versions_.get(), immutable_db_options_, tboptions, + file_options_for_compaction_, cfd->table_cache(), iter.get(), + std::move(range_del_iters), &meta, &blob_file_additions, + snapshot_seqs, earliest_write_conflict_snapshot, kMaxSequenceNumber, + snapshot_checker, paranoid_file_checks, cfd->internal_stats(), &io_s, + io_tracer_, BlobFileCreationReason::kRecovery, + empty_seqno_time_mapping, &event_logger_, job_id, Env::IO_HIGH, + nullptr /* table_properties */, write_hint, + nullptr /*full_history_ts_low*/, &blob_callback_); + LogFlush(immutable_db_options_.info_log); + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] [WriteLevel0TableForRecovery]" + " Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s", + cfd->GetName().c_str(), meta.fd.GetNumber(), + meta.fd.GetFileSize(), s.ToString().c_str()); + mutex_.Lock(); + + // TODO(AR) is this ok? + if (!io_s.ok() && s.ok()) { + s = io_s; + } + } + } + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); + + // Note that if file_size is zero, the file has been deleted and + // should not be added to the manifest. + const bool has_output = meta.fd.GetFileSize() > 0; + + constexpr int level = 0; + + if (s.ok() && has_output) { + edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(), + meta.fd.GetFileSize(), meta.smallest, meta.largest, + meta.fd.smallest_seqno, meta.fd.largest_seqno, + meta.marked_for_compaction, meta.temperature, + meta.oldest_blob_file_number, meta.oldest_ancester_time, + meta.file_creation_time, meta.file_checksum, + meta.file_checksum_func_name, meta.unique_id); + + for (const auto& blob : blob_file_additions) { + edit->AddBlobFile(blob); + } + } + + InternalStats::CompactionStats stats(CompactionReason::kFlush, 1); + stats.micros = immutable_db_options_.clock->NowMicros() - start_micros; + + if (has_output) { + stats.bytes_written = meta.fd.GetFileSize(); + stats.num_output_files = 1; + } + + const auto& blobs = edit->GetBlobFileAdditions(); + for (const auto& blob : blobs) { + stats.bytes_written_blob += blob.GetTotalBlobBytes(); + } + + stats.num_output_files_blob = static_cast(blobs.size()); + + cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER, stats); + cfd->internal_stats()->AddCFStats( + InternalStats::BYTES_FLUSHED, + stats.bytes_written + stats.bytes_written_blob); + RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize()); + return s; +} + +Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + if (db_options.persist_stats_to_disk) { + column_families.push_back( + ColumnFamilyDescriptor(kPersistentStatsColumnFamilyName, cf_options)); + } + std::vector handles; + Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr); + if (s.ok()) { + if (db_options.persist_stats_to_disk) { + assert(handles.size() == 2); + } else { + assert(handles.size() == 1); + } + // i can delete the handle since DBImpl is always holding a reference to + // default column family + if (db_options.persist_stats_to_disk && handles[1] != nullptr) { + delete handles[1]; + } + delete handles[0]; + } + return s; +} + +Status DB::Open(const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr) { + const bool kSeqPerBatch = true; + const bool kBatchPerTxn = true; + return DBImpl::Open(db_options, dbname, column_families, handles, dbptr, + !kSeqPerBatch, kBatchPerTxn); +} + +// TODO: Implement the trimming in flush code path. +// TODO: Perform trimming before inserting into memtable during recovery. +// TODO: Pick files with max_timestamp > trim_ts by each file's timestamp meta +// info, and handle only these files to reduce io. +Status DB::OpenAndTrimHistory( + const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + std::string trim_ts) { + assert(dbptr != nullptr); + assert(handles != nullptr); + auto validate_options = [&db_options] { + if (db_options.avoid_flush_during_recovery) { + return Status::InvalidArgument( + "avoid_flush_during_recovery incompatible with " + "OpenAndTrimHistory"); + } + return Status::OK(); + }; + auto s = validate_options(); + if (!s.ok()) { + return s; + } + + DB* db = nullptr; + s = DB::Open(db_options, dbname, column_families, handles, &db); + if (!s.ok()) { + return s; + } + assert(db); + CompactRangeOptions options; + options.bottommost_level_compaction = + BottommostLevelCompaction::kForceOptimized; + auto db_impl = static_cast_with_check(db); + for (auto handle : *handles) { + assert(handle != nullptr); + auto cfh = static_cast_with_check(handle); + auto cfd = cfh->cfd(); + assert(cfd != nullptr); + // Only compact column families with timestamp enabled + if (cfd->user_comparator() != nullptr && + cfd->user_comparator()->timestamp_size() > 0) { + s = db_impl->CompactRangeInternal(options, handle, nullptr, nullptr, + trim_ts); + if (!s.ok()) { + break; + } + } + } + auto clean_op = [&handles, &db] { + for (auto handle : *handles) { + auto temp_s = db->DestroyColumnFamilyHandle(handle); + assert(temp_s.ok()); + } + handles->clear(); + delete db; + }; + if (!s.ok()) { + clean_op(); + return s; + } + + *dbptr = db; + return s; +} + +IOStatus DBImpl::CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, + size_t preallocate_block_size, + log::Writer** new_log) { + IOStatus io_s; + std::unique_ptr lfile; + + DBOptions db_options = + BuildDBOptions(immutable_db_options_, mutable_db_options_); + FileOptions opt_file_options = + fs_->OptimizeForLogWrite(file_options_, db_options); + std::string wal_dir = immutable_db_options_.GetWalDir(); + std::string log_fname = LogFileName(wal_dir, log_file_num); + + if (recycle_log_number) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "reusing log %" PRIu64 " from recycle list\n", + recycle_log_number); + std::string old_log_fname = LogFileName(wal_dir, recycle_log_number); + TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile1"); + TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile2"); + io_s = fs_->ReuseWritableFile(log_fname, old_log_fname, opt_file_options, + &lfile, /*dbg=*/nullptr); + } else { + io_s = NewWritableFile(fs_.get(), log_fname, &lfile, opt_file_options); + } + + if (io_s.ok()) { + lfile->SetWriteLifeTimeHint(CalculateWALWriteHint()); + lfile->SetPreallocationBlockSize(preallocate_block_size); + + const auto& listeners = immutable_db_options_.listeners; + FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types; + std::unique_ptr file_writer(new WritableFileWriter( + std::move(lfile), log_fname, opt_file_options, + immutable_db_options_.clock, io_tracer_, nullptr /* stats */, listeners, + nullptr, tmp_set.Contains(FileType::kWalFile), + tmp_set.Contains(FileType::kWalFile))); + *new_log = new log::Writer(std::move(file_writer), log_file_num, + immutable_db_options_.recycle_log_file_num > 0, + immutable_db_options_.manual_wal_flush, + immutable_db_options_.wal_compression); + io_s = (*new_log)->AddCompressionTypeRecord(); + } + return io_s; +} + +Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + const bool seq_per_batch, const bool batch_per_txn) { + Status s = ValidateOptionsByTable(db_options, column_families); + if (!s.ok()) { + return s; + } + + s = ValidateOptions(db_options, column_families); + if (!s.ok()) { + return s; + } + + *dbptr = nullptr; + assert(handles); + handles->clear(); + + size_t max_write_buffer_size = 0; + for (auto cf : column_families) { + max_write_buffer_size = + std::max(max_write_buffer_size, cf.options.write_buffer_size); + } + + DBImpl* impl = new DBImpl(db_options, dbname, seq_per_batch, batch_per_txn); + if (!impl->immutable_db_options_.info_log) { + s = impl->init_logger_creation_s_; + delete impl; + return s; + } else { + assert(impl->init_logger_creation_s_.ok()); + } + s = impl->env_->CreateDirIfMissing(impl->immutable_db_options_.GetWalDir()); + if (s.ok()) { + std::vector paths; + for (auto& db_path : impl->immutable_db_options_.db_paths) { + paths.emplace_back(db_path.path); + } + for (auto& cf : column_families) { + for (auto& cf_path : cf.options.cf_paths) { + paths.emplace_back(cf_path.path); + } + } + for (auto& path : paths) { + s = impl->env_->CreateDirIfMissing(path); + if (!s.ok()) { + break; + } + } + + // For recovery from NoSpace() error, we can only handle + // the case where the database is stored in a single path + if (paths.size() <= 1) { + impl->error_handler_.EnableAutoRecovery(); + } + } + if (s.ok()) { + s = impl->CreateArchivalDirectory(); + } + if (!s.ok()) { + delete impl; + return s; + } + + impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath(); + RecoveryContext recovery_ctx; + impl->mutex_.Lock(); + + // Handles create_if_missing, error_if_exists + uint64_t recovered_seq(kMaxSequenceNumber); + s = impl->Recover(column_families, false, false, false, &recovered_seq, + &recovery_ctx); + if (s.ok()) { + uint64_t new_log_number = impl->versions_->NewFileNumber(); + log::Writer* new_log = nullptr; + const size_t preallocate_block_size = + impl->GetWalPreallocateBlockSize(max_write_buffer_size); + s = impl->CreateWAL(new_log_number, 0 /*recycle_log_number*/, + preallocate_block_size, &new_log); + if (s.ok()) { + InstrumentedMutexLock wl(&impl->log_write_mutex_); + impl->logfile_number_ = new_log_number; + assert(new_log != nullptr); + assert(impl->logs_.empty()); + impl->logs_.emplace_back(new_log_number, new_log); + } + + if (s.ok()) { + impl->alive_log_files_.push_back( + DBImpl::LogFileNumberSize(impl->logfile_number_)); + // In WritePrepared there could be gap in sequence numbers. This breaks + // the trick we use in kPointInTimeRecovery which assumes the first seq in + // the log right after the corrupted log is one larger than the last seq + // we read from the wals. To let this trick keep working, we add a dummy + // entry with the expected sequence to the first log right after recovery. + // In non-WritePrepared case also the new log after recovery could be + // empty, and thus missing the consecutive seq hint to distinguish + // middle-log corruption to corrupted-log-remained-after-recovery. This + // case also will be addressed by a dummy write. + if (recovered_seq != kMaxSequenceNumber) { + WriteBatch empty_batch; + WriteBatchInternal::SetSequence(&empty_batch, recovered_seq); + WriteOptions write_options; + uint64_t log_used, log_size; + log::Writer* log_writer = impl->logs_.back().writer; + LogFileNumberSize& log_file_number_size = impl->alive_log_files_.back(); + + assert(log_writer->get_log_number() == log_file_number_size.number); + impl->mutex_.AssertHeld(); + s = impl->WriteToWAL(empty_batch, log_writer, &log_used, &log_size, + Env::IO_TOTAL, log_file_number_size); + if (s.ok()) { + // Need to fsync, otherwise it might get lost after a power reset. + s = impl->FlushWAL(false); + TEST_SYNC_POINT_CALLBACK("DBImpl::Open::BeforeSyncWAL", /*arg=*/&s); + if (s.ok()) { + s = log_writer->file()->Sync(impl->immutable_db_options_.use_fsync); + } + } + } + } + } + if (s.ok()) { + s = impl->LogAndApplyForRecovery(recovery_ctx); + } + + if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) { + impl->mutex_.AssertHeld(); + s = impl->InitPersistStatsColumnFamily(); + } + + if (s.ok()) { + // set column family handles + for (auto cf : column_families) { + auto cfd = + impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); + if (cfd != nullptr) { + handles->push_back( + new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); + impl->NewThreadStatusCfInfo(cfd); + } else { + if (db_options.create_missing_column_families) { + // missing column family, create it + ColumnFamilyHandle* handle = nullptr; + impl->mutex_.Unlock(); + s = impl->CreateColumnFamily(cf.options, cf.name, &handle); + impl->mutex_.Lock(); + if (s.ok()) { + handles->push_back(handle); + } else { + break; + } + } else { + s = Status::InvalidArgument("Column family not found", cf.name); + break; + } + } + } + } + + if (s.ok()) { + SuperVersionContext sv_context(/* create_superversion */ true); + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + impl->InstallSuperVersionAndScheduleWork( + cfd, &sv_context, *cfd->GetLatestMutableCFOptions()); + } + sv_context.Clean(); + } + + if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) { + // try to read format version + s = impl->PersistentStatsProcessFormatVersion(); + } + + if (s.ok()) { + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + if (!cfd->mem()->IsSnapshotSupported()) { + impl->is_snapshot_supported_ = false; + } + if (cfd->ioptions()->merge_operator != nullptr && + !cfd->mem()->IsMergeOperatorSupported()) { + s = Status::InvalidArgument( + "The memtable of column family %s does not support merge operator " + "its options.merge_operator is non-null", + cfd->GetName().c_str()); + } + if (!s.ok()) { + break; + } + } + } + TEST_SYNC_POINT("DBImpl::Open:Opened"); + Status persist_options_status; + if (s.ok()) { + // Persist RocksDB Options before scheduling the compaction. + // The WriteOptionsFile() will release and lock the mutex internally. + persist_options_status = impl->WriteOptionsFile( + false /*need_mutex_lock*/, false /*need_enter_write_thread*/); + + *dbptr = impl; + impl->opened_successfully_ = true; + impl->DeleteObsoleteFiles(); + TEST_SYNC_POINT("DBImpl::Open:AfterDeleteFiles"); + impl->MaybeScheduleFlushOrCompaction(); + } else { + persist_options_status.PermitUncheckedError(); + } + impl->mutex_.Unlock(); + +#ifndef ROCKSDB_LITE + auto sfm = static_cast( + impl->immutable_db_options_.sst_file_manager.get()); + if (s.ok() && sfm) { + // Set Statistics ptr for SstFileManager to dump the stats of + // DeleteScheduler. + sfm->SetStatisticsPtr(impl->immutable_db_options_.statistics); + ROCKS_LOG_INFO(impl->immutable_db_options_.info_log, + "SstFileManager instance %p", sfm); + + // Notify SstFileManager about all sst files that already exist in + // db_paths[0] and cf_paths[0] when the DB is opened. + + // SstFileManagerImpl needs to know sizes of the files. For files whose size + // we already know (sst files that appear in manifest - typically that's the + // vast majority of all files), we'll pass the size to SstFileManager. + // For all other files SstFileManager will query the size from filesystem. + + std::vector metadata; + impl->GetAllColumnFamilyMetaData(&metadata); + + std::unordered_map known_file_sizes; + for (const auto& md : metadata) { + for (const auto& lmd : md.levels) { + for (const auto& fmd : lmd.files) { + known_file_sizes[fmd.relative_filename] = fmd.size; + } + } + for (const auto& bmd : md.blob_files) { + std::string name = bmd.blob_file_name; + // The BlobMetaData.blob_file_name may start with "/". + if (!name.empty() && name[0] == '/') { + name = name.substr(1); + } + known_file_sizes[name] = bmd.blob_file_size; + } + } + + std::vector paths; + paths.emplace_back(impl->immutable_db_options_.db_paths[0].path); + for (auto& cf : column_families) { + if (!cf.options.cf_paths.empty()) { + paths.emplace_back(cf.options.cf_paths[0].path); + } + } + // Remove duplicate paths. + std::sort(paths.begin(), paths.end()); + paths.erase(std::unique(paths.begin(), paths.end()), paths.end()); + IOOptions io_opts; + io_opts.do_not_recurse = true; + for (auto& path : paths) { + std::vector existing_files; + impl->immutable_db_options_.fs + ->GetChildren(path, io_opts, &existing_files, + /*IODebugContext*=*/nullptr) + .PermitUncheckedError(); //**TODO: What do to on error? + for (auto& file_name : existing_files) { + uint64_t file_number; + FileType file_type; + std::string file_path = path + "/" + file_name; + if (ParseFileName(file_name, &file_number, &file_type) && + (file_type == kTableFile || file_type == kBlobFile)) { + // TODO: Check for errors from OnAddFile? + if (known_file_sizes.count(file_name)) { + // We're assuming that each sst file name exists in at most one of + // the paths. + sfm->OnAddFile(file_path, known_file_sizes.at(file_name)) + .PermitUncheckedError(); + } else { + sfm->OnAddFile(file_path).PermitUncheckedError(); + } + } + } + } + + // Reserve some disk buffer space. This is a heuristic - when we run out + // of disk space, this ensures that there is atleast write_buffer_size + // amount of free space before we resume DB writes. In low disk space + // conditions, we want to avoid a lot of small L0 files due to frequent + // WAL write failures and resultant forced flushes + sfm->ReserveDiskBuffer(max_write_buffer_size, + impl->immutable_db_options_.db_paths[0].path); + } + +#endif // !ROCKSDB_LITE + + if (s.ok()) { + ROCKS_LOG_HEADER(impl->immutable_db_options_.info_log, "DB pointer %p", + impl); + LogFlush(impl->immutable_db_options_.info_log); + if (!impl->WALBufferIsEmpty()) { + s = impl->FlushWAL(false); + if (s.ok()) { + // Sync is needed otherwise WAL buffered data might get lost after a + // power reset. + log::Writer* log_writer = impl->logs_.back().writer; + s = log_writer->file()->Sync(impl->immutable_db_options_.use_fsync); + } + } + if (s.ok() && !persist_options_status.ok()) { + s = Status::IOError( + "DB::Open() failed --- Unable to persist Options file", + persist_options_status.ToString()); + } + } + if (!s.ok()) { + ROCKS_LOG_WARN(impl->immutable_db_options_.info_log, + "DB::Open() failed: %s", s.ToString().c_str()); + } + if (s.ok()) { + s = impl->StartPeriodicTaskScheduler(); + } + + if (s.ok()) { + s = impl->RegisterRecordSeqnoTimeWorker(); + } + if (!s.ok()) { + for (auto* h : *handles) { + delete h; + } + handles->clear(); + delete impl; + *dbptr = nullptr; + } + return s; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_impl_readonly.cc b/src/rocksdb/db/db_impl/db_impl_readonly.cc new file mode 100644 index 000000000..0f10baf24 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_readonly.cc @@ -0,0 +1,341 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_impl/db_impl_readonly.h" + +#include "db/arena_wrapped_db_iter.h" +#include "db/db_impl/compacted_db_impl.h" +#include "db/db_impl/db_impl.h" +#include "db/db_iter.h" +#include "db/merge_context.h" +#include "logging/logging.h" +#include "monitoring/perf_context_imp.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE + +DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options, + const std::string& dbname) + : DBImpl(db_options, dbname, /*seq_per_batch*/ false, + /*batch_per_txn*/ true, /*read_only*/ true) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Opening the db in read only mode"); + LogFlush(immutable_db_options_.info_log); +} + +DBImplReadOnly::~DBImplReadOnly() {} + +// Implementations of the DB interface +Status DBImplReadOnly::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* pinnable_val) { + return Get(read_options, column_family, key, pinnable_val, + /*timestamp*/ nullptr); +} + +Status DBImplReadOnly::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* pinnable_val, + std::string* timestamp) { + assert(pinnable_val != nullptr); + // TODO: stopwatch DB_GET needed?, perf timer needed? + PERF_TIMER_GUARD(get_snapshot_time); + + assert(column_family); + if (read_options.timestamp) { + const Status s = FailIfTsMismatchCf( + column_family, *(read_options.timestamp), /*ts_for_read=*/true); + if (!s.ok()) { + return s; + } + } else { + const Status s = FailIfCfHasTs(column_family); + if (!s.ok()) { + return s; + } + } + + // Clear the timestamps for returning results so that we can distinguish + // between tombstone or key that has never been written + if (timestamp) { + timestamp->clear(); + } + + const Comparator* ucmp = column_family->GetComparator(); + assert(ucmp); + std::string* ts = ucmp->timestamp_size() > 0 ? timestamp : nullptr; + + Status s; + SequenceNumber snapshot = versions_->LastSequence(); + GetWithTimestampReadCallback read_cb(snapshot); + auto cfh = static_cast_with_check(column_family); + auto cfd = cfh->cfd(); + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + tracer_->Get(column_family, key); + } + } + SuperVersion* super_version = cfd->GetSuperVersion(); + MergeContext merge_context; + SequenceNumber max_covering_tombstone_seq = 0; + LookupKey lkey(key, snapshot, read_options.timestamp); + PERF_TIMER_STOP(get_snapshot_time); + if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), + /*columns=*/nullptr, ts, &s, &merge_context, + &max_covering_tombstone_seq, read_options, + false /* immutable_memtable */, &read_cb)) { + pinnable_val->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); + } else { + PERF_TIMER_GUARD(get_from_output_files_time); + PinnedIteratorsManager pinned_iters_mgr; + super_version->current->Get( + read_options, lkey, pinnable_val, /*columns=*/nullptr, ts, &s, + &merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr, + /*value_found*/ nullptr, + /*key_exists*/ nullptr, /*seq*/ nullptr, &read_cb, + /*is_blob*/ nullptr, + /*do_merge*/ true); + RecordTick(stats_, MEMTABLE_MISS); + } + RecordTick(stats_, NUMBER_KEYS_READ); + size_t size = pinnable_val->size(); + RecordTick(stats_, BYTES_READ, size); + RecordInHistogram(stats_, BYTES_PER_READ, size); + PERF_COUNTER_ADD(get_read_bytes, size); + return s; +} + +Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) { + assert(column_family); + if (read_options.timestamp) { + const Status s = FailIfTsMismatchCf( + column_family, *(read_options.timestamp), /*ts_for_read=*/true); + if (!s.ok()) { + return NewErrorIterator(s); + } + } else { + const Status s = FailIfCfHasTs(column_family); + if (!s.ok()) { + return NewErrorIterator(s); + } + } + auto cfh = static_cast_with_check(column_family); + auto cfd = cfh->cfd(); + SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); + SequenceNumber latest_snapshot = versions_->LastSequence(); + SequenceNumber read_seq = + read_options.snapshot != nullptr + ? reinterpret_cast(read_options.snapshot) + ->number_ + : latest_snapshot; + ReadCallback* read_callback = nullptr; // No read callback provided. + auto db_iter = NewArenaWrappedDbIterator( + env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options, + super_version->current, read_seq, + super_version->mutable_cf_options.max_sequential_skip_in_iterations, + super_version->version_number, read_callback); + auto internal_iter = NewInternalIterator( + db_iter->GetReadOptions(), cfd, super_version, db_iter->GetArena(), + read_seq, /* allow_unprepared_value */ true, db_iter); + db_iter->SetIterUnderDBIter(internal_iter); + return db_iter; +} + +Status DBImplReadOnly::NewIterators( + const ReadOptions& read_options, + const std::vector& column_families, + std::vector* iterators) { + if (read_options.timestamp) { + for (auto* cf : column_families) { + assert(cf); + const Status s = FailIfTsMismatchCf(cf, *(read_options.timestamp), + /*ts_for_read=*/true); + if (!s.ok()) { + return s; + } + } + } else { + for (auto* cf : column_families) { + assert(cf); + const Status s = FailIfCfHasTs(cf); + if (!s.ok()) { + return s; + } + } + } + + ReadCallback* read_callback = nullptr; // No read callback provided. + if (iterators == nullptr) { + return Status::InvalidArgument("iterators not allowed to be nullptr"); + } + iterators->clear(); + iterators->reserve(column_families.size()); + SequenceNumber latest_snapshot = versions_->LastSequence(); + SequenceNumber read_seq = + read_options.snapshot != nullptr + ? reinterpret_cast(read_options.snapshot) + ->number_ + : latest_snapshot; + + for (auto cfh : column_families) { + auto* cfd = static_cast_with_check(cfh)->cfd(); + auto* sv = cfd->GetSuperVersion()->Ref(); + auto* db_iter = NewArenaWrappedDbIterator( + env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, + sv->current, read_seq, + sv->mutable_cf_options.max_sequential_skip_in_iterations, + sv->version_number, read_callback); + auto* internal_iter = NewInternalIterator( + db_iter->GetReadOptions(), cfd, sv, db_iter->GetArena(), read_seq, + /* allow_unprepared_value */ true, db_iter); + db_iter->SetIterUnderDBIter(internal_iter); + iterators->push_back(db_iter); + } + + return Status::OK(); +} + +namespace { +// Return OK if dbname exists in the file system or create it if +// create_if_missing +Status OpenForReadOnlyCheckExistence(const DBOptions& db_options, + const std::string& dbname) { + Status s; + if (!db_options.create_if_missing) { + // Attempt to read "CURRENT" file + const std::shared_ptr& fs = db_options.env->GetFileSystem(); + std::string manifest_path; + uint64_t manifest_file_number; + s = VersionSet::GetCurrentManifestPath(dbname, fs.get(), &manifest_path, + &manifest_file_number); + } else { + // Historic behavior that doesn't necessarily make sense + s = db_options.env->CreateDirIfMissing(dbname); + } + return s; +} +} // namespace + +Status DB::OpenForReadOnly(const Options& options, const std::string& dbname, + DB** dbptr, bool /*error_if_wal_file_exists*/) { + Status s = OpenForReadOnlyCheckExistence(options, dbname); + if (!s.ok()) { + return s; + } + + *dbptr = nullptr; + + // Try to first open DB as fully compacted DB + s = CompactedDBImpl::Open(options, dbname, dbptr); + if (s.ok()) { + return s; + } + + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + std::vector handles; + + s = DBImplReadOnly::OpenForReadOnlyWithoutCheck( + db_options, dbname, column_families, &handles, dbptr); + if (s.ok()) { + assert(handles.size() == 1); + // i can delete the handle since DBImpl is always holding a + // reference to default column family + delete handles[0]; + } + return s; +} + +Status DB::OpenForReadOnly( + const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + bool error_if_wal_file_exists) { + // If dbname does not exist in the file system, should not do anything + Status s = OpenForReadOnlyCheckExistence(db_options, dbname); + if (!s.ok()) { + return s; + } + + return DBImplReadOnly::OpenForReadOnlyWithoutCheck( + db_options, dbname, column_families, handles, dbptr, + error_if_wal_file_exists); +} + +Status DBImplReadOnly::OpenForReadOnlyWithoutCheck( + const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + bool error_if_wal_file_exists) { + *dbptr = nullptr; + handles->clear(); + + SuperVersionContext sv_context(/* create_superversion */ true); + DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname); + impl->mutex_.Lock(); + Status s = impl->Recover(column_families, true /* read only */, + error_if_wal_file_exists); + if (s.ok()) { + // set column family handles + for (auto cf : column_families) { + auto cfd = + impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); + if (cfd == nullptr) { + s = Status::InvalidArgument("Column family not found", cf.name); + break; + } + handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); + } + } + if (s.ok()) { + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + sv_context.NewSuperVersion(); + cfd->InstallSuperVersion(&sv_context, &impl->mutex_); + } + } + impl->mutex_.Unlock(); + sv_context.Clean(); + if (s.ok()) { + *dbptr = impl; + for (auto* h : *handles) { + impl->NewThreadStatusCfInfo( + static_cast_with_check(h)->cfd()); + } + } else { + for (auto h : *handles) { + delete h; + } + handles->clear(); + delete impl; + } + return s; +} + +#else // !ROCKSDB_LITE + +Status DB::OpenForReadOnly(const Options& /*options*/, + const std::string& /*dbname*/, DB** /*dbptr*/, + bool /*error_if_wal_file_exists*/) { + return Status::NotSupported("Not supported in ROCKSDB_LITE."); +} + +Status DB::OpenForReadOnly( + const DBOptions& /*db_options*/, const std::string& /*dbname*/, + const std::vector& /*column_families*/, + std::vector* /*handles*/, DB** /*dbptr*/, + bool /*error_if_wal_file_exists*/) { + return Status::NotSupported("Not supported in ROCKSDB_LITE."); +} +#endif // !ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_impl_readonly.h b/src/rocksdb/db/db_impl/db_impl_readonly.h new file mode 100644 index 000000000..b876a0fda --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_readonly.h @@ -0,0 +1,170 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include + +#include "db/db_impl/db_impl.h" + +namespace ROCKSDB_NAMESPACE { + +// TODO: Share common structure with CompactedDBImpl and DBImplSecondary +class DBImplReadOnly : public DBImpl { + public: + DBImplReadOnly(const DBOptions& options, const std::string& dbname); + // No copying allowed + DBImplReadOnly(const DBImplReadOnly&) = delete; + void operator=(const DBImplReadOnly&) = delete; + + virtual ~DBImplReadOnly(); + + // Implementations of the DB interface + using DB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value, + std::string* timestamp) override; + + // TODO: Implement ReadOnly MultiGet? + + using DBImpl::NewIterator; + virtual Iterator* NewIterator(const ReadOptions&, + ColumnFamilyHandle* column_family) override; + + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) override; + + using DBImpl::Put; + virtual Status Put(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*value*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DBImpl::PutEntity; + Status PutEntity(const WriteOptions& /* options */, + ColumnFamilyHandle* /* column_family */, + const Slice& /* key */, + const WideColumns& /* columns */) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DBImpl::Merge; + virtual Status Merge(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*value*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::Delete; + virtual Status Delete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::SingleDelete; + virtual Status SingleDelete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status Write(const WriteOptions& /*options*/, + WriteBatch* /*updates*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::CompactRange; + virtual Status CompactRange(const CompactRangeOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice* /*begin*/, + const Slice* /*end*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DBImpl::CompactFiles; + virtual Status CompactFiles( + const CompactionOptions& /*compact_options*/, + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*input_file_names*/, + const int /*output_level*/, const int /*output_path_id*/ = -1, + std::vector* const /*output_file_names*/ = nullptr, + CompactionJobInfo* /*compaction_job_info*/ = nullptr) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + virtual Status DisableFileDeletions() override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + virtual Status EnableFileDeletions(bool /*force*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status GetLiveFiles(std::vector& ret, + uint64_t* manifest_file_size, + bool /*flush_memtable*/) override { + return DBImpl::GetLiveFiles(ret, manifest_file_size, + false /* flush_memtable */); + } + + using DBImpl::Flush; + virtual Status Flush(const FlushOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DBImpl::SyncWAL; + virtual Status SyncWAL() override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DB::IngestExternalFile; + virtual Status IngestExternalFile( + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*external_files*/, + const IngestExternalFileOptions& /*ingestion_options*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& /*options*/, + const std::string& /*column_family_name*/, + const ImportColumnFamilyOptions& /*import_options*/, + const ExportImportFilesMetaData& /*metadata*/, + ColumnFamilyHandle** /*handle*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + // FIXME: some missing overrides for more "write" functions + + protected: +#ifndef ROCKSDB_LITE + Status FlushForGetLiveFiles() override { + // No-op for read-only DB + return Status::OK(); + } +#endif // !ROCKSDB_LITE + + private: + // A "helper" function for DB::OpenForReadOnly without column families + // to reduce unnecessary I/O + // It has the same functionality as DB::OpenForReadOnly with column families + // but does not check the existence of dbname in the file system + static Status OpenForReadOnlyWithoutCheck( + const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + bool error_if_wal_file_exists = false); + friend class DB; +}; +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/db_impl/db_impl_secondary.cc b/src/rocksdb/db/db_impl/db_impl_secondary.cc new file mode 100644 index 000000000..5189d17d9 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_secondary.cc @@ -0,0 +1,967 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_impl/db_impl_secondary.h" + +#include + +#include "db/arena_wrapped_db_iter.h" +#include "db/merge_context.h" +#include "logging/auto_roll_logger.h" +#include "logging/logging.h" +#include "monitoring/perf_context_imp.h" +#include "rocksdb/configurable.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE +DBImplSecondary::DBImplSecondary(const DBOptions& db_options, + const std::string& dbname, + std::string secondary_path) + : DBImpl(db_options, dbname, false, true, true), + secondary_path_(std::move(secondary_path)) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Opening the db in secondary mode"); + LogFlush(immutable_db_options_.info_log); +} + +DBImplSecondary::~DBImplSecondary() {} + +Status DBImplSecondary::Recover( + const std::vector& column_families, + bool /*readonly*/, bool /*error_if_wal_file_exists*/, + bool /*error_if_data_exists_in_wals*/, uint64_t*, + RecoveryContext* /*recovery_ctx*/) { + mutex_.AssertHeld(); + + JobContext job_context(0); + Status s; + s = static_cast(versions_.get()) + ->Recover(column_families, &manifest_reader_, &manifest_reporter_, + &manifest_reader_status_); + if (!s.ok()) { + if (manifest_reader_status_) { + manifest_reader_status_->PermitUncheckedError(); + } + return s; + } + if (immutable_db_options_.paranoid_checks && s.ok()) { + s = CheckConsistency(); + } + // Initial max_total_in_memory_state_ before recovery logs. + max_total_in_memory_state_ = 0; + for (auto cfd : *versions_->GetColumnFamilySet()) { + auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + max_total_in_memory_state_ += mutable_cf_options->write_buffer_size * + mutable_cf_options->max_write_buffer_number; + } + if (s.ok()) { + default_cf_handle_ = new ColumnFamilyHandleImpl( + versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_); + default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats(); + + std::unordered_set cfds_changed; + s = FindAndRecoverLogFiles(&cfds_changed, &job_context); + } + + if (s.IsPathNotFound()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Secondary tries to read WAL, but WAL file(s) have already " + "been purged by primary."); + s = Status::OK(); + } + // TODO: update options_file_number_ needed? + + job_context.Clean(); + return s; +} + +// find new WAL and apply them in order to the secondary instance +Status DBImplSecondary::FindAndRecoverLogFiles( + std::unordered_set* cfds_changed, + JobContext* job_context) { + assert(nullptr != cfds_changed); + assert(nullptr != job_context); + Status s; + std::vector logs; + s = FindNewLogNumbers(&logs); + if (s.ok() && !logs.empty()) { + SequenceNumber next_sequence(kMaxSequenceNumber); + s = RecoverLogFiles(logs, &next_sequence, cfds_changed, job_context); + } + return s; +} + +// List wal_dir and find all new WALs, return these log numbers +Status DBImplSecondary::FindNewLogNumbers(std::vector* logs) { + assert(logs != nullptr); + std::vector filenames; + Status s; + IOOptions io_opts; + io_opts.do_not_recurse = true; + s = immutable_db_options_.fs->GetChildren(immutable_db_options_.GetWalDir(), + io_opts, &filenames, + /*IODebugContext*=*/nullptr); + if (s.IsNotFound()) { + return Status::InvalidArgument("Failed to open wal_dir", + immutable_db_options_.GetWalDir()); + } else if (!s.ok()) { + return s; + } + + // if log_readers_ is non-empty, it means we have applied all logs with log + // numbers smaller than the smallest log in log_readers_, so there is no + // need to pass these logs to RecoverLogFiles + uint64_t log_number_min = 0; + if (!log_readers_.empty()) { + log_number_min = log_readers_.begin()->first; + } + for (size_t i = 0; i < filenames.size(); i++) { + uint64_t number; + FileType type; + if (ParseFileName(filenames[i], &number, &type) && type == kWalFile && + number >= log_number_min) { + logs->push_back(number); + } + } + // Recover logs in the order that they were generated + if (!logs->empty()) { + std::sort(logs->begin(), logs->end()); + } + return s; +} + +Status DBImplSecondary::MaybeInitLogReader( + uint64_t log_number, log::FragmentBufferedReader** log_reader) { + auto iter = log_readers_.find(log_number); + // make sure the log file is still present + if (iter == log_readers_.end() || + iter->second->reader_->GetLogNumber() != log_number) { + // delete the obsolete log reader if log number mismatch + if (iter != log_readers_.end()) { + log_readers_.erase(iter); + } + // initialize log reader from log_number + // TODO: min_log_number_to_keep_2pc check needed? + // Open the log file + std::string fname = + LogFileName(immutable_db_options_.GetWalDir(), log_number); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Recovering log #%" PRIu64 " mode %d", log_number, + static_cast(immutable_db_options_.wal_recovery_mode)); + + std::unique_ptr file_reader; + { + std::unique_ptr file; + Status status = fs_->NewSequentialFile( + fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr); + if (!status.ok()) { + *log_reader = nullptr; + return status; + } + file_reader.reset(new SequentialFileReader( + std::move(file), fname, immutable_db_options_.log_readahead_size, + io_tracer_)); + } + + // Create the log reader. + LogReaderContainer* log_reader_container = new LogReaderContainer( + env_, immutable_db_options_.info_log, std::move(fname), + std::move(file_reader), log_number); + log_readers_.insert(std::make_pair( + log_number, std::unique_ptr(log_reader_container))); + } + iter = log_readers_.find(log_number); + assert(iter != log_readers_.end()); + *log_reader = iter->second->reader_; + return Status::OK(); +} + +// After manifest recovery, replay WALs and refresh log_readers_ if necessary +// REQUIRES: log_numbers are sorted in ascending order +Status DBImplSecondary::RecoverLogFiles( + const std::vector& log_numbers, SequenceNumber* next_sequence, + std::unordered_set* cfds_changed, + JobContext* job_context) { + assert(nullptr != cfds_changed); + assert(nullptr != job_context); + mutex_.AssertHeld(); + Status status; + for (auto log_number : log_numbers) { + log::FragmentBufferedReader* reader = nullptr; + status = MaybeInitLogReader(log_number, &reader); + if (!status.ok()) { + return status; + } + assert(reader != nullptr); + } + for (auto log_number : log_numbers) { + auto it = log_readers_.find(log_number); + assert(it != log_readers_.end()); + log::FragmentBufferedReader* reader = it->second->reader_; + Status* wal_read_status = it->second->status_; + assert(wal_read_status); + // Manually update the file number allocation counter in VersionSet. + versions_->MarkFileNumberUsed(log_number); + + // Determine if we should tolerate incomplete records at the tail end of the + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + + while (reader->ReadRecord(&record, &scratch, + immutable_db_options_.wal_recovery_mode) && + wal_read_status->ok() && status.ok()) { + if (record.size() < WriteBatchInternal::kHeader) { + reader->GetReporter()->Corruption( + record.size(), Status::Corruption("log record too small")); + continue; + } + status = WriteBatchInternal::SetContents(&batch, record); + if (!status.ok()) { + break; + } + SequenceNumber seq_of_batch = WriteBatchInternal::Sequence(&batch); + std::vector column_family_ids; + status = CollectColumnFamilyIdsFromWriteBatch(batch, &column_family_ids); + if (status.ok()) { + for (const auto id : column_family_ids) { + ColumnFamilyData* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(id); + if (cfd == nullptr) { + continue; + } + if (cfds_changed->count(cfd) == 0) { + cfds_changed->insert(cfd); + } + const std::vector& l0_files = + cfd->current()->storage_info()->LevelFiles(0); + SequenceNumber seq = + l0_files.empty() ? 0 : l0_files.back()->fd.largest_seqno; + // If the write batch's sequence number is smaller than the last + // sequence number of the largest sequence persisted for this column + // family, then its data must reside in an SST that has already been + // added in the prior MANIFEST replay. + if (seq_of_batch <= seq) { + continue; + } + auto curr_log_num = std::numeric_limits::max(); + if (cfd_to_current_log_.count(cfd) > 0) { + curr_log_num = cfd_to_current_log_[cfd]; + } + // If the active memtable contains records added by replaying an + // earlier WAL, then we need to seal the memtable, add it to the + // immutable memtable list and create a new active memtable. + if (!cfd->mem()->IsEmpty() && + (curr_log_num == std::numeric_limits::max() || + curr_log_num != log_number)) { + const MutableCFOptions mutable_cf_options = + *cfd->GetLatestMutableCFOptions(); + MemTable* new_mem = + cfd->ConstructNewMemtable(mutable_cf_options, seq_of_batch); + cfd->mem()->SetNextLogNumber(log_number); + cfd->mem()->ConstructFragmentedRangeTombstones(); + cfd->imm()->Add(cfd->mem(), &job_context->memtables_to_free); + new_mem->Ref(); + cfd->SetMemtable(new_mem); + } + } + bool has_valid_writes = false; + status = WriteBatchInternal::InsertInto( + &batch, column_family_memtables_.get(), + nullptr /* flush_scheduler */, nullptr /* trim_history_scheduler*/, + true, log_number, this, false /* concurrent_memtable_writes */, + next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_); + } + // If column family was not found, it might mean that the WAL write + // batch references to the column family that was dropped after the + // insert. We don't want to fail the whole write batch in that case -- + // we just ignore the update. + // That's why we set ignore missing column families to true + // passing null flush_scheduler will disable memtable flushing which is + // needed for secondary instances + if (status.ok()) { + for (const auto id : column_family_ids) { + ColumnFamilyData* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(id); + if (cfd == nullptr) { + continue; + } + std::unordered_map::iterator iter = + cfd_to_current_log_.find(cfd); + if (iter == cfd_to_current_log_.end()) { + cfd_to_current_log_.insert({cfd, log_number}); + } else if (log_number > iter->second) { + iter->second = log_number; + } + } + auto last_sequence = *next_sequence - 1; + if ((*next_sequence != kMaxSequenceNumber) && + (versions_->LastSequence() <= last_sequence)) { + versions_->SetLastAllocatedSequence(last_sequence); + versions_->SetLastPublishedSequence(last_sequence); + versions_->SetLastSequence(last_sequence); + } + } else { + // We are treating this as a failure while reading since we read valid + // blocks that do not form coherent data + reader->GetReporter()->Corruption(record.size(), status); + } + } + if (status.ok() && !wal_read_status->ok()) { + status = *wal_read_status; + } + if (!status.ok()) { + return status; + } + } + // remove logreaders from map after successfully recovering the WAL + if (log_readers_.size() > 1) { + auto erase_iter = log_readers_.begin(); + std::advance(erase_iter, log_readers_.size() - 1); + log_readers_.erase(log_readers_.begin(), erase_iter); + } + return status; +} + +// Implementation of the DB interface +Status DBImplSecondary::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) { + return GetImpl(read_options, column_family, key, value, + /*timestamp*/ nullptr); +} + +Status DBImplSecondary::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, std::string* timestamp) { + return GetImpl(read_options, column_family, key, value, timestamp); +} + +Status DBImplSecondary::GetImpl(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* pinnable_val, + std::string* timestamp) { + assert(pinnable_val != nullptr); + PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock); + StopWatch sw(immutable_db_options_.clock, stats_, DB_GET); + PERF_TIMER_GUARD(get_snapshot_time); + + assert(column_family); + if (read_options.timestamp) { + const Status s = FailIfTsMismatchCf( + column_family, *(read_options.timestamp), /*ts_for_read=*/true); + if (!s.ok()) { + return s; + } + } else { + const Status s = FailIfCfHasTs(column_family); + if (!s.ok()) { + return s; + } + } + + // Clear the timestamp for returning results so that we can distinguish + // between tombstone or key that has never been written later. + if (timestamp) { + timestamp->clear(); + } + + auto cfh = static_cast(column_family); + ColumnFamilyData* cfd = cfh->cfd(); + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + tracer_->Get(column_family, key); + } + } + // Acquire SuperVersion + SuperVersion* super_version = GetAndRefSuperVersion(cfd); + SequenceNumber snapshot = versions_->LastSequence(); + GetWithTimestampReadCallback read_cb(snapshot); + MergeContext merge_context; + SequenceNumber max_covering_tombstone_seq = 0; + Status s; + LookupKey lkey(key, snapshot, read_options.timestamp); + PERF_TIMER_STOP(get_snapshot_time); + + bool done = false; + const Comparator* ucmp = column_family->GetComparator(); + assert(ucmp); + std::string* ts = ucmp->timestamp_size() > 0 ? timestamp : nullptr; + if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), + /*columns=*/nullptr, ts, &s, &merge_context, + &max_covering_tombstone_seq, read_options, + false /* immutable_memtable */, &read_cb)) { + done = true; + pinnable_val->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); + } else if ((s.ok() || s.IsMergeInProgress()) && + super_version->imm->Get( + lkey, pinnable_val->GetSelf(), /*columns=*/nullptr, ts, &s, + &merge_context, &max_covering_tombstone_seq, read_options, + &read_cb)) { + done = true; + pinnable_val->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); + } + if (!done && !s.ok() && !s.IsMergeInProgress()) { + ReturnAndCleanupSuperVersion(cfd, super_version); + return s; + } + if (!done) { + PERF_TIMER_GUARD(get_from_output_files_time); + PinnedIteratorsManager pinned_iters_mgr; + super_version->current->Get( + read_options, lkey, pinnable_val, /*columns=*/nullptr, ts, &s, + &merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr, + /*value_found*/ nullptr, + /*key_exists*/ nullptr, /*seq*/ nullptr, &read_cb, /*is_blob*/ nullptr, + /*do_merge*/ true); + RecordTick(stats_, MEMTABLE_MISS); + } + { + PERF_TIMER_GUARD(get_post_process_time); + ReturnAndCleanupSuperVersion(cfd, super_version); + RecordTick(stats_, NUMBER_KEYS_READ); + size_t size = pinnable_val->size(); + RecordTick(stats_, BYTES_READ, size); + RecordTimeToHistogram(stats_, BYTES_PER_READ, size); + PERF_COUNTER_ADD(get_read_bytes, size); + } + return s; +} + +Iterator* DBImplSecondary::NewIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) { + if (read_options.managed) { + return NewErrorIterator( + Status::NotSupported("Managed iterator is not supported anymore.")); + } + if (read_options.read_tier == kPersistedTier) { + return NewErrorIterator(Status::NotSupported( + "ReadTier::kPersistedData is not yet supported in iterators.")); + } + + assert(column_family); + if (read_options.timestamp) { + const Status s = FailIfTsMismatchCf( + column_family, *(read_options.timestamp), /*ts_for_read=*/true); + if (!s.ok()) { + return NewErrorIterator(s); + } + } else { + const Status s = FailIfCfHasTs(column_family); + if (!s.ok()) { + return NewErrorIterator(s); + } + } + + Iterator* result = nullptr; + auto cfh = static_cast_with_check(column_family); + auto cfd = cfh->cfd(); + ReadCallback* read_callback = nullptr; // No read callback provided. + if (read_options.tailing) { + return NewErrorIterator(Status::NotSupported( + "tailing iterator not supported in secondary mode")); + } else if (read_options.snapshot != nullptr) { + // TODO (yanqin) support snapshot. + return NewErrorIterator( + Status::NotSupported("snapshot not supported in secondary mode")); + } else { + SequenceNumber snapshot(kMaxSequenceNumber); + result = NewIteratorImpl(read_options, cfd, snapshot, read_callback); + } + return result; +} + +ArenaWrappedDBIter* DBImplSecondary::NewIteratorImpl( + const ReadOptions& read_options, ColumnFamilyData* cfd, + SequenceNumber snapshot, ReadCallback* read_callback, + bool expose_blob_index, bool allow_refresh) { + assert(nullptr != cfd); + SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); + assert(snapshot == kMaxSequenceNumber); + snapshot = versions_->LastSequence(); + assert(snapshot != kMaxSequenceNumber); + auto db_iter = NewArenaWrappedDbIterator( + env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options, + super_version->current, snapshot, + super_version->mutable_cf_options.max_sequential_skip_in_iterations, + super_version->version_number, read_callback, this, cfd, + expose_blob_index, read_options.snapshot ? false : allow_refresh); + auto internal_iter = NewInternalIterator( + db_iter->GetReadOptions(), cfd, super_version, db_iter->GetArena(), + snapshot, /* allow_unprepared_value */ true, db_iter); + db_iter->SetIterUnderDBIter(internal_iter); + return db_iter; +} + +Status DBImplSecondary::NewIterators( + const ReadOptions& read_options, + const std::vector& column_families, + std::vector* iterators) { + if (read_options.managed) { + return Status::NotSupported("Managed iterator is not supported anymore."); + } + if (read_options.read_tier == kPersistedTier) { + return Status::NotSupported( + "ReadTier::kPersistedData is not yet supported in iterators."); + } + ReadCallback* read_callback = nullptr; // No read callback provided. + if (iterators == nullptr) { + return Status::InvalidArgument("iterators not allowed to be nullptr"); + } + + if (read_options.timestamp) { + for (auto* cf : column_families) { + assert(cf); + const Status s = FailIfTsMismatchCf(cf, *(read_options.timestamp), + /*ts_for_read=*/true); + if (!s.ok()) { + return s; + } + } + } else { + for (auto* cf : column_families) { + assert(cf); + const Status s = FailIfCfHasTs(cf); + if (!s.ok()) { + return s; + } + } + } + iterators->clear(); + iterators->reserve(column_families.size()); + if (read_options.tailing) { + return Status::NotSupported( + "tailing iterator not supported in secondary mode"); + } else if (read_options.snapshot != nullptr) { + // TODO (yanqin) support snapshot. + return Status::NotSupported("snapshot not supported in secondary mode"); + } else { + SequenceNumber read_seq(kMaxSequenceNumber); + for (auto cfh : column_families) { + ColumnFamilyData* cfd = static_cast(cfh)->cfd(); + iterators->push_back( + NewIteratorImpl(read_options, cfd, read_seq, read_callback)); + } + } + return Status::OK(); +} + +Status DBImplSecondary::CheckConsistency() { + mutex_.AssertHeld(); + Status s = DBImpl::CheckConsistency(); + // If DBImpl::CheckConsistency() which is stricter returns success, then we + // do not need to give a second chance. + if (s.ok()) { + return s; + } + // It's possible that DBImpl::CheckConssitency() can fail because the primary + // may have removed certain files, causing the GetFileSize(name) call to + // fail and returning a PathNotFound. In this case, we take a best-effort + // approach and just proceed. + TEST_SYNC_POINT_CALLBACK( + "DBImplSecondary::CheckConsistency:AfterFirstAttempt", &s); + + if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) { + return Status::OK(); + } + + std::vector metadata; + versions_->GetLiveFilesMetaData(&metadata); + + std::string corruption_messages; + for (const auto& md : metadata) { + // md.name has a leading "/". + std::string file_path = md.db_path + md.name; + + uint64_t fsize = 0; + s = env_->GetFileSize(file_path, &fsize); + if (!s.ok() && + (env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok() || + s.IsPathNotFound())) { + s = Status::OK(); + } + if (!s.ok()) { + corruption_messages += + "Can't access " + md.name + ": " + s.ToString() + "\n"; + } + } + return corruption_messages.empty() ? Status::OK() + : Status::Corruption(corruption_messages); +} + +Status DBImplSecondary::TryCatchUpWithPrimary() { + assert(versions_.get() != nullptr); + assert(manifest_reader_.get() != nullptr); + Status s; + // read the manifest and apply new changes to the secondary instance + std::unordered_set cfds_changed; + JobContext job_context(0, true /*create_superversion*/); + { + InstrumentedMutexLock lock_guard(&mutex_); + s = static_cast_with_check(versions_.get()) + ->ReadAndApply(&mutex_, &manifest_reader_, + manifest_reader_status_.get(), &cfds_changed); + + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Last sequence is %" PRIu64, + static_cast(versions_->LastSequence())); + for (ColumnFamilyData* cfd : cfds_changed) { + if (cfd->IsDropped()) { + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] is dropped\n", + cfd->GetName().c_str()); + continue; + } + VersionStorageInfo::LevelSummaryStorage tmp; + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] Level summary: %s\n", cfd->GetName().c_str(), + cfd->current()->storage_info()->LevelSummary(&tmp)); + } + + // list wal_dir to discover new WALs and apply new changes to the secondary + // instance + if (s.ok()) { + s = FindAndRecoverLogFiles(&cfds_changed, &job_context); + } + if (s.IsPathNotFound()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "Secondary tries to read WAL, but WAL file(s) have already " + "been purged by primary."); + s = Status::OK(); + } + if (s.ok()) { + for (auto cfd : cfds_changed) { + cfd->imm()->RemoveOldMemTables(cfd->GetLogNumber(), + &job_context.memtables_to_free); + auto& sv_context = job_context.superversion_contexts.back(); + cfd->InstallSuperVersion(&sv_context, &mutex_); + sv_context.NewSuperVersion(); + } + } + } + job_context.Clean(); + + // Cleanup unused, obsolete files. + JobContext purge_files_job_context(0); + { + InstrumentedMutexLock lock_guard(&mutex_); + // Currently, secondary instance does not own the database files, thus it + // is unnecessary for the secondary to force full scan. + FindObsoleteFiles(&purge_files_job_context, /*force=*/false); + } + if (purge_files_job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(purge_files_job_context); + } + purge_files_job_context.Clean(); + return s; +} + +Status DB::OpenAsSecondary(const Options& options, const std::string& dbname, + const std::string& secondary_path, DB** dbptr) { + *dbptr = nullptr; + + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.emplace_back(kDefaultColumnFamilyName, cf_options); + std::vector handles; + + Status s = DB::OpenAsSecondary(db_options, dbname, secondary_path, + column_families, &handles, dbptr); + if (s.ok()) { + assert(handles.size() == 1); + delete handles[0]; + } + return s; +} + +Status DB::OpenAsSecondary( + const DBOptions& db_options, const std::string& dbname, + const std::string& secondary_path, + const std::vector& column_families, + std::vector* handles, DB** dbptr) { + *dbptr = nullptr; + + DBOptions tmp_opts(db_options); + Status s; + if (nullptr == tmp_opts.info_log) { + s = CreateLoggerFromOptions(secondary_path, tmp_opts, &tmp_opts.info_log); + if (!s.ok()) { + tmp_opts.info_log = nullptr; + return s; + } + } + + assert(tmp_opts.info_log != nullptr); + if (db_options.max_open_files != -1) { + std::ostringstream oss; + oss << "The primary instance may delete all types of files after they " + "become obsolete. The application can coordinate the primary and " + "secondary so that primary does not delete/rename files that are " + "currently being used by the secondary. Alternatively, a custom " + "Env/FS can be provided such that files become inaccessible only " + "after all primary and secondaries indicate that they are obsolete " + "and deleted. If the above two are not possible, you can open the " + "secondary instance with `max_open_files==-1` so that secondary " + "will eagerly keep all table files open. Even if a file is deleted, " + "its content can still be accessed via a prior open file " + "descriptor. This is a hacky workaround for only table files. If " + "none of the above is done, then point lookup or " + "range scan via the secondary instance can result in IOError: file " + "not found. This can be resolved by retrying " + "TryCatchUpWithPrimary()."; + ROCKS_LOG_WARN(tmp_opts.info_log, "%s", oss.str().c_str()); + } + + handles->clear(); + DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname, secondary_path); + impl->versions_.reset(new ReactiveVersionSet( + dbname, &impl->immutable_db_options_, impl->file_options_, + impl->table_cache_.get(), impl->write_buffer_manager_, + &impl->write_controller_, impl->io_tracer_)); + impl->column_family_memtables_.reset( + new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet())); + impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath(); + + impl->mutex_.Lock(); + s = impl->Recover(column_families, true, false, false); + if (s.ok()) { + for (auto cf : column_families) { + auto cfd = + impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); + if (nullptr == cfd) { + s = Status::InvalidArgument("Column family not found", cf.name); + break; + } + handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); + } + } + SuperVersionContext sv_context(true /* create_superversion */); + if (s.ok()) { + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + sv_context.NewSuperVersion(); + cfd->InstallSuperVersion(&sv_context, &impl->mutex_); + } + } + impl->mutex_.Unlock(); + sv_context.Clean(); + if (s.ok()) { + *dbptr = impl; + for (auto h : *handles) { + impl->NewThreadStatusCfInfo( + static_cast_with_check(h)->cfd()); + } + } else { + for (auto h : *handles) { + delete h; + } + handles->clear(); + delete impl; + } + return s; +} + +Status DBImplSecondary::CompactWithoutInstallation( + const OpenAndCompactOptions& options, ColumnFamilyHandle* cfh, + const CompactionServiceInput& input, CompactionServiceResult* result) { + if (options.canceled && options.canceled->load(std::memory_order_acquire)) { + return Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + InstrumentedMutexLock l(&mutex_); + auto cfd = static_cast_with_check(cfh)->cfd(); + if (!cfd) { + return Status::InvalidArgument("Cannot find column family" + + cfh->GetName()); + } + + std::unordered_set input_set; + for (const auto& file_name : input.input_files) { + input_set.insert(TableFileNameToNumber(file_name)); + } + + auto* version = cfd->current(); + + ColumnFamilyMetaData cf_meta; + version->GetColumnFamilyMetaData(&cf_meta); + + const MutableCFOptions* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + ColumnFamilyOptions cf_options = cfd->GetLatestCFOptions(); + VersionStorageInfo* vstorage = version->storage_info(); + + // Use comp_options to reuse some CompactFiles functions + CompactionOptions comp_options; + comp_options.compression = kDisableCompressionOption; + comp_options.output_file_size_limit = MaxFileSizeForLevel( + *mutable_cf_options, input.output_level, cf_options.compaction_style, + vstorage->base_level(), cf_options.level_compaction_dynamic_level_bytes); + + std::vector input_files; + Status s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage, comp_options); + if (!s.ok()) { + return s; + } + + std::unique_ptr c; + assert(cfd->compaction_picker()); + c.reset(cfd->compaction_picker()->CompactFiles( + comp_options, input_files, input.output_level, vstorage, + *mutable_cf_options, mutable_db_options_, 0)); + assert(c != nullptr); + + c->SetInputVersion(version); + + // Create output directory if it's not existed yet + std::unique_ptr output_dir; + s = CreateAndNewDirectory(fs_.get(), secondary_path_, &output_dir); + if (!s.ok()) { + return s; + } + + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, + immutable_db_options_.info_log.get()); + + const int job_id = next_job_id_.fetch_add(1); + + // use primary host's db_id for running the compaction, but db_session_id is + // using the local one, which is to make sure the unique id is unique from + // the remote compactors. Because the id is generated from db_id, + // db_session_id and orig_file_number, unlike the local compaction, remote + // compaction cannot guarantee the uniqueness of orig_file_number, the file + // number is only assigned when compaction is done. + CompactionServiceCompactionJob compaction_job( + job_id, c.get(), immutable_db_options_, mutable_db_options_, + file_options_for_compaction_, versions_.get(), &shutting_down_, + &log_buffer, output_dir.get(), stats_, &mutex_, &error_handler_, + input.snapshots, table_cache_, &event_logger_, dbname_, io_tracer_, + options.canceled ? *options.canceled : kManualCompactionCanceledFalse_, + input.db_id, db_session_id_, secondary_path_, input, result); + + mutex_.Unlock(); + s = compaction_job.Run(); + mutex_.Lock(); + + // clean up + compaction_job.io_status().PermitUncheckedError(); + compaction_job.CleanupCompaction(); + c->ReleaseCompactionFiles(s); + c.reset(); + + TEST_SYNC_POINT_CALLBACK("DBImplSecondary::CompactWithoutInstallation::End", + &s); + result->status = s; + return s; +} + +Status DB::OpenAndCompact( + const OpenAndCompactOptions& options, const std::string& name, + const std::string& output_directory, const std::string& input, + std::string* output, + const CompactionServiceOptionsOverride& override_options) { + if (options.canceled && options.canceled->load(std::memory_order_acquire)) { + return Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + CompactionServiceInput compaction_input; + Status s = CompactionServiceInput::Read(input, &compaction_input); + if (!s.ok()) { + return s; + } + + compaction_input.db_options.max_open_files = -1; + compaction_input.db_options.compaction_service = nullptr; + if (compaction_input.db_options.statistics) { + compaction_input.db_options.statistics.reset(); + } + compaction_input.db_options.env = override_options.env; + compaction_input.db_options.file_checksum_gen_factory = + override_options.file_checksum_gen_factory; + compaction_input.db_options.statistics = override_options.statistics; + compaction_input.column_family.options.comparator = + override_options.comparator; + compaction_input.column_family.options.merge_operator = + override_options.merge_operator; + compaction_input.column_family.options.compaction_filter = + override_options.compaction_filter; + compaction_input.column_family.options.compaction_filter_factory = + override_options.compaction_filter_factory; + compaction_input.column_family.options.prefix_extractor = + override_options.prefix_extractor; + compaction_input.column_family.options.table_factory = + override_options.table_factory; + compaction_input.column_family.options.sst_partitioner_factory = + override_options.sst_partitioner_factory; + compaction_input.column_family.options.table_properties_collector_factories = + override_options.table_properties_collector_factories; + compaction_input.db_options.listeners = override_options.listeners; + + std::vector column_families; + column_families.push_back(compaction_input.column_family); + // TODO: we have to open default CF, because of an implementation limitation, + // currently we just use the same CF option from input, which is not collect + // and open may fail. + if (compaction_input.column_family.name != kDefaultColumnFamilyName) { + column_families.emplace_back(kDefaultColumnFamilyName, + compaction_input.column_family.options); + } + + DB* db; + std::vector handles; + + s = DB::OpenAsSecondary(compaction_input.db_options, name, output_directory, + column_families, &handles, &db); + if (!s.ok()) { + return s; + } + + CompactionServiceResult compaction_result; + DBImplSecondary* db_secondary = static_cast_with_check(db); + assert(handles.size() > 0); + s = db_secondary->CompactWithoutInstallation( + options, handles[0], compaction_input, &compaction_result); + + Status serialization_status = compaction_result.Write(output); + + for (auto& handle : handles) { + delete handle; + } + delete db; + if (s.ok()) { + return serialization_status; + } + return s; +} + +Status DB::OpenAndCompact( + const std::string& name, const std::string& output_directory, + const std::string& input, std::string* output, + const CompactionServiceOptionsOverride& override_options) { + return OpenAndCompact(OpenAndCompactOptions(), name, output_directory, input, + output, override_options); +} + +#else // !ROCKSDB_LITE + +Status DB::OpenAsSecondary(const Options& /*options*/, + const std::string& /*name*/, + const std::string& /*secondary_path*/, + DB** /*dbptr*/) { + return Status::NotSupported("Not supported in ROCKSDB_LITE."); +} + +Status DB::OpenAsSecondary( + const DBOptions& /*db_options*/, const std::string& /*dbname*/, + const std::string& /*secondary_path*/, + const std::vector& /*column_families*/, + std::vector* /*handles*/, DB** /*dbptr*/) { + return Status::NotSupported("Not supported in ROCKSDB_LITE."); +} +#endif // !ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_impl_secondary.h b/src/rocksdb/db/db_impl/db_impl_secondary.h new file mode 100644 index 000000000..eb9361875 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_secondary.h @@ -0,0 +1,410 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include + +#include "db/db_impl/db_impl.h" +#include "logging/logging.h" + +namespace ROCKSDB_NAMESPACE { + +// A wrapper class to hold log reader, log reporter, log status. +class LogReaderContainer { + public: + LogReaderContainer() + : reader_(nullptr), reporter_(nullptr), status_(nullptr) {} + LogReaderContainer(Env* env, std::shared_ptr info_log, + std::string fname, + std::unique_ptr&& file_reader, + uint64_t log_number) { + LogReporter* reporter = new LogReporter(); + status_ = new Status(); + reporter->env = env; + reporter->info_log = info_log.get(); + reporter->fname = std::move(fname); + reporter->status = status_; + reporter_ = reporter; + // We intentially make log::Reader do checksumming even if + // paranoid_checks==false so that corruptions cause entire commits + // to be skipped instead of propagating bad information (like overly + // large sequence numbers). + reader_ = new log::FragmentBufferedReader(info_log, std::move(file_reader), + reporter, true /*checksum*/, + log_number); + } + log::FragmentBufferedReader* reader_; + log::Reader::Reporter* reporter_; + Status* status_; + ~LogReaderContainer() { + delete reader_; + delete reporter_; + delete status_; + } + + private: + struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + std::string fname; + Status* status; // nullptr if immutable_db_options_.paranoid_checks==false + void Corruption(size_t bytes, const Status& s) override { + ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s", + (this->status == nullptr ? "(ignoring error) " : ""), + fname.c_str(), static_cast(bytes), + s.ToString().c_str()); + if (this->status != nullptr && this->status->ok()) { + *this->status = s; + } + } + }; +}; + +// The secondary instance shares access to the storage as the primary. +// The secondary is able to read and replay changes described in both the +// MANIFEST and the WAL files without coordination with the primary. +// The secondary instance can be opened using `DB::OpenAsSecondary`. After +// that, it can call `DBImplSecondary::TryCatchUpWithPrimary` to make best +// effort attempts to catch up with the primary. +// TODO: Share common structure with CompactedDBImpl and DBImplReadOnly +class DBImplSecondary : public DBImpl { + public: + DBImplSecondary(const DBOptions& options, const std::string& dbname, + std::string secondary_path); + ~DBImplSecondary() override; + + // Recover by replaying MANIFEST and WAL. Also initialize manifest_reader_ + // and log_readers_ to facilitate future operations. + Status Recover(const std::vector& column_families, + bool read_only, bool error_if_wal_file_exists, + bool error_if_data_exists_in_wals, uint64_t* = nullptr, + RecoveryContext* recovery_ctx = nullptr) override; + + // Implementations of the DB interface. + using DB::Get; + // Can return IOError due to files being deleted by the primary. To avoid + // IOError in this case, application can coordinate between primary and + // secondaries so that primary will not delete files that are currently being + // used by the secondaries. The application can also provide a custom FS/Env + // implementation so that files will remain present until all primary and + // secondaries indicate that they can be deleted. As a partial hacky + // workaround, the secondaries can be opened with `max_open_files=-1` so that + // it eagerly keeps all talbe files open and is able to access the contents of + // deleted files via prior open fd. + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value) override; + + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value, + std::string* timestamp) override; + + Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value, + std::string* timestamp); + + using DBImpl::NewIterator; + // Operations on the created iterators can return IOError due to files being + // deleted by the primary. To avoid IOError in this case, application can + // coordinate between primary and secondaries so that primary will not delete + // files that are currently being used by the secondaries. The application can + // also provide a custom FS/Env implementation so that files will remain + // present until all primary and secondaries indicate that they can be + // deleted. As a partial hacky workaround, the secondaries can be opened with + // `max_open_files=-1` so that it eagerly keeps all talbe files open and is + // able to access the contents of deleted files via prior open fd. + Iterator* NewIterator(const ReadOptions&, + ColumnFamilyHandle* column_family) override; + + ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& read_options, + ColumnFamilyData* cfd, + SequenceNumber snapshot, + ReadCallback* read_callback, + bool expose_blob_index = false, + bool allow_refresh = true); + + Status NewIterators(const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) override; + + using DBImpl::Put; + Status Put(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, + const Slice& /*value*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::PutEntity; + Status PutEntity(const WriteOptions& /* options */, + ColumnFamilyHandle* /* column_family */, + const Slice& /* key */, + const WideColumns& /* columns */) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::Merge; + Status Merge(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, + const Slice& /*value*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::Delete; + Status Delete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::SingleDelete; + Status SingleDelete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + Status Write(const WriteOptions& /*options*/, + WriteBatch* /*updates*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::CompactRange; + Status CompactRange(const CompactRangeOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice* /*begin*/, const Slice* /*end*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::CompactFiles; + Status CompactFiles( + const CompactionOptions& /*compact_options*/, + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*input_file_names*/, + const int /*output_level*/, const int /*output_path_id*/ = -1, + std::vector* const /*output_file_names*/ = nullptr, + CompactionJobInfo* /*compaction_job_info*/ = nullptr) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + Status DisableFileDeletions() override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + Status EnableFileDeletions(bool /*force*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + Status GetLiveFiles(std::vector&, + uint64_t* /*manifest_file_size*/, + bool /*flush_memtable*/ = true) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::Flush; + Status Flush(const FlushOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::SetDBOptions; + Status SetDBOptions(const std::unordered_map& + /*options_map*/) override { + // Currently not supported because changing certain options may cause + // flush/compaction. + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::SetOptions; + Status SetOptions( + ColumnFamilyHandle* /*cfd*/, + const std::unordered_map& /*options_map*/) + override { + // Currently not supported because changing certain options may cause + // flush/compaction and/or write to MANIFEST. + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::SyncWAL; + Status SyncWAL() override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DB::IngestExternalFile; + Status IngestExternalFile( + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*external_files*/, + const IngestExternalFileOptions& /*ingestion_options*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + // Try to catch up with the primary by reading as much as possible from the + // log files until there is nothing more to read or encounters an error. If + // the amount of information in the log files to process is huge, this + // method can take long time due to all the I/O and CPU costs. + Status TryCatchUpWithPrimary() override; + + // Try to find log reader using log_number from log_readers_ map, initialize + // if it doesn't exist + Status MaybeInitLogReader(uint64_t log_number, + log::FragmentBufferedReader** log_reader); + + // Check if all live files exist on file system and that their file sizes + // matche to the in-memory records. It is possible that some live files may + // have been deleted by the primary. In this case, CheckConsistency() does + // not flag the missing file as inconsistency. + Status CheckConsistency() override; + +#ifndef NDEBUG + Status TEST_CompactWithoutInstallation(const OpenAndCompactOptions& options, + ColumnFamilyHandle* cfh, + const CompactionServiceInput& input, + CompactionServiceResult* result) { + return CompactWithoutInstallation(options, cfh, input, result); + } +#endif // NDEBUG + + protected: +#ifndef ROCKSDB_LITE + Status FlushForGetLiveFiles() override { + // No-op for read-only DB + return Status::OK(); + } +#endif // !ROCKSDB_LITE + + // ColumnFamilyCollector is a write batch handler which does nothing + // except recording unique column family IDs + class ColumnFamilyCollector : public WriteBatch::Handler { + std::unordered_set column_family_ids_; + + Status AddColumnFamilyId(uint32_t column_family_id) { + if (column_family_ids_.find(column_family_id) == + column_family_ids_.end()) { + column_family_ids_.insert(column_family_id); + } + return Status::OK(); + } + + public: + explicit ColumnFamilyCollector() {} + + ~ColumnFamilyCollector() override {} + + Status PutCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status DeleteCF(uint32_t column_family_id, const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status SingleDeleteCF(uint32_t column_family_id, const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status DeleteRangeCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status MergeCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status PutBlobIndexCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status MarkBeginPrepare(bool) override { return Status::OK(); } + + Status MarkEndPrepare(const Slice&) override { return Status::OK(); } + + Status MarkRollback(const Slice&) override { return Status::OK(); } + + Status MarkCommit(const Slice&) override { return Status::OK(); } + + Status MarkCommitWithTimestamp(const Slice&, const Slice&) override { + return Status::OK(); + } + + Status MarkNoop(bool) override { return Status::OK(); } + + const std::unordered_set& column_families() const { + return column_family_ids_; + } + }; + + Status CollectColumnFamilyIdsFromWriteBatch( + const WriteBatch& batch, std::vector* column_family_ids) { + assert(column_family_ids != nullptr); + column_family_ids->clear(); + ColumnFamilyCollector handler; + Status s = batch.Iterate(&handler); + if (s.ok()) { + for (const auto& cf : handler.column_families()) { + column_family_ids->push_back(cf); + } + } + return s; + } + + bool OwnTablesAndLogs() const override { + // Currently, the secondary instance does not own the database files. It + // simply opens the files of the primary instance and tracks their file + // descriptors until they become obsolete. In the future, the secondary may + // create links to database files. OwnTablesAndLogs will return true then. + return false; + } + + private: + friend class DB; + + // No copying allowed + DBImplSecondary(const DBImplSecondary&); + void operator=(const DBImplSecondary&); + + using DBImpl::Recover; + + Status FindAndRecoverLogFiles( + std::unordered_set* cfds_changed, + JobContext* job_context); + Status FindNewLogNumbers(std::vector* logs); + // After manifest recovery, replay WALs and refresh log_readers_ if necessary + // REQUIRES: log_numbers are sorted in ascending order + Status RecoverLogFiles(const std::vector& log_numbers, + SequenceNumber* next_sequence, + std::unordered_set* cfds_changed, + JobContext* job_context); + + // Run compaction without installation, the output files will be placed in the + // secondary DB path. The LSM tree won't be changed, the secondary DB is still + // in read-only mode. + Status CompactWithoutInstallation(const OpenAndCompactOptions& options, + ColumnFamilyHandle* cfh, + const CompactionServiceInput& input, + CompactionServiceResult* result); + + std::unique_ptr manifest_reader_; + std::unique_ptr manifest_reporter_; + std::unique_ptr manifest_reader_status_; + + // Cache log readers for each log number, used for continue WAL replay + // after recovery + std::map> log_readers_; + + // Current WAL number replayed for each column family. + std::unordered_map cfd_to_current_log_; + + const std::string secondary_path_; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/db_impl/db_impl_write.cc b/src/rocksdb/db/db_impl/db_impl_write.cc new file mode 100644 index 000000000..a597c168d --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_write.cc @@ -0,0 +1,2435 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include + +#include "db/db_impl/db_impl.h" +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "logging/logging.h" +#include "monitoring/perf_context_imp.h" +#include "options/options_helper.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { +// Convenience methods +Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& val) { + const Status s = FailIfCfHasTs(column_family); + if (!s.ok()) { + return s; + } + return DB::Put(o, column_family, key, val); +} + +Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts, const Slice& val) { + const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false); + if (!s.ok()) { + return s; + } + return DB::Put(o, column_family, key, ts, val); +} + +Status DBImpl::PutEntity(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const WideColumns& columns) { + const Status s = FailIfCfHasTs(column_family); + if (!s.ok()) { + return s; + } + + return DB::PutEntity(options, column_family, key, columns); +} + +Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& val) { + const Status s = FailIfCfHasTs(column_family); + if (!s.ok()) { + return s; + } + auto cfh = static_cast_with_check(column_family); + if (!cfh->cfd()->ioptions()->merge_operator) { + return Status::NotSupported("Provide a merge_operator when opening DB"); + } else { + return DB::Merge(o, column_family, key, val); + } +} + +Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts, const Slice& val) { + const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false); + if (!s.ok()) { + return s; + } + return DB::Merge(o, column_family, key, ts, val); +} + +Status DBImpl::Delete(const WriteOptions& write_options, + ColumnFamilyHandle* column_family, const Slice& key) { + const Status s = FailIfCfHasTs(column_family); + if (!s.ok()) { + return s; + } + return DB::Delete(write_options, column_family, key); +} + +Status DBImpl::Delete(const WriteOptions& write_options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts) { + const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false); + if (!s.ok()) { + return s; + } + return DB::Delete(write_options, column_family, key, ts); +} + +Status DBImpl::SingleDelete(const WriteOptions& write_options, + ColumnFamilyHandle* column_family, + const Slice& key) { + const Status s = FailIfCfHasTs(column_family); + if (!s.ok()) { + return s; + } + return DB::SingleDelete(write_options, column_family, key); +} + +Status DBImpl::SingleDelete(const WriteOptions& write_options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts) { + const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false); + if (!s.ok()) { + return s; + } + return DB::SingleDelete(write_options, column_family, key, ts); +} + +Status DBImpl::DeleteRange(const WriteOptions& write_options, + ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key) { + const Status s = FailIfCfHasTs(column_family); + if (!s.ok()) { + return s; + } + return DB::DeleteRange(write_options, column_family, begin_key, end_key); +} + +Status DBImpl::DeleteRange(const WriteOptions& write_options, + ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key, + const Slice& ts) { + const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false); + if (!s.ok()) { + return s; + } + return DB::DeleteRange(write_options, column_family, begin_key, end_key, ts); +} + +void DBImpl::SetRecoverableStatePreReleaseCallback( + PreReleaseCallback* callback) { + recoverable_state_pre_release_callback_.reset(callback); +} + +Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) { + Status s; + if (write_options.protection_bytes_per_key > 0) { + s = WriteBatchInternal::UpdateProtectionInfo( + my_batch, write_options.protection_bytes_per_key); + } + if (s.ok()) { + s = WriteImpl(write_options, my_batch, /*callback=*/nullptr, + /*log_used=*/nullptr); + } + return s; +} + +#ifndef ROCKSDB_LITE +Status DBImpl::WriteWithCallback(const WriteOptions& write_options, + WriteBatch* my_batch, + WriteCallback* callback) { + Status s; + if (write_options.protection_bytes_per_key > 0) { + s = WriteBatchInternal::UpdateProtectionInfo( + my_batch, write_options.protection_bytes_per_key); + } + if (s.ok()) { + s = WriteImpl(write_options, my_batch, callback, nullptr); + } + return s; +} +#endif // ROCKSDB_LITE + +// The main write queue. This is the only write queue that updates LastSequence. +// When using one write queue, the same sequence also indicates the last +// published sequence. +Status DBImpl::WriteImpl(const WriteOptions& write_options, + WriteBatch* my_batch, WriteCallback* callback, + uint64_t* log_used, uint64_t log_ref, + bool disable_memtable, uint64_t* seq_used, + size_t batch_cnt, + PreReleaseCallback* pre_release_callback, + PostMemTableCallback* post_memtable_callback) { + assert(!seq_per_batch_ || batch_cnt != 0); + assert(my_batch == nullptr || my_batch->Count() == 0 || + write_options.protection_bytes_per_key == 0 || + write_options.protection_bytes_per_key == + my_batch->GetProtectionBytesPerKey()); + if (my_batch == nullptr) { + return Status::InvalidArgument("Batch is nullptr!"); + } else if (!disable_memtable && + WriteBatchInternal::TimestampsUpdateNeeded(*my_batch)) { + // If writing to memtable, then we require the caller to set/update the + // timestamps for the keys in the write batch. + // Otherwise, it means we are just writing to the WAL, and we allow + // timestamps unset for the keys in the write batch. This can happen if we + // use TransactionDB with write-committed policy, and we currently do not + // support user-defined timestamp with other policies. + // In the prepare phase, a transaction can write the batch to the WAL + // without inserting to memtable. The keys in the batch do not have to be + // assigned timestamps because they will be used only during recovery if + // there is a commit marker which includes their commit timestamp. + return Status::InvalidArgument("write batch must have timestamp(s) set"); + } else if (write_options.rate_limiter_priority != Env::IO_TOTAL && + write_options.rate_limiter_priority != Env::IO_USER) { + return Status::InvalidArgument( + "WriteOptions::rate_limiter_priority only allows " + "Env::IO_TOTAL and Env::IO_USER due to implementation constraints"); + } else if (write_options.rate_limiter_priority != Env::IO_TOTAL && + (write_options.disableWAL || manual_wal_flush_)) { + return Status::InvalidArgument( + "WriteOptions::rate_limiter_priority currently only supports " + "rate-limiting automatic WAL flush, which requires " + "`WriteOptions::disableWAL` and " + "`DBOptions::manual_wal_flush` both set to false"); + } else if (write_options.protection_bytes_per_key != 0 && + write_options.protection_bytes_per_key != 8) { + return Status::InvalidArgument( + "`WriteOptions::protection_bytes_per_key` must be zero or eight"); + } + // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock + // grabs but does not seem thread-safe. + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_ && !tracer_->IsWriteOrderPreserved()) { + // We don't have to preserve write order so can trace anywhere. It's more + // efficient to trace here than to add latency to a phase of the log/apply + // pipeline. + // TODO: maybe handle the tracing status? + tracer_->Write(my_batch).PermitUncheckedError(); + } + } + if (write_options.sync && write_options.disableWAL) { + return Status::InvalidArgument("Sync writes has to enable WAL."); + } + if (two_write_queues_ && immutable_db_options_.enable_pipelined_write) { + return Status::NotSupported( + "pipelined_writes is not compatible with concurrent prepares"); + } + if (seq_per_batch_ && immutable_db_options_.enable_pipelined_write) { + // TODO(yiwu): update pipeline write with seq_per_batch and batch_cnt + return Status::NotSupported( + "pipelined_writes is not compatible with seq_per_batch"); + } + if (immutable_db_options_.unordered_write && + immutable_db_options_.enable_pipelined_write) { + return Status::NotSupported( + "pipelined_writes is not compatible with unordered_write"); + } + if (immutable_db_options_.enable_pipelined_write && + post_memtable_callback != nullptr) { + return Status::NotSupported( + "pipelined write currently does not honor post_memtable_callback"); + } + if (seq_per_batch_ && post_memtable_callback != nullptr) { + return Status::NotSupported( + "seq_per_batch currently does not honor post_memtable_callback"); + } + // Otherwise IsLatestPersistentState optimization does not make sense + assert(!WriteBatchInternal::IsLatestPersistentState(my_batch) || + disable_memtable); + + if (write_options.low_pri) { + Status s = ThrottleLowPriWritesIfNeeded(write_options, my_batch); + if (!s.ok()) { + return s; + } + } + + if (two_write_queues_ && disable_memtable) { + AssignOrder assign_order = + seq_per_batch_ ? kDoAssignOrder : kDontAssignOrder; + // Otherwise it is WAL-only Prepare batches in WriteCommitted policy and + // they don't consume sequence. + return WriteImplWALOnly(&nonmem_write_thread_, write_options, my_batch, + callback, log_used, log_ref, seq_used, batch_cnt, + pre_release_callback, assign_order, + kDontPublishLastSeq, disable_memtable); + } + + if (immutable_db_options_.unordered_write) { + const size_t sub_batch_cnt = batch_cnt != 0 + ? batch_cnt + // every key is a sub-batch consuming a seq + : WriteBatchInternal::Count(my_batch); + uint64_t seq = 0; + // Use a write thread to i) optimize for WAL write, ii) publish last + // sequence in in increasing order, iii) call pre_release_callback serially + Status status = WriteImplWALOnly( + &write_thread_, write_options, my_batch, callback, log_used, log_ref, + &seq, sub_batch_cnt, pre_release_callback, kDoAssignOrder, + kDoPublishLastSeq, disable_memtable); + TEST_SYNC_POINT("DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL"); + if (!status.ok()) { + return status; + } + if (seq_used) { + *seq_used = seq; + } + if (!disable_memtable) { + TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeUnorderedWriteMemtable"); + status = UnorderedWriteMemtable(write_options, my_batch, callback, + log_ref, seq, sub_batch_cnt); + } + return status; + } + + if (immutable_db_options_.enable_pipelined_write) { + return PipelinedWriteImpl(write_options, my_batch, callback, log_used, + log_ref, disable_memtable, seq_used); + } + + PERF_TIMER_GUARD(write_pre_and_post_process_time); + WriteThread::Writer w(write_options, my_batch, callback, log_ref, + disable_memtable, batch_cnt, pre_release_callback, + post_memtable_callback); + StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE); + + write_thread_.JoinBatchGroup(&w); + if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) { + // we are a non-leader in a parallel group + + if (w.ShouldWriteToMemtable()) { + PERF_TIMER_STOP(write_pre_and_post_process_time); + PERF_TIMER_GUARD(write_memtable_time); + + ColumnFamilyMemTablesImpl column_family_memtables( + versions_->GetColumnFamilySet()); + w.status = WriteBatchInternal::InsertInto( + &w, w.sequence, &column_family_memtables, &flush_scheduler_, + &trim_history_scheduler_, + write_options.ignore_missing_column_families, 0 /*log_number*/, this, + true /*concurrent_memtable_writes*/, seq_per_batch_, w.batch_cnt, + batch_per_txn_, write_options.memtable_insert_hint_per_batch); + + PERF_TIMER_START(write_pre_and_post_process_time); + } + + if (write_thread_.CompleteParallelMemTableWriter(&w)) { + // we're responsible for exit batch group + // TODO(myabandeh): propagate status to write_group + auto last_sequence = w.write_group->last_sequence; + for (auto* tmp_w : *(w.write_group)) { + assert(tmp_w); + if (tmp_w->post_memtable_callback) { + Status tmp_s = + (*tmp_w->post_memtable_callback)(last_sequence, disable_memtable); + // TODO: propagate the execution status of post_memtable_callback to + // caller. + assert(tmp_s.ok()); + } + } + versions_->SetLastSequence(last_sequence); + MemTableInsertStatusCheck(w.status); + write_thread_.ExitAsBatchGroupFollower(&w); + } + assert(w.state == WriteThread::STATE_COMPLETED); + // STATE_COMPLETED conditional below handles exit + } + if (w.state == WriteThread::STATE_COMPLETED) { + if (log_used != nullptr) { + *log_used = w.log_used; + } + if (seq_used != nullptr) { + *seq_used = w.sequence; + } + // write is complete and leader has updated sequence + return w.FinalStatus(); + } + // else we are the leader of the write batch group + assert(w.state == WriteThread::STATE_GROUP_LEADER); + Status status; + // Once reaches this point, the current writer "w" will try to do its write + // job. It may also pick up some of the remaining writers in the "writers_" + // when it finds suitable, and finish them in the same write batch. + // This is how a write job could be done by the other writer. + WriteContext write_context; + LogContext log_context(write_options.sync); + WriteThread::WriteGroup write_group; + bool in_parallel_group = false; + uint64_t last_sequence = kMaxSequenceNumber; + + assert(!two_write_queues_ || !disable_memtable); + { + // With concurrent writes we do preprocess only in the write thread that + // also does write to memtable to avoid sync issue on shared data structure + // with the other thread + + // PreprocessWrite does its own perf timing. + PERF_TIMER_STOP(write_pre_and_post_process_time); + + status = PreprocessWrite(write_options, &log_context, &write_context); + if (!two_write_queues_) { + // Assign it after ::PreprocessWrite since the sequence might advance + // inside it by WriteRecoverableState + last_sequence = versions_->LastSequence(); + } + + PERF_TIMER_START(write_pre_and_post_process_time); + } + + // Add to log and apply to memtable. We can release the lock + // during this phase since &w is currently responsible for logging + // and protects against concurrent loggers and concurrent writes + // into memtables + + TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeLeaderEnters"); + last_batch_group_size_ = + write_thread_.EnterAsBatchGroupLeader(&w, &write_group); + + IOStatus io_s; + Status pre_release_cb_status; + if (status.ok()) { + // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock + // grabs but does not seem thread-safe. + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_ && tracer_->IsWriteOrderPreserved()) { + for (auto* writer : write_group) { + // TODO: maybe handle the tracing status? + tracer_->Write(writer->batch).PermitUncheckedError(); + } + } + } + // Rules for when we can update the memtable concurrently + // 1. supported by memtable + // 2. Puts are not okay if inplace_update_support + // 3. Merges are not okay + // + // Rules 1..2 are enforced by checking the options + // during startup (CheckConcurrentWritesSupported), so if + // options.allow_concurrent_memtable_write is true then they can be + // assumed to be true. Rule 3 is checked for each batch. We could + // relax rules 2 if we could prevent write batches from referring + // more than once to a particular key. + bool parallel = immutable_db_options_.allow_concurrent_memtable_write && + write_group.size > 1; + size_t total_count = 0; + size_t valid_batches = 0; + size_t total_byte_size = 0; + size_t pre_release_callback_cnt = 0; + for (auto* writer : write_group) { + assert(writer); + if (writer->CheckCallback(this)) { + valid_batches += writer->batch_cnt; + if (writer->ShouldWriteToMemtable()) { + total_count += WriteBatchInternal::Count(writer->batch); + parallel = parallel && !writer->batch->HasMerge(); + } + total_byte_size = WriteBatchInternal::AppendedByteSize( + total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); + if (writer->pre_release_callback) { + pre_release_callback_cnt++; + } + } + } + // Note about seq_per_batch_: either disableWAL is set for the entire write + // group or not. In either case we inc seq for each write batch with no + // failed callback. This means that there could be a batch with + // disalbe_memtable in between; although we do not write this batch to + // memtable it still consumes a seq. Otherwise, if !seq_per_batch_, we inc + // the seq per valid written key to mem. + size_t seq_inc = seq_per_batch_ ? valid_batches : total_count; + + const bool concurrent_update = two_write_queues_; + // Update stats while we are an exclusive group leader, so we know + // that nobody else can be writing to these particular stats. + // We're optimistic, updating the stats before we successfully + // commit. That lets us release our leader status early. + auto stats = default_cf_internal_stats_; + stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count, + concurrent_update); + RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count); + stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size, + concurrent_update); + RecordTick(stats_, BYTES_WRITTEN, total_byte_size); + stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1, + concurrent_update); + RecordTick(stats_, WRITE_DONE_BY_SELF); + auto write_done_by_other = write_group.size - 1; + if (write_done_by_other > 0) { + stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther, + write_done_by_other, concurrent_update); + RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other); + } + RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size); + + if (write_options.disableWAL) { + has_unpersisted_data_.store(true, std::memory_order_relaxed); + } + + PERF_TIMER_STOP(write_pre_and_post_process_time); + + if (!two_write_queues_) { + if (status.ok() && !write_options.disableWAL) { + assert(log_context.log_file_number_size); + LogFileNumberSize& log_file_number_size = + *(log_context.log_file_number_size); + PERF_TIMER_GUARD(write_wal_time); + io_s = + WriteToWAL(write_group, log_context.writer, log_used, + log_context.need_log_sync, log_context.need_log_dir_sync, + last_sequence + 1, log_file_number_size); + } + } else { + if (status.ok() && !write_options.disableWAL) { + PERF_TIMER_GUARD(write_wal_time); + // LastAllocatedSequence is increased inside WriteToWAL under + // wal_write_mutex_ to ensure ordered events in WAL + io_s = ConcurrentWriteToWAL(write_group, log_used, &last_sequence, + seq_inc); + } else { + // Otherwise we inc seq number for memtable writes + last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc); + } + } + status = io_s; + assert(last_sequence != kMaxSequenceNumber); + const SequenceNumber current_sequence = last_sequence + 1; + last_sequence += seq_inc; + + // PreReleaseCallback is called after WAL write and before memtable write + if (status.ok()) { + SequenceNumber next_sequence = current_sequence; + size_t index = 0; + // Note: the logic for advancing seq here must be consistent with the + // logic in WriteBatchInternal::InsertInto(write_group...) as well as + // with WriteBatchInternal::InsertInto(write_batch...) that is called on + // the merged batch during recovery from the WAL. + for (auto* writer : write_group) { + if (writer->CallbackFailed()) { + continue; + } + writer->sequence = next_sequence; + if (writer->pre_release_callback) { + Status ws = writer->pre_release_callback->Callback( + writer->sequence, disable_memtable, writer->log_used, index++, + pre_release_callback_cnt); + if (!ws.ok()) { + status = pre_release_cb_status = ws; + break; + } + } + if (seq_per_batch_) { + assert(writer->batch_cnt); + next_sequence += writer->batch_cnt; + } else if (writer->ShouldWriteToMemtable()) { + next_sequence += WriteBatchInternal::Count(writer->batch); + } + } + } + + if (status.ok()) { + PERF_TIMER_GUARD(write_memtable_time); + + if (!parallel) { + // w.sequence will be set inside InsertInto + w.status = WriteBatchInternal::InsertInto( + write_group, current_sequence, column_family_memtables_.get(), + &flush_scheduler_, &trim_history_scheduler_, + write_options.ignore_missing_column_families, + 0 /*recovery_log_number*/, this, parallel, seq_per_batch_, + batch_per_txn_); + } else { + write_group.last_sequence = last_sequence; + write_thread_.LaunchParallelMemTableWriters(&write_group); + in_parallel_group = true; + + // Each parallel follower is doing each own writes. The leader should + // also do its own. + if (w.ShouldWriteToMemtable()) { + ColumnFamilyMemTablesImpl column_family_memtables( + versions_->GetColumnFamilySet()); + assert(w.sequence == current_sequence); + w.status = WriteBatchInternal::InsertInto( + &w, w.sequence, &column_family_memtables, &flush_scheduler_, + &trim_history_scheduler_, + write_options.ignore_missing_column_families, 0 /*log_number*/, + this, true /*concurrent_memtable_writes*/, seq_per_batch_, + w.batch_cnt, batch_per_txn_, + write_options.memtable_insert_hint_per_batch); + } + } + if (seq_used != nullptr) { + *seq_used = w.sequence; + } + } + } + PERF_TIMER_START(write_pre_and_post_process_time); + + if (!io_s.ok()) { + // Check WriteToWAL status + IOStatusCheck(io_s); + } + if (!w.CallbackFailed()) { + if (!io_s.ok()) { + assert(pre_release_cb_status.ok()); + } else { + WriteStatusCheck(pre_release_cb_status); + } + } else { + assert(pre_release_cb_status.ok()); + } + + if (log_context.need_log_sync) { + VersionEdit synced_wals; + log_write_mutex_.Lock(); + if (status.ok()) { + MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync, + &synced_wals); + } else { + MarkLogsNotSynced(logfile_number_); + } + log_write_mutex_.Unlock(); + if (status.ok() && synced_wals.IsWalAddition()) { + InstrumentedMutexLock l(&mutex_); + status = ApplyWALToManifest(&synced_wals); + } + + // Requesting sync with two_write_queues_ is expected to be very rare. We + // hence provide a simple implementation that is not necessarily efficient. + if (two_write_queues_) { + if (manual_wal_flush_) { + status = FlushWAL(true); + } else { + status = SyncWAL(); + } + } + } + + bool should_exit_batch_group = true; + if (in_parallel_group) { + // CompleteParallelWorker returns true if this thread should + // handle exit, false means somebody else did + should_exit_batch_group = write_thread_.CompleteParallelMemTableWriter(&w); + } + if (should_exit_batch_group) { + if (status.ok()) { + for (auto* tmp_w : write_group) { + assert(tmp_w); + if (tmp_w->post_memtable_callback) { + Status tmp_s = + (*tmp_w->post_memtable_callback)(last_sequence, disable_memtable); + // TODO: propagate the execution status of post_memtable_callback to + // caller. + assert(tmp_s.ok()); + } + } + // Note: if we are to resume after non-OK statuses we need to revisit how + // we reacts to non-OK statuses here. + versions_->SetLastSequence(last_sequence); + } + MemTableInsertStatusCheck(w.status); + write_thread_.ExitAsBatchGroupLeader(write_group, status); + } + + if (status.ok()) { + status = w.FinalStatus(); + } + return status; +} + +Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, + WriteBatch* my_batch, WriteCallback* callback, + uint64_t* log_used, uint64_t log_ref, + bool disable_memtable, uint64_t* seq_used) { + PERF_TIMER_GUARD(write_pre_and_post_process_time); + StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE); + + WriteContext write_context; + + WriteThread::Writer w(write_options, my_batch, callback, log_ref, + disable_memtable, /*_batch_cnt=*/0, + /*_pre_release_callback=*/nullptr); + write_thread_.JoinBatchGroup(&w); + TEST_SYNC_POINT("DBImplWrite::PipelinedWriteImpl:AfterJoinBatchGroup"); + if (w.state == WriteThread::STATE_GROUP_LEADER) { + WriteThread::WriteGroup wal_write_group; + if (w.callback && !w.callback->AllowWriteBatching()) { + write_thread_.WaitForMemTableWriters(); + } + LogContext log_context(!write_options.disableWAL && write_options.sync); + // PreprocessWrite does its own perf timing. + PERF_TIMER_STOP(write_pre_and_post_process_time); + w.status = PreprocessWrite(write_options, &log_context, &write_context); + PERF_TIMER_START(write_pre_and_post_process_time); + + // This can set non-OK status if callback fail. + last_batch_group_size_ = + write_thread_.EnterAsBatchGroupLeader(&w, &wal_write_group); + const SequenceNumber current_sequence = + write_thread_.UpdateLastSequence(versions_->LastSequence()) + 1; + size_t total_count = 0; + size_t total_byte_size = 0; + + if (w.status.ok()) { + // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock + // grabs but does not seem thread-safe. + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) { + for (auto* writer : wal_write_group) { + // TODO: maybe handle the tracing status? + tracer_->Write(writer->batch).PermitUncheckedError(); + } + } + } + SequenceNumber next_sequence = current_sequence; + for (auto* writer : wal_write_group) { + assert(writer); + if (writer->CheckCallback(this)) { + if (writer->ShouldWriteToMemtable()) { + writer->sequence = next_sequence; + size_t count = WriteBatchInternal::Count(writer->batch); + next_sequence += count; + total_count += count; + } + total_byte_size = WriteBatchInternal::AppendedByteSize( + total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); + } + } + if (w.disable_wal) { + has_unpersisted_data_.store(true, std::memory_order_relaxed); + } + write_thread_.UpdateLastSequence(current_sequence + total_count - 1); + } + + auto stats = default_cf_internal_stats_; + stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count); + RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count); + stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size); + RecordTick(stats_, BYTES_WRITTEN, total_byte_size); + RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size); + + PERF_TIMER_STOP(write_pre_and_post_process_time); + + IOStatus io_s; + io_s.PermitUncheckedError(); // Allow io_s to be uninitialized + + if (w.status.ok() && !write_options.disableWAL) { + PERF_TIMER_GUARD(write_wal_time); + stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1); + RecordTick(stats_, WRITE_DONE_BY_SELF, 1); + if (wal_write_group.size > 1) { + stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther, + wal_write_group.size - 1); + RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1); + } + assert(log_context.log_file_number_size); + LogFileNumberSize& log_file_number_size = + *(log_context.log_file_number_size); + io_s = + WriteToWAL(wal_write_group, log_context.writer, log_used, + log_context.need_log_sync, log_context.need_log_dir_sync, + current_sequence, log_file_number_size); + w.status = io_s; + } + + if (!io_s.ok()) { + // Check WriteToWAL status + IOStatusCheck(io_s); + } else if (!w.CallbackFailed()) { + WriteStatusCheck(w.status); + } + + VersionEdit synced_wals; + if (log_context.need_log_sync) { + InstrumentedMutexLock l(&log_write_mutex_); + if (w.status.ok()) { + MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync, + &synced_wals); + } else { + MarkLogsNotSynced(logfile_number_); + } + } + if (w.status.ok() && synced_wals.IsWalAddition()) { + InstrumentedMutexLock l(&mutex_); + w.status = ApplyWALToManifest(&synced_wals); + } + write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status); + } + + // NOTE: the memtable_write_group is declared before the following + // `if` statement because its lifetime needs to be longer + // that the inner context of the `if` as a reference to it + // may be used further below within the outer _write_thread + WriteThread::WriteGroup memtable_write_group; + + if (w.state == WriteThread::STATE_MEMTABLE_WRITER_LEADER) { + PERF_TIMER_GUARD(write_memtable_time); + assert(w.ShouldWriteToMemtable()); + write_thread_.EnterAsMemTableWriter(&w, &memtable_write_group); + if (memtable_write_group.size > 1 && + immutable_db_options_.allow_concurrent_memtable_write) { + write_thread_.LaunchParallelMemTableWriters(&memtable_write_group); + } else { + memtable_write_group.status = WriteBatchInternal::InsertInto( + memtable_write_group, w.sequence, column_family_memtables_.get(), + &flush_scheduler_, &trim_history_scheduler_, + write_options.ignore_missing_column_families, 0 /*log_number*/, this, + false /*concurrent_memtable_writes*/, seq_per_batch_, batch_per_txn_); + versions_->SetLastSequence(memtable_write_group.last_sequence); + write_thread_.ExitAsMemTableWriter(&w, memtable_write_group); + } + } else { + // NOTE: the memtable_write_group is never really used, + // so we need to set its status to pass ASSERT_STATUS_CHECKED + memtable_write_group.status.PermitUncheckedError(); + } + + if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) { + assert(w.ShouldWriteToMemtable()); + ColumnFamilyMemTablesImpl column_family_memtables( + versions_->GetColumnFamilySet()); + w.status = WriteBatchInternal::InsertInto( + &w, w.sequence, &column_family_memtables, &flush_scheduler_, + &trim_history_scheduler_, write_options.ignore_missing_column_families, + 0 /*log_number*/, this, true /*concurrent_memtable_writes*/, + false /*seq_per_batch*/, 0 /*batch_cnt*/, true /*batch_per_txn*/, + write_options.memtable_insert_hint_per_batch); + if (write_thread_.CompleteParallelMemTableWriter(&w)) { + MemTableInsertStatusCheck(w.status); + versions_->SetLastSequence(w.write_group->last_sequence); + write_thread_.ExitAsMemTableWriter(&w, *w.write_group); + } + } + if (seq_used != nullptr) { + *seq_used = w.sequence; + } + + assert(w.state == WriteThread::STATE_COMPLETED); + return w.FinalStatus(); +} + +Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options, + WriteBatch* my_batch, + WriteCallback* callback, uint64_t log_ref, + SequenceNumber seq, + const size_t sub_batch_cnt) { + PERF_TIMER_GUARD(write_pre_and_post_process_time); + StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE); + + WriteThread::Writer w(write_options, my_batch, callback, log_ref, + false /*disable_memtable*/); + + if (w.CheckCallback(this) && w.ShouldWriteToMemtable()) { + w.sequence = seq; + size_t total_count = WriteBatchInternal::Count(my_batch); + InternalStats* stats = default_cf_internal_stats_; + stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count); + RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count); + + ColumnFamilyMemTablesImpl column_family_memtables( + versions_->GetColumnFamilySet()); + w.status = WriteBatchInternal::InsertInto( + &w, w.sequence, &column_family_memtables, &flush_scheduler_, + &trim_history_scheduler_, write_options.ignore_missing_column_families, + 0 /*log_number*/, this, true /*concurrent_memtable_writes*/, + seq_per_batch_, sub_batch_cnt, true /*batch_per_txn*/, + write_options.memtable_insert_hint_per_batch); + if (write_options.disableWAL) { + has_unpersisted_data_.store(true, std::memory_order_relaxed); + } + } + + size_t pending_cnt = pending_memtable_writes_.fetch_sub(1) - 1; + if (pending_cnt == 0) { + // switch_cv_ waits until pending_memtable_writes_ = 0. Locking its mutex + // before notify ensures that cv is in waiting state when it is notified + // thus not missing the update to pending_memtable_writes_ even though it is + // not modified under the mutex. + std::lock_guard lck(switch_mutex_); + switch_cv_.notify_all(); + } + WriteStatusCheck(w.status); + + if (!w.FinalStatus().ok()) { + return w.FinalStatus(); + } + return Status::OK(); +} + +// The 2nd write queue. If enabled it will be used only for WAL-only writes. +// This is the only queue that updates LastPublishedSequence which is only +// applicable in a two-queue setting. +Status DBImpl::WriteImplWALOnly( + WriteThread* write_thread, const WriteOptions& write_options, + WriteBatch* my_batch, WriteCallback* callback, uint64_t* log_used, + const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt, + PreReleaseCallback* pre_release_callback, const AssignOrder assign_order, + const PublishLastSeq publish_last_seq, const bool disable_memtable) { + PERF_TIMER_GUARD(write_pre_and_post_process_time); + WriteThread::Writer w(write_options, my_batch, callback, log_ref, + disable_memtable, sub_batch_cnt, pre_release_callback); + StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE); + + write_thread->JoinBatchGroup(&w); + assert(w.state != WriteThread::STATE_PARALLEL_MEMTABLE_WRITER); + if (w.state == WriteThread::STATE_COMPLETED) { + if (log_used != nullptr) { + *log_used = w.log_used; + } + if (seq_used != nullptr) { + *seq_used = w.sequence; + } + return w.FinalStatus(); + } + // else we are the leader of the write batch group + assert(w.state == WriteThread::STATE_GROUP_LEADER); + + if (publish_last_seq == kDoPublishLastSeq) { + Status status; + + // Currently we only use kDoPublishLastSeq in unordered_write + assert(immutable_db_options_.unordered_write); + WriteContext write_context; + if (error_handler_.IsDBStopped()) { + status = error_handler_.GetBGError(); + } + // TODO(myabandeh): Make preliminary checks thread-safe so we could do them + // without paying the cost of obtaining the mutex. + if (status.ok()) { + LogContext log_context; + status = PreprocessWrite(write_options, &log_context, &write_context); + WriteStatusCheckOnLocked(status); + } + if (!status.ok()) { + WriteThread::WriteGroup write_group; + write_thread->EnterAsBatchGroupLeader(&w, &write_group); + write_thread->ExitAsBatchGroupLeader(write_group, status); + return status; + } + } + + WriteThread::WriteGroup write_group; + uint64_t last_sequence; + write_thread->EnterAsBatchGroupLeader(&w, &write_group); + // Note: no need to update last_batch_group_size_ here since the batch writes + // to WAL only + // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock + // grabs but does not seem thread-safe. + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) { + for (auto* writer : write_group) { + // TODO: maybe handle the tracing status? + tracer_->Write(writer->batch).PermitUncheckedError(); + } + } + } + + size_t pre_release_callback_cnt = 0; + size_t total_byte_size = 0; + for (auto* writer : write_group) { + assert(writer); + if (writer->CheckCallback(this)) { + total_byte_size = WriteBatchInternal::AppendedByteSize( + total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); + if (writer->pre_release_callback) { + pre_release_callback_cnt++; + } + } + } + + const bool concurrent_update = true; + // Update stats while we are an exclusive group leader, so we know + // that nobody else can be writing to these particular stats. + // We're optimistic, updating the stats before we successfully + // commit. That lets us release our leader status early. + auto stats = default_cf_internal_stats_; + stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size, + concurrent_update); + RecordTick(stats_, BYTES_WRITTEN, total_byte_size); + stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1, + concurrent_update); + RecordTick(stats_, WRITE_DONE_BY_SELF); + auto write_done_by_other = write_group.size - 1; + if (write_done_by_other > 0) { + stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther, + write_done_by_other, concurrent_update); + RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other); + } + RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size); + + PERF_TIMER_STOP(write_pre_and_post_process_time); + + PERF_TIMER_GUARD(write_wal_time); + // LastAllocatedSequence is increased inside WriteToWAL under + // wal_write_mutex_ to ensure ordered events in WAL + size_t seq_inc = 0 /* total_count */; + if (assign_order == kDoAssignOrder) { + size_t total_batch_cnt = 0; + for (auto* writer : write_group) { + assert(writer->batch_cnt || !seq_per_batch_); + if (!writer->CallbackFailed()) { + total_batch_cnt += writer->batch_cnt; + } + } + seq_inc = total_batch_cnt; + } + Status status; + if (!write_options.disableWAL) { + IOStatus io_s = + ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc); + status = io_s; + // last_sequence may not be set if there is an error + // This error checking and return is moved up to avoid using uninitialized + // last_sequence. + if (!io_s.ok()) { + IOStatusCheck(io_s); + write_thread->ExitAsBatchGroupLeader(write_group, status); + return status; + } + } else { + // Otherwise we inc seq number to do solely the seq allocation + last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc); + } + + size_t memtable_write_cnt = 0; + auto curr_seq = last_sequence + 1; + for (auto* writer : write_group) { + if (writer->CallbackFailed()) { + continue; + } + writer->sequence = curr_seq; + if (assign_order == kDoAssignOrder) { + assert(writer->batch_cnt || !seq_per_batch_); + curr_seq += writer->batch_cnt; + } + if (!writer->disable_memtable) { + memtable_write_cnt++; + } + // else seq advances only by memtable writes + } + if (status.ok() && write_options.sync) { + assert(!write_options.disableWAL); + // Requesting sync with two_write_queues_ is expected to be very rare. We + // hance provide a simple implementation that is not necessarily efficient. + if (manual_wal_flush_) { + status = FlushWAL(true); + } else { + status = SyncWAL(); + } + } + PERF_TIMER_START(write_pre_and_post_process_time); + + if (!w.CallbackFailed()) { + WriteStatusCheck(status); + } + if (status.ok()) { + size_t index = 0; + for (auto* writer : write_group) { + if (!writer->CallbackFailed() && writer->pre_release_callback) { + assert(writer->sequence != kMaxSequenceNumber); + Status ws = writer->pre_release_callback->Callback( + writer->sequence, disable_memtable, writer->log_used, index++, + pre_release_callback_cnt); + if (!ws.ok()) { + status = ws; + break; + } + } + } + } + if (publish_last_seq == kDoPublishLastSeq) { + versions_->SetLastSequence(last_sequence + seq_inc); + // Currently we only use kDoPublishLastSeq in unordered_write + assert(immutable_db_options_.unordered_write); + } + if (immutable_db_options_.unordered_write && status.ok()) { + pending_memtable_writes_ += memtable_write_cnt; + } + write_thread->ExitAsBatchGroupLeader(write_group, status); + if (status.ok()) { + status = w.FinalStatus(); + } + if (seq_used != nullptr) { + *seq_used = w.sequence; + } + return status; +} + +void DBImpl::WriteStatusCheckOnLocked(const Status& status) { + // Is setting bg_error_ enough here? This will at least stop + // compaction and fail any further writes. + InstrumentedMutexLock l(&mutex_); + assert(!status.IsIOFenced() || !error_handler_.GetBGError().ok()); + if (immutable_db_options_.paranoid_checks && !status.ok() && + !status.IsBusy() && !status.IsIncomplete()) { + // Maybe change the return status to void? + error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback); + } +} + +void DBImpl::WriteStatusCheck(const Status& status) { + // Is setting bg_error_ enough here? This will at least stop + // compaction and fail any further writes. + assert(!status.IsIOFenced() || !error_handler_.GetBGError().ok()); + if (immutable_db_options_.paranoid_checks && !status.ok() && + !status.IsBusy() && !status.IsIncomplete()) { + mutex_.Lock(); + // Maybe change the return status to void? + error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback); + mutex_.Unlock(); + } +} + +void DBImpl::IOStatusCheck(const IOStatus& io_status) { + // Is setting bg_error_ enough here? This will at least stop + // compaction and fail any further writes. + if ((immutable_db_options_.paranoid_checks && !io_status.ok() && + !io_status.IsBusy() && !io_status.IsIncomplete()) || + io_status.IsIOFenced()) { + mutex_.Lock(); + // Maybe change the return status to void? + error_handler_.SetBGError(io_status, BackgroundErrorReason::kWriteCallback); + mutex_.Unlock(); + } else { + // Force writable file to be continue writable. + logs_.back().writer->file()->reset_seen_error(); + } +} + +void DBImpl::MemTableInsertStatusCheck(const Status& status) { + // A non-OK status here indicates that the state implied by the + // WAL has diverged from the in-memory state. This could be + // because of a corrupt write_batch (very bad), or because the + // client specified an invalid column family and didn't specify + // ignore_missing_column_families. + if (!status.ok()) { + mutex_.Lock(); + assert(!error_handler_.IsBGWorkStopped()); + // Maybe change the return status to void? + error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable) + .PermitUncheckedError(); + mutex_.Unlock(); + } +} + +Status DBImpl::PreprocessWrite(const WriteOptions& write_options, + LogContext* log_context, + WriteContext* write_context) { + assert(write_context != nullptr && log_context != nullptr); + Status status; + + if (error_handler_.IsDBStopped()) { + InstrumentedMutexLock l(&mutex_); + status = error_handler_.GetBGError(); + } + + PERF_TIMER_GUARD(write_scheduling_flushes_compactions_time); + + if (UNLIKELY(status.ok() && total_log_size_ > GetMaxTotalWalSize())) { + assert(versions_); + InstrumentedMutexLock l(&mutex_); + const ColumnFamilySet* const column_families = + versions_->GetColumnFamilySet(); + assert(column_families); + size_t num_cfs = column_families->NumberOfColumnFamilies(); + assert(num_cfs >= 1); + if (num_cfs > 1) { + WaitForPendingWrites(); + status = SwitchWAL(write_context); + } + } + + if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldFlush())) { + // Before a new memtable is added in SwitchMemtable(), + // write_buffer_manager_->ShouldFlush() will keep returning true. If another + // thread is writing to another DB with the same write buffer, they may also + // be flushed. We may end up with flushing much more DBs than needed. It's + // suboptimal but still correct. + InstrumentedMutexLock l(&mutex_); + WaitForPendingWrites(); + status = HandleWriteBufferManagerFlush(write_context); + } + + if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) { + InstrumentedMutexLock l(&mutex_); + status = TrimMemtableHistory(write_context); + } + + if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) { + InstrumentedMutexLock l(&mutex_); + WaitForPendingWrites(); + status = ScheduleFlushes(write_context); + } + + PERF_TIMER_STOP(write_scheduling_flushes_compactions_time); + PERF_TIMER_GUARD(write_pre_and_post_process_time); + + if (UNLIKELY(status.ok() && (write_controller_.IsStopped() || + write_controller_.NeedsDelay()))) { + PERF_TIMER_STOP(write_pre_and_post_process_time); + PERF_TIMER_GUARD(write_delay_time); + // We don't know size of curent batch so that we always use the size + // for previous one. It might create a fairness issue that expiration + // might happen for smaller writes but larger writes can go through. + // Can optimize it if it is an issue. + InstrumentedMutexLock l(&mutex_); + status = DelayWrite(last_batch_group_size_, write_options); + PERF_TIMER_START(write_pre_and_post_process_time); + } + + // If memory usage exceeded beyond a certain threshold, + // write_buffer_manager_->ShouldStall() returns true to all threads writing to + // all DBs and writers will be stalled. + // It does soft checking because WriteBufferManager::buffer_limit_ has already + // exceeded at this point so no new write (including current one) will go + // through until memory usage is decreased. + if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldStall())) { + if (write_options.no_slowdown) { + status = Status::Incomplete("Write stall"); + } else { + InstrumentedMutexLock l(&mutex_); + WriteBufferManagerStallWrites(); + } + } + InstrumentedMutexLock l(&log_write_mutex_); + if (status.ok() && log_context->need_log_sync) { + // Wait until the parallel syncs are finished. Any sync process has to sync + // the front log too so it is enough to check the status of front() + // We do a while loop since log_sync_cv_ is signalled when any sync is + // finished + // Note: there does not seem to be a reason to wait for parallel sync at + // this early step but it is not important since parallel sync (SyncWAL) and + // need_log_sync are usually not used together. + while (logs_.front().IsSyncing()) { + log_sync_cv_.Wait(); + } + for (auto& log : logs_) { + // This is just to prevent the logs to be synced by a parallel SyncWAL + // call. We will do the actual syncing later after we will write to the + // WAL. + // Note: there does not seem to be a reason to set this early before we + // actually write to the WAL + log.PrepareForSync(); + } + } else { + log_context->need_log_sync = false; + } + log_context->writer = logs_.back().writer; + log_context->need_log_dir_sync = + log_context->need_log_dir_sync && !log_dir_synced_; + log_context->log_file_number_size = std::addressof(alive_log_files_.back()); + + return status; +} + +Status DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group, + WriteBatch* tmp_batch, WriteBatch** merged_batch, + size_t* write_with_wal, + WriteBatch** to_be_cached_state) { + assert(write_with_wal != nullptr); + assert(tmp_batch != nullptr); + assert(*to_be_cached_state == nullptr); + *write_with_wal = 0; + auto* leader = write_group.leader; + assert(!leader->disable_wal); // Same holds for all in the batch group + if (write_group.size == 1 && !leader->CallbackFailed() && + leader->batch->GetWalTerminationPoint().is_cleared()) { + // we simply write the first WriteBatch to WAL if the group only + // contains one batch, that batch should be written to the WAL, + // and the batch is not wanting to be truncated + *merged_batch = leader->batch; + if (WriteBatchInternal::IsLatestPersistentState(*merged_batch)) { + *to_be_cached_state = *merged_batch; + } + *write_with_wal = 1; + } else { + // WAL needs all of the batches flattened into a single batch. + // We could avoid copying here with an iov-like AddRecord + // interface + *merged_batch = tmp_batch; + for (auto writer : write_group) { + if (!writer->CallbackFailed()) { + Status s = WriteBatchInternal::Append(*merged_batch, writer->batch, + /*WAL_only*/ true); + if (!s.ok()) { + tmp_batch->Clear(); + return s; + } + if (WriteBatchInternal::IsLatestPersistentState(writer->batch)) { + // We only need to cache the last of such write batch + *to_be_cached_state = writer->batch; + } + (*write_with_wal)++; + } + } + } + // return merged_batch; + return Status::OK(); +} + +// When two_write_queues_ is disabled, this function is called from the only +// write thread. Otherwise this must be called holding log_write_mutex_. +IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch, + log::Writer* log_writer, uint64_t* log_used, + uint64_t* log_size, + Env::IOPriority rate_limiter_priority, + LogFileNumberSize& log_file_number_size) { + assert(log_size != nullptr); + + Slice log_entry = WriteBatchInternal::Contents(&merged_batch); + TEST_SYNC_POINT_CALLBACK("DBImpl::WriteToWAL:log_entry", &log_entry); + auto s = merged_batch.VerifyChecksum(); + if (!s.ok()) { + return status_to_io_status(std::move(s)); + } + *log_size = log_entry.size(); + // When two_write_queues_ WriteToWAL has to be protected from concurretn calls + // from the two queues anyway and log_write_mutex_ is already held. Otherwise + // if manual_wal_flush_ is enabled we need to protect log_writer->AddRecord + // from possible concurrent calls via the FlushWAL by the application. + const bool needs_locking = manual_wal_flush_ && !two_write_queues_; + // Due to performance cocerns of missed branch prediction penalize the new + // manual_wal_flush_ feature (by UNLIKELY) instead of the more common case + // when we do not need any locking. + if (UNLIKELY(needs_locking)) { + log_write_mutex_.Lock(); + } + IOStatus io_s = log_writer->AddRecord(log_entry, rate_limiter_priority); + + if (UNLIKELY(needs_locking)) { + log_write_mutex_.Unlock(); + } + if (log_used != nullptr) { + *log_used = logfile_number_; + } + total_log_size_ += log_entry.size(); + log_file_number_size.AddSize(*log_size); + log_empty_ = false; + return io_s; +} + +IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group, + log::Writer* log_writer, uint64_t* log_used, + bool need_log_sync, bool need_log_dir_sync, + SequenceNumber sequence, + LogFileNumberSize& log_file_number_size) { + IOStatus io_s; + assert(!two_write_queues_); + assert(!write_group.leader->disable_wal); + // Same holds for all in the batch group + size_t write_with_wal = 0; + WriteBatch* to_be_cached_state = nullptr; + WriteBatch* merged_batch; + io_s = status_to_io_status(MergeBatch(write_group, &tmp_batch_, &merged_batch, + &write_with_wal, &to_be_cached_state)); + if (UNLIKELY(!io_s.ok())) { + return io_s; + } + + if (merged_batch == write_group.leader->batch) { + write_group.leader->log_used = logfile_number_; + } else if (write_with_wal > 1) { + for (auto writer : write_group) { + writer->log_used = logfile_number_; + } + } + + WriteBatchInternal::SetSequence(merged_batch, sequence); + + uint64_t log_size; + io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size, + write_group.leader->rate_limiter_priority, + log_file_number_size); + if (to_be_cached_state) { + cached_recoverable_state_ = *to_be_cached_state; + cached_recoverable_state_empty_ = false; + } + + if (io_s.ok() && need_log_sync) { + StopWatch sw(immutable_db_options_.clock, stats_, WAL_FILE_SYNC_MICROS); + // It's safe to access logs_ with unlocked mutex_ here because: + // - we've set getting_synced=true for all logs, + // so other threads won't pop from logs_ while we're here, + // - only writer thread can push to logs_, and we're in + // writer thread, so no one will push to logs_, + // - as long as other threads don't modify it, it's safe to read + // from std::deque from multiple threads concurrently. + // + // Sync operation should work with locked log_write_mutex_, because: + // when DBOptions.manual_wal_flush_ is set, + // FlushWAL function will be invoked by another thread. + // if without locked log_write_mutex_, the log file may get data + // corruption + + const bool needs_locking = manual_wal_flush_ && !two_write_queues_; + if (UNLIKELY(needs_locking)) { + log_write_mutex_.Lock(); + } + + for (auto& log : logs_) { + io_s = log.writer->file()->Sync(immutable_db_options_.use_fsync); + if (!io_s.ok()) { + break; + } + } + + if (UNLIKELY(needs_locking)) { + log_write_mutex_.Unlock(); + } + + if (io_s.ok() && need_log_dir_sync) { + // We only sync WAL directory the first time WAL syncing is + // requested, so that in case users never turn on WAL sync, + // we can avoid the disk I/O in the write code path. + io_s = directories_.GetWalDir()->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + } + } + + if (merged_batch == &tmp_batch_) { + tmp_batch_.Clear(); + } + if (io_s.ok()) { + auto stats = default_cf_internal_stats_; + if (need_log_sync) { + stats->AddDBStats(InternalStats::kIntStatsWalFileSynced, 1); + RecordTick(stats_, WAL_FILE_SYNCED); + } + stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size); + RecordTick(stats_, WAL_FILE_BYTES, log_size); + stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal); + RecordTick(stats_, WRITE_WITH_WAL, write_with_wal); + } + return io_s; +} + +IOStatus DBImpl::ConcurrentWriteToWAL( + const WriteThread::WriteGroup& write_group, uint64_t* log_used, + SequenceNumber* last_sequence, size_t seq_inc) { + IOStatus io_s; + + assert(two_write_queues_ || immutable_db_options_.unordered_write); + assert(!write_group.leader->disable_wal); + // Same holds for all in the batch group + WriteBatch tmp_batch; + size_t write_with_wal = 0; + WriteBatch* to_be_cached_state = nullptr; + WriteBatch* merged_batch; + io_s = status_to_io_status(MergeBatch(write_group, &tmp_batch, &merged_batch, + &write_with_wal, &to_be_cached_state)); + if (UNLIKELY(!io_s.ok())) { + return io_s; + } + + // We need to lock log_write_mutex_ since logs_ and alive_log_files might be + // pushed back concurrently + log_write_mutex_.Lock(); + if (merged_batch == write_group.leader->batch) { + write_group.leader->log_used = logfile_number_; + } else if (write_with_wal > 1) { + for (auto writer : write_group) { + writer->log_used = logfile_number_; + } + } + *last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc); + auto sequence = *last_sequence + 1; + WriteBatchInternal::SetSequence(merged_batch, sequence); + + log::Writer* log_writer = logs_.back().writer; + LogFileNumberSize& log_file_number_size = alive_log_files_.back(); + + assert(log_writer->get_log_number() == log_file_number_size.number); + + uint64_t log_size; + io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size, + write_group.leader->rate_limiter_priority, + log_file_number_size); + if (to_be_cached_state) { + cached_recoverable_state_ = *to_be_cached_state; + cached_recoverable_state_empty_ = false; + } + log_write_mutex_.Unlock(); + + if (io_s.ok()) { + const bool concurrent = true; + auto stats = default_cf_internal_stats_; + stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size, + concurrent); + RecordTick(stats_, WAL_FILE_BYTES, log_size); + stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal, + concurrent); + RecordTick(stats_, WRITE_WITH_WAL, write_with_wal); + } + return io_s; +} + +Status DBImpl::WriteRecoverableState() { + mutex_.AssertHeld(); + if (!cached_recoverable_state_empty_) { + bool dont_care_bool; + SequenceNumber next_seq; + if (two_write_queues_) { + log_write_mutex_.Lock(); + } + SequenceNumber seq; + if (two_write_queues_) { + seq = versions_->FetchAddLastAllocatedSequence(0); + } else { + seq = versions_->LastSequence(); + } + WriteBatchInternal::SetSequence(&cached_recoverable_state_, seq + 1); + auto status = WriteBatchInternal::InsertInto( + &cached_recoverable_state_, column_family_memtables_.get(), + &flush_scheduler_, &trim_history_scheduler_, true, + 0 /*recovery_log_number*/, this, false /* concurrent_memtable_writes */, + &next_seq, &dont_care_bool, seq_per_batch_); + auto last_seq = next_seq - 1; + if (two_write_queues_) { + versions_->FetchAddLastAllocatedSequence(last_seq - seq); + versions_->SetLastPublishedSequence(last_seq); + } + versions_->SetLastSequence(last_seq); + if (two_write_queues_) { + log_write_mutex_.Unlock(); + } + if (status.ok() && recoverable_state_pre_release_callback_) { + const bool DISABLE_MEMTABLE = true; + for (uint64_t sub_batch_seq = seq + 1; + sub_batch_seq < next_seq && status.ok(); sub_batch_seq++) { + uint64_t const no_log_num = 0; + // Unlock it since the callback might end up locking mutex. e.g., + // AddCommitted -> AdvanceMaxEvictedSeq -> GetSnapshotListFromDB + mutex_.Unlock(); + status = recoverable_state_pre_release_callback_->Callback( + sub_batch_seq, !DISABLE_MEMTABLE, no_log_num, 0, 1); + mutex_.Lock(); + } + } + if (status.ok()) { + cached_recoverable_state_.Clear(); + cached_recoverable_state_empty_ = true; + } + return status; + } + return Status::OK(); +} + +void DBImpl::SelectColumnFamiliesForAtomicFlush( + autovector* cfds) { + for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() || + !cached_recoverable_state_empty_.load()) { + cfds->push_back(cfd); + } + } +} + +// Assign sequence number for atomic flush. +void DBImpl::AssignAtomicFlushSeq(const autovector& cfds) { + assert(immutable_db_options_.atomic_flush); + auto seq = versions_->LastSequence(); + for (auto cfd : cfds) { + cfd->imm()->AssignAtomicFlushSeq(seq); + } +} + +Status DBImpl::SwitchWAL(WriteContext* write_context) { + mutex_.AssertHeld(); + assert(write_context != nullptr); + Status status; + + if (alive_log_files_.begin()->getting_flushed) { + return status; + } + + auto oldest_alive_log = alive_log_files_.begin()->number; + bool flush_wont_release_oldest_log = false; + if (allow_2pc()) { + auto oldest_log_with_uncommitted_prep = + logs_with_prep_tracker_.FindMinLogContainingOutstandingPrep(); + + assert(oldest_log_with_uncommitted_prep == 0 || + oldest_log_with_uncommitted_prep >= oldest_alive_log); + if (oldest_log_with_uncommitted_prep > 0 && + oldest_log_with_uncommitted_prep == oldest_alive_log) { + if (unable_to_release_oldest_log_) { + // we already attempted to flush all column families dependent on + // the oldest alive log but the log still contained uncommitted + // transactions so there is still nothing that we can do. + return status; + } else { + ROCKS_LOG_WARN( + immutable_db_options_.info_log, + "Unable to release oldest log due to uncommitted transaction"); + unable_to_release_oldest_log_ = true; + flush_wont_release_oldest_log = true; + } + } + } + if (!flush_wont_release_oldest_log) { + // we only mark this log as getting flushed if we have successfully + // flushed all data in this log. If this log contains outstanding prepared + // transactions then we cannot flush this log until those transactions are + // commited. + unable_to_release_oldest_log_ = false; + alive_log_files_.begin()->getting_flushed = true; + } + + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "Flushing all column families with data in WAL number %" PRIu64 + ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64, + oldest_alive_log, total_log_size_.load(), GetMaxTotalWalSize()); + // no need to refcount because drop is happening in write thread, so can't + // happen while we're in the write thread + autovector cfds; + if (immutable_db_options_.atomic_flush) { + SelectColumnFamiliesForAtomicFlush(&cfds); + } else { + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + if (cfd->OldestLogToKeep() <= oldest_alive_log) { + cfds.push_back(cfd); + } + } + MaybeFlushStatsCF(&cfds); + } + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + + for (const auto cfd : cfds) { + cfd->Ref(); + status = SwitchMemtable(cfd, write_context); + cfd->UnrefAndTryDelete(); + if (!status.ok()) { + break; + } + } + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + + if (status.ok()) { + if (immutable_db_options_.atomic_flush) { + AssignAtomicFlushSeq(cfds); + } + for (auto cfd : cfds) { + cfd->imm()->FlushRequested(); + if (!immutable_db_options_.atomic_flush) { + FlushRequest flush_req; + GenerateFlushRequest({cfd}, &flush_req); + SchedulePendingFlush(flush_req, FlushReason::kWalFull); + } + } + if (immutable_db_options_.atomic_flush) { + FlushRequest flush_req; + GenerateFlushRequest(cfds, &flush_req); + SchedulePendingFlush(flush_req, FlushReason::kWalFull); + } + MaybeScheduleFlushOrCompaction(); + } + return status; +} + +Status DBImpl::HandleWriteBufferManagerFlush(WriteContext* write_context) { + mutex_.AssertHeld(); + assert(write_context != nullptr); + Status status; + + // Before a new memtable is added in SwitchMemtable(), + // write_buffer_manager_->ShouldFlush() will keep returning true. If another + // thread is writing to another DB with the same write buffer, they may also + // be flushed. We may end up with flushing much more DBs than needed. It's + // suboptimal but still correct. + // no need to refcount because drop is happening in write thread, so can't + // happen while we're in the write thread + autovector cfds; + if (immutable_db_options_.atomic_flush) { + SelectColumnFamiliesForAtomicFlush(&cfds); + } else { + ColumnFamilyData* cfd_picked = nullptr; + SequenceNumber seq_num_for_cf_picked = kMaxSequenceNumber; + + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + if (!cfd->mem()->IsEmpty() && !cfd->imm()->IsFlushPendingOrRunning()) { + // We only consider flush on CFs with bytes in the mutable memtable, + // and no immutable memtables for which flush has yet to finish. If + // we triggered flush on CFs already trying to flush, we would risk + // creating too many immutable memtables leading to write stalls. + uint64_t seq = cfd->mem()->GetCreationSeq(); + if (cfd_picked == nullptr || seq < seq_num_for_cf_picked) { + cfd_picked = cfd; + seq_num_for_cf_picked = seq; + } + } + } + if (cfd_picked != nullptr) { + cfds.push_back(cfd_picked); + } + MaybeFlushStatsCF(&cfds); + } + if (!cfds.empty()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "Flushing triggered to alleviate write buffer memory usage. Write " + "buffer is using %" ROCKSDB_PRIszt + " bytes out of a total of %" ROCKSDB_PRIszt ".", + write_buffer_manager_->memory_usage(), + write_buffer_manager_->buffer_size()); + } + + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + for (const auto cfd : cfds) { + if (cfd->mem()->IsEmpty()) { + continue; + } + cfd->Ref(); + status = SwitchMemtable(cfd, write_context); + cfd->UnrefAndTryDelete(); + if (!status.ok()) { + break; + } + } + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + + if (status.ok()) { + if (immutable_db_options_.atomic_flush) { + AssignAtomicFlushSeq(cfds); + } + for (const auto cfd : cfds) { + cfd->imm()->FlushRequested(); + if (!immutable_db_options_.atomic_flush) { + FlushRequest flush_req; + GenerateFlushRequest({cfd}, &flush_req); + SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager); + } + } + if (immutable_db_options_.atomic_flush) { + FlushRequest flush_req; + GenerateFlushRequest(cfds, &flush_req); + SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager); + } + MaybeScheduleFlushOrCompaction(); + } + return status; +} + +uint64_t DBImpl::GetMaxTotalWalSize() const { + uint64_t max_total_wal_size = + max_total_wal_size_.load(std::memory_order_acquire); + if (max_total_wal_size > 0) { + return max_total_wal_size; + } + return 4 * max_total_in_memory_state_.load(std::memory_order_acquire); +} + +// REQUIRES: mutex_ is held +// REQUIRES: this thread is currently at the front of the writer queue +Status DBImpl::DelayWrite(uint64_t num_bytes, + const WriteOptions& write_options) { + uint64_t time_delayed = 0; + bool delayed = false; + { + StopWatch sw(immutable_db_options_.clock, stats_, WRITE_STALL, + &time_delayed); + uint64_t delay = + write_controller_.GetDelay(immutable_db_options_.clock, num_bytes); + TEST_SYNC_POINT("DBImpl::DelayWrite:Start"); + if (delay > 0) { + if (write_options.no_slowdown) { + return Status::Incomplete("Write stall"); + } + TEST_SYNC_POINT("DBImpl::DelayWrite:Sleep"); + + // Notify write_thread_ about the stall so it can setup a barrier and + // fail any pending writers with no_slowdown + write_thread_.BeginWriteStall(); + mutex_.Unlock(); + TEST_SYNC_POINT("DBImpl::DelayWrite:BeginWriteStallDone"); + // We will delay the write until we have slept for `delay` microseconds + // or we don't need a delay anymore. We check for cancellation every 1ms + // (slightly longer because WriteController minimum delay is 1ms, in + // case of sleep imprecision, rounding, etc.) + const uint64_t kDelayInterval = 1001; + uint64_t stall_end = sw.start_time() + delay; + while (write_controller_.NeedsDelay()) { + if (immutable_db_options_.clock->NowMicros() >= stall_end) { + // We already delayed this write `delay` microseconds + break; + } + + delayed = true; + // Sleep for 0.001 seconds + immutable_db_options_.clock->SleepForMicroseconds(kDelayInterval); + } + mutex_.Lock(); + write_thread_.EndWriteStall(); + } + + // Don't wait if there's a background error, even if its a soft error. We + // might wait here indefinitely as the background compaction may never + // finish successfully, resulting in the stall condition lasting + // indefinitely + while (error_handler_.GetBGError().ok() && write_controller_.IsStopped() && + !shutting_down_.load(std::memory_order_relaxed)) { + if (write_options.no_slowdown) { + return Status::Incomplete("Write stall"); + } + delayed = true; + + // Notify write_thread_ about the stall so it can setup a barrier and + // fail any pending writers with no_slowdown + write_thread_.BeginWriteStall(); + TEST_SYNC_POINT("DBImpl::DelayWrite:Wait"); + bg_cv_.Wait(); + write_thread_.EndWriteStall(); + } + } + assert(!delayed || !write_options.no_slowdown); + if (delayed) { + default_cf_internal_stats_->AddDBStats( + InternalStats::kIntStatsWriteStallMicros, time_delayed); + RecordTick(stats_, STALL_MICROS, time_delayed); + } + + // If DB is not in read-only mode and write_controller is not stopping + // writes, we can ignore any background errors and allow the write to + // proceed + Status s; + if (write_controller_.IsStopped()) { + if (!shutting_down_.load(std::memory_order_relaxed)) { + // If writes are still stopped and db not shutdown, it means we bailed + // due to a background error + s = Status::Incomplete(error_handler_.GetBGError().ToString()); + } else { + s = Status::ShutdownInProgress("stalled writes"); + } + } + if (error_handler_.IsDBStopped()) { + s = error_handler_.GetBGError(); + } + return s; +} + +// REQUIRES: mutex_ is held +// REQUIRES: this thread is currently at the front of the writer queue +void DBImpl::WriteBufferManagerStallWrites() { + mutex_.AssertHeld(); + // First block future writer threads who want to add themselves to the queue + // of WriteThread. + write_thread_.BeginWriteStall(); + mutex_.Unlock(); + + // Change the state to State::Blocked. + static_cast(wbm_stall_.get()) + ->SetState(WBMStallInterface::State::BLOCKED); + // Then WriteBufferManager will add DB instance to its queue + // and block this thread by calling WBMStallInterface::Block(). + write_buffer_manager_->BeginWriteStall(wbm_stall_.get()); + wbm_stall_->Block(); + + mutex_.Lock(); + // Stall has ended. Signal writer threads so that they can add + // themselves to the WriteThread queue for writes. + write_thread_.EndWriteStall(); +} + +Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options, + WriteBatch* my_batch) { + assert(write_options.low_pri); + // This is called outside the DB mutex. Although it is safe to make the call, + // the consistency condition is not guaranteed to hold. It's OK to live with + // it in this case. + // If we need to speed compaction, it means the compaction is left behind + // and we start to limit low pri writes to a limit. + if (write_controller_.NeedSpeedupCompaction()) { + if (allow_2pc() && (my_batch->HasCommit() || my_batch->HasRollback())) { + // For 2PC, we only rate limit prepare, not commit. + return Status::OK(); + } + if (write_options.no_slowdown) { + return Status::Incomplete("Low priority write stall"); + } else { + assert(my_batch != nullptr); + // Rate limit those writes. The reason that we don't completely wait + // is that in case the write is heavy, low pri writes may never have + // a chance to run. Now we guarantee we are still slowly making + // progress. + PERF_TIMER_GUARD(write_delay_time); + write_controller_.low_pri_rate_limiter()->Request( + my_batch->GetDataSize(), Env::IO_HIGH, nullptr /* stats */, + RateLimiter::OpType::kWrite); + } + } + return Status::OK(); +} + +void DBImpl::MaybeFlushStatsCF(autovector* cfds) { + assert(cfds != nullptr); + if (!cfds->empty() && immutable_db_options_.persist_stats_to_disk) { + ColumnFamilyData* cfd_stats = + versions_->GetColumnFamilySet()->GetColumnFamily( + kPersistentStatsColumnFamilyName); + if (cfd_stats != nullptr && !cfd_stats->mem()->IsEmpty()) { + for (ColumnFamilyData* cfd : *cfds) { + if (cfd == cfd_stats) { + // stats CF already included in cfds + return; + } + } + // force flush stats CF when its log number is less than all other CF's + // log numbers + bool force_flush_stats_cf = true; + for (auto* loop_cfd : *versions_->GetColumnFamilySet()) { + if (loop_cfd == cfd_stats) { + continue; + } + if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) { + force_flush_stats_cf = false; + } + } + if (force_flush_stats_cf) { + cfds->push_back(cfd_stats); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Force flushing stats CF with automated flush " + "to avoid holding old logs"); + } + } + } +} + +Status DBImpl::TrimMemtableHistory(WriteContext* context) { + autovector cfds; + ColumnFamilyData* tmp_cfd; + while ((tmp_cfd = trim_history_scheduler_.TakeNextColumnFamily()) != + nullptr) { + cfds.push_back(tmp_cfd); + } + for (auto& cfd : cfds) { + autovector to_delete; + bool trimmed = cfd->imm()->TrimHistory(&context->memtables_to_free_, + cfd->mem()->MemoryAllocatedBytes()); + if (trimmed) { + context->superversion_context.NewSuperVersion(); + assert(context->superversion_context.new_superversion.get() != nullptr); + cfd->InstallSuperVersion(&context->superversion_context, &mutex_); + } + + if (cfd->UnrefAndTryDelete()) { + cfd = nullptr; + } + } + return Status::OK(); +} + +Status DBImpl::ScheduleFlushes(WriteContext* context) { + autovector cfds; + if (immutable_db_options_.atomic_flush) { + SelectColumnFamiliesForAtomicFlush(&cfds); + for (auto cfd : cfds) { + cfd->Ref(); + } + flush_scheduler_.Clear(); + } else { + ColumnFamilyData* tmp_cfd; + while ((tmp_cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) { + cfds.push_back(tmp_cfd); + } + MaybeFlushStatsCF(&cfds); + } + Status status; + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + + for (auto& cfd : cfds) { + if (!cfd->mem()->IsEmpty()) { + status = SwitchMemtable(cfd, context); + } + if (cfd->UnrefAndTryDelete()) { + cfd = nullptr; + } + if (!status.ok()) { + break; + } + } + + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + + if (status.ok()) { + if (immutable_db_options_.atomic_flush) { + AssignAtomicFlushSeq(cfds); + FlushRequest flush_req; + GenerateFlushRequest(cfds, &flush_req); + SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull); + } else { + for (auto* cfd : cfds) { + FlushRequest flush_req; + GenerateFlushRequest({cfd}, &flush_req); + SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull); + } + } + MaybeScheduleFlushOrCompaction(); + } + return status; +} + +#ifndef ROCKSDB_LITE +void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/, + const MemTableInfo& mem_table_info) { + if (immutable_db_options_.listeners.size() == 0U) { + return; + } + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + + mutex_.Unlock(); + for (auto listener : immutable_db_options_.listeners) { + listener->OnMemTableSealed(mem_table_info); + } + mutex_.Lock(); +} +#endif // ROCKSDB_LITE + +// REQUIRES: mutex_ is held +// REQUIRES: this thread is currently at the front of the writer queue +// REQUIRES: this thread is currently at the front of the 2nd writer queue if +// two_write_queues_ is true (This is to simplify the reasoning.) +Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { + mutex_.AssertHeld(); + log::Writer* new_log = nullptr; + MemTable* new_mem = nullptr; + IOStatus io_s; + + // Recoverable state is persisted in WAL. After memtable switch, WAL might + // be deleted, so we write the state to memtable to be persisted as well. + Status s = WriteRecoverableState(); + if (!s.ok()) { + return s; + } + + // Attempt to switch to a new memtable and trigger flush of old. + // Do this without holding the dbmutex lock. + assert(versions_->prev_log_number() == 0); + if (two_write_queues_) { + log_write_mutex_.Lock(); + } + bool creating_new_log = !log_empty_; + if (two_write_queues_) { + log_write_mutex_.Unlock(); + } + uint64_t recycle_log_number = 0; + if (creating_new_log && immutable_db_options_.recycle_log_file_num && + !log_recycle_files_.empty()) { + recycle_log_number = log_recycle_files_.front(); + } + uint64_t new_log_number = + creating_new_log ? versions_->NewFileNumber() : logfile_number_; + const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + + // Set memtable_info for memtable sealed callback +#ifndef ROCKSDB_LITE + MemTableInfo memtable_info; + memtable_info.cf_name = cfd->GetName(); + memtable_info.first_seqno = cfd->mem()->GetFirstSequenceNumber(); + memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber(); + memtable_info.num_entries = cfd->mem()->num_entries(); + memtable_info.num_deletes = cfd->mem()->num_deletes(); +#endif // ROCKSDB_LITE + // Log this later after lock release. It may be outdated, e.g., if background + // flush happens before logging, but that should be ok. + int num_imm_unflushed = cfd->imm()->NumNotFlushed(); + const auto preallocate_block_size = + GetWalPreallocateBlockSize(mutable_cf_options.write_buffer_size); + mutex_.Unlock(); + if (creating_new_log) { + // TODO: Write buffer size passed in should be max of all CF's instead + // of mutable_cf_options.write_buffer_size. + io_s = CreateWAL(new_log_number, recycle_log_number, preallocate_block_size, + &new_log); + if (s.ok()) { + s = io_s; + } + } + if (s.ok()) { + SequenceNumber seq = versions_->LastSequence(); + new_mem = cfd->ConstructNewMemtable(mutable_cf_options, seq); + context->superversion_context.NewSuperVersion(); + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] New memtable created with log file: #%" PRIu64 + ". Immutable memtables: %d.\n", + cfd->GetName().c_str(), new_log_number, num_imm_unflushed); + // There should be no concurrent write as the thread is at the front of + // writer queue + cfd->mem()->ConstructFragmentedRangeTombstones(); + + mutex_.Lock(); + if (recycle_log_number != 0) { + // Since renaming the file is done outside DB mutex, we need to ensure + // concurrent full purges don't delete the file while we're recycling it. + // To achieve that we hold the old log number in the recyclable list until + // after it has been renamed. + assert(log_recycle_files_.front() == recycle_log_number); + log_recycle_files_.pop_front(); + } + if (s.ok() && creating_new_log) { + InstrumentedMutexLock l(&log_write_mutex_); + assert(new_log != nullptr); + if (!logs_.empty()) { + // Alway flush the buffer of the last log before switching to a new one + log::Writer* cur_log_writer = logs_.back().writer; + if (error_handler_.IsRecoveryInProgress()) { + // In recovery path, we force another try of writing WAL buffer. + cur_log_writer->file()->reset_seen_error(); + } + io_s = cur_log_writer->WriteBuffer(); + if (s.ok()) { + s = io_s; + } + if (!s.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "[%s] Failed to switch from #%" PRIu64 " to #%" PRIu64 + " WAL file\n", + cfd->GetName().c_str(), cur_log_writer->get_log_number(), + new_log_number); + } + } + if (s.ok()) { + logfile_number_ = new_log_number; + log_empty_ = true; + log_dir_synced_ = false; + logs_.emplace_back(logfile_number_, new_log); + alive_log_files_.push_back(LogFileNumberSize(logfile_number_)); + } + } + + if (!s.ok()) { + // how do we fail if we're not creating new log? + assert(creating_new_log); + delete new_mem; + delete new_log; + context->superversion_context.new_superversion.reset(); + // We may have lost data from the WritableFileBuffer in-memory buffer for + // the current log, so treat it as a fatal error and set bg_error + if (!io_s.ok()) { + error_handler_.SetBGError(io_s, BackgroundErrorReason::kMemTable); + } else { + error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable); + } + // Read back bg_error in order to get the right severity + s = error_handler_.GetBGError(); + return s; + } + + bool empty_cf_updated = false; + if (immutable_db_options_.track_and_verify_wals_in_manifest && + !immutable_db_options_.allow_2pc && creating_new_log) { + // In non-2pc mode, WALs become obsolete if they do not contain unflushed + // data. Updating the empty CF's log number might cause some WALs to become + // obsolete. So we should track the WAL obsoletion event before actually + // updating the empty CF's log number. + uint64_t min_wal_number_to_keep = + versions_->PreComputeMinLogNumberWithUnflushedData(logfile_number_); + if (min_wal_number_to_keep > + versions_->GetWalSet().GetMinWalNumberToKeep()) { + // Get a snapshot of the empty column families. + // LogAndApply may release and reacquire db + // mutex, during that period, column family may become empty (e.g. its + // flush succeeds), then it affects the computed min_log_number_to_keep, + // so we take a snapshot for consistency of column family data + // status. If a column family becomes non-empty afterwards, its active log + // should still be the created new log, so the min_log_number_to_keep is + // not affected. + autovector empty_cfs; + for (auto cf : *versions_->GetColumnFamilySet()) { + if (cf->IsEmpty()) { + empty_cfs.push_back(cf); + } + } + + VersionEdit wal_deletion; + wal_deletion.DeleteWalsBefore(min_wal_number_to_keep); + s = versions_->LogAndApplyToDefaultColumnFamily(&wal_deletion, &mutex_, + directories_.GetDbDir()); + if (!s.ok() && versions_->io_status().IsIOError()) { + s = error_handler_.SetBGError(versions_->io_status(), + BackgroundErrorReason::kManifestWrite); + } + if (!s.ok()) { + return s; + } + + for (auto cf : empty_cfs) { + if (cf->IsEmpty()) { + cf->SetLogNumber(logfile_number_); + // MEMPURGE: No need to change this, because new adds + // should still receive new sequence numbers. + cf->mem()->SetCreationSeq(versions_->LastSequence()); + } // cf may become non-empty. + } + empty_cf_updated = true; + } + } + if (!empty_cf_updated) { + for (auto cf : *versions_->GetColumnFamilySet()) { + // all this is just optimization to delete logs that + // are no longer needed -- if CF is empty, that means it + // doesn't need that particular log to stay alive, so we just + // advance the log number. no need to persist this in the manifest + if (cf->IsEmpty()) { + if (creating_new_log) { + cf->SetLogNumber(logfile_number_); + } + cf->mem()->SetCreationSeq(versions_->LastSequence()); + } + } + } + + cfd->mem()->SetNextLogNumber(logfile_number_); + assert(new_mem != nullptr); + cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_); + new_mem->Ref(); + cfd->SetMemtable(new_mem); + InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context, + mutable_cf_options); + +#ifndef ROCKSDB_LITE + // Notify client that memtable is sealed, now that we have successfully + // installed a new memtable + NotifyOnMemTableSealed(cfd, memtable_info); +#endif // ROCKSDB_LITE + // It is possible that we got here without checking the value of i_os, but + // that is okay. If we did, it most likely means that s was already an error. + // In any case, ignore any unchecked error for i_os here. + io_s.PermitUncheckedError(); + return s; +} + +size_t DBImpl::GetWalPreallocateBlockSize(uint64_t write_buffer_size) const { + mutex_.AssertHeld(); + size_t bsize = + static_cast(write_buffer_size / 10 + write_buffer_size); + // Some users might set very high write_buffer_size and rely on + // max_total_wal_size or other parameters to control the WAL size. + if (mutable_db_options_.max_total_wal_size > 0) { + bsize = std::min( + bsize, static_cast(mutable_db_options_.max_total_wal_size)); + } + if (immutable_db_options_.db_write_buffer_size > 0) { + bsize = std::min(bsize, immutable_db_options_.db_write_buffer_size); + } + if (immutable_db_options_.write_buffer_manager && + immutable_db_options_.write_buffer_manager->enabled()) { + bsize = std::min( + bsize, immutable_db_options_.write_buffer_manager->buffer_size()); + } + + return bsize; +} + +// Default implementations of convenience methods that subclasses of DB +// can call if they wish +Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + // Pre-allocate size of write batch conservatively. + // 8 bytes are taken by header, 4 bytes for count, 1 byte for type, + // and we allocate 11 extra bytes for key length, as well as value length. + WriteBatch batch(key.size() + value.size() + 24, 0 /* max_bytes */, + opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */); + Status s = batch.Put(column_family, key, value); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} + +Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts, const Slice& value) { + ColumnFamilyHandle* default_cf = DefaultColumnFamily(); + assert(default_cf); + const Comparator* const default_cf_ucmp = default_cf->GetComparator(); + assert(default_cf_ucmp); + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, + opt.protection_bytes_per_key, + default_cf_ucmp->timestamp_size()); + Status s = batch.Put(column_family, key, ts, value); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} + +Status DB::PutEntity(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const WideColumns& columns) { + const ColumnFamilyHandle* const default_cf = DefaultColumnFamily(); + assert(default_cf); + + const Comparator* const default_cf_ucmp = default_cf->GetComparator(); + assert(default_cf_ucmp); + + WriteBatch batch(/* reserved_bytes */ 0, /* max_bytes */ 0, + options.protection_bytes_per_key, + default_cf_ucmp->timestamp_size()); + + const Status s = batch.PutEntity(column_family, key, columns); + if (!s.ok()) { + return s; + } + + return Write(options, &batch); +} + +Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key) { + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, + opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */); + Status s = batch.Delete(column_family, key); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} + +Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts) { + ColumnFamilyHandle* default_cf = DefaultColumnFamily(); + assert(default_cf); + const Comparator* const default_cf_ucmp = default_cf->GetComparator(); + assert(default_cf_ucmp); + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, + opt.protection_bytes_per_key, + default_cf_ucmp->timestamp_size()); + Status s = batch.Delete(column_family, key, ts); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} + +Status DB::SingleDelete(const WriteOptions& opt, + ColumnFamilyHandle* column_family, const Slice& key) { + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, + opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */); + Status s = batch.SingleDelete(column_family, key); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} + +Status DB::SingleDelete(const WriteOptions& opt, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts) { + ColumnFamilyHandle* default_cf = DefaultColumnFamily(); + assert(default_cf); + const Comparator* const default_cf_ucmp = default_cf->GetComparator(); + assert(default_cf_ucmp); + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, + opt.protection_bytes_per_key, + default_cf_ucmp->timestamp_size()); + Status s = batch.SingleDelete(column_family, key, ts); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} + +Status DB::DeleteRange(const WriteOptions& opt, + ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key) { + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, + opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */); + Status s = batch.DeleteRange(column_family, begin_key, end_key); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} + +Status DB::DeleteRange(const WriteOptions& opt, + ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key, + const Slice& ts) { + ColumnFamilyHandle* default_cf = DefaultColumnFamily(); + assert(default_cf); + const Comparator* const default_cf_ucmp = default_cf->GetComparator(); + assert(default_cf_ucmp); + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, + opt.protection_bytes_per_key, + default_cf_ucmp->timestamp_size()); + Status s = batch.DeleteRange(column_family, begin_key, end_key, ts); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} + +Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, + opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */); + Status s = batch.Merge(column_family, key, value); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} + +Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts, const Slice& value) { + ColumnFamilyHandle* default_cf = DefaultColumnFamily(); + assert(default_cf); + const Comparator* const default_cf_ucmp = default_cf->GetComparator(); + assert(default_cf_ucmp); + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, + opt.protection_bytes_per_key, + default_cf_ucmp->timestamp_size()); + Status s = batch.Merge(column_family, key, ts, value); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_info_dumper.cc b/src/rocksdb/db/db_info_dumper.cc new file mode 100644 index 000000000..be8d5bee1 --- /dev/null +++ b/src/rocksdb/db/db_info_dumper.cc @@ -0,0 +1,147 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_info_dumper.h" + +#include + +#include +#include +#include +#include + +#include "file/filename.h" +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { + +void DumpDBFileSummary(const ImmutableDBOptions& options, + const std::string& dbname, + const std::string& session_id) { + if (options.info_log == nullptr) { + return; + } + + auto* env = options.env; + uint64_t number = 0; + FileType type = kInfoLogFile; + + std::vector files; + uint64_t file_num = 0; + uint64_t file_size; + std::string file_info, wal_info; + + Header(options.info_log, "DB SUMMARY\n"); + Header(options.info_log, "DB Session ID: %s\n", session_id.c_str()); + + Status s; + // Get files in dbname dir + s = env->GetChildren(dbname, &files); + if (!s.ok()) { + Error(options.info_log, "Error when reading %s dir %s\n", dbname.c_str(), + s.ToString().c_str()); + } + std::sort(files.begin(), files.end()); + for (const std::string& file : files) { + if (!ParseFileName(file, &number, &type)) { + continue; + } + switch (type) { + case kCurrentFile: + Header(options.info_log, "CURRENT file: %s\n", file.c_str()); + break; + case kIdentityFile: + Header(options.info_log, "IDENTITY file: %s\n", file.c_str()); + break; + case kDescriptorFile: + s = env->GetFileSize(dbname + "/" + file, &file_size); + if (s.ok()) { + Header(options.info_log, + "MANIFEST file: %s size: %" PRIu64 " Bytes\n", file.c_str(), + file_size); + } else { + Error(options.info_log, + "Error when reading MANIFEST file: %s/%s %s\n", dbname.c_str(), + file.c_str(), s.ToString().c_str()); + } + break; + case kWalFile: + s = env->GetFileSize(dbname + "/" + file, &file_size); + if (s.ok()) { + wal_info.append(file) + .append(" size: ") + .append(std::to_string(file_size)) + .append(" ; "); + } else { + Error(options.info_log, "Error when reading LOG file: %s/%s %s\n", + dbname.c_str(), file.c_str(), s.ToString().c_str()); + } + break; + case kTableFile: + if (++file_num < 10) { + file_info.append(file).append(" "); + } + break; + default: + break; + } + } + + // Get sst files in db_path dir + for (auto& db_path : options.db_paths) { + if (dbname.compare(db_path.path) != 0) { + s = env->GetChildren(db_path.path, &files); + if (!s.ok()) { + Error(options.info_log, "Error when reading %s dir %s\n", + db_path.path.c_str(), s.ToString().c_str()); + continue; + } + std::sort(files.begin(), files.end()); + for (const std::string& file : files) { + if (ParseFileName(file, &number, &type)) { + if (type == kTableFile && ++file_num < 10) { + file_info.append(file).append(" "); + } + } + } + } + Header(options.info_log, + "SST files in %s dir, Total Num: %" PRIu64 ", files: %s\n", + db_path.path.c_str(), file_num, file_info.c_str()); + file_num = 0; + file_info.clear(); + } + + // Get wal file in wal_dir + const auto& wal_dir = options.GetWalDir(dbname); + if (!options.IsWalDirSameAsDBPath(dbname)) { + s = env->GetChildren(wal_dir, &files); + if (!s.ok()) { + Error(options.info_log, "Error when reading %s dir %s\n", wal_dir.c_str(), + s.ToString().c_str()); + return; + } + wal_info.clear(); + for (const std::string& file : files) { + if (ParseFileName(file, &number, &type)) { + if (type == kWalFile) { + s = env->GetFileSize(wal_dir + "/" + file, &file_size); + if (s.ok()) { + wal_info.append(file) + .append(" size: ") + .append(std::to_string(file_size)) + .append(" ; "); + } else { + Error(options.info_log, "Error when reading LOG file %s/%s %s\n", + wal_dir.c_str(), file.c_str(), s.ToString().c_str()); + } + } + } + } + } + Header(options.info_log, "Write Ahead Log file in %s: %s\n", wal_dir.c_str(), + wal_info.c_str()); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_info_dumper.h b/src/rocksdb/db/db_info_dumper.h new file mode 100644 index 000000000..f518e840f --- /dev/null +++ b/src/rocksdb/db/db_info_dumper.h @@ -0,0 +1,15 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include + +#include "options/db_options.h" + +namespace ROCKSDB_NAMESPACE { +void DumpDBFileSummary(const ImmutableDBOptions& options, + const std::string& dbname, + const std::string& session_id = ""); +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_inplace_update_test.cc b/src/rocksdb/db/db_inplace_update_test.cc new file mode 100644 index 000000000..3921a3b00 --- /dev/null +++ b/src/rocksdb/db/db_inplace_update_test.cc @@ -0,0 +1,262 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/db_test_util.h" +#include "port/stack_trace.h" + +namespace ROCKSDB_NAMESPACE { + +class DBTestInPlaceUpdate : public DBTestBase { + public: + DBTestInPlaceUpdate() + : DBTestBase("db_inplace_update_test", /*env_do_fsync=*/true) {} +}; + +TEST_F(DBTestInPlaceUpdate, InPlaceUpdate) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + options.env = env_; + options.write_buffer_size = 100000; + options.allow_concurrent_memtable_write = false; + Reopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Update key with values of smaller size + int numValues = 10; + for (int i = numValues; i > 0; i--) { + std::string value = DummyString(i, 'a'); + ASSERT_OK(Put(1, "key", value)); + ASSERT_EQ(value, Get(1, "key")); + } + + // Only 1 instance for that key. + validateNumberOfEntries(1, 1); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTestInPlaceUpdate, InPlaceUpdateLargeNewValue) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + options.env = env_; + options.write_buffer_size = 100000; + options.allow_concurrent_memtable_write = false; + Reopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Update key with values of larger size + int numValues = 10; + for (int i = 0; i < numValues; i++) { + std::string value = DummyString(i, 'a'); + ASSERT_OK(Put(1, "key", value)); + ASSERT_EQ(value, Get(1, "key")); + } + + // All 10 updates exist in the internal iterator + validateNumberOfEntries(numValues, 1); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTestInPlaceUpdate, InPlaceUpdateEntitySmallerNewValue) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + options.env = env_; + options.allow_concurrent_memtable_write = false; + + Reopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Update key with values of smaller size + constexpr int num_values = 10; + for (int i = num_values; i > 0; --i) { + constexpr char key[] = "key"; + const std::string value = DummyString(i, 'a'); + WideColumns wide_columns{{"attr", value}}; + + ASSERT_OK(db_->PutEntity(WriteOptions(), handles_[1], key, wide_columns)); + // TODO: use Get to check entity once it's supported + } + + // Only 1 instance for that key. + validateNumberOfEntries(1, 1); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTestInPlaceUpdate, InPlaceUpdateEntityLargerNewValue) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + options.env = env_; + options.allow_concurrent_memtable_write = false; + + Reopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Update key with values of larger size + constexpr int num_values = 10; + for (int i = 0; i < num_values; ++i) { + constexpr char key[] = "key"; + const std::string value = DummyString(i, 'a'); + WideColumns wide_columns{{"attr", value}}; + + ASSERT_OK(db_->PutEntity(WriteOptions(), handles_[1], key, wide_columns)); + // TODO: use Get to check entity once it's supported + } + + // All 10 updates exist in the internal iterator + validateNumberOfEntries(num_values, 1); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackSmallerSize) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + + options.env = env_; + options.write_buffer_size = 100000; + options.inplace_callback = + ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceSmallerSize; + options.allow_concurrent_memtable_write = false; + Reopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Update key with values of smaller size + int numValues = 10; + ASSERT_OK(Put(1, "key", DummyString(numValues, 'a'))); + ASSERT_EQ(DummyString(numValues, 'c'), Get(1, "key")); + + for (int i = numValues; i > 0; i--) { + ASSERT_OK(Put(1, "key", DummyString(i, 'a'))); + ASSERT_EQ(DummyString(i - 1, 'b'), Get(1, "key")); + } + + // Only 1 instance for that key. + validateNumberOfEntries(1, 1); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackSmallerVarintSize) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + + options.env = env_; + options.write_buffer_size = 100000; + options.inplace_callback = + ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceSmallerVarintSize; + options.allow_concurrent_memtable_write = false; + Reopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Update key with values of smaller varint size + int numValues = 265; + ASSERT_OK(Put(1, "key", DummyString(numValues, 'a'))); + ASSERT_EQ(DummyString(numValues, 'c'), Get(1, "key")); + + for (int i = numValues; i > 0; i--) { + ASSERT_OK(Put(1, "key", DummyString(i, 'a'))); + ASSERT_EQ(DummyString(1, 'b'), Get(1, "key")); + } + + // Only 1 instance for that key. + validateNumberOfEntries(1, 1); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackLargeNewValue) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + + options.env = env_; + options.write_buffer_size = 100000; + options.inplace_callback = + ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceLargerSize; + options.allow_concurrent_memtable_write = false; + Reopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Update key with values of larger size + int numValues = 10; + for (int i = 0; i < numValues; i++) { + ASSERT_OK(Put(1, "key", DummyString(i, 'a'))); + ASSERT_EQ(DummyString(i, 'c'), Get(1, "key")); + } + + // No inplace updates. All updates are puts with new seq number + // All 10 updates exist in the internal iterator + validateNumberOfEntries(numValues, 1); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackNoAction) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + + options.env = env_; + options.write_buffer_size = 100000; + options.inplace_callback = + ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceNoAction; + options.allow_concurrent_memtable_write = false; + Reopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Callback function requests no actions from db + ASSERT_OK(Put(1, "key", DummyString(1, 'a'))); + ASSERT_EQ(Get(1, "key"), "NOT_FOUND"); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTestInPlaceUpdate, InPlaceUpdateAndSnapshot) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + options.env = env_; + options.write_buffer_size = 100000; + options.allow_concurrent_memtable_write = false; + Reopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Update key with values of smaller size, and + // run GetSnapshot and ReleaseSnapshot + int numValues = 2; + for (int i = numValues; i > 0; i--) { + const Snapshot* s = db_->GetSnapshot(); + ASSERT_EQ(nullptr, s); + std::string value = DummyString(i, 'a'); + ASSERT_OK(Put(1, "key", value)); + ASSERT_EQ(value, Get(1, "key")); + // release s (nullptr) + db_->ReleaseSnapshot(s); + } + + // Only 1 instance for that key. + validateNumberOfEntries(1, 1); + } while (ChangeCompactOptions()); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_io_failure_test.cc b/src/rocksdb/db/db_io_failure_test.cc new file mode 100644 index 000000000..2a405fd38 --- /dev/null +++ b/src/rocksdb/db/db_io_failure_test.cc @@ -0,0 +1,593 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "test_util/testutil.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +class DBIOFailureTest : public DBTestBase { + public: + DBIOFailureTest() : DBTestBase("db_io_failure_test", /*env_do_fsync=*/true) {} +}; + +#ifndef ROCKSDB_LITE +// Check that number of files does not grow when writes are dropped +TEST_F(DBIOFailureTest, DropWrites) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.paranoid_checks = false; + Reopen(options); + + ASSERT_OK(Put("foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + Compact("a", "z"); + const size_t num_files = CountFiles(); + // Force out-of-space errors + env_->drop_writes_.store(true, std::memory_order_release); + env_->sleep_counter_.Reset(); + env_->SetMockSleep(); + for (int i = 0; i < 5; i++) { + if (option_config_ != kUniversalCompactionMultiLevel && + option_config_ != kUniversalSubcompactions) { + for (int level = 0; level < dbfull()->NumberLevels(); level++) { + if (level > 0 && level == dbfull()->NumberLevels() - 1) { + break; + } + Status s = + dbfull()->TEST_CompactRange(level, nullptr, nullptr, nullptr, + true /* disallow trivial move */); + ASSERT_TRUE(s.ok() || s.IsCorruption()); + } + } else { + Status s = + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_TRUE(s.ok() || s.IsCorruption()); + } + } + + std::string property_value; + ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); + ASSERT_EQ("5", property_value); + + env_->drop_writes_.store(false, std::memory_order_release); + const size_t count = CountFiles(); + ASSERT_LT(count, num_files + 3); + + // Check that compaction attempts slept after errors + // TODO @krad: Figure out why ASSERT_EQ 5 keeps failing in certain compiler + // versions + ASSERT_GE(env_->sleep_counter_.Read(), 4); + } while (ChangeCompactOptions()); +} + +// Check background error counter bumped on flush failures. +TEST_F(DBIOFailureTest, DropWritesFlush) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.max_background_flushes = 1; + Reopen(options); + + ASSERT_OK(Put("foo", "v1")); + // Force out-of-space errors + env_->drop_writes_.store(true, std::memory_order_release); + + std::string property_value; + // Background error count is 0 now. + ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); + ASSERT_EQ("0", property_value); + + // ASSERT file is too short + ASSERT_TRUE(dbfull()->TEST_FlushMemTable(true).IsCorruption()); + + ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); + ASSERT_EQ("1", property_value); + + env_->drop_writes_.store(false, std::memory_order_release); + } while (ChangeCompactOptions()); +} + +// Check that CompactRange() returns failure if there is not enough space left +// on device +TEST_F(DBIOFailureTest, NoSpaceCompactRange) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.disable_auto_compactions = true; + Reopen(options); + + // generate 5 tables + for (int i = 0; i < 5; ++i) { + ASSERT_OK(Put(Key(i), Key(i) + "v")); + ASSERT_OK(Flush()); + } + + // Force out-of-space errors + env_->no_space_.store(true, std::memory_order_release); + + Status s = dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow trivial move */); + ASSERT_TRUE(s.IsIOError()); + ASSERT_TRUE(s.IsNoSpace()); + + env_->no_space_.store(false, std::memory_order_release); + } while (ChangeCompactOptions()); +} +#endif // ROCKSDB_LITE + +TEST_F(DBIOFailureTest, NonWritableFileSystem) { + do { + Options options = CurrentOptions(); + options.write_buffer_size = 4096; + options.arena_block_size = 4096; + options.env = env_; + Reopen(options); + ASSERT_OK(Put("foo", "v1")); + env_->non_writeable_rate_.store(100); + std::string big(100000, 'x'); + int errors = 0; + for (int i = 0; i < 20; i++) { + if (!Put("foo", big).ok()) { + errors++; + env_->SleepForMicroseconds(100000); + } + } + ASSERT_GT(errors, 0); + env_->non_writeable_rate_.store(0); + } while (ChangeCompactOptions()); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBIOFailureTest, ManifestWriteError) { + // Test for the following problem: + // (a) Compaction produces file F + // (b) Log record containing F is written to MANIFEST file, but Sync() fails + // (c) GC deletes F + // (d) After reopening DB, reads fail since deleted F is named in log record + + // We iterate twice. In the second iteration, everything is the + // same except the log record never makes it to the MANIFEST file. + for (int iter = 0; iter < 2; iter++) { + std::atomic* error_type = (iter == 0) ? &env_->manifest_sync_error_ + : &env_->manifest_write_error_; + + // Insert foo=>bar mapping + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + options.paranoid_checks = true; + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "bar")); + ASSERT_EQ("bar", Get("foo")); + + // Memtable compaction (will succeed) + ASSERT_OK(Flush()); + ASSERT_EQ("bar", Get("foo")); + const int last = 2; + MoveFilesToLevel(2); + ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo=>bar is now in last level + + // Merging compaction (will fail) + error_type->store(true, std::memory_order_release); + ASSERT_NOK( + dbfull()->TEST_CompactRange(last, nullptr, nullptr)); // Should fail + ASSERT_EQ("bar", Get("foo")); + + error_type->store(false, std::memory_order_release); + + // Since paranoid_checks=true, writes should fail + ASSERT_NOK(Put("foo2", "bar2")); + + // Recovery: should not lose data + ASSERT_EQ("bar", Get("foo")); + + // Try again with paranoid_checks=false + Close(); + options.paranoid_checks = false; + Reopen(options); + + // Merging compaction (will fail) + error_type->store(true, std::memory_order_release); + Status s = + dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail + if (iter == 0) { + ASSERT_OK(s); + } else { + ASSERT_TRUE(s.IsIOError()); + } + ASSERT_EQ("bar", Get("foo")); + + // Recovery: should not lose data + error_type->store(false, std::memory_order_release); + Reopen(options); + ASSERT_EQ("bar", Get("foo")); + + // Since paranoid_checks=false, writes should succeed + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_EQ("bar", Get("foo")); + ASSERT_EQ("bar2", Get("foo2")); + } +} + +TEST_F(DBIOFailureTest, PutFailsParanoid) { + // Test the following: + // (a) A random put fails in paranoid mode (simulate by sync fail) + // (b) All other puts have to fail, even if writes would succeed + // (c) All of that should happen ONLY if paranoid_checks = true + + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + options.paranoid_checks = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "foo1", "bar1")); + // simulate error + env_->log_write_error_.store(true, std::memory_order_release); + ASSERT_NOK(Put(1, "foo2", "bar2")); + env_->log_write_error_.store(false, std::memory_order_release); + // the next put should fail, too + ASSERT_NOK(Put(1, "foo3", "bar3")); + // but we're still able to read + ASSERT_EQ("bar", Get(1, "foo")); + + // do the same thing with paranoid checks off + options.paranoid_checks = false; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "foo1", "bar1")); + // simulate error + env_->log_write_error_.store(true, std::memory_order_release); + ASSERT_NOK(Put(1, "foo2", "bar2")); + env_->log_write_error_.store(false, std::memory_order_release); + // the next put should NOT fail + ASSERT_OK(Put(1, "foo3", "bar3")); +} +#if !(defined NDEBUG) || !defined(OS_WIN) +TEST_F(DBIOFailureTest, FlushSstRangeSyncError) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + options.paranoid_checks = true; + options.write_buffer_size = 256 * 1024 * 1024; + options.writable_file_max_buffer_size = 128 * 1024; + options.bytes_per_sync = 128 * 1024; + options.level0_file_num_compaction_trigger = 4; + options.memtable_factory.reset(test::NewSpecialSkipListFactory(10)); + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + const char* io_error_msg = "range sync dummy error"; + std::atomic range_sync_called(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SpecialEnv::SStableFile::RangeSync", [&](void* arg) { + if (range_sync_called.fetch_add(1) == 0) { + Status* st = static_cast(arg); + *st = Status::IOError(io_error_msg); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + std::string rnd_str = + rnd.RandomString(static_cast(options.bytes_per_sync / 2)); + std::string rnd_str_512kb = rnd.RandomString(512 * 1024); + + ASSERT_OK(Put(1, "foo", "bar")); + // First 1MB doesn't get range synced + ASSERT_OK(Put(1, "foo0_0", rnd_str_512kb)); + ASSERT_OK(Put(1, "foo0_1", rnd_str_512kb)); + ASSERT_OK(Put(1, "foo1_1", rnd_str)); + ASSERT_OK(Put(1, "foo1_2", rnd_str)); + ASSERT_OK(Put(1, "foo1_3", rnd_str)); + ASSERT_OK(Put(1, "foo2", "bar")); + ASSERT_OK(Put(1, "foo3_1", rnd_str)); + ASSERT_OK(Put(1, "foo3_2", rnd_str)); + ASSERT_OK(Put(1, "foo3_3", rnd_str)); + ASSERT_OK(Put(1, "foo4", "bar")); + Status s = dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_TRUE(s.IsIOError()); + ASSERT_STREQ(s.getState(), io_error_msg); + + // Following writes should fail as flush failed. + ASSERT_NOK(Put(1, "foo2", "bar3")); + ASSERT_EQ("bar", Get(1, "foo")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_GE(1, range_sync_called.load()); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("bar", Get(1, "foo")); +} + +TEST_F(DBIOFailureTest, CompactSstRangeSyncError) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + options.paranoid_checks = true; + options.write_buffer_size = 256 * 1024 * 1024; + options.writable_file_max_buffer_size = 128 * 1024; + options.bytes_per_sync = 128 * 1024; + options.level0_file_num_compaction_trigger = 2; + options.target_file_size_base = 256 * 1024 * 1024; + options.disable_auto_compactions = true; + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + Random rnd(301); + std::string rnd_str = + rnd.RandomString(static_cast(options.bytes_per_sync / 2)); + std::string rnd_str_512kb = rnd.RandomString(512 * 1024); + + ASSERT_OK(Put(1, "foo", "bar")); + // First 1MB doesn't get range synced + ASSERT_OK(Put(1, "foo0_0", rnd_str_512kb)); + ASSERT_OK(Put(1, "foo0_1", rnd_str_512kb)); + ASSERT_OK(Put(1, "foo1_1", rnd_str)); + ASSERT_OK(Put(1, "foo1_2", rnd_str)); + ASSERT_OK(Put(1, "foo1_3", rnd_str)); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "foo3_1", rnd_str)); + ASSERT_OK(Put(1, "foo3_2", rnd_str)); + ASSERT_OK(Put(1, "foo3_3", rnd_str)); + ASSERT_OK(Put(1, "foo4", "bar")); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + + const char* io_error_msg = "range sync dummy error"; + std::atomic range_sync_called(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SpecialEnv::SStableFile::RangeSync", [&](void* arg) { + if (range_sync_called.fetch_add(1) == 0) { + Status* st = static_cast(arg); + *st = Status::IOError(io_error_msg); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(dbfull()->SetOptions(handles_[1], + { + {"disable_auto_compactions", "false"}, + })); + Status s = dbfull()->TEST_WaitForCompact(); + ASSERT_TRUE(s.IsIOError()); + ASSERT_STREQ(s.getState(), io_error_msg); + + // Following writes should fail as flush failed. + ASSERT_NOK(Put(1, "foo2", "bar3")); + ASSERT_EQ("bar", Get(1, "foo")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_GE(1, range_sync_called.load()); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("bar", Get(1, "foo")); +} + +TEST_F(DBIOFailureTest, FlushSstCloseError) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + options.paranoid_checks = true; + options.level0_file_num_compaction_trigger = 4; + options.memtable_factory.reset(test::NewSpecialSkipListFactory(2)); + + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + const char* io_error_msg = "close dummy error"; + std::atomic close_called(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SpecialEnv::SStableFile::Close", [&](void* arg) { + if (close_called.fetch_add(1) == 0) { + Status* st = static_cast(arg); + *st = Status::IOError(io_error_msg); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "foo1", "bar1")); + ASSERT_OK(Put(1, "foo", "bar2")); + Status s = dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_TRUE(s.IsIOError()); + ASSERT_STREQ(s.getState(), io_error_msg); + + // Following writes should fail as flush failed. + ASSERT_NOK(Put(1, "foo2", "bar3")); + ASSERT_EQ("bar2", Get(1, "foo")); + ASSERT_EQ("bar1", Get(1, "foo1")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("bar2", Get(1, "foo")); + ASSERT_EQ("bar1", Get(1, "foo1")); +} + +TEST_F(DBIOFailureTest, CompactionSstCloseError) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + options.paranoid_checks = true; + options.level0_file_num_compaction_trigger = 2; + options.disable_auto_compactions = true; + + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "foo2", "bar")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "foo", "bar2")); + ASSERT_OK(Put(1, "foo2", "bar")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "foo", "bar3")); + ASSERT_OK(Put(1, "foo2", "bar")); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + const char* io_error_msg = "close dummy error"; + std::atomic close_called(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SpecialEnv::SStableFile::Close", [&](void* arg) { + if (close_called.fetch_add(1) == 0) { + Status* st = static_cast(arg); + *st = Status::IOError(io_error_msg); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(dbfull()->SetOptions(handles_[1], + { + {"disable_auto_compactions", "false"}, + })); + Status s = dbfull()->TEST_WaitForCompact(); + ASSERT_TRUE(s.IsIOError()); + ASSERT_STREQ(s.getState(), io_error_msg); + + // Following writes should fail as compaction failed. + ASSERT_NOK(Put(1, "foo2", "bar3")); + ASSERT_EQ("bar3", Get(1, "foo")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("bar3", Get(1, "foo")); +} + +TEST_F(DBIOFailureTest, FlushSstSyncError) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + options.paranoid_checks = true; + options.use_fsync = false; + options.level0_file_num_compaction_trigger = 4; + options.memtable_factory.reset(test::NewSpecialSkipListFactory(2)); + + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + const char* io_error_msg = "sync dummy error"; + std::atomic sync_called(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SpecialEnv::SStableFile::Sync", [&](void* arg) { + if (sync_called.fetch_add(1) == 0) { + Status* st = static_cast(arg); + *st = Status::IOError(io_error_msg); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "foo1", "bar1")); + ASSERT_OK(Put(1, "foo", "bar2")); + Status s = dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_TRUE(s.IsIOError()); + ASSERT_STREQ(s.getState(), io_error_msg); + + // Following writes should fail as flush failed. + ASSERT_NOK(Put(1, "foo2", "bar3")); + ASSERT_EQ("bar2", Get(1, "foo")); + ASSERT_EQ("bar1", Get(1, "foo1")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("bar2", Get(1, "foo")); + ASSERT_EQ("bar1", Get(1, "foo1")); +} + +TEST_F(DBIOFailureTest, CompactionSstSyncError) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + options.paranoid_checks = true; + options.level0_file_num_compaction_trigger = 2; + options.disable_auto_compactions = true; + options.use_fsync = false; + + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "foo2", "bar")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "foo", "bar2")); + ASSERT_OK(Put(1, "foo2", "bar")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "foo", "bar3")); + ASSERT_OK(Put(1, "foo2", "bar")); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + const char* io_error_msg = "sync dummy error"; + std::atomic sync_called(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SpecialEnv::SStableFile::Sync", [&](void* arg) { + if (sync_called.fetch_add(1) == 0) { + Status* st = static_cast(arg); + *st = Status::IOError(io_error_msg); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(dbfull()->SetOptions(handles_[1], + { + {"disable_auto_compactions", "false"}, + })); + Status s = dbfull()->TEST_WaitForCompact(); + ASSERT_TRUE(s.IsIOError()); + ASSERT_STREQ(s.getState(), io_error_msg); + + // Following writes should fail as compaction failed. + ASSERT_NOK(Put(1, "foo2", "bar3")); + ASSERT_EQ("bar3", Get(1, "foo")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("bar3", Get(1, "foo")); +} +#endif // !(defined NDEBUG) || !defined(OS_WIN) +#endif // ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_iter.cc b/src/rocksdb/db/db_iter.cc new file mode 100644 index 000000000..e1375deb7 --- /dev/null +++ b/src/rocksdb/db/db_iter.cc @@ -0,0 +1,1708 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_iter.h" + +#include +#include +#include + +#include "db/dbformat.h" +#include "db/merge_context.h" +#include "db/merge_helper.h" +#include "db/pinned_iterators_manager.h" +#include "db/wide/wide_column_serialization.h" +#include "file/filename.h" +#include "logging/logging.h" +#include "memory/arena.h" +#include "monitoring/perf_context_imp.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/options.h" +#include "rocksdb/system_clock.h" +#include "table/internal_iterator.h" +#include "table/iterator_wrapper.h" +#include "trace_replay/trace_replay.h" +#include "util/mutexlock.h" +#include "util/string_util.h" +#include "util/user_comparator_wrapper.h" + +namespace ROCKSDB_NAMESPACE { + +DBIter::DBIter(Env* _env, const ReadOptions& read_options, + const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + const Comparator* cmp, InternalIterator* iter, + const Version* version, SequenceNumber s, bool arena_mode, + uint64_t max_sequential_skip_in_iterations, + ReadCallback* read_callback, DBImpl* db_impl, + ColumnFamilyData* cfd, bool expose_blob_index) + : prefix_extractor_(mutable_cf_options.prefix_extractor.get()), + env_(_env), + clock_(ioptions.clock), + logger_(ioptions.logger), + user_comparator_(cmp), + merge_operator_(ioptions.merge_operator.get()), + iter_(iter), + version_(version), + read_callback_(read_callback), + sequence_(s), + statistics_(ioptions.stats), + max_skip_(max_sequential_skip_in_iterations), + max_skippable_internal_keys_(read_options.max_skippable_internal_keys), + num_internal_keys_skipped_(0), + iterate_lower_bound_(read_options.iterate_lower_bound), + iterate_upper_bound_(read_options.iterate_upper_bound), + direction_(kForward), + valid_(false), + current_entry_is_merged_(false), + is_key_seqnum_zero_(false), + prefix_same_as_start_(mutable_cf_options.prefix_extractor + ? read_options.prefix_same_as_start + : false), + pin_thru_lifetime_(read_options.pin_data), + expect_total_order_inner_iter_(prefix_extractor_ == nullptr || + read_options.total_order_seek || + read_options.auto_prefix_mode), + read_tier_(read_options.read_tier), + fill_cache_(read_options.fill_cache), + verify_checksums_(read_options.verify_checksums), + expose_blob_index_(expose_blob_index), + is_blob_(false), + arena_mode_(arena_mode), + db_impl_(db_impl), + cfd_(cfd), + timestamp_ub_(read_options.timestamp), + timestamp_lb_(read_options.iter_start_ts), + timestamp_size_(timestamp_ub_ ? timestamp_ub_->size() : 0) { + RecordTick(statistics_, NO_ITERATOR_CREATED); + if (pin_thru_lifetime_) { + pinned_iters_mgr_.StartPinning(); + } + if (iter_.iter()) { + iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_); + } + status_.PermitUncheckedError(); + assert(timestamp_size_ == + user_comparator_.user_comparator()->timestamp_size()); +} + +Status DBIter::GetProperty(std::string prop_name, std::string* prop) { + if (prop == nullptr) { + return Status::InvalidArgument("prop is nullptr"); + } + if (prop_name == "rocksdb.iterator.super-version-number") { + // First try to pass the value returned from inner iterator. + return iter_.iter()->GetProperty(prop_name, prop); + } else if (prop_name == "rocksdb.iterator.is-key-pinned") { + if (valid_) { + *prop = (pin_thru_lifetime_ && saved_key_.IsKeyPinned()) ? "1" : "0"; + } else { + *prop = "Iterator is not valid."; + } + return Status::OK(); + } else if (prop_name == "rocksdb.iterator.internal-key") { + *prop = saved_key_.GetUserKey().ToString(); + return Status::OK(); + } + return Status::InvalidArgument("Unidentified property."); +} + +bool DBIter::ParseKey(ParsedInternalKey* ikey) { + Status s = ParseInternalKey(iter_.key(), ikey, false /* log_err_key */); + if (!s.ok()) { + status_ = Status::Corruption("In DBIter: ", s.getState()); + valid_ = false; + ROCKS_LOG_ERROR(logger_, "In DBIter: %s", status_.getState()); + return false; + } else { + return true; + } +} + +void DBIter::Next() { + assert(valid_); + assert(status_.ok()); + + PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, clock_); + // Release temporarily pinned blocks from last operation + ReleaseTempPinnedData(); + ResetBlobValue(); + ResetValueAndColumns(); + local_stats_.skip_count_ += num_internal_keys_skipped_; + local_stats_.skip_count_--; + num_internal_keys_skipped_ = 0; + bool ok = true; + if (direction_ == kReverse) { + is_key_seqnum_zero_ = false; + if (!ReverseToForward()) { + ok = false; + } + } else if (!current_entry_is_merged_) { + // If the current value is not a merge, the iter position is the + // current key, which is already returned. We can safely issue a + // Next() without checking the current key. + // If the current key is a merge, very likely iter already points + // to the next internal position. + assert(iter_.Valid()); + iter_.Next(); + PERF_COUNTER_ADD(internal_key_skipped_count, 1); + } + + local_stats_.next_count_++; + if (ok && iter_.Valid()) { + ClearSavedValue(); + + if (prefix_same_as_start_) { + assert(prefix_extractor_ != nullptr); + const Slice prefix = prefix_.GetUserKey(); + FindNextUserEntry(true /* skipping the current user key */, &prefix); + } else { + FindNextUserEntry(true /* skipping the current user key */, nullptr); + } + } else { + is_key_seqnum_zero_ = false; + valid_ = false; + } + if (statistics_ != nullptr && valid_) { + local_stats_.next_found_count_++; + local_stats_.bytes_read_ += (key().size() + value().size()); + } +} + +bool DBIter::SetBlobValueIfNeeded(const Slice& user_key, + const Slice& blob_index) { + assert(!is_blob_); + assert(blob_value_.empty()); + + if (expose_blob_index_) { // Stacked BlobDB implementation + is_blob_ = true; + return true; + } + + if (!version_) { + status_ = Status::Corruption("Encountered unexpected blob index."); + valid_ = false; + return false; + } + + // TODO: consider moving ReadOptions from ArenaWrappedDBIter to DBIter to + // avoid having to copy options back and forth. + ReadOptions read_options; + read_options.read_tier = read_tier_; + read_options.fill_cache = fill_cache_; + read_options.verify_checksums = verify_checksums_; + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + constexpr uint64_t* bytes_read = nullptr; + + const Status s = version_->GetBlob(read_options, user_key, blob_index, + prefetch_buffer, &blob_value_, bytes_read); + + if (!s.ok()) { + status_ = s; + valid_ = false; + return false; + } + + is_blob_ = true; + return true; +} + +bool DBIter::SetValueAndColumnsFromEntity(Slice slice) { + assert(value_.empty()); + assert(wide_columns_.empty()); + + const Status s = WideColumnSerialization::Deserialize(slice, wide_columns_); + + if (!s.ok()) { + status_ = s; + valid_ = false; + return false; + } + + if (!wide_columns_.empty() && + wide_columns_[0].name() == kDefaultWideColumnName) { + value_ = wide_columns_[0].value(); + } + + return true; +} + +// PRE: saved_key_ has the current user key if skipping_saved_key +// POST: saved_key_ should have the next user key if valid_, +// if the current entry is a result of merge +// current_entry_is_merged_ => true +// saved_value_ => the merged value +// +// NOTE: In between, saved_key_ can point to a user key that has +// a delete marker or a sequence number higher than sequence_ +// saved_key_ MUST have a proper user_key before calling this function +// +// The prefix parameter, if not null, indicates that we need to iterate +// within the prefix, and the iterator needs to be made invalid, if no +// more entry for the prefix can be found. +bool DBIter::FindNextUserEntry(bool skipping_saved_key, const Slice* prefix) { + PERF_TIMER_GUARD(find_next_user_entry_time); + return FindNextUserEntryInternal(skipping_saved_key, prefix); +} + +// Actual implementation of DBIter::FindNextUserEntry() +bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key, + const Slice* prefix) { + // Loop until we hit an acceptable entry to yield + assert(iter_.Valid()); + assert(status_.ok()); + assert(direction_ == kForward); + current_entry_is_merged_ = false; + + // How many times in a row we have skipped an entry with user key less than + // or equal to saved_key_. We could skip these entries either because + // sequence numbers were too high or because skipping_saved_key = true. + // What saved_key_ contains throughout this method: + // - if skipping_saved_key : saved_key_ contains the key that we need + // to skip, and we haven't seen any keys greater + // than that, + // - if num_skipped > 0 : saved_key_ contains the key that we have skipped + // num_skipped times, and we haven't seen any keys + // greater than that, + // - none of the above : saved_key_ can contain anything, it doesn't + // matter. + uint64_t num_skipped = 0; + // For write unprepared, the target sequence number in reseek could be larger + // than the snapshot, and thus needs to be skipped again. This could result in + // an infinite loop of reseeks. To avoid that, we limit the number of reseeks + // to one. + bool reseek_done = false; + + do { + // Will update is_key_seqnum_zero_ as soon as we parsed the current key + // but we need to save the previous value to be used in the loop. + bool is_prev_key_seqnum_zero = is_key_seqnum_zero_; + if (!ParseKey(&ikey_)) { + is_key_seqnum_zero_ = false; + return false; + } + Slice user_key_without_ts = + StripTimestampFromUserKey(ikey_.user_key, timestamp_size_); + + is_key_seqnum_zero_ = (ikey_.sequence == 0); + + assert(iterate_upper_bound_ == nullptr || + iter_.UpperBoundCheckResult() != IterBoundCheck::kInbound || + user_comparator_.CompareWithoutTimestamp( + user_key_without_ts, /*a_has_ts=*/false, *iterate_upper_bound_, + /*b_has_ts=*/false) < 0); + if (iterate_upper_bound_ != nullptr && + iter_.UpperBoundCheckResult() != IterBoundCheck::kInbound && + user_comparator_.CompareWithoutTimestamp( + user_key_without_ts, /*a_has_ts=*/false, *iterate_upper_bound_, + /*b_has_ts=*/false) >= 0) { + break; + } + + assert(prefix == nullptr || prefix_extractor_ != nullptr); + if (prefix != nullptr && + prefix_extractor_->Transform(user_key_without_ts).compare(*prefix) != + 0) { + assert(prefix_same_as_start_); + break; + } + + if (TooManyInternalKeysSkipped()) { + return false; + } + + assert(ikey_.user_key.size() >= timestamp_size_); + Slice ts = timestamp_size_ > 0 ? ExtractTimestampFromUserKey( + ikey_.user_key, timestamp_size_) + : Slice(); + bool more_recent = false; + if (IsVisible(ikey_.sequence, ts, &more_recent)) { + // If the previous entry is of seqnum 0, the current entry will not + // possibly be skipped. This condition can potentially be relaxed to + // prev_key.seq <= ikey_.sequence. We are cautious because it will be more + // prone to bugs causing the same user key with the same sequence number. + // Note that with current timestamp implementation, the same user key can + // have different timestamps and zero sequence number on the bottommost + // level. This may change in the future. + if ((!is_prev_key_seqnum_zero || timestamp_size_ > 0) && + skipping_saved_key && + CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) <= 0) { + num_skipped++; // skip this entry + PERF_COUNTER_ADD(internal_key_skipped_count, 1); + } else { + assert(!skipping_saved_key || + CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) > 0); + if (!iter_.PrepareValue()) { + assert(!iter_.status().ok()); + valid_ = false; + return false; + } + num_skipped = 0; + reseek_done = false; + switch (ikey_.type) { + case kTypeDeletion: + case kTypeDeletionWithTimestamp: + case kTypeSingleDeletion: + // Arrange to skip all upcoming entries for this key since + // they are hidden by this deletion. + if (timestamp_lb_) { + saved_key_.SetInternalKey(ikey_); + valid_ = true; + return true; + } else { + saved_key_.SetUserKey( + ikey_.user_key, !pin_thru_lifetime_ || + !iter_.iter()->IsKeyPinned() /* copy */); + skipping_saved_key = true; + PERF_COUNTER_ADD(internal_delete_skipped_count, 1); + } + break; + case kTypeValue: + case kTypeBlobIndex: + case kTypeWideColumnEntity: + if (timestamp_lb_) { + saved_key_.SetInternalKey(ikey_); + } else { + saved_key_.SetUserKey( + ikey_.user_key, !pin_thru_lifetime_ || + !iter_.iter()->IsKeyPinned() /* copy */); + } + + if (ikey_.type == kTypeBlobIndex) { + if (!SetBlobValueIfNeeded(ikey_.user_key, iter_.value())) { + return false; + } + + SetValueAndColumnsFromPlain(expose_blob_index_ ? iter_.value() + : blob_value_); + } else if (ikey_.type == kTypeWideColumnEntity) { + if (!SetValueAndColumnsFromEntity(iter_.value())) { + return false; + } + } else { + assert(ikey_.type == kTypeValue); + SetValueAndColumnsFromPlain(iter_.value()); + } + + valid_ = true; + return true; + break; + case kTypeMerge: + saved_key_.SetUserKey( + ikey_.user_key, + !pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */); + // By now, we are sure the current ikey is going to yield a value + current_entry_is_merged_ = true; + valid_ = true; + return MergeValuesNewToOld(); // Go to a different state machine + break; + default: + valid_ = false; + status_ = Status::Corruption( + "Unknown value type: " + + std::to_string(static_cast(ikey_.type))); + return false; + } + } + } else { + if (more_recent) { + PERF_COUNTER_ADD(internal_recent_skipped_count, 1); + } + + // This key was inserted after our snapshot was taken or skipped by + // timestamp range. If this happens too many times in a row for the same + // user key, we want to seek to the target sequence number. + int cmp = user_comparator_.CompareWithoutTimestamp( + ikey_.user_key, saved_key_.GetUserKey()); + if (cmp == 0 || (skipping_saved_key && cmp < 0)) { + num_skipped++; + } else { + saved_key_.SetUserKey( + ikey_.user_key, + !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */); + skipping_saved_key = false; + num_skipped = 0; + reseek_done = false; + } + } + + // If we have sequentially iterated via numerous equal keys, then it's + // better to seek so that we can avoid too many key comparisons. + // + // To avoid infinite loops, do not reseek if we have already attempted to + // reseek previously. + // + // TODO(lth): If we reseek to sequence number greater than ikey_.sequence, + // then it does not make sense to reseek as we would actually land further + // away from the desired key. There is opportunity for optimization here. + if (num_skipped > max_skip_ && !reseek_done) { + is_key_seqnum_zero_ = false; + num_skipped = 0; + reseek_done = true; + std::string last_key; + if (skipping_saved_key) { + // We're looking for the next user-key but all we see are the same + // user-key with decreasing sequence numbers. Fast forward to + // sequence number 0 and type deletion (the smallest type). + if (timestamp_size_ == 0) { + AppendInternalKey( + &last_key, + ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion)); + } else { + const std::string kTsMin(timestamp_size_, '\0'); + AppendInternalKeyWithDifferentTimestamp( + &last_key, + ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion), + kTsMin); + } + // Don't set skipping_saved_key = false because we may still see more + // user-keys equal to saved_key_. + } else { + // We saw multiple entries with this user key and sequence numbers + // higher than sequence_. Fast forward to sequence_. + // Note that this only covers a case when a higher key was overwritten + // many times since our snapshot was taken, not the case when a lot of + // different keys were inserted after our snapshot was taken. + if (timestamp_size_ == 0) { + AppendInternalKey( + &last_key, ParsedInternalKey(saved_key_.GetUserKey(), sequence_, + kValueTypeForSeek)); + } else { + AppendInternalKeyWithDifferentTimestamp( + &last_key, + ParsedInternalKey(saved_key_.GetUserKey(), sequence_, + kValueTypeForSeek), + *timestamp_ub_); + } + } + iter_.Seek(last_key); + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); + } else { + iter_.Next(); + } + } while (iter_.Valid()); + + valid_ = false; + return iter_.status().ok(); +} + +// Merge values of the same user key starting from the current iter_ position +// Scan from the newer entries to older entries. +// PRE: iter_.key() points to the first merge type entry +// saved_key_ stores the user key +// iter_.PrepareValue() has been called +// POST: saved_value_ has the merged value for the user key +// iter_ points to the next entry (or invalid) +bool DBIter::MergeValuesNewToOld() { + if (!merge_operator_) { + ROCKS_LOG_ERROR(logger_, "Options::merge_operator is null."); + status_ = Status::InvalidArgument("merge_operator_ must be set."); + valid_ = false; + return false; + } + + // Temporarily pin the blocks that hold merge operands + TempPinData(); + merge_context_.Clear(); + // Start the merge process by pushing the first operand + merge_context_.PushOperand( + iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */); + TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:PushedFirstOperand"); + + ParsedInternalKey ikey; + for (iter_.Next(); iter_.Valid(); iter_.Next()) { + TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:SteppedToNextOperand"); + if (!ParseKey(&ikey)) { + return false; + } + + if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key, + saved_key_.GetUserKey())) { + // hit the next user key, stop right here + break; + } + if (kTypeDeletion == ikey.type || kTypeSingleDeletion == ikey.type || + kTypeDeletionWithTimestamp == ikey.type) { + // hit a delete with the same user key, stop right here + // iter_ is positioned after delete + iter_.Next(); + break; + } + if (!iter_.PrepareValue()) { + valid_ = false; + return false; + } + + if (kTypeValue == ikey.type) { + // hit a put, merge the put value with operands and store the + // final result in saved_value_. We are done! + const Slice val = iter_.value(); + if (!Merge(&val, ikey.user_key)) { + return false; + } + // iter_ is positioned after put + iter_.Next(); + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + return true; + } else if (kTypeMerge == ikey.type) { + // hit a merge, add the value as an operand and run associative merge. + // when complete, add result to operands and continue. + merge_context_.PushOperand( + iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */); + PERF_COUNTER_ADD(internal_merge_count, 1); + } else if (kTypeBlobIndex == ikey.type) { + if (expose_blob_index_) { + status_ = + Status::NotSupported("BlobDB does not support merge operator."); + valid_ = false; + return false; + } + // hit a put, merge the put value with operands and store the + // final result in saved_value_. We are done! + if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) { + return false; + } + valid_ = true; + if (!Merge(&blob_value_, ikey.user_key)) { + return false; + } + + ResetBlobValue(); + + // iter_ is positioned after put + iter_.Next(); + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + return true; + } else if (kTypeWideColumnEntity == ikey.type) { + if (!MergeEntity(iter_.value(), ikey.user_key)) { + return false; + } + + // iter_ is positioned after put + iter_.Next(); + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + + return true; + } else { + valid_ = false; + status_ = Status::Corruption( + "Unrecognized value type: " + + std::to_string(static_cast(ikey.type))); + return false; + } + } + + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + + // we either exhausted all internal keys under this user key, or hit + // a deletion marker. + // feed null as the existing value to the merge operator, such that + // client can differentiate this scenario and do things accordingly. + if (!Merge(nullptr, saved_key_.GetUserKey())) { + return false; + } + assert(status_.ok()); + return true; +} + +void DBIter::Prev() { + assert(valid_); + assert(status_.ok()); + + PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, clock_); + ReleaseTempPinnedData(); + ResetBlobValue(); + ResetValueAndColumns(); + ResetInternalKeysSkippedCounter(); + bool ok = true; + if (direction_ == kForward) { + if (!ReverseToBackward()) { + ok = false; + } + } + if (ok) { + ClearSavedValue(); + + Slice prefix; + if (prefix_same_as_start_) { + assert(prefix_extractor_ != nullptr); + prefix = prefix_.GetUserKey(); + } + PrevInternal(prefix_same_as_start_ ? &prefix : nullptr); + } + + if (statistics_ != nullptr) { + local_stats_.prev_count_++; + if (valid_) { + local_stats_.prev_found_count_++; + local_stats_.bytes_read_ += (key().size() + value().size()); + } + } +} + +bool DBIter::ReverseToForward() { + assert(iter_.status().ok()); + + // When moving backwards, iter_ is positioned on _previous_ key, which may + // not exist or may have different prefix than the current key(). + // If that's the case, seek iter_ to current key. + if (!expect_total_order_inner_iter() || !iter_.Valid()) { + IterKey last_key; + ParsedInternalKey pikey(saved_key_.GetUserKey(), kMaxSequenceNumber, + kValueTypeForSeek); + if (timestamp_size_ > 0) { + // TODO: pre-create kTsMax. + const std::string kTsMax(timestamp_size_, '\xff'); + pikey.SetTimestamp(kTsMax); + } + last_key.SetInternalKey(pikey); + iter_.Seek(last_key.GetInternalKey()); + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); + } + + direction_ = kForward; + // Skip keys less than the current key() (a.k.a. saved_key_). + while (iter_.Valid()) { + ParsedInternalKey ikey; + if (!ParseKey(&ikey)) { + return false; + } + if (user_comparator_.Compare(ikey.user_key, saved_key_.GetUserKey()) >= 0) { + return true; + } + iter_.Next(); + } + + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + + return true; +} + +// Move iter_ to the key before saved_key_. +bool DBIter::ReverseToBackward() { + assert(iter_.status().ok()); + + // When current_entry_is_merged_ is true, iter_ may be positioned on the next + // key, which may not exist or may have prefix different from current. + // If that's the case, seek to saved_key_. + if (current_entry_is_merged_ && + (!expect_total_order_inner_iter() || !iter_.Valid())) { + IterKey last_key; + // Using kMaxSequenceNumber and kValueTypeForSeek + // (not kValueTypeForSeekForPrev) to seek to a key strictly smaller + // than saved_key_. + last_key.SetInternalKey(ParsedInternalKey( + saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek)); + if (!expect_total_order_inner_iter()) { + iter_.SeekForPrev(last_key.GetInternalKey()); + } else { + // Some iterators may not support SeekForPrev(), so we avoid using it + // when prefix seek mode is disabled. This is somewhat expensive + // (an extra Prev(), as well as an extra change of direction of iter_), + // so we may need to reconsider it later. + iter_.Seek(last_key.GetInternalKey()); + if (!iter_.Valid() && iter_.status().ok()) { + iter_.SeekToLast(); + } + } + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); + } + + direction_ = kReverse; + return FindUserKeyBeforeSavedKey(); +} + +void DBIter::PrevInternal(const Slice* prefix) { + while (iter_.Valid()) { + saved_key_.SetUserKey( + ExtractUserKey(iter_.key()), + !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */); + + assert(prefix == nullptr || prefix_extractor_ != nullptr); + if (prefix != nullptr && + prefix_extractor_ + ->Transform(StripTimestampFromUserKey(saved_key_.GetUserKey(), + timestamp_size_)) + .compare(*prefix) != 0) { + assert(prefix_same_as_start_); + // Current key does not have the same prefix as start + valid_ = false; + return; + } + + assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() || + user_comparator_.CompareWithoutTimestamp( + saved_key_.GetUserKey(), /*a_has_ts=*/true, + *iterate_lower_bound_, /*b_has_ts=*/false) >= 0); + if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() && + user_comparator_.CompareWithoutTimestamp( + saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_lower_bound_, + /*b_has_ts=*/false) < 0) { + // We've iterated earlier than the user-specified lower bound. + valid_ = false; + return; + } + + if (!FindValueForCurrentKey()) { // assigns valid_ + return; + } + + // Whether or not we found a value for current key, we need iter_ to end up + // on a smaller key. + if (!FindUserKeyBeforeSavedKey()) { + return; + } + + if (valid_) { + // Found the value. + return; + } + + if (TooManyInternalKeysSkipped(false)) { + return; + } + } + + // We haven't found any key - iterator is not valid + valid_ = false; +} + +// Used for backwards iteration. +// Looks at the entries with user key saved_key_ and finds the most up-to-date +// value for it, or executes a merge, or determines that the value was deleted. +// Sets valid_ to true if the value is found and is ready to be presented to +// the user through value(). +// Sets valid_ to false if the value was deleted, and we should try another key. +// Returns false if an error occurred, and !status().ok() and !valid_. +// +// PRE: iter_ is positioned on the last entry with user key equal to saved_key_. +// POST: iter_ is positioned on one of the entries equal to saved_key_, or on +// the entry just before them, or on the entry just after them. +bool DBIter::FindValueForCurrentKey() { + assert(iter_.Valid()); + merge_context_.Clear(); + current_entry_is_merged_ = false; + // last entry before merge (could be kTypeDeletion, + // kTypeDeletionWithTimestamp, kTypeSingleDeletion, kTypeValue, + // kTypeBlobIndex, or kTypeWideColumnEntity) + ValueType last_not_merge_type = kTypeDeletion; + ValueType last_key_entry_type = kTypeDeletion; + + // If false, it indicates that we have not seen any valid entry, even though + // last_key_entry_type is initialized to kTypeDeletion. + bool valid_entry_seen = false; + + // Temporarily pin blocks that hold (merge operands / the value) + ReleaseTempPinnedData(); + TempPinData(); + size_t num_skipped = 0; + while (iter_.Valid()) { + ParsedInternalKey ikey; + if (!ParseKey(&ikey)) { + return false; + } + + if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key, + saved_key_.GetUserKey())) { + // Found a smaller user key, thus we are done with current user key. + break; + } + + assert(ikey.user_key.size() >= timestamp_size_); + Slice ts; + if (timestamp_size_ > 0) { + ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_, + timestamp_size_); + } + + bool visible = IsVisible(ikey.sequence, ts); + if (!visible && + (timestamp_lb_ == nullptr || + user_comparator_.CompareTimestamp(ts, *timestamp_ub_) > 0)) { + // Found an invisible version of the current user key, and it must have + // a higher sequence number or timestamp. Therefore, we are done with the + // current user key. + break; + } + + if (!ts.empty()) { + saved_timestamp_.assign(ts.data(), ts.size()); + } + + if (TooManyInternalKeysSkipped()) { + return false; + } + + // This user key has lots of entries. + // We're going from old to new, and it's taking too long. Let's do a Seek() + // and go from new to old. This helps when a key was overwritten many times. + if (num_skipped >= max_skip_) { + return FindValueForCurrentKeyUsingSeek(); + } + + if (!iter_.PrepareValue()) { + valid_ = false; + return false; + } + + if (timestamp_lb_ != nullptr) { + // Only needed when timestamp_lb_ is not null + [[maybe_unused]] const bool ret = ParseKey(&ikey_); + saved_ikey_.assign(iter_.key().data(), iter_.key().size()); + // Since the preceding ParseKey(&ikey) succeeds, so must this. + assert(ret); + } + + valid_entry_seen = true; + last_key_entry_type = ikey.type; + switch (last_key_entry_type) { + case kTypeValue: + case kTypeBlobIndex: + case kTypeWideColumnEntity: + if (iter_.iter()->IsValuePinned()) { + pinned_value_ = iter_.value(); + } else { + valid_ = false; + status_ = Status::NotSupported( + "Backward iteration not supported if underlying iterator's value " + "cannot be pinned."); + } + merge_context_.Clear(); + last_not_merge_type = last_key_entry_type; + if (!status_.ok()) { + return false; + } + break; + case kTypeDeletion: + case kTypeDeletionWithTimestamp: + case kTypeSingleDeletion: + merge_context_.Clear(); + last_not_merge_type = last_key_entry_type; + PERF_COUNTER_ADD(internal_delete_skipped_count, 1); + break; + case kTypeMerge: { + assert(merge_operator_ != nullptr); + merge_context_.PushOperandBack( + iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */); + PERF_COUNTER_ADD(internal_merge_count, 1); + } break; + default: + valid_ = false; + status_ = Status::Corruption( + "Unknown value type: " + + std::to_string(static_cast(last_key_entry_type))); + return false; + } + + PERF_COUNTER_ADD(internal_key_skipped_count, 1); + iter_.Prev(); + ++num_skipped; + + if (visible && timestamp_lb_ != nullptr) { + // If timestamp_lb_ is not nullptr, we do not have to look further for + // another internal key. We can return this current internal key. Yet we + // still keep the invariant that iter_ is positioned before the returned + // key. + break; + } + } + + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + + if (!valid_entry_seen) { + // Since we haven't seen any valid entry, last_key_entry_type remains + // unchanged and the same as its initial value. + assert(last_key_entry_type == kTypeDeletion); + assert(last_not_merge_type == kTypeDeletion); + valid_ = false; + return true; + } + + if (timestamp_lb_ != nullptr) { + assert(last_key_entry_type == ikey_.type); + } + + Status s; + s.PermitUncheckedError(); + + switch (last_key_entry_type) { + case kTypeDeletion: + case kTypeDeletionWithTimestamp: + case kTypeSingleDeletion: + if (timestamp_lb_ == nullptr) { + valid_ = false; + } else { + saved_key_.SetInternalKey(saved_ikey_); + valid_ = true; + } + return true; + case kTypeMerge: + current_entry_is_merged_ = true; + if (last_not_merge_type == kTypeDeletion || + last_not_merge_type == kTypeSingleDeletion || + last_not_merge_type == kTypeDeletionWithTimestamp) { + if (!Merge(nullptr, saved_key_.GetUserKey())) { + return false; + } + return true; + } else if (last_not_merge_type == kTypeBlobIndex) { + if (expose_blob_index_) { + status_ = + Status::NotSupported("BlobDB does not support merge operator."); + valid_ = false; + return false; + } + if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) { + return false; + } + valid_ = true; + if (!Merge(&blob_value_, saved_key_.GetUserKey())) { + return false; + } + + ResetBlobValue(); + + return true; + } else if (last_not_merge_type == kTypeWideColumnEntity) { + if (!MergeEntity(pinned_value_, saved_key_.GetUserKey())) { + return false; + } + + return true; + } else { + assert(last_not_merge_type == kTypeValue); + if (!Merge(&pinned_value_, saved_key_.GetUserKey())) { + return false; + } + return true; + } + break; + case kTypeValue: + if (timestamp_lb_ != nullptr) { + saved_key_.SetInternalKey(saved_ikey_); + } + + SetValueAndColumnsFromPlain(pinned_value_); + + break; + case kTypeBlobIndex: + if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) { + return false; + } + + SetValueAndColumnsFromPlain(expose_blob_index_ ? pinned_value_ + : blob_value_); + + break; + case kTypeWideColumnEntity: + if (!SetValueAndColumnsFromEntity(pinned_value_)) { + return false; + } + break; + default: + valid_ = false; + status_ = Status::Corruption( + "Unknown value type: " + + std::to_string(static_cast(last_key_entry_type))); + return false; + } + if (!s.ok()) { + valid_ = false; + status_ = s; + return false; + } + valid_ = true; + return true; +} + +// This function is used in FindValueForCurrentKey. +// We use Seek() function instead of Prev() to find necessary value +// TODO: This is very similar to FindNextUserEntry() and MergeValuesNewToOld(). +// Would be nice to reuse some code. +bool DBIter::FindValueForCurrentKeyUsingSeek() { + // FindValueForCurrentKey will enable pinning before calling + // FindValueForCurrentKeyUsingSeek() + assert(pinned_iters_mgr_.PinningEnabled()); + std::string last_key; + if (0 == timestamp_size_) { + AppendInternalKey(&last_key, + ParsedInternalKey(saved_key_.GetUserKey(), sequence_, + kValueTypeForSeek)); + } else { + AppendInternalKeyWithDifferentTimestamp( + &last_key, + ParsedInternalKey(saved_key_.GetUserKey(), sequence_, + kValueTypeForSeek), + timestamp_lb_ == nullptr ? *timestamp_ub_ : *timestamp_lb_); + } + iter_.Seek(last_key); + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); + + // In case read_callback presents, the value we seek to may not be visible. + // Find the next value that's visible. + ParsedInternalKey ikey; + + while (true) { + if (!iter_.Valid()) { + valid_ = false; + return iter_.status().ok(); + } + + if (!ParseKey(&ikey)) { + return false; + } + assert(ikey.user_key.size() >= timestamp_size_); + Slice ts; + if (timestamp_size_ > 0) { + ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_, + timestamp_size_); + } + + if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key, + saved_key_.GetUserKey())) { + // No visible values for this key, even though FindValueForCurrentKey() + // has seen some. This is possible if we're using a tailing iterator, and + // the entries were discarded in a compaction. + valid_ = false; + return true; + } + + if (IsVisible(ikey.sequence, ts)) { + break; + } + + iter_.Next(); + } + + if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion || + kTypeDeletionWithTimestamp == ikey.type) { + if (timestamp_lb_ == nullptr) { + valid_ = false; + } else { + valid_ = true; + saved_key_.SetInternalKey(ikey); + } + return true; + } + if (!iter_.PrepareValue()) { + valid_ = false; + return false; + } + if (timestamp_size_ > 0) { + Slice ts = ExtractTimestampFromUserKey(ikey.user_key, timestamp_size_); + saved_timestamp_.assign(ts.data(), ts.size()); + } + if (ikey.type == kTypeValue || ikey.type == kTypeBlobIndex || + ikey.type == kTypeWideColumnEntity) { + assert(iter_.iter()->IsValuePinned()); + pinned_value_ = iter_.value(); + if (ikey.type == kTypeBlobIndex) { + if (!SetBlobValueIfNeeded(ikey.user_key, pinned_value_)) { + return false; + } + + SetValueAndColumnsFromPlain(expose_blob_index_ ? pinned_value_ + : blob_value_); + } else if (ikey.type == kTypeWideColumnEntity) { + if (!SetValueAndColumnsFromEntity(pinned_value_)) { + return false; + } + } else { + assert(ikey.type == kTypeValue); + SetValueAndColumnsFromPlain(pinned_value_); + } + + if (timestamp_lb_ != nullptr) { + saved_key_.SetInternalKey(ikey); + } + + valid_ = true; + return true; + } + + // kTypeMerge. We need to collect all kTypeMerge values and save them + // in operands + assert(ikey.type == kTypeMerge); + current_entry_is_merged_ = true; + merge_context_.Clear(); + merge_context_.PushOperand( + iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */); + while (true) { + iter_.Next(); + + if (!iter_.Valid()) { + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + break; + } + if (!ParseKey(&ikey)) { + return false; + } + if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key, + saved_key_.GetUserKey())) { + break; + } + if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion || + ikey.type == kTypeDeletionWithTimestamp) { + break; + } + if (!iter_.PrepareValue()) { + valid_ = false; + return false; + } + + if (ikey.type == kTypeValue) { + const Slice val = iter_.value(); + if (!Merge(&val, saved_key_.GetUserKey())) { + return false; + } + return true; + } else if (ikey.type == kTypeMerge) { + merge_context_.PushOperand( + iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */); + PERF_COUNTER_ADD(internal_merge_count, 1); + } else if (ikey.type == kTypeBlobIndex) { + if (expose_blob_index_) { + status_ = + Status::NotSupported("BlobDB does not support merge operator."); + valid_ = false; + return false; + } + if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) { + return false; + } + valid_ = true; + if (!Merge(&blob_value_, saved_key_.GetUserKey())) { + return false; + } + + ResetBlobValue(); + + return true; + } else if (ikey.type == kTypeWideColumnEntity) { + if (!MergeEntity(iter_.value(), saved_key_.GetUserKey())) { + return false; + } + + return true; + } else { + valid_ = false; + status_ = Status::Corruption( + "Unknown value type: " + + std::to_string(static_cast(ikey.type))); + return false; + } + } + + if (!Merge(nullptr, saved_key_.GetUserKey())) { + return false; + } + + // Make sure we leave iter_ in a good state. If it's valid and we don't care + // about prefixes, that's already good enough. Otherwise it needs to be + // seeked to the current key. + if (!expect_total_order_inner_iter() || !iter_.Valid()) { + if (!expect_total_order_inner_iter()) { + iter_.SeekForPrev(last_key); + } else { + iter_.Seek(last_key); + if (!iter_.Valid() && iter_.status().ok()) { + iter_.SeekToLast(); + } + } + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); + } + + valid_ = true; + return true; +} + +bool DBIter::Merge(const Slice* val, const Slice& user_key) { + Status s = MergeHelper::TimedFullMerge( + merge_operator_, user_key, val, merge_context_.GetOperands(), + &saved_value_, logger_, statistics_, clock_, &pinned_value_, + /* update_num_ops_stats */ true); + if (!s.ok()) { + valid_ = false; + status_ = s; + return false; + } + + SetValueAndColumnsFromPlain(pinned_value_.data() ? pinned_value_ + : saved_value_); + + valid_ = true; + return true; +} + +bool DBIter::MergeEntity(const Slice& entity, const Slice& user_key) { + Status s = MergeHelper::TimedFullMergeWithEntity( + merge_operator_, user_key, entity, merge_context_.GetOperands(), + &saved_value_, logger_, statistics_, clock_, + /* update_num_ops_stats */ true); + if (!s.ok()) { + valid_ = false; + status_ = s; + return false; + } + + if (!SetValueAndColumnsFromEntity(saved_value_)) { + return false; + } + + valid_ = true; + return true; +} + +// Move backwards until the key smaller than saved_key_. +// Changes valid_ only if return value is false. +bool DBIter::FindUserKeyBeforeSavedKey() { + assert(status_.ok()); + size_t num_skipped = 0; + while (iter_.Valid()) { + ParsedInternalKey ikey; + if (!ParseKey(&ikey)) { + return false; + } + + if (CompareKeyForSkip(ikey.user_key, saved_key_.GetUserKey()) < 0) { + return true; + } + + if (TooManyInternalKeysSkipped()) { + return false; + } + + assert(ikey.sequence != kMaxSequenceNumber); + assert(ikey.user_key.size() >= timestamp_size_); + Slice ts; + if (timestamp_size_ > 0) { + ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_, + timestamp_size_); + } + if (!IsVisible(ikey.sequence, ts)) { + PERF_COUNTER_ADD(internal_recent_skipped_count, 1); + } else { + PERF_COUNTER_ADD(internal_key_skipped_count, 1); + } + + if (num_skipped >= max_skip_) { + num_skipped = 0; + IterKey last_key; + ParsedInternalKey pikey(saved_key_.GetUserKey(), kMaxSequenceNumber, + kValueTypeForSeek); + if (timestamp_size_ > 0) { + // TODO: pre-create kTsMax. + const std::string kTsMax(timestamp_size_, '\xff'); + pikey.SetTimestamp(kTsMax); + } + last_key.SetInternalKey(pikey); + // It would be more efficient to use SeekForPrev() here, but some + // iterators may not support it. + iter_.Seek(last_key.GetInternalKey()); + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); + if (!iter_.Valid()) { + break; + } + } else { + ++num_skipped; + } + + iter_.Prev(); + } + + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + + return true; +} + +bool DBIter::TooManyInternalKeysSkipped(bool increment) { + if ((max_skippable_internal_keys_ > 0) && + (num_internal_keys_skipped_ > max_skippable_internal_keys_)) { + valid_ = false; + status_ = Status::Incomplete("Too many internal keys skipped."); + return true; + } else if (increment) { + num_internal_keys_skipped_++; + } + return false; +} + +bool DBIter::IsVisible(SequenceNumber sequence, const Slice& ts, + bool* more_recent) { + // Remember that comparator orders preceding timestamp as larger. + // TODO(yanqin): support timestamp in read_callback_. + bool visible_by_seq = (read_callback_ == nullptr) + ? sequence <= sequence_ + : read_callback_->IsVisible(sequence); + + bool visible_by_ts = + (timestamp_ub_ == nullptr || + user_comparator_.CompareTimestamp(ts, *timestamp_ub_) <= 0) && + (timestamp_lb_ == nullptr || + user_comparator_.CompareTimestamp(ts, *timestamp_lb_) >= 0); + + if (more_recent) { + *more_recent = !visible_by_seq; + } + return visible_by_seq && visible_by_ts; +} + +void DBIter::SetSavedKeyToSeekTarget(const Slice& target) { + is_key_seqnum_zero_ = false; + SequenceNumber seq = sequence_; + saved_key_.Clear(); + saved_key_.SetInternalKey(target, seq, kValueTypeForSeek, timestamp_ub_); + + if (iterate_lower_bound_ != nullptr && + user_comparator_.CompareWithoutTimestamp( + saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_lower_bound_, + /*b_has_ts=*/false) < 0) { + // Seek key is smaller than the lower bound. + saved_key_.Clear(); + saved_key_.SetInternalKey(*iterate_lower_bound_, seq, kValueTypeForSeek, + timestamp_ub_); + } +} + +void DBIter::SetSavedKeyToSeekForPrevTarget(const Slice& target) { + is_key_seqnum_zero_ = false; + saved_key_.Clear(); + // now saved_key is used to store internal key. + saved_key_.SetInternalKey(target, 0 /* sequence_number */, + kValueTypeForSeekForPrev, timestamp_ub_); + + if (timestamp_size_ > 0) { + const std::string kTsMin(timestamp_size_, '\0'); + Slice ts = kTsMin; + saved_key_.UpdateInternalKey( + /*seq=*/0, kValueTypeForSeekForPrev, + timestamp_lb_ == nullptr ? &ts : timestamp_lb_); + } + + if (iterate_upper_bound_ != nullptr && + user_comparator_.CompareWithoutTimestamp( + saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_upper_bound_, + /*b_has_ts=*/false) >= 0) { + saved_key_.Clear(); + saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber, + kValueTypeForSeekForPrev, timestamp_ub_); + if (timestamp_size_ > 0) { + const std::string kTsMax(timestamp_size_, '\xff'); + Slice ts = kTsMax; + saved_key_.UpdateInternalKey( + kMaxSequenceNumber, kValueTypeForSeekForPrev, + timestamp_lb_ != nullptr ? timestamp_lb_ : &ts); + } + } +} + +void DBIter::Seek(const Slice& target) { + PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_); + StopWatch sw(clock_, statistics_, DB_SEEK); + +#ifndef ROCKSDB_LITE + if (db_impl_ != nullptr && cfd_ != nullptr) { + // TODO: What do we do if this returns an error? + Slice lower_bound, upper_bound; + if (iterate_lower_bound_ != nullptr) { + lower_bound = *iterate_lower_bound_; + } else { + lower_bound = Slice(""); + } + if (iterate_upper_bound_ != nullptr) { + upper_bound = *iterate_upper_bound_; + } else { + upper_bound = Slice(""); + } + db_impl_->TraceIteratorSeek(cfd_->GetID(), target, lower_bound, upper_bound) + .PermitUncheckedError(); + } +#endif // ROCKSDB_LITE + + status_ = Status::OK(); + ReleaseTempPinnedData(); + ResetBlobValue(); + ResetValueAndColumns(); + ResetInternalKeysSkippedCounter(); + + // Seek the inner iterator based on the target key. + { + PERF_TIMER_GUARD(seek_internal_seek_time); + + SetSavedKeyToSeekTarget(target); + iter_.Seek(saved_key_.GetInternalKey()); + + RecordTick(statistics_, NUMBER_DB_SEEK); + } + if (!iter_.Valid()) { + valid_ = false; + return; + } + direction_ = kForward; + + // Now the inner iterator is placed to the target position. From there, + // we need to find out the next key that is visible to the user. + ClearSavedValue(); + if (prefix_same_as_start_) { + // The case where the iterator needs to be invalidated if it has exhausted + // keys within the same prefix of the seek key. + assert(prefix_extractor_ != nullptr); + Slice target_prefix = prefix_extractor_->Transform(target); + FindNextUserEntry(false /* not skipping saved_key */, + &target_prefix /* prefix */); + if (valid_) { + // Remember the prefix of the seek key for the future Next() call to + // check. + prefix_.SetUserKey(target_prefix); + } + } else { + FindNextUserEntry(false /* not skipping saved_key */, nullptr); + } + if (!valid_) { + return; + } + + // Updating stats and perf context counters. + if (statistics_ != nullptr) { + // Decrement since we don't want to count this key as skipped + RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); + RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); + } + PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size()); +} + +void DBIter::SeekForPrev(const Slice& target) { + PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_); + StopWatch sw(clock_, statistics_, DB_SEEK); + +#ifndef ROCKSDB_LITE + if (db_impl_ != nullptr && cfd_ != nullptr) { + // TODO: What do we do if this returns an error? + Slice lower_bound, upper_bound; + if (iterate_lower_bound_ != nullptr) { + lower_bound = *iterate_lower_bound_; + } else { + lower_bound = Slice(""); + } + if (iterate_upper_bound_ != nullptr) { + upper_bound = *iterate_upper_bound_; + } else { + upper_bound = Slice(""); + } + db_impl_ + ->TraceIteratorSeekForPrev(cfd_->GetID(), target, lower_bound, + upper_bound) + .PermitUncheckedError(); + } +#endif // ROCKSDB_LITE + + status_ = Status::OK(); + ReleaseTempPinnedData(); + ResetBlobValue(); + ResetValueAndColumns(); + ResetInternalKeysSkippedCounter(); + + // Seek the inner iterator based on the target key. + { + PERF_TIMER_GUARD(seek_internal_seek_time); + SetSavedKeyToSeekForPrevTarget(target); + iter_.SeekForPrev(saved_key_.GetInternalKey()); + RecordTick(statistics_, NUMBER_DB_SEEK); + } + if (!iter_.Valid()) { + valid_ = false; + return; + } + direction_ = kReverse; + + // Now the inner iterator is placed to the target position. From there, + // we need to find out the first key that is visible to the user in the + // backward direction. + ClearSavedValue(); + if (prefix_same_as_start_) { + // The case where the iterator needs to be invalidated if it has exhausted + // keys within the same prefix of the seek key. + assert(prefix_extractor_ != nullptr); + Slice target_prefix = prefix_extractor_->Transform(target); + PrevInternal(&target_prefix); + if (valid_) { + // Remember the prefix of the seek key for the future Prev() call to + // check. + prefix_.SetUserKey(target_prefix); + } + } else { + PrevInternal(nullptr); + } + + // Report stats and perf context. + if (statistics_ != nullptr && valid_) { + RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); + RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); + PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size()); + } +} + +void DBIter::SeekToFirst() { + if (iterate_lower_bound_ != nullptr) { + Seek(*iterate_lower_bound_); + return; + } + PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_); + // Don't use iter_::Seek() if we set a prefix extractor + // because prefix seek will be used. + if (!expect_total_order_inner_iter()) { + max_skip_ = std::numeric_limits::max(); + } + status_ = Status::OK(); + // if iterator is empty, this status_ could be unchecked. + status_.PermitUncheckedError(); + direction_ = kForward; + ReleaseTempPinnedData(); + ResetBlobValue(); + ResetValueAndColumns(); + ResetInternalKeysSkippedCounter(); + ClearSavedValue(); + is_key_seqnum_zero_ = false; + + { + PERF_TIMER_GUARD(seek_internal_seek_time); + iter_.SeekToFirst(); + } + + RecordTick(statistics_, NUMBER_DB_SEEK); + if (iter_.Valid()) { + saved_key_.SetUserKey( + ExtractUserKey(iter_.key()), + !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */); + FindNextUserEntry(false /* not skipping saved_key */, + nullptr /* no prefix check */); + if (statistics_ != nullptr) { + if (valid_) { + RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); + RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); + PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size()); + } + } + } else { + valid_ = false; + } + if (valid_ && prefix_same_as_start_) { + assert(prefix_extractor_ != nullptr); + prefix_.SetUserKey(prefix_extractor_->Transform( + StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_))); + } +} + +void DBIter::SeekToLast() { + if (iterate_upper_bound_ != nullptr) { + // Seek to last key strictly less than ReadOptions.iterate_upper_bound. + SeekForPrev(*iterate_upper_bound_); + const bool is_ikey = (timestamp_size_ > 0 && timestamp_lb_ != nullptr); + Slice k = Valid() ? key() : Slice(); + if (is_ikey && Valid()) { + k.remove_suffix(kNumInternalBytes + timestamp_size_); + } + while (Valid() && 0 == user_comparator_.CompareWithoutTimestamp( + *iterate_upper_bound_, /*a_has_ts=*/false, k, + /*b_has_ts=*/false)) { + ReleaseTempPinnedData(); + ResetBlobValue(); + ResetValueAndColumns(); + PrevInternal(nullptr); + + k = key(); + if (is_ikey) { + k.remove_suffix(kNumInternalBytes + timestamp_size_); + } + } + return; + } + + PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_); + // Don't use iter_::Seek() if we set a prefix extractor + // because prefix seek will be used. + if (!expect_total_order_inner_iter()) { + max_skip_ = std::numeric_limits::max(); + } + status_ = Status::OK(); + // if iterator is empty, this status_ could be unchecked. + status_.PermitUncheckedError(); + direction_ = kReverse; + ReleaseTempPinnedData(); + ResetBlobValue(); + ResetValueAndColumns(); + ResetInternalKeysSkippedCounter(); + ClearSavedValue(); + is_key_seqnum_zero_ = false; + + { + PERF_TIMER_GUARD(seek_internal_seek_time); + iter_.SeekToLast(); + } + PrevInternal(nullptr); + if (statistics_ != nullptr) { + RecordTick(statistics_, NUMBER_DB_SEEK); + if (valid_) { + RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); + RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); + PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size()); + } + } + if (valid_ && prefix_same_as_start_) { + assert(prefix_extractor_ != nullptr); + prefix_.SetUserKey(prefix_extractor_->Transform( + StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_))); + } +} + +Iterator* NewDBIterator(Env* env, const ReadOptions& read_options, + const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + const Comparator* user_key_comparator, + InternalIterator* internal_iter, const Version* version, + const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iterations, + ReadCallback* read_callback, DBImpl* db_impl, + ColumnFamilyData* cfd, bool expose_blob_index) { + DBIter* db_iter = + new DBIter(env, read_options, ioptions, mutable_cf_options, + user_key_comparator, internal_iter, version, sequence, false, + max_sequential_skip_in_iterations, read_callback, db_impl, cfd, + expose_blob_index); + return db_iter; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_iter.h b/src/rocksdb/db/db_iter.h new file mode 100644 index 000000000..e87c2b4c9 --- /dev/null +++ b/src/rocksdb/db/db_iter.h @@ -0,0 +1,420 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/range_del_aggregator.h" +#include "memory/arena.h" +#include "options/cf_options.h" +#include "rocksdb/db.h" +#include "rocksdb/iterator.h" +#include "rocksdb/wide_columns.h" +#include "table/iterator_wrapper.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { +class Version; + +// This file declares the factory functions of DBIter, in its original form +// or a wrapped form with class ArenaWrappedDBIter, which is defined here. +// Class DBIter, which is declared and implemented inside db_iter.cc, is +// an iterator that converts internal keys (yielded by an InternalIterator) +// that were live at the specified sequence number into appropriate user +// keys. +// Each internal key consists of a user key, a sequence number, and a value +// type. DBIter deals with multiple key versions, tombstones, merge operands, +// etc, and exposes an Iterator. +// For example, DBIter may wrap following InternalIterator: +// user key: AAA value: v3 seqno: 100 type: Put +// user key: AAA value: v2 seqno: 97 type: Put +// user key: AAA value: v1 seqno: 95 type: Put +// user key: BBB value: v1 seqno: 90 type: Put +// user key: BBC value: N/A seqno: 98 type: Delete +// user key: BBC value: v1 seqno: 95 type: Put +// If the snapshot passed in is 102, then the DBIter is expected to +// expose the following iterator: +// key: AAA value: v3 +// key: BBB value: v1 +// If the snapshot passed in is 96, then it should expose: +// key: AAA value: v1 +// key: BBB value: v1 +// key: BBC value: v1 +// + +// Memtables and sstables that make the DB representation contain +// (userkey,seq,type) => uservalue entries. DBIter +// combines multiple entries for the same userkey found in the DB +// representation into a single entry while accounting for sequence +// numbers, deletion markers, overwrites, etc. +class DBIter final : public Iterator { + public: + // The following is grossly complicated. TODO: clean it up + // Which direction is the iterator currently moving? + // (1) When moving forward: + // (1a) if current_entry_is_merged_ = false, the internal iterator is + // positioned at the exact entry that yields this->key(), this->value() + // (1b) if current_entry_is_merged_ = true, the internal iterator is + // positioned immediately after the last entry that contributed to the + // current this->value(). That entry may or may not have key equal to + // this->key(). + // (2) When moving backwards, the internal iterator is positioned + // just before all entries whose user key == this->key(). + enum Direction : uint8_t { kForward, kReverse }; + + // LocalStatistics contain Statistics counters that will be aggregated per + // each iterator instance and then will be sent to the global statistics when + // the iterator is destroyed. + // + // The purpose of this approach is to avoid perf regression happening + // when multiple threads bump the atomic counters from a DBIter::Next(). + struct LocalStatistics { + explicit LocalStatistics() { ResetCounters(); } + + void ResetCounters() { + next_count_ = 0; + next_found_count_ = 0; + prev_count_ = 0; + prev_found_count_ = 0; + bytes_read_ = 0; + skip_count_ = 0; + } + + void BumpGlobalStatistics(Statistics* global_statistics) { + RecordTick(global_statistics, NUMBER_DB_NEXT, next_count_); + RecordTick(global_statistics, NUMBER_DB_NEXT_FOUND, next_found_count_); + RecordTick(global_statistics, NUMBER_DB_PREV, prev_count_); + RecordTick(global_statistics, NUMBER_DB_PREV_FOUND, prev_found_count_); + RecordTick(global_statistics, ITER_BYTES_READ, bytes_read_); + RecordTick(global_statistics, NUMBER_ITER_SKIP, skip_count_); + PERF_COUNTER_ADD(iter_read_bytes, bytes_read_); + ResetCounters(); + } + + // Map to Tickers::NUMBER_DB_NEXT + uint64_t next_count_; + // Map to Tickers::NUMBER_DB_NEXT_FOUND + uint64_t next_found_count_; + // Map to Tickers::NUMBER_DB_PREV + uint64_t prev_count_; + // Map to Tickers::NUMBER_DB_PREV_FOUND + uint64_t prev_found_count_; + // Map to Tickers::ITER_BYTES_READ + uint64_t bytes_read_; + // Map to Tickers::NUMBER_ITER_SKIP + uint64_t skip_count_; + }; + + DBIter(Env* _env, const ReadOptions& read_options, + const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, const Comparator* cmp, + InternalIterator* iter, const Version* version, SequenceNumber s, + bool arena_mode, uint64_t max_sequential_skip_in_iterations, + ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd, + bool expose_blob_index); + + // No copying allowed + DBIter(const DBIter&) = delete; + void operator=(const DBIter&) = delete; + + ~DBIter() override { + // Release pinned data if any + if (pinned_iters_mgr_.PinningEnabled()) { + pinned_iters_mgr_.ReleasePinnedData(); + } + RecordTick(statistics_, NO_ITERATOR_DELETED); + ResetInternalKeysSkippedCounter(); + local_stats_.BumpGlobalStatistics(statistics_); + iter_.DeleteIter(arena_mode_); + } + void SetIter(InternalIterator* iter) { + assert(iter_.iter() == nullptr); + iter_.Set(iter); + iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_); + } + + bool Valid() const override { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + if (valid_) { + status_.PermitUncheckedError(); + } +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + return valid_; + } + Slice key() const override { + assert(valid_); + if (timestamp_lb_) { + return saved_key_.GetInternalKey(); + } else { + const Slice ukey_and_ts = saved_key_.GetUserKey(); + return Slice(ukey_and_ts.data(), ukey_and_ts.size() - timestamp_size_); + } + } + Slice value() const override { + assert(valid_); + + return value_; + } + + const WideColumns& columns() const override { + assert(valid_); + + return wide_columns_; + } + + Status status() const override { + if (status_.ok()) { + return iter_.status(); + } else { + assert(!valid_); + return status_; + } + } + Slice timestamp() const override { + assert(valid_); + assert(timestamp_size_ > 0); + if (direction_ == kReverse) { + return saved_timestamp_; + } + const Slice ukey_and_ts = saved_key_.GetUserKey(); + assert(timestamp_size_ < ukey_and_ts.size()); + return ExtractTimestampFromUserKey(ukey_and_ts, timestamp_size_); + } + bool IsBlob() const { + assert(valid_); + return is_blob_; + } + + Status GetProperty(std::string prop_name, std::string* prop) override; + + void Next() final override; + void Prev() final override; + // 'target' does not contain timestamp, even if user timestamp feature is + // enabled. + void Seek(const Slice& target) final override; + void SeekForPrev(const Slice& target) final override; + void SeekToFirst() final override; + void SeekToLast() final override; + Env* env() const { return env_; } + void set_sequence(uint64_t s) { + sequence_ = s; + if (read_callback_) { + read_callback_->Refresh(s); + } + } + void set_valid(bool v) { valid_ = v; } + + private: + // For all methods in this block: + // PRE: iter_->Valid() && status_.ok() + // Return false if there was an error, and status() is non-ok, valid_ = false; + // in this case callers would usually stop what they were doing and return. + bool ReverseToForward(); + bool ReverseToBackward(); + // Set saved_key_ to the seek key to target, with proper sequence number set. + // It might get adjusted if the seek key is smaller than iterator lower bound. + // target does not have timestamp. + void SetSavedKeyToSeekTarget(const Slice& target); + // Set saved_key_ to the seek key to target, with proper sequence number set. + // It might get adjusted if the seek key is larger than iterator upper bound. + // target does not have timestamp. + void SetSavedKeyToSeekForPrevTarget(const Slice& target); + bool FindValueForCurrentKey(); + bool FindValueForCurrentKeyUsingSeek(); + bool FindUserKeyBeforeSavedKey(); + // If `skipping_saved_key` is true, the function will keep iterating until it + // finds a user key that is larger than `saved_key_`. + // If `prefix` is not null, the iterator needs to stop when all keys for the + // prefix are exhausted and the iterator is set to invalid. + bool FindNextUserEntry(bool skipping_saved_key, const Slice* prefix); + // Internal implementation of FindNextUserEntry(). + bool FindNextUserEntryInternal(bool skipping_saved_key, const Slice* prefix); + bool ParseKey(ParsedInternalKey* key); + bool MergeValuesNewToOld(); + + // If prefix is not null, we need to set the iterator to invalid if no more + // entry can be found within the prefix. + void PrevInternal(const Slice* prefix); + bool TooManyInternalKeysSkipped(bool increment = true); + bool IsVisible(SequenceNumber sequence, const Slice& ts, + bool* more_recent = nullptr); + + // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData() + // is called + void TempPinData() { + if (!pin_thru_lifetime_) { + pinned_iters_mgr_.StartPinning(); + } + } + + // Release blocks pinned by TempPinData() + void ReleaseTempPinnedData() { + if (!pin_thru_lifetime_ && pinned_iters_mgr_.PinningEnabled()) { + pinned_iters_mgr_.ReleasePinnedData(); + } + } + + inline void ClearSavedValue() { + if (saved_value_.capacity() > 1048576) { + std::string empty; + swap(empty, saved_value_); + } else { + saved_value_.clear(); + } + } + + inline void ResetInternalKeysSkippedCounter() { + local_stats_.skip_count_ += num_internal_keys_skipped_; + if (valid_) { + local_stats_.skip_count_--; + } + num_internal_keys_skipped_ = 0; + } + + bool expect_total_order_inner_iter() { + assert(expect_total_order_inner_iter_ || prefix_extractor_ != nullptr); + return expect_total_order_inner_iter_; + } + + // If lower bound of timestamp is given by ReadOptions.iter_start_ts, we need + // to return versions of the same key. We cannot just skip if the key value + // is the same but timestamps are different but fall in timestamp range. + inline int CompareKeyForSkip(const Slice& a, const Slice& b) { + return timestamp_lb_ != nullptr + ? user_comparator_.Compare(a, b) + : user_comparator_.CompareWithoutTimestamp(a, b); + } + + // Retrieves the blob value for the specified user key using the given blob + // index when using the integrated BlobDB implementation. + bool SetBlobValueIfNeeded(const Slice& user_key, const Slice& blob_index); + + void ResetBlobValue() { + is_blob_ = false; + blob_value_.Reset(); + } + + void SetValueAndColumnsFromPlain(const Slice& slice) { + assert(value_.empty()); + assert(wide_columns_.empty()); + + value_ = slice; + wide_columns_.emplace_back(kDefaultWideColumnName, slice); + } + + bool SetValueAndColumnsFromEntity(Slice slice); + + void ResetValueAndColumns() { + value_.clear(); + wide_columns_.clear(); + } + + // If user-defined timestamp is enabled, `user_key` includes timestamp. + bool Merge(const Slice* val, const Slice& user_key); + bool MergeEntity(const Slice& entity, const Slice& user_key); + + const SliceTransform* prefix_extractor_; + Env* const env_; + SystemClock* clock_; + Logger* logger_; + UserComparatorWrapper user_comparator_; + const MergeOperator* const merge_operator_; + IteratorWrapper iter_; + const Version* version_; + ReadCallback* read_callback_; + // Max visible sequence number. It is normally the snapshot seq unless we have + // uncommitted data in db as in WriteUnCommitted. + SequenceNumber sequence_; + + IterKey saved_key_; + // Reusable internal key data structure. This is only used inside one function + // and should not be used across functions. Reusing this object can reduce + // overhead of calling construction of the function if creating it each time. + ParsedInternalKey ikey_; + std::string saved_value_; + Slice pinned_value_; + // for prefix seek mode to support prev() + PinnableSlice blob_value_; + // Value of the default column + Slice value_; + // All columns (i.e. name-value pairs) + WideColumns wide_columns_; + Statistics* statistics_; + uint64_t max_skip_; + uint64_t max_skippable_internal_keys_; + uint64_t num_internal_keys_skipped_; + const Slice* iterate_lower_bound_; + const Slice* iterate_upper_bound_; + + // The prefix of the seek key. It is only used when prefix_same_as_start_ + // is true and prefix extractor is not null. In Next() or Prev(), current keys + // will be checked against this prefix, so that the iterator can be + // invalidated if the keys in this prefix has been exhausted. Set it using + // SetUserKey() and use it using GetUserKey(). + IterKey prefix_; + + Status status_; + Direction direction_; + bool valid_; + bool current_entry_is_merged_; + // True if we know that the current entry's seqnum is 0. + // This information is used as that the next entry will be for another + // user key. + bool is_key_seqnum_zero_; + const bool prefix_same_as_start_; + // Means that we will pin all data blocks we read as long the Iterator + // is not deleted, will be true if ReadOptions::pin_data is true + const bool pin_thru_lifetime_; + // Expect the inner iterator to maintain a total order. + // prefix_extractor_ must be non-NULL if the value is false. + const bool expect_total_order_inner_iter_; + ReadTier read_tier_; + bool fill_cache_; + bool verify_checksums_; + // Whether the iterator is allowed to expose blob references. Set to true when + // the stacked BlobDB implementation is used, false otherwise. + bool expose_blob_index_; + bool is_blob_; + bool arena_mode_; + // List of operands for merge operator. + MergeContext merge_context_; + LocalStatistics local_stats_; + PinnedIteratorsManager pinned_iters_mgr_; +#ifdef ROCKSDB_LITE + ROCKSDB_FIELD_UNUSED +#endif + DBImpl* db_impl_; +#ifdef ROCKSDB_LITE + ROCKSDB_FIELD_UNUSED +#endif + ColumnFamilyData* cfd_; + const Slice* const timestamp_ub_; + const Slice* const timestamp_lb_; + const size_t timestamp_size_; + std::string saved_timestamp_; + + // Used only if timestamp_lb_ is not nullptr. + std::string saved_ikey_; +}; + +// Return a new iterator that converts internal keys (yielded by +// "*internal_iter") that were live at the specified `sequence` number +// into appropriate user keys. +extern Iterator* NewDBIterator( + Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + const Comparator* user_key_comparator, InternalIterator* internal_iter, + const Version* version, const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iterations, ReadCallback* read_callback, + DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr, + bool expose_blob_index = false); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_iter_stress_test.cc b/src/rocksdb/db/db_iter_stress_test.cc new file mode 100644 index 000000000..872f7e6bd --- /dev/null +++ b/src/rocksdb/db/db_iter_stress_test.cc @@ -0,0 +1,658 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "rocksdb/comparator.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "test_util/testharness.h" +#include "util/random.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +#ifdef GFLAGS + +#include "util/gflags_compat.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; + +DEFINE_bool(verbose, false, + "Print huge, detailed trace. Intended for debugging failures."); + +#else + +void ParseCommandLineFlags(int*, char***, bool) {} +bool FLAGS_verbose = false; + +#endif + +namespace ROCKSDB_NAMESPACE { + +class DBIteratorStressTest : public testing::Test { + public: + Env* env_; + + DBIteratorStressTest() : env_(Env::Default()) {} +}; + +namespace { + +struct Entry { + std::string key; + ValueType type; // kTypeValue, kTypeDeletion, kTypeMerge + uint64_t sequence; + std::string ikey; // internal key, made from `key`, `sequence` and `type` + std::string value; + // If false, we'll pretend that this entry doesn't exist. + bool visible = true; + + bool operator<(const Entry& e) const { + if (key != e.key) return key < e.key; + return std::tie(sequence, type) > std::tie(e.sequence, e.type); + } +}; + +struct Data { + std::vector entries; + + // Indices in `entries` with `visible` = false. + std::vector hidden; + // Keys of entries whose `visible` changed since the last seek of iterators. + std::set recently_touched_keys; +}; + +struct StressTestIterator : public InternalIterator { + Data* data; + Random64* rnd; + InternalKeyComparator cmp; + + // Each operation will return error with this probability... + double error_probability = 0; + // ... and add/remove entries with this probability. + double mutation_probability = 0; + // The probability of adding vs removing entries will be chosen so that the + // amount of removed entries stays somewhat close to this number. + double target_hidden_fraction = 0; + // If true, print all mutations to stdout for debugging. + bool trace = false; + + int iter = -1; + Status status_; + + StressTestIterator(Data* _data, Random64* _rnd, const Comparator* _cmp) + : data(_data), rnd(_rnd), cmp(_cmp) {} + + bool Valid() const override { + if (iter >= 0 && iter < (int)data->entries.size()) { + assert(status_.ok()); + return true; + } + return false; + } + + Status status() const override { return status_; } + + bool MaybeFail() { + if (rnd->Next() >= + static_cast(std::numeric_limits::max()) * + error_probability) { + return false; + } + if (rnd->Next() % 2) { + status_ = Status::Incomplete("test"); + } else { + status_ = Status::IOError("test"); + } + if (trace) { + std::cout << "injecting " << status_.ToString() << std::endl; + } + iter = -1; + return true; + } + + void MaybeMutate() { + if (rnd->Next() >= + static_cast(std::numeric_limits::max()) * + mutation_probability) { + return; + } + do { + // If too many entries are hidden, hide less, otherwise hide more. + double hide_probability = + data->hidden.size() > data->entries.size() * target_hidden_fraction + ? 1. / 3 + : 2. / 3; + if (data->hidden.empty()) { + hide_probability = 1; + } + bool do_hide = rnd->Next() < + static_cast(std::numeric_limits::max()) * + hide_probability; + if (do_hide) { + // Hide a random entry. + size_t idx = rnd->Next() % data->entries.size(); + Entry& e = data->entries[idx]; + if (e.visible) { + if (trace) { + std::cout << "hiding idx " << idx << std::endl; + } + e.visible = false; + data->hidden.push_back(idx); + data->recently_touched_keys.insert(e.key); + } else { + // Already hidden. Let's go unhide something instead, just because + // it's easy and it doesn't really matter what we do. + do_hide = false; + } + } + if (!do_hide) { + // Unhide a random entry. + size_t hi = rnd->Next() % data->hidden.size(); + size_t idx = data->hidden[hi]; + if (trace) { + std::cout << "unhiding idx " << idx << std::endl; + } + Entry& e = data->entries[idx]; + assert(!e.visible); + e.visible = true; + data->hidden[hi] = data->hidden.back(); + data->hidden.pop_back(); + data->recently_touched_keys.insert(e.key); + } + } while (rnd->Next() % 3 != 0); // do 3 mutations on average + } + + void SkipForward() { + while (iter < (int)data->entries.size() && !data->entries[iter].visible) { + ++iter; + } + } + void SkipBackward() { + while (iter >= 0 && !data->entries[iter].visible) { + --iter; + } + } + + void SeekToFirst() override { + if (MaybeFail()) return; + MaybeMutate(); + + status_ = Status::OK(); + iter = 0; + SkipForward(); + } + void SeekToLast() override { + if (MaybeFail()) return; + MaybeMutate(); + + status_ = Status::OK(); + iter = (int)data->entries.size() - 1; + SkipBackward(); + } + + void Seek(const Slice& target) override { + if (MaybeFail()) return; + MaybeMutate(); + + status_ = Status::OK(); + // Binary search. + auto it = std::partition_point( + data->entries.begin(), data->entries.end(), + [&](const Entry& e) { return cmp.Compare(e.ikey, target) < 0; }); + iter = (int)(it - data->entries.begin()); + SkipForward(); + } + void SeekForPrev(const Slice& target) override { + if (MaybeFail()) return; + MaybeMutate(); + + status_ = Status::OK(); + // Binary search. + auto it = std::partition_point( + data->entries.begin(), data->entries.end(), + [&](const Entry& e) { return cmp.Compare(e.ikey, target) <= 0; }); + iter = (int)(it - data->entries.begin()); + --iter; + SkipBackward(); + } + + void Next() override { + assert(Valid()); + if (MaybeFail()) return; + MaybeMutate(); + ++iter; + SkipForward(); + } + void Prev() override { + assert(Valid()); + if (MaybeFail()) return; + MaybeMutate(); + --iter; + SkipBackward(); + } + + Slice key() const override { + assert(Valid()); + return data->entries[iter].ikey; + } + Slice value() const override { + assert(Valid()); + return data->entries[iter].value; + } + + bool IsKeyPinned() const override { return true; } + bool IsValuePinned() const override { return true; } +}; + +// A small reimplementation of DBIter, supporting only some of the features, +// and doing everything in O(log n). +// Skips all keys that are in recently_touched_keys. +struct ReferenceIterator { + Data* data; + uint64_t sequence; // ignore entries with sequence number below this + + bool valid = false; + std::string key; + std::string value; + + ReferenceIterator(Data* _data, uint64_t _sequence) + : data(_data), sequence(_sequence) {} + + bool Valid() const { return valid; } + + // Finds the first entry with key + // greater/less/greater-or-equal/less-or-equal than `key`, depending on + // arguments: if `skip`, inequality is strict; if `forward`, it's + // greater/greater-or-equal, otherwise less/less-or-equal. + // Sets `key` to the result. + // If no such key exists, returns false. Doesn't check `visible`. + bool FindNextKey(bool skip, bool forward) { + valid = false; + auto it = std::partition_point(data->entries.begin(), data->entries.end(), + [&](const Entry& e) { + if (forward != skip) { + return e.key < key; + } else { + return e.key <= key; + } + }); + if (forward) { + if (it != data->entries.end()) { + key = it->key; + return true; + } + } else { + if (it != data->entries.begin()) { + --it; + key = it->key; + return true; + } + } + return false; + } + + bool FindValueForCurrentKey() { + if (data->recently_touched_keys.count(key)) { + return false; + } + + // Find the first entry for the key. The caller promises that it exists. + auto it = std::partition_point(data->entries.begin(), data->entries.end(), + [&](const Entry& e) { + if (e.key != key) { + return e.key < key; + } + return e.sequence > sequence; + }); + + // Find the first visible entry. + for (;; ++it) { + if (it == data->entries.end()) { + return false; + } + Entry& e = *it; + if (e.key != key) { + return false; + } + assert(e.sequence <= sequence); + if (!e.visible) continue; + if (e.type == kTypeDeletion) { + return false; + } + if (e.type == kTypeValue) { + value = e.value; + valid = true; + return true; + } + assert(e.type == kTypeMerge); + break; + } + + // Collect merge operands. + std::vector operands; + for (; it != data->entries.end(); ++it) { + Entry& e = *it; + if (e.key != key) { + break; + } + assert(e.sequence <= sequence); + if (!e.visible) continue; + if (e.type == kTypeDeletion) { + break; + } + operands.push_back(e.value); + if (e.type == kTypeValue) { + break; + } + } + + // Do a merge. + value = operands.back().ToString(); + for (int i = (int)operands.size() - 2; i >= 0; --i) { + value.append(","); + value.append(operands[i].data(), operands[i].size()); + } + + valid = true; + return true; + } + + // Start at `key` and move until we encounter a valid value. + // `forward` defines the direction of movement. + // If `skip` is true, we're looking for key not equal to `key`. + void DoTheThing(bool skip, bool forward) { + while (FindNextKey(skip, forward) && !FindValueForCurrentKey()) { + skip = true; + } + } + + void Seek(const Slice& target) { + key = target.ToString(); + DoTheThing(false, true); + } + void SeekForPrev(const Slice& target) { + key = target.ToString(); + DoTheThing(false, false); + } + void SeekToFirst() { Seek(""); } + void SeekToLast() { + key = data->entries.back().key; + DoTheThing(false, false); + } + void Next() { + assert(Valid()); + DoTheThing(true, true); + } + void Prev() { + assert(Valid()); + DoTheThing(true, false); + } +}; + +} // anonymous namespace + +// Use an internal iterator that sometimes returns errors and sometimes +// adds/removes entries on the fly. Do random operations on a DBIter and +// check results. +// TODO: can be improved for more coverage: +// * Override IsKeyPinned() and IsValuePinned() to actually use +// PinnedIteratorManager and check that there's no use-after free. +// * Try different combinations of prefix_extractor, total_order_seek, +// prefix_same_as_start, iterate_lower_bound, iterate_upper_bound. +TEST_F(DBIteratorStressTest, StressTest) { + // We use a deterministic RNG, and everything happens in a single thread. + Random64 rnd(826909345792864532ll); + + auto gen_key = [&](int max_key) { + assert(max_key > 0); + int len = 0; + int a = max_key; + while (a) { + a /= 10; + ++len; + } + std::string s = std::to_string(rnd.Next() % static_cast(max_key)); + s.insert(0, len - (int)s.size(), '0'); + return s; + }; + + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + ReadOptions ropt; + + size_t num_matching = 0; + size_t num_at_end = 0; + size_t num_not_ok = 0; + size_t num_recently_removed = 0; + + // Number of iterations for each combination of parameters + // (there are ~250 of those). + // Tweak this to change the test run time. + // As of the time of writing, the test takes ~4 seconds for value of 5000. + const int num_iterations = 5000; + // Enable this to print all the operations for debugging. + bool trace = FLAGS_verbose; + + for (int num_entries : {5, 10, 100}) { + for (double key_space : {0.1, 1.0, 3.0}) { + for (ValueType prevalent_entry_type : + {kTypeValue, kTypeDeletion, kTypeMerge}) { + for (double error_probability : {0.01, 0.1}) { + for (double mutation_probability : {0.01, 0.5}) { + for (double target_hidden_fraction : {0.1, 0.5}) { + std::string trace_str = + "entries: " + std::to_string(num_entries) + + ", key_space: " + std::to_string(key_space) + + ", error_probability: " + std::to_string(error_probability) + + ", mutation_probability: " + + std::to_string(mutation_probability) + + ", target_hidden_fraction: " + + std::to_string(target_hidden_fraction); + SCOPED_TRACE(trace_str); + if (trace) { + std::cout << trace_str << std::endl; + } + + // Generate data. + Data data; + int max_key = (int)(num_entries * key_space) + 1; + for (int i = 0; i < num_entries; ++i) { + Entry e; + e.key = gen_key(max_key); + if (rnd.Next() % 10 != 0) { + e.type = prevalent_entry_type; + } else { + const ValueType types[] = {kTypeValue, kTypeDeletion, + kTypeMerge}; + e.type = + types[rnd.Next() % (sizeof(types) / sizeof(types[0]))]; + } + e.sequence = i; + e.value = "v" + std::to_string(i); + ParsedInternalKey internal_key(e.key, e.sequence, e.type); + AppendInternalKey(&e.ikey, internal_key); + + data.entries.push_back(e); + } + std::sort(data.entries.begin(), data.entries.end()); + if (trace) { + std::cout << "entries:"; + for (size_t i = 0; i < data.entries.size(); ++i) { + Entry& e = data.entries[i]; + std::cout << "\n idx " << i << ": \"" << e.key << "\": \"" + << e.value << "\" seq: " << e.sequence << " type: " + << (e.type == kTypeValue ? "val" + : e.type == kTypeDeletion ? "del" + : "merge"); + } + std::cout << std::endl; + } + + std::unique_ptr db_iter; + std::unique_ptr ref_iter; + for (int iteration = 0; iteration < num_iterations; ++iteration) { + SCOPED_TRACE(iteration); + // Create a new iterator every ~30 operations. + if (db_iter == nullptr || rnd.Next() % 30 == 0) { + uint64_t sequence = rnd.Next() % (data.entries.size() + 2); + ref_iter.reset(new ReferenceIterator(&data, sequence)); + if (trace) { + std::cout << "new iterator, seq: " << sequence << std::endl; + } + + auto internal_iter = + new StressTestIterator(&data, &rnd, BytewiseComparator()); + internal_iter->error_probability = error_probability; + internal_iter->mutation_probability = mutation_probability; + internal_iter->target_hidden_fraction = + target_hidden_fraction; + internal_iter->trace = trace; + db_iter.reset(NewDBIterator( + env_, ropt, ImmutableOptions(options), + MutableCFOptions(options), BytewiseComparator(), + internal_iter, nullptr /* version */, sequence, + options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + } + + // Do a random operation. It's important to do it on ref_it + // later than on db_iter to make sure ref_it sees the correct + // recently_touched_keys. + std::string old_key; + bool forward = rnd.Next() % 2 > 0; + // Do Next()/Prev() ~90% of the time. + bool seek = !ref_iter->Valid() || rnd.Next() % 10 == 0; + if (trace) { + std::cout << iteration << ": "; + } + + if (!seek) { + assert(db_iter->Valid()); + old_key = ref_iter->key; + if (trace) { + std::cout << (forward ? "Next" : "Prev") << std::endl; + } + + if (forward) { + db_iter->Next(); + ref_iter->Next(); + } else { + db_iter->Prev(); + ref_iter->Prev(); + } + } else { + data.recently_touched_keys.clear(); + // Do SeekToFirst less often than Seek. + if (rnd.Next() % 4 == 0) { + if (trace) { + std::cout << (forward ? "SeekToFirst" : "SeekToLast") + << std::endl; + } + + if (forward) { + old_key = ""; + db_iter->SeekToFirst(); + ref_iter->SeekToFirst(); + } else { + old_key = data.entries.back().key; + db_iter->SeekToLast(); + ref_iter->SeekToLast(); + } + } else { + old_key = gen_key(max_key); + if (trace) { + std::cout << (forward ? "Seek" : "SeekForPrev") << " \"" + << old_key << '"' << std::endl; + } + if (forward) { + db_iter->Seek(old_key); + ref_iter->Seek(old_key); + } else { + db_iter->SeekForPrev(old_key); + ref_iter->SeekForPrev(old_key); + } + } + } + + // Check the result. + if (db_iter->Valid()) { + ASSERT_TRUE(db_iter->status().ok()); + if (data.recently_touched_keys.count( + db_iter->key().ToString())) { + // Ended on a key that may have been mutated during the + // operation. Reference iterator skips such keys, so we + // can't check the exact result. + + // Check that the key moved in the right direction. + if (forward) { + if (seek) + ASSERT_GE(db_iter->key().ToString(), old_key); + else + ASSERT_GT(db_iter->key().ToString(), old_key); + } else { + if (seek) + ASSERT_LE(db_iter->key().ToString(), old_key); + else + ASSERT_LT(db_iter->key().ToString(), old_key); + } + + if (ref_iter->Valid()) { + // Check that DBIter didn't miss any non-mutated key. + if (forward) { + ASSERT_LT(db_iter->key().ToString(), ref_iter->key); + } else { + ASSERT_GT(db_iter->key().ToString(), ref_iter->key); + } + } + // Tell the next iteration of the loop to reseek the + // iterators. + ref_iter->valid = false; + + ++num_recently_removed; + } else { + ASSERT_TRUE(ref_iter->Valid()); + ASSERT_EQ(ref_iter->key, db_iter->key().ToString()); + ASSERT_EQ(ref_iter->value, db_iter->value()); + ++num_matching; + } + } else if (db_iter->status().ok()) { + ASSERT_FALSE(ref_iter->Valid()); + ++num_at_end; + } else { + // Non-ok status. Nothing to check here. + // Tell the next iteration of the loop to reseek the + // iterators. + ref_iter->valid = false; + ++num_not_ok; + } + } + } + } + } + } + } + } + + // Check that all cases were hit many times. + EXPECT_GT(num_matching, 10000); + EXPECT_GT(num_at_end, 10000); + EXPECT_GT(num_not_ok, 10000); + EXPECT_GT(num_recently_removed, 10000); + + std::cout << "stats:\n exact matches: " << num_matching + << "\n end reached: " << num_at_end + << "\n non-ok status: " << num_not_ok + << "\n mutated on the fly: " << num_recently_removed << std::endl; +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + ParseCommandLineFlags(&argc, &argv, true); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_iter_test.cc b/src/rocksdb/db/db_iter_test.cc new file mode 100644 index 000000000..65290bfad --- /dev/null +++ b/src/rocksdb/db/db_iter_test.cc @@ -0,0 +1,3195 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_iter.h" + +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "rocksdb/comparator.h" +#include "rocksdb/options.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/slice.h" +#include "rocksdb/statistics.h" +#include "table/iterator_wrapper.h" +#include "table/merging_iterator.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +static uint64_t TestGetTickerCount(const Options& options, + Tickers ticker_type) { + return options.statistics->getTickerCount(ticker_type); +} + +class TestIterator : public InternalIterator { + public: + explicit TestIterator(const Comparator* comparator) + : initialized_(false), + valid_(false), + sequence_number_(0), + iter_(0), + cmp(comparator) { + data_.reserve(16); + } + + void AddPut(std::string argkey, std::string argvalue) { + Add(argkey, kTypeValue, argvalue); + } + + void AddDeletion(std::string argkey) { + Add(argkey, kTypeDeletion, std::string()); + } + + void AddSingleDeletion(std::string argkey) { + Add(argkey, kTypeSingleDeletion, std::string()); + } + + void AddMerge(std::string argkey, std::string argvalue) { + Add(argkey, kTypeMerge, argvalue); + } + + void Add(std::string argkey, ValueType type, std::string argvalue) { + Add(argkey, type, argvalue, sequence_number_++); + } + + void Add(std::string argkey, ValueType type, std::string argvalue, + size_t seq_num, bool update_iter = false) { + valid_ = true; + ParsedInternalKey internal_key(argkey, seq_num, type); + data_.push_back( + std::pair(std::string(), argvalue)); + AppendInternalKey(&data_.back().first, internal_key); + if (update_iter && valid_ && cmp.Compare(data_.back().first, key()) < 0) { + // insert a key smaller than current key + Finish(); + // data_[iter_] is not anymore the current element of the iterator. + // Increment it to reposition it to the right position. + iter_++; + } + } + + // should be called before operations with iterator + void Finish() { + initialized_ = true; + std::sort(data_.begin(), data_.end(), + [this](std::pair a, + std::pair b) { + return (cmp.Compare(a.first, b.first) < 0); + }); + } + + // Removes the key from the set of keys over which this iterator iterates. + // Not to be confused with AddDeletion(). + // If the iterator is currently positioned on this key, the deletion will + // apply next time the iterator moves. + // Used for simulating ForwardIterator updating to a new version that doesn't + // have some of the keys (e.g. after compaction with a filter). + void Vanish(std::string _key) { + if (valid_ && data_[iter_].first == _key) { + delete_current_ = true; + return; + } + for (auto it = data_.begin(); it != data_.end(); ++it) { + ParsedInternalKey ikey; + Status pik_status = + ParseInternalKey(it->first, &ikey, true /* log_err_key */); + pik_status.PermitUncheckedError(); + assert(pik_status.ok()); + if (!pik_status.ok() || ikey.user_key != _key) { + continue; + } + if (valid_ && data_.begin() + iter_ > it) { + --iter_; + } + data_.erase(it); + return; + } + assert(false); + } + + // Number of operations done on this iterator since construction. + size_t steps() const { return steps_; } + + bool Valid() const override { + assert(initialized_); + return valid_; + } + + void SeekToFirst() override { + assert(initialized_); + ++steps_; + DeleteCurrentIfNeeded(); + valid_ = (data_.size() > 0); + iter_ = 0; + } + + void SeekToLast() override { + assert(initialized_); + ++steps_; + DeleteCurrentIfNeeded(); + valid_ = (data_.size() > 0); + iter_ = data_.size() - 1; + } + + void Seek(const Slice& target) override { + assert(initialized_); + SeekToFirst(); + ++steps_; + if (!valid_) { + return; + } + while (iter_ < data_.size() && + (cmp.Compare(data_[iter_].first, target) < 0)) { + ++iter_; + } + + if (iter_ == data_.size()) { + valid_ = false; + } + } + + void SeekForPrev(const Slice& target) override { + assert(initialized_); + DeleteCurrentIfNeeded(); + SeekForPrevImpl(target, &cmp); + } + + void Next() override { + assert(initialized_); + assert(valid_); + assert(iter_ < data_.size()); + + ++steps_; + if (delete_current_) { + DeleteCurrentIfNeeded(); + } else { + ++iter_; + } + valid_ = iter_ < data_.size(); + } + + void Prev() override { + assert(initialized_); + assert(valid_); + assert(iter_ < data_.size()); + + ++steps_; + DeleteCurrentIfNeeded(); + if (iter_ == 0) { + valid_ = false; + } else { + --iter_; + } + } + + Slice key() const override { + assert(initialized_); + return data_[iter_].first; + } + + Slice value() const override { + assert(initialized_); + return data_[iter_].second; + } + + Status status() const override { + assert(initialized_); + return Status::OK(); + } + + bool IsKeyPinned() const override { return true; } + bool IsValuePinned() const override { return true; } + + private: + bool initialized_; + bool valid_; + size_t sequence_number_; + size_t iter_; + size_t steps_ = 0; + + InternalKeyComparator cmp; + std::vector> data_; + bool delete_current_ = false; + + void DeleteCurrentIfNeeded() { + if (!delete_current_) { + return; + } + data_.erase(data_.begin() + iter_); + delete_current_ = false; + } +}; + +class DBIteratorTest : public testing::Test { + public: + Env* env_; + + DBIteratorTest() : env_(Env::Default()) {} +}; + +TEST_F(DBIteratorTest, DBIteratorPrevNext) { + Options options; + ImmutableOptions ioptions = ImmutableOptions(options); + MutableCFOptions mutable_cf_options = MutableCFOptions(options); + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddDeletion("a"); + internal_iter->AddDeletion("a"); + internal_iter->AddDeletion("a"); + internal_iter->AddDeletion("a"); + internal_iter->AddPut("a", "val_a"); + + internal_iter->AddPut("b", "val_b"); + internal_iter->Finish(); + + ReadOptions ro; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "val_b"); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "val_b"); + + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); + } + // Test to check the SeekToLast() with iterate_upper_bound not set + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + internal_iter->AddPut("b", "val_b"); + internal_iter->AddPut("c", "val_c"); + internal_iter->Finish(); + + ReadOptions ro; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + } + + // Test to check the SeekToLast() with iterate_upper_bound set + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddPut("d", "val_d"); + internal_iter->AddPut("e", "val_e"); + internal_iter->AddPut("f", "val_f"); + internal_iter->Finish(); + + Slice prefix("d"); + + ReadOptions ro; + ro.iterate_upper_bound = &prefix; + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + } + // Test to check the SeekToLast() iterate_upper_bound set to a key that + // is not Put yet + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddPut("d", "val_d"); + internal_iter->Finish(); + + Slice prefix("z"); + + ReadOptions ro; + ro.iterate_upper_bound = &prefix; + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "d"); + + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "d"); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + } + // Test to check the SeekToLast() with iterate_upper_bound set to the + // first key + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + internal_iter->AddPut("b", "val_b"); + internal_iter->Finish(); + + Slice prefix("a"); + + ReadOptions ro; + ro.iterate_upper_bound = &prefix; + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + db_iter->SeekToLast(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); + } + // Test case to check SeekToLast with iterate_upper_bound set + // (same key put may times - SeekToLast should start with the + // maximum sequence id of the upper bound) + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddPut("c", "val_c"); + internal_iter->Finish(); + + Slice prefix("c"); + + ReadOptions ro; + ro.iterate_upper_bound = &prefix; + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 7 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + SetPerfLevel(kEnableCount); + ASSERT_TRUE(GetPerfLevel() == kEnableCount); + + get_perf_context()->Reset(); + db_iter->SeekToLast(); + + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(static_cast(get_perf_context()->internal_key_skipped_count), + 1); + ASSERT_EQ(db_iter->key().ToString(), "b"); + + SetPerfLevel(kDisable); + } + // Test to check the SeekToLast() with the iterate_upper_bound set + // (Checking the value of the key which has sequence ids greater than + // and less that the iterator's sequence id) + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + + internal_iter->AddPut("a", "val_a1"); + internal_iter->AddPut("a", "val_a2"); + internal_iter->AddPut("b", "val_b1"); + internal_iter->AddPut("c", "val_c1"); + internal_iter->AddPut("c", "val_c2"); + internal_iter->AddPut("c", "val_c3"); + internal_iter->AddPut("b", "val_b2"); + internal_iter->AddPut("d", "val_d1"); + internal_iter->Finish(); + + Slice prefix("c"); + + ReadOptions ro; + ro.iterate_upper_bound = &prefix; + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 4 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "val_b1"); + } + + // Test to check the SeekToLast() with the iterate_upper_bound set to the + // key that is deleted + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddDeletion("a"); + internal_iter->AddPut("b", "val_b"); + internal_iter->AddPut("c", "val_c"); + internal_iter->Finish(); + + Slice prefix("a"); + + ReadOptions ro; + ro.iterate_upper_bound = &prefix; + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + db_iter->SeekToLast(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); + } + // Test to check the SeekToLast() with the iterate_upper_bound set + // (Deletion cases) + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + internal_iter->AddDeletion("b"); + internal_iter->AddPut("c", "val_c"); + internal_iter->Finish(); + + Slice prefix("c"); + + ReadOptions ro; + ro.iterate_upper_bound = &prefix; + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + } + // Test to check the SeekToLast() with iterate_upper_bound set + // (Deletion cases - Lot of internal keys after the upper_bound + // is deleted) + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + internal_iter->AddDeletion("c"); + internal_iter->AddDeletion("d"); + internal_iter->AddDeletion("e"); + internal_iter->AddDeletion("f"); + internal_iter->AddDeletion("g"); + internal_iter->AddDeletion("h"); + internal_iter->Finish(); + + Slice prefix("c"); + + ReadOptions ro; + ro.iterate_upper_bound = &prefix; + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 7 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + SetPerfLevel(kEnableCount); + ASSERT_TRUE(GetPerfLevel() == kEnableCount); + + get_perf_context()->Reset(); + db_iter->SeekToLast(); + + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ( + static_cast(get_perf_context()->internal_delete_skipped_count), 0); + ASSERT_EQ(db_iter->key().ToString(), "b"); + + SetPerfLevel(kDisable); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddDeletion("a"); + internal_iter->AddDeletion("a"); + internal_iter->AddDeletion("a"); + internal_iter->AddDeletion("a"); + internal_iter->AddPut("a", "val_a"); + + internal_iter->AddPut("b", "val_b"); + internal_iter->Finish(); + + ReadOptions ro; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "val_b"); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + internal_iter->Finish(); + + ReadOptions ro; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 2 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "val_b"); + + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "val_b"); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("a", "val_a"); + + internal_iter->AddPut("b", "val_b"); + + internal_iter->AddPut("c", "val_c"); + internal_iter->Finish(); + + ReadOptions ro; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "val_c"); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "val_b"); + + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "val_c"); + } +} + +TEST_F(DBIteratorTest, DBIteratorEmpty) { + Options options; + ImmutableOptions ioptions = ImmutableOptions(options); + MutableCFOptions mutable_cf_options = MutableCFOptions(options); + ReadOptions ro; + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 0 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 0 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToFirst(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); + } +} + +TEST_F(DBIteratorTest, DBIteratorUseSkipCountSkips) { + ReadOptions ro; + Options options; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + for (size_t i = 0; i < 200; ++i) { + internal_iter->AddPut("a", "a"); + internal_iter->AddPut("b", "b"); + internal_iter->AddPut("c", "c"); + } + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 2 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "c"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1u); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "b"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2u); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "a"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3u); + + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3u); +} + +TEST_F(DBIteratorTest, DBIteratorUseSkip) { + ReadOptions ro; + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + ImmutableOptions ioptions = ImmutableOptions(options); + MutableCFOptions mutable_cf_options = MutableCFOptions(options); + + { + for (size_t i = 0; i < 200; ++i) { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("b", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + for (size_t k = 0; k < 200; ++k) { + internal_iter->AddPut("c", std::to_string(k)); + } + internal_iter->Finish(); + + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, i + 2 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), std::to_string(i)); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_2"); + db_iter->Prev(); + + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); + } + } + + { + for (size_t i = 0; i < 200; ++i) { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("b", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + for (size_t k = 0; k < 200; ++k) { + internal_iter->AddDeletion("c"); + } + internal_iter->AddPut("c", "200"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, i + 2 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_2"); + db_iter->Prev(); + + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("b", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + for (size_t i = 0; i < 200; ++i) { + internal_iter->AddDeletion("c"); + } + internal_iter->AddPut("c", "200"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 202 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "200"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_2"); + db_iter->Prev(); + + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); + } + } + + { + for (size_t i = 0; i < 200; ++i) { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + for (size_t k = 0; k < 200; ++k) { + internal_iter->AddDeletion("c"); + } + internal_iter->AddPut("c", "200"); + internal_iter->Finish(); + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, i /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); + + db_iter->SeekToFirst(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); + } + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + for (size_t i = 0; i < 200; ++i) { + internal_iter->AddDeletion("c"); + } + internal_iter->AddPut("c", "200"); + internal_iter->Finish(); + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 200 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "200"); + + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); + + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "200"); + + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); + } + + { + for (size_t i = 0; i < 200; ++i) { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("b", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + for (size_t k = 0; k < 200; ++k) { + internal_iter->AddPut("d", std::to_string(k)); + } + + for (size_t k = 0; k < 200; ++k) { + internal_iter->AddPut("c", std::to_string(k)); + } + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, i + 2 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "d"); + ASSERT_EQ(db_iter->value().ToString(), std::to_string(i)); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_2"); + db_iter->Prev(); + + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); + } + } + + { + for (size_t i = 0; i < 200; ++i) { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("b", "b"); + internal_iter->AddMerge("a", "a"); + for (size_t k = 0; k < 200; ++k) { + internal_iter->AddMerge("c", std::to_string(k)); + } + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, i + 2 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "c"); + std::string merge_result = "0"; + for (size_t j = 1; j <= i; ++j) { + merge_result += "," + std::to_string(j); + } + ASSERT_EQ(db_iter->value().ToString(), merge_result); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "b"); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "a"); + + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); + } + } +} + +TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) { + Options options; + ImmutableOptions ioptions = ImmutableOptions(options); + MutableCFOptions mutable_cf_options = MutableCFOptions(options); + ReadOptions ro; + + // Basic test case ... Make sure explicityly passing the default value works. + // Skipping internal keys is disabled by default, when the value is 0. + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddDeletion("b"); + internal_iter->AddDeletion("b"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddDeletion("c"); + internal_iter->AddPut("d", "val_d"); + internal_iter->Finish(); + + ro.max_skippable_internal_keys = 0; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "d"); + ASSERT_EQ(db_iter->value().ToString(), "val_d"); + + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().ok()); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "d"); + ASSERT_EQ(db_iter->value().ToString(), "val_d"); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); + } + + // Test to make sure that the request will *not* fail as incomplete if + // num_internal_keys_skipped is *equal* to max_skippable_internal_keys + // threshold. (It will fail as incomplete only when the threshold is + // exceeded.) + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddDeletion("b"); + internal_iter->AddDeletion("b"); + internal_iter->AddPut("c", "val_c"); + internal_iter->Finish(); + + ro.max_skippable_internal_keys = 2; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "val_c"); + + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().ok()); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "val_c"); + + db_iter->Prev(); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().ok()); + } + + // Fail the request as incomplete when num_internal_keys_skipped > + // max_skippable_internal_keys + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddDeletion("b"); + internal_iter->AddDeletion("b"); + internal_iter->AddDeletion("b"); + internal_iter->AddPut("c", "val_c"); + internal_iter->Finish(); + + ro.max_skippable_internal_keys = 2; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().IsIncomplete()); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "val_c"); + + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().IsIncomplete()); + } + + // Test that the num_internal_keys_skipped counter resets after a successful + // read. + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddDeletion("b"); + internal_iter->AddDeletion("b"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddDeletion("d"); + internal_iter->AddDeletion("d"); + internal_iter->AddDeletion("d"); + internal_iter->AddPut("e", "val_e"); + internal_iter->Finish(); + + ro.max_skippable_internal_keys = 2; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "val_c"); + + db_iter->Next(); // num_internal_keys_skipped counter resets here. + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().IsIncomplete()); + } + + // Test that the num_internal_keys_skipped counter resets after a successful + // read. + // Reverse direction + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddDeletion("b"); + internal_iter->AddDeletion("b"); + internal_iter->AddDeletion("b"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddDeletion("d"); + internal_iter->AddDeletion("d"); + internal_iter->AddPut("e", "val_e"); + internal_iter->Finish(); + + ro.max_skippable_internal_keys = 2; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "e"); + ASSERT_EQ(db_iter->value().ToString(), "val_e"); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "val_c"); + + db_iter->Prev(); // num_internal_keys_skipped counter resets here. + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().IsIncomplete()); + } + + // Test that skipping separate keys is handled + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddDeletion("b"); + internal_iter->AddDeletion("c"); + internal_iter->AddDeletion("d"); + internal_iter->AddPut("e", "val_e"); + internal_iter->Finish(); + + ro.max_skippable_internal_keys = 2; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().IsIncomplete()); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "e"); + ASSERT_EQ(db_iter->value().ToString(), "val_e"); + + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().IsIncomplete()); + } + + // Test if alternating puts and deletes of the same key are handled correctly. + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + internal_iter->AddDeletion("b"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddDeletion("c"); + internal_iter->AddPut("d", "val_d"); + internal_iter->AddDeletion("d"); + internal_iter->AddPut("e", "val_e"); + internal_iter->Finish(); + + ro.max_skippable_internal_keys = 2; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().IsIncomplete()); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "e"); + ASSERT_EQ(db_iter->value().ToString(), "val_e"); + + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().IsIncomplete()); + } + + // Test for large number of skippable internal keys with *default* + // max_sequential_skip_in_iterations. + { + for (size_t i = 1; i <= 200; ++i) { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + for (size_t j = 1; j <= i; ++j) { + internal_iter->AddPut("b", "val_b"); + internal_iter->AddDeletion("b"); + } + internal_iter->AddPut("c", "val_c"); + internal_iter->Finish(); + + ro.max_skippable_internal_keys = i; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 2 * i + 1 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Next(); + if ((options.max_sequential_skip_in_iterations + 1) >= + ro.max_skippable_internal_keys) { + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().IsIncomplete()); + } else { + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "val_c"); + } + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "val_c"); + + db_iter->Prev(); + if ((options.max_sequential_skip_in_iterations + 1) >= + ro.max_skippable_internal_keys) { + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().IsIncomplete()); + } else { + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + } + } + } + + // Test for large number of skippable internal keys with a *non-default* + // max_sequential_skip_in_iterations. + { + for (size_t i = 1; i <= 200; ++i) { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + for (size_t j = 1; j <= i; ++j) { + internal_iter->AddPut("b", "val_b"); + internal_iter->AddDeletion("b"); + } + internal_iter->AddPut("c", "val_c"); + internal_iter->Finish(); + + options.max_sequential_skip_in_iterations = 1000; + ro.max_skippable_internal_keys = i; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 2 * i + 1 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().IsIncomplete()); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "val_c"); + + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().IsIncomplete()); + } + } +} + +TEST_F(DBIteratorTest, DBIterator1) { + ReadOptions ro; + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "0"); + internal_iter->AddPut("b", "0"); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("a", "1"); + internal_iter->AddMerge("b", "2"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 1 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "0"); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + db_iter->Next(); + ASSERT_FALSE(db_iter->Valid()); +} + +TEST_F(DBIteratorTest, DBIterator2) { + ReadOptions ro; + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "0"); + internal_iter->AddPut("b", "0"); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("a", "1"); + internal_iter->AddMerge("b", "2"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 0 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "0"); + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); +} + +TEST_F(DBIteratorTest, DBIterator3) { + ReadOptions ro; + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "0"); + internal_iter->AddPut("b", "0"); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("a", "1"); + internal_iter->AddMerge("b", "2"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 2 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "0"); + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); +} + +TEST_F(DBIteratorTest, DBIterator4) { + ReadOptions ro; + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "0"); + internal_iter->AddPut("b", "0"); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("a", "1"); + internal_iter->AddMerge("b", "2"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 4 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "0,1"); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "2"); + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); +} + +TEST_F(DBIteratorTest, DBIterator5) { + ReadOptions ro; + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + ImmutableOptions ioptions = ImmutableOptions(options); + MutableCFOptions mutable_cf_options = MutableCFOptions(options); + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddPut("a", "put_1"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 0 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddPut("a", "put_1"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 1 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddPut("a", "put_1"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 2 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddPut("a", "put_1"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 3 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "put_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddPut("a", "put_1"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 4 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddPut("a", "put_1"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 5 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddPut("a", "put_1"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 6 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5,merge_6"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + // put, singledelete, merge + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddSingleDeletion("a"); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddPut("b", "val_b"); + internal_iter->Finish(); + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->Seek("b"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + } +} + +TEST_F(DBIteratorTest, DBIterator6) { + ReadOptions ro; + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + ImmutableOptions ioptions = ImmutableOptions(options); + MutableCFOptions mutable_cf_options = MutableCFOptions(options); + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddDeletion("a"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 0 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddDeletion("a"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 1 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddDeletion("a"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 2 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddDeletion("a"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 3 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddDeletion("a"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 4 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_4"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddDeletion("a"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 5 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddDeletion("a"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 6 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5,merge_6"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } +} + +TEST_F(DBIteratorTest, DBIterator7) { + ReadOptions ro; + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + ImmutableOptions ioptions = ImmutableOptions(options); + MutableCFOptions mutable_cf_options = MutableCFOptions(options); + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); + + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 0 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); + + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 2 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "val,merge_2"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); + + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 4 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_3"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); + + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 5 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "merge_4"); + db_iter->Prev(); + + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_3"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); + + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 6 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_3"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); + + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 7 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); + + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 9 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_6,merge_7"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); + + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 13 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), + "merge_6,merge_7,merge_8,merge_9,merge_10,merge_11"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); + + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 14 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), + "merge_6,merge_7,merge_8,merge_9,merge_10,merge_11"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } +} + +TEST_F(DBIteratorTest, DBIterator8) { + ReadOptions ro; + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddDeletion("a"); + internal_iter->AddPut("a", "0"); + internal_iter->AddPut("b", "0"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "0"); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "0"); +} + +// TODO(3.13): fix the issue of Seek() then Prev() which might not necessary +// return the biggest element smaller than the seek key. +TEST_F(DBIteratorTest, DBIterator9) { + ReadOptions ro; + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("b", "merge_3"); + internal_iter->AddMerge("b", "merge_4"); + internal_iter->AddMerge("d", "merge_5"); + internal_iter->AddMerge("d", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4"); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "d"); + ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6"); + + db_iter->Seek("b"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2"); + + db_iter->SeekForPrev("b"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4"); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "d"); + ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6"); + + db_iter->Seek("c"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "d"); + ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4"); + + db_iter->SeekForPrev("c"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4"); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "d"); + ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6"); + } +} + +// TODO(3.13): fix the issue of Seek() then Prev() which might not necessary +// return the biggest element smaller than the seek key. +TEST_F(DBIteratorTest, DBIterator10) { + ReadOptions ro; + Options options; + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "1"); + internal_iter->AddPut("b", "2"); + internal_iter->AddPut("c", "3"); + internal_iter->AddPut("d", "4"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + db_iter->Seek("c"); + ASSERT_TRUE(db_iter->Valid()); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "2"); + + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "3"); + + db_iter->SeekForPrev("c"); + ASSERT_TRUE(db_iter->Valid()); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "d"); + ASSERT_EQ(db_iter->value().ToString(), "4"); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "3"); +} + +TEST_F(DBIteratorTest, SeekToLastOccurrenceSeq0) { + ReadOptions ro; + Options options; + options.merge_operator = nullptr; + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "1"); + internal_iter->AddPut("b", "2"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, 0 /* force seek */, nullptr /* read_callback */)); + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "1"); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "2"); + db_iter->Next(); + ASSERT_FALSE(db_iter->Valid()); +} + +TEST_F(DBIteratorTest, DBIterator11) { + ReadOptions ro; + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "0"); + internal_iter->AddPut("b", "0"); + internal_iter->AddSingleDeletion("b"); + internal_iter->AddMerge("a", "1"); + internal_iter->AddMerge("b", "2"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 1 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "0"); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + db_iter->Next(); + ASSERT_FALSE(db_iter->Valid()); +} + +TEST_F(DBIteratorTest, DBIterator12) { + ReadOptions ro; + Options options; + options.merge_operator = nullptr; + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "1"); + internal_iter->AddPut("b", "2"); + internal_iter->AddPut("c", "3"); + internal_iter->AddSingleDeletion("b"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, 0 /* force seek */, nullptr /* read_callback */)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "3"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "1"); + db_iter->Prev(); + ASSERT_FALSE(db_iter->Valid()); +} + +TEST_F(DBIteratorTest, DBIterator13) { + ReadOptions ro; + Options options; + options.merge_operator = nullptr; + + std::string key; + key.resize(9); + key.assign(9, static_cast(0)); + key[0] = 'b'; + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut(key, "0"); + internal_iter->AddPut(key, "1"); + internal_iter->AddPut(key, "2"); + internal_iter->AddPut(key, "3"); + internal_iter->AddPut(key, "4"); + internal_iter->AddPut(key, "5"); + internal_iter->AddPut(key, "6"); + internal_iter->AddPut(key, "7"); + internal_iter->AddPut(key, "8"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 2 /* sequence */, 3 /* max_sequential_skip_in_iterations */, + nullptr /* read_callback */)); + db_iter->Seek("b"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), key); + ASSERT_EQ(db_iter->value().ToString(), "2"); +} + +TEST_F(DBIteratorTest, DBIterator14) { + ReadOptions ro; + Options options; + options.merge_operator = nullptr; + + std::string key("b"); + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("b", "0"); + internal_iter->AddPut("b", "1"); + internal_iter->AddPut("b", "2"); + internal_iter->AddPut("b", "3"); + internal_iter->AddPut("a", "4"); + internal_iter->AddPut("a", "5"); + internal_iter->AddPut("a", "6"); + internal_iter->AddPut("c", "7"); + internal_iter->AddPut("c", "8"); + internal_iter->AddPut("c", "9"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 4 /* sequence */, 1 /* max_sequential_skip_in_iterations */, + nullptr /* read_callback */)); + db_iter->Seek("b"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "3"); + db_iter->SeekToFirst(); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "4"); +} + +class DBIterWithMergeIterTest : public testing::Test { + public: + DBIterWithMergeIterTest() + : env_(Env::Default()), icomp_(BytewiseComparator()) { + options_.merge_operator = nullptr; + + internal_iter1_ = new TestIterator(BytewiseComparator()); + internal_iter1_->Add("a", kTypeValue, "1", 3u); + internal_iter1_->Add("f", kTypeValue, "2", 5u); + internal_iter1_->Add("g", kTypeValue, "3", 7u); + internal_iter1_->Finish(); + + internal_iter2_ = new TestIterator(BytewiseComparator()); + internal_iter2_->Add("a", kTypeValue, "4", 6u); + internal_iter2_->Add("b", kTypeValue, "5", 1u); + internal_iter2_->Add("c", kTypeValue, "6", 2u); + internal_iter2_->Add("d", kTypeValue, "7", 3u); + internal_iter2_->Finish(); + + std::vector child_iters; + child_iters.push_back(internal_iter1_); + child_iters.push_back(internal_iter2_); + InternalKeyComparator icomp(BytewiseComparator()); + InternalIterator* merge_iter = + NewMergingIterator(&icomp_, &child_iters[0], 2u); + + db_iter_.reset(NewDBIterator( + env_, ro_, ImmutableOptions(options_), MutableCFOptions(options_), + BytewiseComparator(), merge_iter, nullptr /* version */, + 8 /* read data earlier than seqId 8 */, + 3 /* max iterators before reseek */, nullptr /* read_callback */)); + } + + Env* env_; + ReadOptions ro_; + Options options_; + TestIterator* internal_iter1_; + TestIterator* internal_iter2_; + InternalKeyComparator icomp_; + Iterator* merge_iter_; + std::unique_ptr db_iter_; +}; + +TEST_F(DBIterWithMergeIterTest, InnerMergeIterator1) { + db_iter_->SeekToFirst(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "a"); + ASSERT_EQ(db_iter_->value().ToString(), "4"); + db_iter_->Next(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "b"); + ASSERT_EQ(db_iter_->value().ToString(), "5"); + db_iter_->Next(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "c"); + ASSERT_EQ(db_iter_->value().ToString(), "6"); + db_iter_->Next(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "d"); + ASSERT_EQ(db_iter_->value().ToString(), "7"); + db_iter_->Next(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "f"); + ASSERT_EQ(db_iter_->value().ToString(), "2"); + db_iter_->Next(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "g"); + ASSERT_EQ(db_iter_->value().ToString(), "3"); + db_iter_->Next(); + ASSERT_FALSE(db_iter_->Valid()); +} + +TEST_F(DBIterWithMergeIterTest, InnerMergeIterator2) { + // Test Prev() when one child iterator is at its end. + db_iter_->SeekForPrev("g"); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "g"); + ASSERT_EQ(db_iter_->value().ToString(), "3"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "f"); + ASSERT_EQ(db_iter_->value().ToString(), "2"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "d"); + ASSERT_EQ(db_iter_->value().ToString(), "7"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "c"); + ASSERT_EQ(db_iter_->value().ToString(), "6"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "b"); + ASSERT_EQ(db_iter_->value().ToString(), "5"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "a"); + ASSERT_EQ(db_iter_->value().ToString(), "4"); +} + +TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace1) { + // Test Prev() when one child iterator is at its end but more rows + // are added. + db_iter_->Seek("f"); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "f"); + ASSERT_EQ(db_iter_->value().ToString(), "2"); + + // Test call back inserts a key in the end of the mem table after + // MergeIterator::Prev() realized the mem table iterator is at its end + // and before an SeekToLast() is called. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "MergeIterator::Prev:BeforePrev", + [&](void* /*arg*/) { internal_iter2_->Add("z", kTypeValue, "7", 12u); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "d"); + ASSERT_EQ(db_iter_->value().ToString(), "7"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "c"); + ASSERT_EQ(db_iter_->value().ToString(), "6"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "b"); + ASSERT_EQ(db_iter_->value().ToString(), "5"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "a"); + ASSERT_EQ(db_iter_->value().ToString(), "4"); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace2) { + // Test Prev() when one child iterator is at its end but more rows + // are added. + db_iter_->Seek("f"); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "f"); + ASSERT_EQ(db_iter_->value().ToString(), "2"); + + // Test call back inserts entries for update a key in the end of the + // mem table after MergeIterator::Prev() realized the mem tableiterator is at + // its end and before an SeekToLast() is called. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "MergeIterator::Prev:BeforePrev", [&](void* /*arg*/) { + internal_iter2_->Add("z", kTypeValue, "7", 12u); + internal_iter2_->Add("z", kTypeValue, "7", 11u); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "d"); + ASSERT_EQ(db_iter_->value().ToString(), "7"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "c"); + ASSERT_EQ(db_iter_->value().ToString(), "6"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "b"); + ASSERT_EQ(db_iter_->value().ToString(), "5"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "a"); + ASSERT_EQ(db_iter_->value().ToString(), "4"); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace3) { + // Test Prev() when one child iterator is at its end but more rows + // are added and max_skipped is triggered. + db_iter_->Seek("f"); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "f"); + ASSERT_EQ(db_iter_->value().ToString(), "2"); + + // Test call back inserts entries for update a key in the end of the + // mem table after MergeIterator::Prev() realized the mem table iterator is at + // its end and before an SeekToLast() is called. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "MergeIterator::Prev:BeforePrev", [&](void* /*arg*/) { + internal_iter2_->Add("z", kTypeValue, "7", 16u, true); + internal_iter2_->Add("z", kTypeValue, "7", 15u, true); + internal_iter2_->Add("z", kTypeValue, "7", 14u, true); + internal_iter2_->Add("z", kTypeValue, "7", 13u, true); + internal_iter2_->Add("z", kTypeValue, "7", 12u, true); + internal_iter2_->Add("z", kTypeValue, "7", 11u, true); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "d"); + ASSERT_EQ(db_iter_->value().ToString(), "7"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "c"); + ASSERT_EQ(db_iter_->value().ToString(), "6"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "b"); + ASSERT_EQ(db_iter_->value().ToString(), "5"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "a"); + ASSERT_EQ(db_iter_->value().ToString(), "4"); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace4) { + // Test Prev() when one child iterator has more rows inserted + // between Seek() and Prev() when changing directions. + internal_iter2_->Add("z", kTypeValue, "9", 4u); + + db_iter_->Seek("g"); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "g"); + ASSERT_EQ(db_iter_->value().ToString(), "3"); + + // Test call back inserts entries for update a key before "z" in + // mem table after MergeIterator::Prev() calls mem table iterator's + // Seek() and before calling Prev() + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "MergeIterator::Prev:BeforePrev", [&](void* arg) { + IteratorWrapper* it = reinterpret_cast(arg); + if (it->key().starts_with("z")) { + internal_iter2_->Add("x", kTypeValue, "7", 16u, true); + internal_iter2_->Add("x", kTypeValue, "7", 15u, true); + internal_iter2_->Add("x", kTypeValue, "7", 14u, true); + internal_iter2_->Add("x", kTypeValue, "7", 13u, true); + internal_iter2_->Add("x", kTypeValue, "7", 12u, true); + internal_iter2_->Add("x", kTypeValue, "7", 11u, true); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "f"); + ASSERT_EQ(db_iter_->value().ToString(), "2"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "d"); + ASSERT_EQ(db_iter_->value().ToString(), "7"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "c"); + ASSERT_EQ(db_iter_->value().ToString(), "6"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "b"); + ASSERT_EQ(db_iter_->value().ToString(), "5"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "a"); + ASSERT_EQ(db_iter_->value().ToString(), "4"); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace5) { + internal_iter2_->Add("z", kTypeValue, "9", 4u); + + // Test Prev() when one child iterator has more rows inserted + // between Seek() and Prev() when changing directions. + db_iter_->Seek("g"); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "g"); + ASSERT_EQ(db_iter_->value().ToString(), "3"); + + // Test call back inserts entries for update a key before "z" in + // mem table after MergeIterator::Prev() calls mem table iterator's + // Seek() and before calling Prev() + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "MergeIterator::Prev:BeforePrev", [&](void* arg) { + IteratorWrapper* it = reinterpret_cast(arg); + if (it->key().starts_with("z")) { + internal_iter2_->Add("x", kTypeValue, "7", 16u, true); + internal_iter2_->Add("x", kTypeValue, "7", 15u, true); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "f"); + ASSERT_EQ(db_iter_->value().ToString(), "2"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "d"); + ASSERT_EQ(db_iter_->value().ToString(), "7"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "c"); + ASSERT_EQ(db_iter_->value().ToString(), "6"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "b"); + ASSERT_EQ(db_iter_->value().ToString(), "5"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "a"); + ASSERT_EQ(db_iter_->value().ToString(), "4"); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace6) { + internal_iter2_->Add("z", kTypeValue, "9", 4u); + + // Test Prev() when one child iterator has more rows inserted + // between Seek() and Prev() when changing directions. + db_iter_->Seek("g"); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "g"); + ASSERT_EQ(db_iter_->value().ToString(), "3"); + + // Test call back inserts an entry for update a key before "z" in + // mem table after MergeIterator::Prev() calls mem table iterator's + // Seek() and before calling Prev() + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "MergeIterator::Prev:BeforePrev", [&](void* arg) { + IteratorWrapper* it = reinterpret_cast(arg); + if (it->key().starts_with("z")) { + internal_iter2_->Add("x", kTypeValue, "7", 16u, true); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "f"); + ASSERT_EQ(db_iter_->value().ToString(), "2"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "d"); + ASSERT_EQ(db_iter_->value().ToString(), "7"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "c"); + ASSERT_EQ(db_iter_->value().ToString(), "6"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "b"); + ASSERT_EQ(db_iter_->value().ToString(), "5"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "a"); + ASSERT_EQ(db_iter_->value().ToString(), "4"); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace7) { + internal_iter1_->Add("u", kTypeValue, "10", 4u); + internal_iter1_->Add("v", kTypeValue, "11", 4u); + internal_iter1_->Add("w", kTypeValue, "12", 4u); + internal_iter2_->Add("z", kTypeValue, "9", 4u); + + // Test Prev() when one child iterator has more rows inserted + // between Seek() and Prev() when changing directions. + db_iter_->Seek("g"); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "g"); + ASSERT_EQ(db_iter_->value().ToString(), "3"); + + // Test call back inserts entries for update a key before "z" in + // mem table after MergeIterator::Prev() calls mem table iterator's + // Seek() and before calling Prev() + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "MergeIterator::Prev:BeforePrev", [&](void* arg) { + IteratorWrapper* it = reinterpret_cast(arg); + if (it->key().starts_with("z")) { + internal_iter2_->Add("x", kTypeValue, "7", 16u, true); + internal_iter2_->Add("x", kTypeValue, "7", 15u, true); + internal_iter2_->Add("x", kTypeValue, "7", 14u, true); + internal_iter2_->Add("x", kTypeValue, "7", 13u, true); + internal_iter2_->Add("x", kTypeValue, "7", 12u, true); + internal_iter2_->Add("x", kTypeValue, "7", 11u, true); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "f"); + ASSERT_EQ(db_iter_->value().ToString(), "2"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "d"); + ASSERT_EQ(db_iter_->value().ToString(), "7"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "c"); + ASSERT_EQ(db_iter_->value().ToString(), "6"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "b"); + ASSERT_EQ(db_iter_->value().ToString(), "5"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "a"); + ASSERT_EQ(db_iter_->value().ToString(), "4"); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace8) { + // internal_iter1_: a, f, g + // internal_iter2_: a, b, c, d, adding (z) + internal_iter2_->Add("z", kTypeValue, "9", 4u); + + // Test Prev() when one child iterator has more rows inserted + // between Seek() and Prev() when changing directions. + db_iter_->Seek("g"); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "g"); + ASSERT_EQ(db_iter_->value().ToString(), "3"); + + // Test call back inserts two keys before "z" in mem table after + // MergeIterator::Prev() calls mem table iterator's Seek() and + // before calling Prev() + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "MergeIterator::Prev:BeforePrev", [&](void* arg) { + IteratorWrapper* it = reinterpret_cast(arg); + if (it->key().starts_with("z")) { + internal_iter2_->Add("x", kTypeValue, "7", 16u, true); + internal_iter2_->Add("y", kTypeValue, "7", 17u, true); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "f"); + ASSERT_EQ(db_iter_->value().ToString(), "2"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "d"); + ASSERT_EQ(db_iter_->value().ToString(), "7"); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBIteratorTest, SeekPrefixTombstones) { + ReadOptions ro; + Options options; + options.prefix_extractor.reset(NewNoopTransform()); + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddDeletion("b"); + internal_iter->AddDeletion("c"); + internal_iter->AddDeletion("d"); + internal_iter->AddDeletion("e"); + internal_iter->AddDeletion("f"); + internal_iter->AddDeletion("g"); + internal_iter->Finish(); + + ro.prefix_same_as_start = true; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + int skipped_keys = 0; + + get_perf_context()->Reset(); + db_iter->SeekForPrev("z"); + skipped_keys = + static_cast(get_perf_context()->internal_key_skipped_count); + ASSERT_EQ(skipped_keys, 0); + + get_perf_context()->Reset(); + db_iter->Seek("a"); + skipped_keys = + static_cast(get_perf_context()->internal_key_skipped_count); + ASSERT_EQ(skipped_keys, 0); +} + +TEST_F(DBIteratorTest, SeekToFirstLowerBound) { + const int kNumKeys = 3; + for (int i = 0; i < kNumKeys + 2; ++i) { + // + 2 for two special cases: lower bound before and lower bound after the + // internal iterator's keys + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + for (int j = 1; j <= kNumKeys; ++j) { + internal_iter->AddPut(std::to_string(j), "val"); + } + internal_iter->Finish(); + + ReadOptions ro; + auto lower_bound_str = std::to_string(i); + Slice lower_bound(lower_bound_str); + ro.iterate_lower_bound = &lower_bound; + Options options; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + db_iter->SeekToFirst(); + if (i == kNumKeys + 1) { + // lower bound was beyond the last key + ASSERT_FALSE(db_iter->Valid()); + ASSERT_OK(db_iter->status()); + } else { + ASSERT_TRUE(db_iter->Valid()); + int expected; + if (i == 0) { + // lower bound was before the first key + expected = 1; + } else { + // lower bound was at the ith key + expected = i; + } + ASSERT_EQ(std::to_string(expected), db_iter->key().ToString()); + } + } +} + +TEST_F(DBIteratorTest, PrevLowerBound) { + const int kNumKeys = 3; + const int kLowerBound = 2; + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + for (int j = 1; j <= kNumKeys; ++j) { + internal_iter->AddPut(std::to_string(j), "val"); + } + internal_iter->Finish(); + + ReadOptions ro; + auto lower_bound_str = std::to_string(kLowerBound); + Slice lower_bound(lower_bound_str); + ro.iterate_lower_bound = &lower_bound; + Options options; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + db_iter->SeekToLast(); + for (int i = kNumKeys; i >= kLowerBound; --i) { + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(std::to_string(i), db_iter->key().ToString()); + db_iter->Prev(); + } + ASSERT_FALSE(db_iter->Valid()); +} + +TEST_F(DBIteratorTest, SeekLessLowerBound) { + const int kNumKeys = 3; + const int kLowerBound = 2; + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + for (int j = 1; j <= kNumKeys; ++j) { + internal_iter->AddPut(std::to_string(j), "val"); + } + internal_iter->Finish(); + + ReadOptions ro; + auto lower_bound_str = std::to_string(kLowerBound); + Slice lower_bound(lower_bound_str); + ro.iterate_lower_bound = &lower_bound; + Options options; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + auto before_lower_bound_str = std::to_string(kLowerBound - 1); + Slice before_lower_bound(lower_bound_str); + + db_iter->Seek(before_lower_bound); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(lower_bound_str, db_iter->key().ToString()); +} + +TEST_F(DBIteratorTest, ReverseToForwardWithDisappearingKeys) { + Options options; + options.prefix_extractor.reset(NewCappedPrefixTransform(0)); + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "A"); + internal_iter->AddPut("b", "B"); + for (int i = 0; i < 100; ++i) { + internal_iter->AddPut("c" + std::to_string(i), ""); + } + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ReadOptions(), ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + db_iter->SeekForPrev("a"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_OK(db_iter->status()); + ASSERT_EQ("a", db_iter->key().ToString()); + + internal_iter->Vanish("a"); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_OK(db_iter->status()); + ASSERT_EQ("b", db_iter->key().ToString()); + + // A (sort of) bug used to cause DBIter to pointlessly drag the internal + // iterator all the way to the end. But this doesn't really matter at the time + // of writing because the only iterator that can see disappearing keys is + // ForwardIterator, which doesn't support SeekForPrev(). + EXPECT_LT(internal_iter->steps(), 20); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_iterator_test.cc b/src/rocksdb/db/db_iterator_test.cc new file mode 100644 index 000000000..aaf1408b4 --- /dev/null +++ b/src/rocksdb/db/db_iterator_test.cc @@ -0,0 +1,3265 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#include "db/arena_wrapped_db_iter.h" +#include "db/db_iter.h" +#include "db/db_test_util.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/iostats_context.h" +#include "rocksdb/perf_context.h" +#include "table/block_based/flush_block_policy.h" +#include "util/random.h" +#include "utilities/merge_operators/string_append/stringappend2.h" + +namespace ROCKSDB_NAMESPACE { + +// A dumb ReadCallback which saying every key is committed. +class DummyReadCallback : public ReadCallback { + public: + DummyReadCallback() : ReadCallback(kMaxSequenceNumber) {} + bool IsVisibleFullCheck(SequenceNumber /*seq*/) override { return true; } + void SetSnapshot(SequenceNumber seq) { max_visible_seq_ = seq; } +}; + +// Test param: +// bool: whether to pass read_callback to NewIterator(). +class DBIteratorTest : public DBTestBase, + public testing::WithParamInterface { + public: + DBIteratorTest() : DBTestBase("db_iterator_test", /*env_do_fsync=*/true) {} + + Iterator* NewIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family = nullptr) { + if (column_family == nullptr) { + column_family = db_->DefaultColumnFamily(); + } + auto* cfd = + static_cast_with_check(column_family)->cfd(); + SequenceNumber seq = read_options.snapshot != nullptr + ? read_options.snapshot->GetSequenceNumber() + : db_->GetLatestSequenceNumber(); + bool use_read_callback = GetParam(); + DummyReadCallback* read_callback = nullptr; + if (use_read_callback) { + read_callback = new DummyReadCallback(); + read_callback->SetSnapshot(seq); + InstrumentedMutexLock lock(&mutex_); + read_callbacks_.push_back( + std::unique_ptr(read_callback)); + } + return dbfull()->NewIteratorImpl(read_options, cfd, seq, read_callback); + } + + private: + InstrumentedMutex mutex_; + std::vector> read_callbacks_; +}; + +TEST_P(DBIteratorTest, IteratorProperty) { + // The test needs to be changed if kPersistedTier is supported in iterator. + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put(1, "1", "2")); + ASSERT_OK(Delete(1, "2")); + ReadOptions ropt; + ropt.pin_data = false; + { + std::unique_ptr iter(NewIterator(ropt, handles_[1])); + iter->SeekToFirst(); + std::string prop_value; + ASSERT_NOK(iter->GetProperty("non_existing.value", &prop_value)); + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("0", prop_value); + ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value)); + ASSERT_EQ("1", prop_value); + iter->Next(); + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("Iterator is not valid.", prop_value); + + // Get internal key at which the iteration stopped (tombstone in this case). + ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value)); + ASSERT_EQ("2", prop_value); + } + Close(); +} + +TEST_P(DBIteratorTest, PersistedTierOnIterator) { + // The test needs to be changed if kPersistedTier is supported in iterator. + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + ReadOptions ropt; + ropt.read_tier = kPersistedTier; + + auto* iter = db_->NewIterator(ropt, handles_[1]); + ASSERT_TRUE(iter->status().IsNotSupported()); + delete iter; + + std::vector iters; + ASSERT_TRUE(db_->NewIterators(ropt, {handles_[1]}, &iters).IsNotSupported()); + Close(); +} + +TEST_P(DBIteratorTest, NonBlockingIteration) { + do { + ReadOptions non_blocking_opts, regular_opts; + anon::OptionsOverride options_override; + options_override.full_block_cache = true; + Options options = CurrentOptions(options_override); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + non_blocking_opts.read_tier = kBlockCacheTier; + + CreateAndReopenWithCF({"pikachu"}, options); + // write one kv to the database. + ASSERT_OK(Put(1, "a", "b")); + + // scan using non-blocking iterator. We should find it because + // it is in memtable. + Iterator* iter = NewIterator(non_blocking_opts, handles_[1]); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 1); + delete iter; + + // flush memtable to storage. Now, the key should not be in the + // memtable neither in the block cache. + ASSERT_OK(Flush(1)); + + // verify that a non-blocking iterator does not find any + // kvs. Neither does it do any IOs to storage. + uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS); + uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + iter = NewIterator(non_blocking_opts, handles_[1]); + count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + count++; + } + ASSERT_EQ(count, 0); + ASSERT_TRUE(iter->status().IsIncomplete()); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + delete iter; + + // read in the specified block via a regular get + ASSERT_EQ(Get(1, "a"), "b"); + + // verify that we can find it via a non-blocking scan + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + iter = NewIterator(non_blocking_opts, handles_[1]); + count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 1); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + delete iter; + + // This test verifies block cache behaviors, which is not used by plain + // table format. + } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipMmapReads)); +} + +TEST_P(DBIteratorTest, IterSeekBeforePrev) { + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + EXPECT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(Put("0", "f")); + ASSERT_OK(Put("1", "h")); + EXPECT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(Put("2", "j")); + auto iter = NewIterator(ReadOptions()); + iter->Seek(Slice("c")); + iter->Prev(); + iter->Seek(Slice("a")); + iter->Prev(); + delete iter; +} + +TEST_P(DBIteratorTest, IterReseekNewUpperBound) { + Random rnd(301); + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.block_size = 1024; + table_options.block_size_deviation = 50; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.compression = kNoCompression; + Reopen(options); + + ASSERT_OK(Put("a", rnd.RandomString(400))); + ASSERT_OK(Put("aabb", rnd.RandomString(400))); + ASSERT_OK(Put("aaef", rnd.RandomString(400))); + ASSERT_OK(Put("b", rnd.RandomString(400))); + EXPECT_OK(dbfull()->Flush(FlushOptions())); + ReadOptions opts; + Slice ub = Slice("aa"); + opts.iterate_upper_bound = &ub; + auto iter = NewIterator(opts); + iter->Seek(Slice("a")); + ub = Slice("b"); + iter->Seek(Slice("aabc")); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "aaef"); + delete iter; +} + +TEST_P(DBIteratorTest, IterSeekForPrevBeforeNext) { + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + EXPECT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(Put("0", "f")); + ASSERT_OK(Put("1", "h")); + EXPECT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(Put("2", "j")); + auto iter = NewIterator(ReadOptions()); + iter->SeekForPrev(Slice("0")); + iter->Next(); + iter->SeekForPrev(Slice("1")); + iter->Next(); + delete iter; +} + +namespace { +std::string MakeLongKey(size_t length, char c) { + return std::string(length, c); +} +} // anonymous namespace + +TEST_P(DBIteratorTest, IterLongKeys) { + ASSERT_OK(Put(MakeLongKey(20, 0), "0")); + ASSERT_OK(Put(MakeLongKey(32, 2), "2")); + ASSERT_OK(Put("a", "b")); + EXPECT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(Put(MakeLongKey(50, 1), "1")); + ASSERT_OK(Put(MakeLongKey(127, 3), "3")); + ASSERT_OK(Put(MakeLongKey(64, 4), "4")); + auto iter = NewIterator(ReadOptions()); + + // Create a key that needs to be skipped for Seq too new + iter->Seek(MakeLongKey(20, 0)); + ASSERT_EQ(IterStatus(iter), MakeLongKey(20, 0) + "->0"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(64, 4) + "->4"); + + iter->SeekForPrev(MakeLongKey(127, 3)); + ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1"); + delete iter; + + iter = NewIterator(ReadOptions()); + iter->Seek(MakeLongKey(50, 1)); + ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3"); + delete iter; +} + +TEST_P(DBIteratorTest, IterNextWithNewerSeq) { + ASSERT_OK(Put("0", "0")); + EXPECT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + ASSERT_OK(Put("d", "e")); + auto iter = NewIterator(ReadOptions()); + + // Create a key that needs to be skipped for Seq too new + for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; + i++) { + ASSERT_OK(Put("b", "f")); + } + + iter->Seek(Slice("a")); + ASSERT_EQ(IterStatus(iter), "a->b"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->d"); + iter->SeekForPrev(Slice("b")); + ASSERT_EQ(IterStatus(iter), "a->b"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->d"); + + delete iter; +} + +TEST_P(DBIteratorTest, IterPrevWithNewerSeq) { + ASSERT_OK(Put("0", "0")); + EXPECT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + ASSERT_OK(Put("d", "e")); + auto iter = NewIterator(ReadOptions()); + + // Create a key that needs to be skipped for Seq too new + for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; + i++) { + ASSERT_OK(Put("b", "f")); + } + + iter->Seek(Slice("d")); + ASSERT_EQ(IterStatus(iter), "d->e"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "c->d"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->b"); + iter->Prev(); + iter->SeekForPrev(Slice("d")); + ASSERT_EQ(IterStatus(iter), "d->e"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "c->d"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->b"); + iter->Prev(); + delete iter; +} + +TEST_P(DBIteratorTest, IterPrevWithNewerSeq2) { + ASSERT_OK(Put("0", "0")); + EXPECT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + ASSERT_OK(Put("e", "f")); + auto iter = NewIterator(ReadOptions()); + auto iter2 = NewIterator(ReadOptions()); + iter->Seek(Slice("c")); + iter2->SeekForPrev(Slice("d")); + ASSERT_EQ(IterStatus(iter), "c->d"); + ASSERT_EQ(IterStatus(iter2), "c->d"); + + // Create a key that needs to be skipped for Seq too new + for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; + i++) { + ASSERT_OK(Put("b", "f")); + } + + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->b"); + iter->Prev(); + iter2->Prev(); + ASSERT_EQ(IterStatus(iter2), "a->b"); + iter2->Prev(); + delete iter; + delete iter2; +} + +TEST_P(DBIteratorTest, IterEmpty) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + Iterator* iter = NewIterator(ReadOptions(), handles_[1]); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("foo"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekForPrev("foo"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + ASSERT_OK(iter->status()); + + delete iter; + } while (ChangeCompactOptions()); +} + +TEST_P(DBIteratorTest, IterSingle) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "a", "va")); + Iterator* iter = NewIterator(ReadOptions(), handles_[1]); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekForPrev(""); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekForPrev("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("b"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekForPrev("b"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +TEST_P(DBIteratorTest, IterMulti) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "a", "va")); + ASSERT_OK(Put(1, "b", "vb")); + ASSERT_OK(Put(1, "c", "vc")); + Iterator* iter = NewIterator(ReadOptions(), handles_[1]); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("ax"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->SeekForPrev("d"); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->SeekForPrev("c"); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->SeekForPrev("bx"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + iter->Seek("b"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Seek("z"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekForPrev("b"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->SeekForPrev(""); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + // Switch from reverse to forward + iter->SeekToLast(); + iter->Prev(); + iter->Prev(); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + // Switch from forward to reverse + iter->SeekToFirst(); + iter->Next(); + iter->Next(); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + // Make sure iter stays at snapshot + ASSERT_OK(Put(1, "a", "va2")); + ASSERT_OK(Put(1, "a2", "va3")); + ASSERT_OK(Put(1, "b", "vb2")); + ASSERT_OK(Put(1, "c", "vc2")); + ASSERT_OK(Delete(1, "b")); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +// Check that we can skip over a run of user keys +// by using reseek rather than sequential scan +TEST_P(DBIteratorTest, IterReseek) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + Options options = CurrentOptions(options_override); + options.max_sequential_skip_in_iterations = 3; + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // insert three keys with same userkey and verify that + // reseek is not invoked. For each of these test cases, + // verify that we can find the next key "b". + ASSERT_OK(Put(1, "a", "zero")); + ASSERT_OK(Put(1, "a", "one")); + ASSERT_OK(Put(1, "a", "two")); + ASSERT_OK(Put(1, "b", "bone")); + Iterator* iter = NewIterator(ReadOptions(), handles_[1]); + iter->SeekToFirst(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "a->two"); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "b->bone"); + delete iter; + + // insert a total of three keys with same userkey and verify + // that reseek is still not invoked. + ASSERT_OK(Put(1, "a", "three")); + iter = NewIterator(ReadOptions(), handles_[1]); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->three"); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "b->bone"); + delete iter; + + // insert a total of four keys with same userkey and verify + // that reseek is invoked. + ASSERT_OK(Put(1, "a", "four")); + iter = NewIterator(ReadOptions(), handles_[1]); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->four"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1); + ASSERT_EQ(IterStatus(iter), "b->bone"); + delete iter; + + // Testing reverse iterator + // At this point, we have three versions of "a" and one version of "b". + // The reseek statistics is already at 1. + int num_reseeks = static_cast( + TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION)); + + // Insert another version of b and assert that reseek is not invoked + ASSERT_OK(Put(1, "b", "btwo")); + iter = NewIterator(ReadOptions(), handles_[1]); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "b->btwo"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks + 1); + ASSERT_EQ(IterStatus(iter), "a->four"); + delete iter; + + // insert two more versions of b. This makes a total of 4 versions + // of b and 4 versions of a. + ASSERT_OK(Put(1, "b", "bthree")); + ASSERT_OK(Put(1, "b", "bfour")); + iter = NewIterator(ReadOptions(), handles_[1]); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "b->bfour"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks + 2); + iter->Prev(); + + // the previous Prev call should have invoked reseek + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks + 3); + ASSERT_EQ(IterStatus(iter), "a->four"); + delete iter; +} + +TEST_F(DBIteratorTest, ReseekUponDirectionChange) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.merge_operator.reset( + new StringAppendTESTOperator(/*delim_char=*/' ')); + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(Put("bar", "value")); + { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + it->SeekToLast(); + it->Prev(); + it->Next(); + } + ASSERT_EQ(1, + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + + const std::string merge_key("good"); + ASSERT_OK(Put(merge_key, "orig")); + ASSERT_OK(Merge(merge_key, "suffix")); + { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + it->Seek(merge_key); + ASSERT_TRUE(it->Valid()); + const uint64_t prev_reseek_count = + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION); + it->Prev(); + ASSERT_EQ(prev_reseek_count + 1, options.statistics->getTickerCount( + NUMBER_OF_RESEEKS_IN_ITERATION)); + } +} + +TEST_P(DBIteratorTest, IterSmallAndLargeMix) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "a", "va")); + ASSERT_OK(Put(1, "b", std::string(100000, 'b'))); + ASSERT_OK(Put(1, "c", "vc")); + ASSERT_OK(Put(1, "d", std::string(100000, 'd'))); + ASSERT_OK(Put(1, "e", std::string(100000, 'e'))); + + Iterator* iter = NewIterator(ReadOptions(), handles_[1]); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +TEST_P(DBIteratorTest, IterMultiWithDelete) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "ka", "va")); + ASSERT_OK(Put(1, "kb", "vb")); + ASSERT_OK(Put(1, "kc", "vc")); + ASSERT_OK(Delete(1, "kb")); + ASSERT_EQ("NOT_FOUND", Get(1, "kb")); + + Iterator* iter = NewIterator(ReadOptions(), handles_[1]); + iter->Seek("kc"); + ASSERT_EQ(IterStatus(iter), "kc->vc"); + if (!CurrentOptions().merge_operator) { + // TODO: merge operator does not support backward iteration yet + if (kPlainTableAllBytesPrefix != option_config_ && + kBlockBasedTableWithWholeKeyHashIndex != option_config_ && + kHashLinkList != option_config_ && + kHashSkipList != option_config_) { // doesn't support SeekToLast + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "ka->va"); + } + } + delete iter; + } while (ChangeOptions()); +} + +TEST_P(DBIteratorTest, IterPrevMaxSkip) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + for (int i = 0; i < 2; i++) { + ASSERT_OK(Put(1, "key1", "v1")); + ASSERT_OK(Put(1, "key2", "v2")); + ASSERT_OK(Put(1, "key3", "v3")); + ASSERT_OK(Put(1, "key4", "v4")); + ASSERT_OK(Put(1, "key5", "v5")); + } + + VerifyIterLast("key5->v5", 1); + + ASSERT_OK(Delete(1, "key5")); + VerifyIterLast("key4->v4", 1); + + ASSERT_OK(Delete(1, "key4")); + VerifyIterLast("key3->v3", 1); + + ASSERT_OK(Delete(1, "key3")); + VerifyIterLast("key2->v2", 1); + + ASSERT_OK(Delete(1, "key2")); + VerifyIterLast("key1->v1", 1); + + ASSERT_OK(Delete(1, "key1")); + VerifyIterLast("(invalid)", 1); + } while (ChangeOptions(kSkipMergePut | kSkipNoSeekToLast)); +} + +TEST_P(DBIteratorTest, IterWithSnapshot) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override)); + ASSERT_OK(Put(1, "key1", "val1")); + ASSERT_OK(Put(1, "key2", "val2")); + ASSERT_OK(Put(1, "key3", "val3")); + ASSERT_OK(Put(1, "key4", "val4")); + ASSERT_OK(Put(1, "key5", "val5")); + + const Snapshot* snapshot = db_->GetSnapshot(); + ReadOptions options; + options.snapshot = snapshot; + Iterator* iter = NewIterator(options, handles_[1]); + + ASSERT_OK(Put(1, "key0", "val0")); + // Put more values after the snapshot + ASSERT_OK(Put(1, "key100", "val100")); + ASSERT_OK(Put(1, "key101", "val101")); + + iter->Seek("key5"); + ASSERT_EQ(IterStatus(iter), "key5->val5"); + if (!CurrentOptions().merge_operator) { + // TODO: merge operator does not support backward iteration yet + if (kPlainTableAllBytesPrefix != option_config_ && + kBlockBasedTableWithWholeKeyHashIndex != option_config_ && + kHashLinkList != option_config_ && kHashSkipList != option_config_) { + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "key4->val4"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "key3->val3"); + + iter->Next(); + ASSERT_EQ(IterStatus(iter), "key4->val4"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "key5->val5"); + } + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + } + + if (!CurrentOptions().merge_operator) { + // TODO(gzh): merge operator does not support backward iteration yet + if (kPlainTableAllBytesPrefix != option_config_ && + kBlockBasedTableWithWholeKeyHashIndex != option_config_ && + kHashLinkList != option_config_ && kHashSkipList != option_config_) { + iter->SeekForPrev("key1"); + ASSERT_EQ(IterStatus(iter), "key1->val1"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "key2->val2"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "key3->val3"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "key2->val2"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "key1->val1"); + iter->Prev(); + ASSERT_TRUE(!iter->Valid()); + } + } + db_->ReleaseSnapshot(snapshot); + delete iter; + } while (ChangeOptions()); +} + +TEST_P(DBIteratorTest, IteratorPinsRef) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "hello")); + + // Get iterator that will yield the current contents of the DB. + Iterator* iter = NewIterator(ReadOptions(), handles_[1]); + + // Write to force compactions + ASSERT_OK(Put(1, "foo", "newvalue1")); + for (int i = 0; i < 100; i++) { + // 100K values + ASSERT_OK(Put(1, Key(i), Key(i) + std::string(100000, 'v'))); + } + ASSERT_OK(Put(1, "foo", "newvalue2")); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("hello", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + delete iter; + } while (ChangeCompactOptions()); +} + +TEST_P(DBIteratorTest, IteratorDeleteAfterCfDelete) { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + + ASSERT_OK(Put(1, "foo", "delete-cf-then-delete-iter")); + ASSERT_OK(Put(1, "hello", "value2")); + + ColumnFamilyHandle* cf = handles_[1]; + ReadOptions ro; + + auto* iter = db_->NewIterator(ro, cf); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "foo->delete-cf-then-delete-iter"); + + // delete CF handle + EXPECT_OK(db_->DestroyColumnFamilyHandle(cf)); + handles_.erase(std::begin(handles_) + 1); + + // delete Iterator after CF handle is deleted + iter->Next(); + ASSERT_EQ(IterStatus(iter), "hello->value2"); + delete iter; +} + +TEST_P(DBIteratorTest, IteratorDeleteAfterCfDrop) { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + + ASSERT_OK(Put(1, "foo", "drop-cf-then-delete-iter")); + + ReadOptions ro; + ColumnFamilyHandle* cf = handles_[1]; + + auto* iter = db_->NewIterator(ro, cf); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "foo->drop-cf-then-delete-iter"); + + // drop and delete CF + EXPECT_OK(db_->DropColumnFamily(cf)); + EXPECT_OK(db_->DestroyColumnFamilyHandle(cf)); + handles_.erase(std::begin(handles_) + 1); + + // delete Iterator after CF handle is dropped + delete iter; +} + +// SetOptions not defined in ROCKSDB LITE +#ifndef ROCKSDB_LITE +TEST_P(DBIteratorTest, DBIteratorBoundTest) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + + options.prefix_extractor = nullptr; + DestroyAndReopen(options); + ASSERT_OK(Put("a", "0")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("g1", "0")); + + // testing basic case with no iterate_upper_bound and no prefix_extractor + { + ReadOptions ro; + ro.iterate_upper_bound = nullptr; + + std::unique_ptr iter(NewIterator(ro)); + + iter->Seek("foo"); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo1")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("g1")), 0); + + iter->SeekForPrev("g1"); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("g1")), 0); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo1")), 0); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo")), 0); + } + + // testing iterate_upper_bound and forward iterator + // to make sure it stops at bound + { + ReadOptions ro; + // iterate_upper_bound points beyond the last expected entry + Slice prefix("foo2"); + ro.iterate_upper_bound = &prefix; + + std::unique_ptr iter(NewIterator(ro)); + + iter->Seek("foo"); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(("foo1")), 0); + + iter->Next(); + // should stop here... + ASSERT_TRUE(!iter->Valid()); + } + // Testing SeekToLast with iterate_upper_bound set + { + ReadOptions ro; + + Slice prefix("foo"); + ro.iterate_upper_bound = &prefix; + + std::unique_ptr iter(NewIterator(ro)); + + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("a")), 0); + } + + // prefix is the first letter of the key + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:1"}})); + ASSERT_OK(Put("a", "0")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("g1", "0")); + + // testing with iterate_upper_bound and prefix_extractor + // Seek target and iterate_upper_bound are not is same prefix + // This should be an error + { + ReadOptions ro; + Slice upper_bound("g"); + ro.iterate_upper_bound = &upper_bound; + + std::unique_ptr iter(NewIterator(ro)); + + iter->Seek("foo"); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo1", iter->key().ToString()); + + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + } + + // testing that iterate_upper_bound prevents iterating over deleted items + // if the bound has already reached + { + options.prefix_extractor = nullptr; + DestroyAndReopen(options); + ASSERT_OK(Put("a", "0")); + ASSERT_OK(Put("b", "0")); + ASSERT_OK(Put("b1", "0")); + ASSERT_OK(Put("c", "0")); + ASSERT_OK(Put("d", "0")); + ASSERT_OK(Put("e", "0")); + ASSERT_OK(Delete("c")); + ASSERT_OK(Delete("d")); + + // base case with no bound + ReadOptions ro; + ro.iterate_upper_bound = nullptr; + + std::unique_ptr iter(NewIterator(ro)); + + iter->Seek("b"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("b")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(("b1")), 0); + + get_perf_context()->Reset(); + iter->Next(); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ( + static_cast(get_perf_context()->internal_delete_skipped_count), 2); + + // now testing with iterate_bound + Slice prefix("c"); + ro.iterate_upper_bound = &prefix; + + iter.reset(NewIterator(ro)); + + get_perf_context()->Reset(); + + iter->Seek("b"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("b")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(("b1")), 0); + + iter->Next(); + // the iteration should stop as soon as the bound key is reached + // even though the key is deleted + // hence internal_delete_skipped_count should be 0 + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ( + static_cast(get_perf_context()->internal_delete_skipped_count), 0); + } +} + +TEST_P(DBIteratorTest, DBIteratorBoundMultiSeek) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.prefix_extractor = nullptr; + DestroyAndReopen(options); + ASSERT_OK(Put("a", "0")); + ASSERT_OK(Put("z", "0")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_OK(Put("foo3", "bar3")); + ASSERT_OK(Put("foo4", "bar4")); + + { + std::string up_str = "foo5"; + Slice up(up_str); + ReadOptions ro; + ro.iterate_upper_bound = &up; + std::unique_ptr iter(NewIterator(ro)); + + iter->Seek("foo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo1")), 0); + + uint64_t prev_block_cache_hit = + TestGetTickerCount(options, BLOCK_CACHE_HIT); + uint64_t prev_block_cache_miss = + TestGetTickerCount(options, BLOCK_CACHE_MISS); + + ASSERT_GT(prev_block_cache_hit + prev_block_cache_miss, 0); + + iter->Seek("foo4"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo4")), 0); + ASSERT_EQ(prev_block_cache_hit, + TestGetTickerCount(options, BLOCK_CACHE_HIT)); + ASSERT_EQ(prev_block_cache_miss, + TestGetTickerCount(options, BLOCK_CACHE_MISS)); + + iter->Seek("foo2"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo2")), 0); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo3")), 0); + ASSERT_EQ(prev_block_cache_hit, + TestGetTickerCount(options, BLOCK_CACHE_HIT)); + ASSERT_EQ(prev_block_cache_miss, + TestGetTickerCount(options, BLOCK_CACHE_MISS)); + } +} +#endif + +TEST_P(DBIteratorTest, DBIteratorBoundOptimizationTest) { + for (auto format_version : {2, 3, 4}) { + int upper_bound_hits = 0; + Options options = CurrentOptions(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableIterator:out_of_bound", + [&upper_bound_hits](void*) { upper_bound_hits++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor = nullptr; + BlockBasedTableOptions table_options; + table_options.format_version = format_version; + table_options.flush_block_policy_factory = + std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_OK(Put("foo4", "bar4")); + ASSERT_OK(Flush()); + + Slice ub("foo3"); + ReadOptions ro; + ro.iterate_upper_bound = &ub; + + std::unique_ptr iter(NewIterator(ro)); + + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo1")), 0); + ASSERT_EQ(upper_bound_hits, 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo2")), 0); + ASSERT_EQ(upper_bound_hits, 0); + + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_EQ(upper_bound_hits, 1); + } +} + +// Enable kBinarySearchWithFirstKey, do some iterator operations and check that +// they don't do unnecessary block reads. +TEST_P(DBIteratorTest, IndexWithFirstKey) { + for (int tailing = 0; tailing < 2; ++tailing) { + SCOPED_TRACE("tailing = " + std::to_string(tailing)); + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor = nullptr; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + Statistics* stats = options.statistics.get(); + BlockBasedTableOptions table_options; + table_options.index_type = + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey; + table_options.index_shortening = + BlockBasedTableOptions::IndexShorteningMode::kNoShortening; + table_options.flush_block_policy_factory = + std::make_shared(); + table_options.block_cache = + NewLRUCache(8000); // fits all blocks and their cache metadata overhead + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + ASSERT_OK(Merge("a1", "x1")); + ASSERT_OK(Merge("b1", "y1")); + ASSERT_OK(Merge("c0", "z1")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("a2", "x2")); + ASSERT_OK(Merge("b2", "y2")); + ASSERT_OK(Merge("c0", "z2")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("a3", "x3")); + ASSERT_OK(Merge("b3", "y3")); + ASSERT_OK(Merge("c3", "z3")); + ASSERT_OK(Flush()); + + // Block cache is not important for this test. + // We use BLOCK_CACHE_DATA_* counters just because they're the most readily + // available way of counting block accesses. + + ReadOptions ropt; + ropt.tailing = tailing; + std::unique_ptr iter(NewIterator(ropt)); + + ropt.read_tier = ReadTier::kBlockCacheTier; + std::unique_ptr nonblocking_iter(NewIterator(ropt)); + + iter->Seek("b10"); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("b2", iter->key().ToString()); + EXPECT_EQ("y2", iter->value().ToString()); + EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + // The cache-only iterator should succeed too, using the blocks pulled into + // the cache by the previous iterator. + nonblocking_iter->Seek("b10"); + ASSERT_TRUE(nonblocking_iter->Valid()); + EXPECT_EQ("b2", nonblocking_iter->key().ToString()); + EXPECT_EQ("y2", nonblocking_iter->value().ToString()); + EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // ... but it shouldn't be able to step forward since the next block is + // not in cache yet. + nonblocking_iter->Next(); + ASSERT_FALSE(nonblocking_iter->Valid()); + ASSERT_TRUE(nonblocking_iter->status().IsIncomplete()); + + // ... nor should a seek to the next key succeed. + nonblocking_iter->Seek("b20"); + ASSERT_FALSE(nonblocking_iter->Valid()); + ASSERT_TRUE(nonblocking_iter->status().IsIncomplete()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("b3", iter->key().ToString()); + EXPECT_EQ("y3", iter->value().ToString()); + EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // After the blocking iterator loaded the next block, the nonblocking + // iterator's seek should succeed. + nonblocking_iter->Seek("b20"); + ASSERT_TRUE(nonblocking_iter->Valid()); + EXPECT_EQ("b3", nonblocking_iter->key().ToString()); + EXPECT_EQ("y3", nonblocking_iter->value().ToString()); + EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + iter->Seek("c0"); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("c0", iter->key().ToString()); + EXPECT_EQ("z1,z2", iter->value().ToString()); + EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(6, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("c3", iter->key().ToString()); + EXPECT_EQ("z3", iter->value().ToString()); + EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(7, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + iter.reset(); + + // Enable iterate_upper_bound and check that iterator is not trying to read + // blocks that are fully above upper bound. + std::string ub = "b3"; + Slice ub_slice(ub); + ropt.iterate_upper_bound = &ub_slice; + iter.reset(NewIterator(ropt)); + + iter->Seek("b2"); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("b2", iter->key().ToString()); + EXPECT_EQ("y2", iter->value().ToString()); + EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(7, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + iter->Next(); + ASSERT_FALSE(iter->Valid()); + EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(7, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + } +} + +TEST_P(DBIteratorTest, IndexWithFirstKeyGet) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor = nullptr; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + Statistics* stats = options.statistics.get(); + BlockBasedTableOptions table_options; + table_options.index_type = + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey; + table_options.index_shortening = + BlockBasedTableOptions::IndexShorteningMode::kNoShortening; + table_options.flush_block_policy_factory = + std::make_shared(); + table_options.block_cache = NewLRUCache(1000); // fits all blocks + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + ASSERT_OK(Merge("a", "x1")); + ASSERT_OK(Merge("c", "y1")); + ASSERT_OK(Merge("e", "z1")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("c", "y2")); + ASSERT_OK(Merge("e", "z2")); + ASSERT_OK(Flush()); + + // Get() between blocks shouldn't read any blocks. + ASSERT_EQ("NOT_FOUND", Get("b")); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Get() of an existing key shouldn't read any unnecessary blocks when there's + // only one key per block. + + ASSERT_EQ("y1,y2", Get("c")); + EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + ASSERT_EQ("x1", Get("a")); + EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + EXPECT_EQ(std::vector({"NOT_FOUND", "z1,z2"}), + MultiGet({"b", "e"})); +} + +// TODO(3.13): fix the issue of Seek() + Prev() which might not necessary +// return the biggest key which is smaller than the seek key. +TEST_P(DBIteratorTest, PrevAfterAndNextAfterMerge) { + Options options; + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreatePutOperator(); + options.env = env_; + DestroyAndReopen(options); + + // write three entries with different keys using Merge() + WriteOptions wopts; + ASSERT_OK(db_->Merge(wopts, "1", "data1")); + ASSERT_OK(db_->Merge(wopts, "2", "data2")); + ASSERT_OK(db_->Merge(wopts, "3", "data3")); + + std::unique_ptr it(NewIterator(ReadOptions())); + + it->Seek("2"); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("2", it->key().ToString()); + + it->Prev(); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("1", it->key().ToString()); + + it->SeekForPrev("1"); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("1", it->key().ToString()); + + it->Next(); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("2", it->key().ToString()); +} + +class DBIteratorTestForPinnedData : public DBIteratorTest { + public: + enum TestConfig { + NORMAL, + CLOSE_AND_OPEN, + COMPACT_BEFORE_READ, + FLUSH_EVERY_1000, + MAX + }; + DBIteratorTestForPinnedData() : DBIteratorTest() {} + void PinnedDataIteratorRandomized(TestConfig run_config) { + // Generate Random data + Random rnd(301); + + int puts = 100000; + int key_pool = static_cast(puts * 0.7); + int key_size = 100; + int val_size = 1000; + int seeks_percentage = 20; // 20% of keys will be used to test seek() + int delete_percentage = 20; // 20% of keys will be deleted + int merge_percentage = 20; // 20% of keys will be added using Merge() + + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.use_delta_encoding = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.merge_operator = MergeOperators::CreatePutOperator(); + DestroyAndReopen(options); + + std::vector generated_keys(key_pool); + for (int i = 0; i < key_pool; i++) { + generated_keys[i] = rnd.RandomString(key_size); + } + + std::map true_data; + std::vector random_keys; + std::vector deleted_keys; + for (int i = 0; i < puts; i++) { + auto& k = generated_keys[rnd.Next() % key_pool]; + auto v = rnd.RandomString(val_size); + + // Insert data to true_data map and to DB + true_data[k] = v; + if (rnd.PercentTrue(merge_percentage)) { + ASSERT_OK(db_->Merge(WriteOptions(), k, v)); + } else { + ASSERT_OK(Put(k, v)); + } + + // Pick random keys to be used to test Seek() + if (rnd.PercentTrue(seeks_percentage)) { + random_keys.push_back(k); + } + + // Delete some random keys + if (rnd.PercentTrue(delete_percentage)) { + deleted_keys.push_back(k); + true_data.erase(k); + ASSERT_OK(Delete(k)); + } + + if (run_config == TestConfig::FLUSH_EVERY_1000) { + if (i && i % 1000 == 0) { + ASSERT_OK(Flush()); + } + } + } + + if (run_config == TestConfig::CLOSE_AND_OPEN) { + Close(); + Reopen(options); + } else if (run_config == TestConfig::COMPACT_BEFORE_READ) { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } + + ReadOptions ro; + ro.pin_data = true; + auto iter = NewIterator(ro); + + { + // Test Seek to random keys + std::vector keys_slices; + std::vector true_keys; + for (auto& k : random_keys) { + iter->Seek(k); + if (!iter->Valid()) { + ASSERT_EQ(true_data.lower_bound(k), true_data.end()); + continue; + } + std::string prop_value; + ASSERT_OK( + iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + keys_slices.push_back(iter->key()); + true_keys.push_back(true_data.lower_bound(k)->first); + } + + for (size_t i = 0; i < keys_slices.size(); i++) { + ASSERT_EQ(keys_slices[i].ToString(), true_keys[i]); + } + } + + { + // Test SeekForPrev to random keys + std::vector keys_slices; + std::vector true_keys; + for (auto& k : random_keys) { + iter->SeekForPrev(k); + if (!iter->Valid()) { + ASSERT_EQ(true_data.upper_bound(k), true_data.begin()); + continue; + } + std::string prop_value; + ASSERT_OK( + iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + keys_slices.push_back(iter->key()); + true_keys.push_back((--true_data.upper_bound(k))->first); + } + + for (size_t i = 0; i < keys_slices.size(); i++) { + ASSERT_EQ(keys_slices[i].ToString(), true_keys[i]); + } + } + + { + // Test iterating all data forward + std::vector all_keys; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string prop_value; + ASSERT_OK( + iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + all_keys.push_back(iter->key()); + } + ASSERT_EQ(all_keys.size(), true_data.size()); + + // Verify that all keys slices are valid + auto data_iter = true_data.begin(); + for (size_t i = 0; i < all_keys.size(); i++) { + ASSERT_EQ(all_keys[i].ToString(), data_iter->first); + data_iter++; + } + } + + { + // Test iterating all data backward + std::vector all_keys; + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + std::string prop_value; + ASSERT_OK( + iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + all_keys.push_back(iter->key()); + } + ASSERT_EQ(all_keys.size(), true_data.size()); + + // Verify that all keys slices are valid (backward) + auto data_iter = true_data.rbegin(); + for (size_t i = 0; i < all_keys.size(); i++) { + ASSERT_EQ(all_keys[i].ToString(), data_iter->first); + data_iter++; + } + } + + delete iter; + } +}; + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedNormal) { + PinnedDataIteratorRandomized(TestConfig::NORMAL); +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedCLoseAndOpen) { + PinnedDataIteratorRandomized(TestConfig::CLOSE_AND_OPEN); +} + +TEST_P(DBIteratorTestForPinnedData, + PinnedDataIteratorRandomizedCompactBeforeRead) { + PinnedDataIteratorRandomized(TestConfig::COMPACT_BEFORE_READ); +} + +TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedFlush) { + PinnedDataIteratorRandomized(TestConfig::FLUSH_EVERY_1000); +} + +INSTANTIATE_TEST_CASE_P(DBIteratorTestForPinnedDataInstance, + DBIteratorTestForPinnedData, + testing::Values(true, false)); + +#ifndef ROCKSDB_LITE +TEST_P(DBIteratorTest, PinnedDataIteratorMultipleFiles) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.use_delta_encoding = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.disable_auto_compactions = true; + options.write_buffer_size = 1024 * 1024 * 10; // 10 Mb + DestroyAndReopen(options); + + std::map true_data; + + // Generate 4 sst files in L2 + Random rnd(301); + for (int i = 1; i <= 1000; i++) { + std::string k = Key(i * 3); + std::string v = rnd.RandomString(100); + ASSERT_OK(Put(k, v)); + true_data[k] = v; + if (i % 250 == 0) { + ASSERT_OK(Flush()); + } + } + ASSERT_EQ(FilesPerLevel(0), "4"); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(FilesPerLevel(0), "0,4"); + + // Generate 4 sst files in L0 + for (int i = 1; i <= 1000; i++) { + std::string k = Key(i * 2); + std::string v = rnd.RandomString(100); + ASSERT_OK(Put(k, v)); + true_data[k] = v; + if (i % 250 == 0) { + ASSERT_OK(Flush()); + } + } + ASSERT_EQ(FilesPerLevel(0), "4,4"); + + // Add some keys/values in memtables + for (int i = 1; i <= 1000; i++) { + std::string k = Key(i); + std::string v = rnd.RandomString(100); + ASSERT_OK(Put(k, v)); + true_data[k] = v; + } + ASSERT_EQ(FilesPerLevel(0), "4,4"); + + ReadOptions ro; + ro.pin_data = true; + auto iter = NewIterator(ro); + + std::vector> results; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string prop_value; + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + results.emplace_back(iter->key(), iter->value().ToString()); + } + + ASSERT_EQ(results.size(), true_data.size()); + auto data_iter = true_data.begin(); + for (size_t i = 0; i < results.size(); i++, data_iter++) { + auto& kv = results[i]; + ASSERT_EQ(kv.first, data_iter->first); + ASSERT_EQ(kv.second, data_iter->second); + } + + delete iter; +} +#endif + +TEST_P(DBIteratorTest, PinnedDataIteratorMergeOperator) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.use_delta_encoding = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.merge_operator = MergeOperators::CreateUInt64AddOperator(); + DestroyAndReopen(options); + + std::string numbers[7]; + for (int val = 0; val <= 6; val++) { + PutFixed64(numbers + val, val); + } + + // +1 all keys in range [ 0 => 999] + for (int i = 0; i < 1000; i++) { + WriteOptions wo; + ASSERT_OK(db_->Merge(wo, Key(i), numbers[1])); + } + + // +2 all keys divisible by 2 in range [ 0 => 999] + for (int i = 0; i < 1000; i += 2) { + WriteOptions wo; + ASSERT_OK(db_->Merge(wo, Key(i), numbers[2])); + } + + // +3 all keys divisible by 5 in range [ 0 => 999] + for (int i = 0; i < 1000; i += 5) { + WriteOptions wo; + ASSERT_OK(db_->Merge(wo, Key(i), numbers[3])); + } + + ReadOptions ro; + ro.pin_data = true; + auto iter = NewIterator(ro); + + std::vector> results; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string prop_value; + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + results.emplace_back(iter->key(), iter->value().ToString()); + } + + ASSERT_EQ(results.size(), 1000); + for (size_t i = 0; i < results.size(); i++) { + auto& kv = results[i]; + ASSERT_EQ(kv.first, Key(static_cast(i))); + int expected_val = 1; + if (i % 2 == 0) { + expected_val += 2; + } + if (i % 5 == 0) { + expected_val += 3; + } + ASSERT_EQ(kv.second, numbers[expected_val]); + } + + delete iter; +} + +TEST_P(DBIteratorTest, PinnedDataIteratorReadAfterUpdate) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.use_delta_encoding = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.write_buffer_size = 100000; + DestroyAndReopen(options); + + Random rnd(301); + + std::map true_data; + for (int i = 0; i < 1000; i++) { + std::string k = rnd.RandomString(10); + std::string v = rnd.RandomString(1000); + ASSERT_OK(Put(k, v)); + true_data[k] = v; + } + + ReadOptions ro; + ro.pin_data = true; + auto iter = NewIterator(ro); + + // Delete 50% of the keys and update the other 50% + for (auto& kv : true_data) { + if (rnd.OneIn(2)) { + ASSERT_OK(Delete(kv.first)); + } else { + std::string new_val = rnd.RandomString(1000); + ASSERT_OK(Put(kv.first, new_val)); + } + } + + std::vector> results; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string prop_value; + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + results.emplace_back(iter->key(), iter->value().ToString()); + } + + auto data_iter = true_data.begin(); + for (size_t i = 0; i < results.size(); i++, data_iter++) { + auto& kv = results[i]; + ASSERT_EQ(kv.first, data_iter->first); + ASSERT_EQ(kv.second, data_iter->second); + } + + delete iter; +} + +class SliceTransformLimitedDomainGeneric : public SliceTransform { + const char* Name() const override { + return "SliceTransformLimitedDomainGeneric"; + } + + Slice Transform(const Slice& src) const override { + return Slice(src.data(), 1); + } + + bool InDomain(const Slice& src) const override { + // prefix will be x???? + return src.size() >= 1; + } + + bool InRange(const Slice& dst) const override { + // prefix will be x???? + return dst.size() == 1; + } +}; + +TEST_P(DBIteratorTest, IterSeekForPrevCrossingFiles) { + Options options = CurrentOptions(); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.disable_auto_compactions = true; + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + ASSERT_OK(Put("a1", "va1")); + ASSERT_OK(Put("a2", "va2")); + ASSERT_OK(Put("a3", "va3")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("b1", "vb1")); + ASSERT_OK(Put("b2", "vb2")); + ASSERT_OK(Put("b3", "vb3")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("b4", "vb4")); + ASSERT_OK(Put("d1", "vd1")); + ASSERT_OK(Put("d2", "vd2")); + ASSERT_OK(Put("d4", "vd4")); + ASSERT_OK(Flush()); + + MoveFilesToLevel(1); + { + ReadOptions ro; + Iterator* iter = NewIterator(ro); + + iter->SeekForPrev("a4"); + ASSERT_EQ(iter->key().ToString(), "a3"); + ASSERT_EQ(iter->value().ToString(), "va3"); + + iter->SeekForPrev("c2"); + ASSERT_EQ(iter->key().ToString(), "b3"); + iter->SeekForPrev("d3"); + ASSERT_EQ(iter->key().ToString(), "d2"); + iter->SeekForPrev("b5"); + ASSERT_EQ(iter->key().ToString(), "b4"); + delete iter; + } + + { + ReadOptions ro; + ro.prefix_same_as_start = true; + Iterator* iter = NewIterator(ro); + iter->SeekForPrev("c2"); + ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); + delete iter; + } +} + +TEST_P(DBIteratorTest, IterSeekForPrevCrossingFilesCustomPrefixExtractor) { + Options options = CurrentOptions(); + options.prefix_extractor = + std::make_shared(); + options.disable_auto_compactions = true; + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + ASSERT_OK(Put("a1", "va1")); + ASSERT_OK(Put("a2", "va2")); + ASSERT_OK(Put("a3", "va3")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("b1", "vb1")); + ASSERT_OK(Put("b2", "vb2")); + ASSERT_OK(Put("b3", "vb3")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("b4", "vb4")); + ASSERT_OK(Put("d1", "vd1")); + ASSERT_OK(Put("d2", "vd2")); + ASSERT_OK(Put("d4", "vd4")); + ASSERT_OK(Flush()); + + MoveFilesToLevel(1); + { + ReadOptions ro; + Iterator* iter = NewIterator(ro); + + iter->SeekForPrev("a4"); + ASSERT_EQ(iter->key().ToString(), "a3"); + ASSERT_EQ(iter->value().ToString(), "va3"); + + iter->SeekForPrev("c2"); + ASSERT_EQ(iter->key().ToString(), "b3"); + iter->SeekForPrev("d3"); + ASSERT_EQ(iter->key().ToString(), "d2"); + iter->SeekForPrev("b5"); + ASSERT_EQ(iter->key().ToString(), "b4"); + delete iter; + } + + { + ReadOptions ro; + ro.prefix_same_as_start = true; + Iterator* iter = NewIterator(ro); + iter->SeekForPrev("c2"); + ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); + delete iter; + } +} + +TEST_P(DBIteratorTest, IterPrevKeyCrossingBlocks) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.block_size = 1; // every block will contain one entry + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.merge_operator = MergeOperators::CreateStringAppendTESTOperator(); + options.disable_auto_compactions = true; + options.max_sequential_skip_in_iterations = 8; + + DestroyAndReopen(options); + + // Putting such deletes will force DBIter::Prev() to fallback to a Seek + for (int file_num = 0; file_num < 10; file_num++) { + ASSERT_OK(Delete("key4")); + ASSERT_OK(Flush()); + } + + // First File containing 5 blocks of puts + ASSERT_OK(Put("key1", "val1.0")); + ASSERT_OK(Put("key2", "val2.0")); + ASSERT_OK(Put("key3", "val3.0")); + ASSERT_OK(Put("key4", "val4.0")); + ASSERT_OK(Put("key5", "val5.0")); + ASSERT_OK(Flush()); + + // Second file containing 9 blocks of merge operands + ASSERT_OK(db_->Merge(WriteOptions(), "key1", "val1.1")); + ASSERT_OK(db_->Merge(WriteOptions(), "key1", "val1.2")); + + ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.1")); + ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.2")); + ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.3")); + + ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.1")); + ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.2")); + ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.3")); + ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.4")); + ASSERT_OK(Flush()); + + { + ReadOptions ro; + ro.fill_cache = false; + Iterator* iter = NewIterator(ro); + + iter->SeekToLast(); + ASSERT_EQ(iter->key().ToString(), "key5"); + ASSERT_EQ(iter->value().ToString(), "val5.0"); + + iter->Prev(); + ASSERT_EQ(iter->key().ToString(), "key4"); + ASSERT_EQ(iter->value().ToString(), "val4.0"); + + iter->Prev(); + ASSERT_EQ(iter->key().ToString(), "key3"); + ASSERT_EQ(iter->value().ToString(), "val3.0,val3.1,val3.2,val3.3,val3.4"); + + iter->Prev(); + ASSERT_EQ(iter->key().ToString(), "key2"); + ASSERT_EQ(iter->value().ToString(), "val2.0,val2.1,val2.2,val2.3"); + + iter->Prev(); + ASSERT_EQ(iter->key().ToString(), "key1"); + ASSERT_EQ(iter->value().ToString(), "val1.0,val1.1,val1.2"); + + delete iter; + } +} + +TEST_P(DBIteratorTest, IterPrevKeyCrossingBlocksRandomized) { + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreateStringAppendTESTOperator(); + options.disable_auto_compactions = true; + options.level0_slowdown_writes_trigger = (1 << 30); + options.level0_stop_writes_trigger = (1 << 30); + options.max_sequential_skip_in_iterations = 8; + DestroyAndReopen(options); + + const int kNumKeys = 500; + // Small number of merge operands to make sure that DBIter::Prev() don't + // fall back to Seek() + const int kNumMergeOperands = 3; + // Use value size that will make sure that every block contain 1 key + const int kValSize = + static_cast(BlockBasedTableOptions().block_size) * 4; + // Percentage of keys that wont get merge operations + const int kNoMergeOpPercentage = 20; + // Percentage of keys that will be deleted + const int kDeletePercentage = 10; + + // For half of the key range we will write multiple deletes first to + // force DBIter::Prev() to fall back to Seek() + for (int file_num = 0; file_num < 10; file_num++) { + for (int i = 0; i < kNumKeys; i += 2) { + ASSERT_OK(Delete(Key(i))); + } + ASSERT_OK(Flush()); + } + + Random rnd(301); + std::map true_data; + std::string gen_key; + std::string gen_val; + + for (int i = 0; i < kNumKeys; i++) { + gen_key = Key(i); + gen_val = rnd.RandomString(kValSize); + + ASSERT_OK(Put(gen_key, gen_val)); + true_data[gen_key] = gen_val; + } + ASSERT_OK(Flush()); + + // Separate values and merge operands in different file so that we + // make sure that we don't merge them while flushing but actually + // merge them in the read path + for (int i = 0; i < kNumKeys; i++) { + if (rnd.PercentTrue(kNoMergeOpPercentage)) { + // Dont give merge operations for some keys + continue; + } + + for (int j = 0; j < kNumMergeOperands; j++) { + gen_key = Key(i); + gen_val = rnd.RandomString(kValSize); + + ASSERT_OK(db_->Merge(WriteOptions(), gen_key, gen_val)); + true_data[gen_key] += "," + gen_val; + } + } + ASSERT_OK(Flush()); + + for (int i = 0; i < kNumKeys; i++) { + if (rnd.PercentTrue(kDeletePercentage)) { + gen_key = Key(i); + + ASSERT_OK(Delete(gen_key)); + true_data.erase(gen_key); + } + } + ASSERT_OK(Flush()); + + { + ReadOptions ro; + ro.fill_cache = false; + Iterator* iter = NewIterator(ro); + auto data_iter = true_data.rbegin(); + + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + ASSERT_EQ(iter->key().ToString(), data_iter->first); + ASSERT_EQ(iter->value().ToString(), data_iter->second); + data_iter++; + } + ASSERT_EQ(data_iter, true_data.rend()); + + delete iter; + } + + { + ReadOptions ro; + ro.fill_cache = false; + Iterator* iter = NewIterator(ro); + auto data_iter = true_data.rbegin(); + + int entries_right = 0; + std::string seek_key; + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + // Verify key/value of current position + ASSERT_EQ(iter->key().ToString(), data_iter->first); + ASSERT_EQ(iter->value().ToString(), data_iter->second); + + bool restore_position_with_seek = rnd.Uniform(2); + if (restore_position_with_seek) { + seek_key = iter->key().ToString(); + } + + // Do some Next() operations the restore the iterator to orignal position + int next_count = + entries_right > 0 ? rnd.Uniform(std::min(entries_right, 10)) : 0; + for (int i = 0; i < next_count; i++) { + iter->Next(); + data_iter--; + + ASSERT_EQ(iter->key().ToString(), data_iter->first); + ASSERT_EQ(iter->value().ToString(), data_iter->second); + } + + if (restore_position_with_seek) { + // Restore orignal position using Seek() + iter->Seek(seek_key); + for (int i = 0; i < next_count; i++) { + data_iter++; + } + + ASSERT_EQ(iter->key().ToString(), data_iter->first); + ASSERT_EQ(iter->value().ToString(), data_iter->second); + } else { + // Restore original position using Prev() + for (int i = 0; i < next_count; i++) { + iter->Prev(); + data_iter++; + + ASSERT_EQ(iter->key().ToString(), data_iter->first); + ASSERT_EQ(iter->value().ToString(), data_iter->second); + } + } + + entries_right++; + data_iter++; + } + ASSERT_EQ(data_iter, true_data.rend()); + + delete iter; + } +} + +TEST_P(DBIteratorTest, IteratorWithLocalStatistics) { + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 1000; i++) { + // Key 10 bytes / Value 10 bytes + ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10))); + } + + std::atomic total_next(0); + std::atomic total_next_found(0); + std::atomic total_prev(0); + std::atomic total_prev_found(0); + std::atomic total_bytes(0); + + std::vector threads; + std::function reader_func_next = [&]() { + SetPerfLevel(kEnableCount); + get_perf_context()->Reset(); + Iterator* iter = NewIterator(ReadOptions()); + + iter->SeekToFirst(); + // Seek will bump ITER_BYTES_READ + uint64_t bytes = 0; + bytes += iter->key().size(); + bytes += iter->value().size(); + while (true) { + iter->Next(); + total_next++; + + if (!iter->Valid()) { + break; + } + total_next_found++; + bytes += iter->key().size(); + bytes += iter->value().size(); + } + + delete iter; + ASSERT_EQ(bytes, get_perf_context()->iter_read_bytes); + SetPerfLevel(kDisable); + total_bytes += bytes; + }; + + std::function reader_func_prev = [&]() { + SetPerfLevel(kEnableCount); + Iterator* iter = NewIterator(ReadOptions()); + + iter->SeekToLast(); + // Seek will bump ITER_BYTES_READ + uint64_t bytes = 0; + bytes += iter->key().size(); + bytes += iter->value().size(); + while (true) { + iter->Prev(); + total_prev++; + + if (!iter->Valid()) { + break; + } + total_prev_found++; + bytes += iter->key().size(); + bytes += iter->value().size(); + } + + delete iter; + ASSERT_EQ(bytes, get_perf_context()->iter_read_bytes); + SetPerfLevel(kDisable); + total_bytes += bytes; + }; + + for (int i = 0; i < 10; i++) { + threads.emplace_back(reader_func_next); + } + for (int i = 0; i < 15; i++) { + threads.emplace_back(reader_func_prev); + } + + for (auto& t : threads) { + t.join(); + } + + ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_NEXT), (uint64_t)total_next); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_NEXT_FOUND), + (uint64_t)total_next_found); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV), (uint64_t)total_prev); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV_FOUND), + (uint64_t)total_prev_found); + ASSERT_EQ(TestGetTickerCount(options, ITER_BYTES_READ), + (uint64_t)total_bytes); +} + +TEST_P(DBIteratorTest, ReadAhead) { + Options options; + env_->count_random_reads_ = true; + options.env = env_; + options.disable_auto_compactions = true; + options.write_buffer_size = 4 << 20; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.block_size = 1024; + table_options.no_block_cache = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + + std::string value(1024, 'a'); + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(i), value)); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(i), value)); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(i), value)); + } + ASSERT_OK(Flush()); +#ifndef ROCKSDB_LITE + ASSERT_EQ("1,1,1", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + env_->random_read_bytes_counter_ = 0; + options.statistics->setTickerCount(NO_FILE_OPENS, 0); + ReadOptions read_options; + auto* iter = NewIterator(read_options); + iter->SeekToFirst(); + int64_t num_file_opens = TestGetTickerCount(options, NO_FILE_OPENS); + size_t bytes_read = env_->random_read_bytes_counter_; + delete iter; + + int64_t num_file_closes = TestGetTickerCount(options, NO_FILE_CLOSES); + env_->random_read_bytes_counter_ = 0; + options.statistics->setTickerCount(NO_FILE_OPENS, 0); + read_options.readahead_size = 1024 * 10; + iter = NewIterator(read_options); + iter->SeekToFirst(); + int64_t num_file_opens_readahead = TestGetTickerCount(options, NO_FILE_OPENS); + size_t bytes_read_readahead = env_->random_read_bytes_counter_; + delete iter; + int64_t num_file_closes_readahead = + TestGetTickerCount(options, NO_FILE_CLOSES); + ASSERT_EQ(num_file_opens, num_file_opens_readahead); + ASSERT_EQ(num_file_closes, num_file_closes_readahead); + ASSERT_GT(bytes_read_readahead, bytes_read); + ASSERT_GT(bytes_read_readahead, read_options.readahead_size * 3); + + // Verify correctness. + iter = NewIterator(read_options); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_EQ(value, iter->value()); + count++; + } + ASSERT_EQ(100, count); + for (int i = 0; i < 100; i++) { + iter->Seek(Key(i)); + ASSERT_EQ(value, iter->value()); + } + delete iter; +} + +// Insert a key, create a snapshot iterator, overwrite key lots of times, +// seek to a smaller key. Expect DBIter to fall back to a seek instead of +// going through all the overwrites linearly. +TEST_P(DBIteratorTest, DBIteratorSkipRecentDuplicatesTest) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.max_sequential_skip_in_iterations = 3; + options.prefix_extractor = nullptr; + options.write_buffer_size = 1 << 27; // big enough to avoid flush + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + + // Insert. + ASSERT_OK(Put("b", "0")); + + // Create iterator. + ReadOptions ro; + std::unique_ptr iter(NewIterator(ro)); + + // Insert a lot. + for (int i = 0; i < 100; ++i) { + ASSERT_OK(Put("b", std::to_string(i + 1).c_str())); + } + +#ifndef ROCKSDB_LITE + // Check that memtable wasn't flushed. + std::string val; + ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level0", &val)); + EXPECT_EQ("0", val); +#endif + + // Seek iterator to a smaller key. + get_perf_context()->Reset(); + iter->Seek("a"); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("b", iter->key().ToString()); + EXPECT_EQ("0", iter->value().ToString()); + + // Check that the seek didn't do too much work. + // Checks are not tight, just make sure that everything is well below 100. + EXPECT_LT(get_perf_context()->internal_key_skipped_count, 4); + EXPECT_LT(get_perf_context()->internal_recent_skipped_count, 8); + EXPECT_LT(get_perf_context()->seek_on_memtable_count, 10); + EXPECT_LT(get_perf_context()->next_on_memtable_count, 10); + EXPECT_LT(get_perf_context()->prev_on_memtable_count, 10); + + // Check that iterator did something like what we expect. + EXPECT_EQ(get_perf_context()->internal_delete_skipped_count, 0); + EXPECT_EQ(get_perf_context()->internal_merge_count, 0); + EXPECT_GE(get_perf_context()->internal_recent_skipped_count, 2); + EXPECT_GE(get_perf_context()->seek_on_memtable_count, 2); + EXPECT_EQ(1, + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); +} + +TEST_P(DBIteratorTest, Refresh) { + ASSERT_OK(Put("x", "y")); + + std::unique_ptr iter(NewIterator(ReadOptions())); + ASSERT_OK(iter->status()); + iter->Seek(Slice("a")); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("x")), 0); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + ASSERT_OK(Put("c", "d")); + + iter->Seek(Slice("a")); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("x")), 0); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + ASSERT_OK(iter->status()); + ASSERT_OK(iter->Refresh()); + + iter->Seek(Slice("a")); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("c")), 0); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("x")), 0); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + EXPECT_OK(dbfull()->Flush(FlushOptions())); + + ASSERT_OK(Put("m", "n")); + + iter->Seek(Slice("a")); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("c")), 0); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("x")), 0); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + ASSERT_OK(iter->status()); + ASSERT_OK(iter->Refresh()); + + iter->Seek(Slice("a")); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("c")), 0); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("m")), 0); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("x")), 0); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + iter.reset(); +} + +TEST_P(DBIteratorTest, RefreshWithSnapshot) { + ASSERT_OK(Put("x", "y")); + const Snapshot* snapshot = db_->GetSnapshot(); + ReadOptions options; + options.snapshot = snapshot; + Iterator* iter = NewIterator(options); + ASSERT_OK(iter->status()); + + iter->Seek(Slice("a")); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("x")), 0); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + ASSERT_OK(Put("c", "d")); + + iter->Seek(Slice("a")); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("x")), 0); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + ASSERT_OK(iter->status()); + Status s = iter->Refresh(); + ASSERT_TRUE(s.IsNotSupported()); + db_->ReleaseSnapshot(snapshot); + delete iter; +} + +TEST_P(DBIteratorTest, CreationFailure) { + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::NewInternalIterator:StatusCallback", [](void* arg) { + *(reinterpret_cast(arg)) = Status::Corruption("test status"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Iterator* iter = NewIterator(ReadOptions()); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->status().IsCorruption()); + delete iter; +} + +TEST_P(DBIteratorTest, UpperBoundWithChangeDirection) { + Options options = CurrentOptions(); + options.max_sequential_skip_in_iterations = 3; + DestroyAndReopen(options); + + // write a bunch of kvs to the database. + ASSERT_OK(Put("a", "1")); + ASSERT_OK(Put("y", "1")); + ASSERT_OK(Put("y1", "1")); + ASSERT_OK(Put("y2", "1")); + ASSERT_OK(Put("y3", "1")); + ASSERT_OK(Put("z", "1")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("a", "1")); + ASSERT_OK(Put("z", "1")); + ASSERT_OK(Put("bar", "1")); + ASSERT_OK(Put("foo", "1")); + + std::string upper_bound = "x"; + Slice ub_slice(upper_bound); + ReadOptions ro; + ro.iterate_upper_bound = &ub_slice; + ro.max_skippable_internal_keys = 1000; + + Iterator* iter = NewIterator(ro); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("bar", iter->key().ToString()); + + delete iter; +} + +TEST_P(DBIteratorTest, TableFilter) { + ASSERT_OK(Put("a", "1")); + EXPECT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(Put("b", "2")); + ASSERT_OK(Put("c", "3")); + EXPECT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(Put("d", "4")); + ASSERT_OK(Put("e", "5")); + ASSERT_OK(Put("f", "6")); + EXPECT_OK(dbfull()->Flush(FlushOptions())); + + // Ensure the table_filter callback is called once for each table. + { + std::set unseen{1, 2, 3}; + ReadOptions opts; + opts.table_filter = [&](const TableProperties& props) { + auto it = unseen.find(props.num_entries); + if (it == unseen.end()) { + ADD_FAILURE() << "saw table properties with an unexpected " + << props.num_entries << " entries"; + } else { + unseen.erase(it); + } + return true; + }; + auto iter = NewIterator(opts); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->1"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->2"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->3"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "d->4"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "e->5"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "f->6"); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(unseen.empty()); + delete iter; + } + + // Ensure returning false in the table_filter hides the keys from that table + // during iteration. + { + ReadOptions opts; + opts.table_filter = [](const TableProperties& props) { + return props.num_entries != 2; + }; + auto iter = NewIterator(opts); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->1"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "d->4"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "e->5"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "f->6"); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + delete iter; + } +} + +TEST_P(DBIteratorTest, UpperBoundWithPrevReseek) { + Options options = CurrentOptions(); + options.max_sequential_skip_in_iterations = 3; + DestroyAndReopen(options); + + // write a bunch of kvs to the database. + ASSERT_OK(Put("a", "1")); + ASSERT_OK(Put("y", "1")); + ASSERT_OK(Put("z", "1")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("a", "1")); + ASSERT_OK(Put("z", "1")); + ASSERT_OK(Put("bar", "1")); + ASSERT_OK(Put("foo", "1")); + ASSERT_OK(Put("foo", "2")); + + ASSERT_OK(Put("foo", "3")); + ASSERT_OK(Put("foo", "4")); + ASSERT_OK(Put("foo", "5")); + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(Put("foo", "6")); + + std::string upper_bound = "x"; + Slice ub_slice(upper_bound); + ReadOptions ro; + ro.snapshot = snapshot; + ro.iterate_upper_bound = &ub_slice; + + Iterator* iter = NewIterator(ro); + iter->SeekForPrev("goo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + iter->Prev(); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar", iter->key().ToString()); + + delete iter; + db_->ReleaseSnapshot(snapshot); +} + +TEST_P(DBIteratorTest, SkipStatistics) { + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + + int skip_count = 0; + + // write a bunch of kvs to the database. + ASSERT_OK(Put("a", "1")); + ASSERT_OK(Put("b", "1")); + ASSERT_OK(Put("c", "1")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("d", "1")); + ASSERT_OK(Put("e", "1")); + ASSERT_OK(Put("f", "1")); + ASSERT_OK(Put("a", "2")); + ASSERT_OK(Put("b", "2")); + ASSERT_OK(Flush()); + ASSERT_OK(Delete("d")); + ASSERT_OK(Delete("e")); + ASSERT_OK(Delete("f")); + + Iterator* iter = NewIterator(ReadOptions()); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 3); + delete iter; + skip_count += 8; // 3 deletes + 3 original keys + 2 lower in sequence + ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP)); + + iter = NewIterator(ReadOptions()); + count = 0; + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 3); + delete iter; + skip_count += 8; // Same as above, but in reverse order + ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP)); + + ASSERT_OK(Put("aa", "1")); + ASSERT_OK(Put("ab", "1")); + ASSERT_OK(Put("ac", "1")); + ASSERT_OK(Put("ad", "1")); + ASSERT_OK(Flush()); + ASSERT_OK(Delete("ab")); + ASSERT_OK(Delete("ac")); + ASSERT_OK(Delete("ad")); + + ReadOptions ro; + Slice prefix("b"); + ro.iterate_upper_bound = &prefix; + + iter = NewIterator(ro); + count = 0; + for (iter->Seek("aa"); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 1); + delete iter; + skip_count += 6; // 3 deletes + 3 original keys + ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP)); + + iter = NewIterator(ro); + count = 0; + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 2); + delete iter; + // 3 deletes + 3 original keys + lower sequence of "a" + skip_count += 7; + ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP)); +} + +TEST_P(DBIteratorTest, SeekAfterHittingManyInternalKeys) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + ReadOptions ropts; + ropts.max_skippable_internal_keys = 2; + + ASSERT_OK(Put("1", "val_1")); + // Add more tombstones than max_skippable_internal_keys so that Next() fails. + ASSERT_OK(Delete("2")); + ASSERT_OK(Delete("3")); + ASSERT_OK(Delete("4")); + ASSERT_OK(Delete("5")); + ASSERT_OK(Put("6", "val_6")); + + std::unique_ptr iter(NewIterator(ropts)); + iter->SeekToFirst(); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "1"); + ASSERT_EQ(iter->value().ToString(), "val_1"); + + // This should fail as incomplete due to too many non-visible internal keys on + // the way to the next valid user key. + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + ASSERT_TRUE(iter->status().IsIncomplete()); + + // Get the internal key at which Next() failed. + std::string prop_value; + ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value)); + ASSERT_EQ("4", prop_value); + + // Create a new iterator to seek to the internal key. + std::unique_ptr iter2(NewIterator(ropts)); + iter2->Seek(prop_value); + ASSERT_TRUE(iter2->Valid()); + ASSERT_OK(iter2->status()); + + ASSERT_EQ(iter2->key().ToString(), "6"); + ASSERT_EQ(iter2->value().ToString(), "val_6"); +} + +// Reproduces a former bug where iterator would skip some records when DBIter +// re-seeks subiterator with Incomplete status. +TEST_P(DBIteratorTest, NonBlockingIterationBugRepro) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + // Make sure the sst file has more than one block. + table_options.flush_block_policy_factory = + std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + // Two records in sst file, each in its own block. + ASSERT_OK(Put("b", "")); + ASSERT_OK(Put("d", "")); + ASSERT_OK(Flush()); + + // Create a nonblocking iterator before writing to memtable. + ReadOptions ropt; + ropt.read_tier = kBlockCacheTier; + std::unique_ptr iter(NewIterator(ropt)); + + // Overwrite a key in memtable many times to hit + // max_sequential_skip_in_iterations (which is 8 by default). + for (int i = 0; i < 20; ++i) { + ASSERT_OK(Put("c", "")); + } + + // Load the second block in sst file into the block cache. + { + std::unique_ptr iter2(NewIterator(ReadOptions())); + iter2->Seek("d"); + } + + // Finally seek the nonblocking iterator. + iter->Seek("a"); + // With the bug, the status used to be OK, and the iterator used to point to + // "d". + EXPECT_TRUE(iter->status().IsIncomplete()); +} + +TEST_P(DBIteratorTest, SeekBackwardAfterOutOfUpperBound) { + ASSERT_OK(Put("a", "")); + ASSERT_OK(Put("b", "")); + ASSERT_OK(Flush()); + + ReadOptions ropt; + Slice ub = "b"; + ropt.iterate_upper_bound = &ub; + + std::unique_ptr it(dbfull()->NewIterator(ropt)); + it->SeekForPrev("a"); + ASSERT_TRUE(it->Valid()); + ASSERT_OK(it->status()); + ASSERT_EQ("a", it->key().ToString()); + it->Next(); + ASSERT_FALSE(it->Valid()); + ASSERT_OK(it->status()); + it->SeekForPrev("a"); + ASSERT_OK(it->status()); + + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("a", it->key().ToString()); +} + +TEST_P(DBIteratorTest, AvoidReseekLevelIterator) { + Options options = CurrentOptions(); + options.compression = CompressionType::kNoCompression; + BlockBasedTableOptions table_options; + table_options.block_size = 800; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + + Random rnd(301); + std::string random_str = rnd.RandomString(180); + + ASSERT_OK(Put("1", random_str)); + ASSERT_OK(Put("2", random_str)); + ASSERT_OK(Put("3", random_str)); + ASSERT_OK(Put("4", random_str)); + // A new block + ASSERT_OK(Put("5", random_str)); + ASSERT_OK(Put("6", random_str)); + ASSERT_OK(Put("7", random_str)); + ASSERT_OK(Flush()); + ASSERT_OK(Put("8", random_str)); + ASSERT_OK(Put("9", random_str)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + int num_find_file_in_level = 0; + int num_idx_blk_seek = 0; + SyncPoint::GetInstance()->SetCallBack( + "LevelIterator::Seek:BeforeFindFile", + [&](void* /*arg*/) { num_find_file_in_level++; }); + SyncPoint::GetInstance()->SetCallBack( + "IndexBlockIter::Seek:0", [&](void* /*arg*/) { num_idx_blk_seek++; }); + SyncPoint::GetInstance()->EnableProcessing(); + + { + std::unique_ptr iter(NewIterator(ReadOptions())); + iter->Seek("1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(1, num_idx_blk_seek); + + iter->Seek("2"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(1, num_idx_blk_seek); + + iter->Seek("3"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(1, num_idx_blk_seek); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(1, num_idx_blk_seek); + + iter->Seek("5"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(2, num_idx_blk_seek); + + iter->Seek("6"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(2, num_idx_blk_seek); + + iter->Seek("7"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(3, num_idx_blk_seek); + + iter->Seek("8"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(2, num_find_file_in_level); + // Still re-seek because "8" is the boundary key, which has + // the same user key as the seek key. + ASSERT_EQ(4, num_idx_blk_seek); + + iter->Seek("5"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(3, num_find_file_in_level); + ASSERT_EQ(5, num_idx_blk_seek); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(3, num_find_file_in_level); + ASSERT_EQ(5, num_idx_blk_seek); + + // Seek backward never triggers the index block seek to be skipped + iter->Seek("5"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(3, num_find_file_in_level); + ASSERT_EQ(6, num_idx_blk_seek); + } + + SyncPoint::GetInstance()->DisableProcessing(); +} + +// MyRocks may change iterate bounds before seek. Simply test to make sure such +// usage doesn't break iterator. +TEST_P(DBIteratorTest, IterateBoundChangedBeforeSeek) { + Options options = CurrentOptions(); + options.compression = CompressionType::kNoCompression; + BlockBasedTableOptions table_options; + table_options.block_size = 100; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + std::string value(50, 'v'); + Reopen(options); + ASSERT_OK(Put("aaa", value)); + ASSERT_OK(Flush()); + ASSERT_OK(Put("bbb", "v")); + ASSERT_OK(Put("ccc", "v")); + ASSERT_OK(Put("ddd", "v")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("eee", "v")); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + std::string ub1 = "e"; + std::string ub2 = "c"; + Slice ub(ub1); + ReadOptions read_opts1; + read_opts1.iterate_upper_bound = &ub; + Iterator* iter = NewIterator(read_opts1); + // Seek and iterate accross block boundary. + iter->Seek("b"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("bbb", iter->key()); + ub = Slice(ub2); + iter->Seek("b"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("bbb", iter->key()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + delete iter; + + std::string lb1 = "a"; + std::string lb2 = "c"; + Slice lb(lb1); + ReadOptions read_opts2; + read_opts2.iterate_lower_bound = &lb; + iter = NewIterator(read_opts2); + iter->SeekForPrev("d"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("ccc", iter->key()); + lb = Slice(lb2); + iter->SeekForPrev("d"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("ccc", iter->key()); + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + delete iter; +} + +TEST_P(DBIteratorTest, IterateWithLowerBoundAcrossFileBoundary) { + ASSERT_OK(Put("aaa", "v")); + ASSERT_OK(Put("bbb", "v")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("ccc", "v")); + ASSERT_OK(Put("ddd", "v")); + ASSERT_OK(Flush()); + // Move both files to bottom level. + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + Slice lower_bound("b"); + ReadOptions read_opts; + read_opts.iterate_lower_bound = &lower_bound; + std::unique_ptr iter(NewIterator(read_opts)); + iter->SeekForPrev("d"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("ccc", iter->key()); + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("bbb", iter->key()); + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); +} + +TEST_P(DBIteratorTest, Blob) { + Options options = CurrentOptions(); + options.enable_blob_files = true; + options.max_sequential_skip_in_iterations = 2; + options.statistics = CreateDBStatistics(); + + Reopen(options); + + // Note: we have 4 KVs (3 of which are hidden) for key "b" and + // max_sequential_skip_in_iterations is set to 2. Thus, we need to do a reseek + // anytime we move from "b" to "c" or vice versa. + ASSERT_OK(Put("a", "va")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("b", "vb0")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("b", "vb1")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("b", "vb2")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("b", "vb3")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("c", "vc")); + ASSERT_OK(Flush()); + + std::unique_ptr iter_guard(NewIterator(ReadOptions())); + Iterator* const iter = iter_guard.get(); + + iter->SeekToFirst(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "b->vb3"); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "b->vb3"); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("a"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("ax"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "b->vb3"); + + iter->SeekForPrev("d"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->SeekForPrev("c"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->SeekForPrev("bx"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3); + ASSERT_EQ(IterStatus(iter), "b->vb3"); + + iter->Seek("b"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3); + ASSERT_EQ(IterStatus(iter), "b->vb3"); + iter->Seek("z"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekForPrev("b"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 4); + ASSERT_EQ(IterStatus(iter), "b->vb3"); + iter->SeekForPrev(""); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 4); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + // Switch from reverse to forward + iter->SeekToLast(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 4); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 5); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 5); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 6); + ASSERT_EQ(IterStatus(iter), "b->vb3"); + + // Switch from forward to reverse + iter->SeekToFirst(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 6); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 6); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 7); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 8); + ASSERT_EQ(IterStatus(iter), "b->vb3"); +} + +INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest, + testing::Values(true, false)); + +// Tests how DBIter work with ReadCallback +class DBIteratorWithReadCallbackTest : public DBIteratorTest {}; + +TEST_F(DBIteratorWithReadCallbackTest, ReadCallback) { + class TestReadCallback : public ReadCallback { + public: + explicit TestReadCallback(SequenceNumber _max_visible_seq) + : ReadCallback(_max_visible_seq) {} + + bool IsVisibleFullCheck(SequenceNumber seq) override { + return seq <= max_visible_seq_; + } + }; + + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("foo", "v2")); + ASSERT_OK(Put("foo", "v3")); + ASSERT_OK(Put("a", "va")); + ASSERT_OK(Put("z", "vz")); + SequenceNumber seq1 = db_->GetLatestSequenceNumber(); + TestReadCallback callback1(seq1); + ASSERT_OK(Put("foo", "v4")); + ASSERT_OK(Put("foo", "v5")); + ASSERT_OK(Put("bar", "v7")); + + SequenceNumber seq2 = db_->GetLatestSequenceNumber(); + auto* cfd = + static_cast_with_check(db_->DefaultColumnFamily()) + ->cfd(); + // The iterator are suppose to see data before seq1. + Iterator* iter = + dbfull()->NewIteratorImpl(ReadOptions(), cfd, seq2, &callback1); + + // Seek + // The latest value of "foo" before seq1 is "v3" + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("foo", iter->key()); + ASSERT_EQ("v3", iter->value()); + // "bar" is not visible to the iterator. It will move on to the next key + // "foo". + iter->Seek("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("foo", iter->key()); + ASSERT_EQ("v3", iter->value()); + + // Next + // Seek to "a" + iter->Seek("a"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("va", iter->value()); + // "bar" is not visible to the iterator. It will move on to the next key + // "foo". + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("foo", iter->key()); + ASSERT_EQ("v3", iter->value()); + + // Prev + // Seek to "z" + iter->Seek("z"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("vz", iter->value()); + // The previous key is "foo", which is visible to the iterator. + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("foo", iter->key()); + ASSERT_EQ("v3", iter->value()); + // "bar" is not visible to the iterator. It will move on to the next key "a". + iter->Prev(); // skipping "bar" + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("a", iter->key()); + ASSERT_EQ("va", iter->value()); + + // SeekForPrev + // The previous key is "foo", which is visible to the iterator. + iter->SeekForPrev("y"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("foo", iter->key()); + ASSERT_EQ("v3", iter->value()); + // "bar" is not visible to the iterator. It will move on to the next key "a". + iter->SeekForPrev("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("a", iter->key()); + ASSERT_EQ("va", iter->value()); + + delete iter; + + // Prev beyond max_sequential_skip_in_iterations + uint64_t num_versions = + CurrentOptions().max_sequential_skip_in_iterations + 10; + for (uint64_t i = 0; i < num_versions; i++) { + ASSERT_OK(Put("bar", std::to_string(i))); + } + SequenceNumber seq3 = db_->GetLatestSequenceNumber(); + TestReadCallback callback2(seq3); + ASSERT_OK(Put("bar", "v8")); + SequenceNumber seq4 = db_->GetLatestSequenceNumber(); + + // The iterator is suppose to see data before seq3. + iter = dbfull()->NewIteratorImpl(ReadOptions(), cfd, seq4, &callback2); + // Seek to "z", which is visible. + iter->Seek("z"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("vz", iter->value()); + // Previous key is "foo" and the last value "v5" is visible. + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("foo", iter->key()); + ASSERT_EQ("v5", iter->value()); + // Since the number of values of "bar" is more than + // max_sequential_skip_in_iterations, Prev() will ultimately fallback to + // seek in forward direction. Here we test the fallback seek is correct. + // The last visible value should be (num_versions - 1), as "v8" is not + // visible. + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("bar", iter->key()); + ASSERT_EQ(std::to_string(num_versions - 1), iter->value()); + + delete iter; +} + +TEST_F(DBIteratorTest, BackwardIterationOnInplaceUpdateMemtable) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = false; + options.env = env_; + DestroyAndReopen(options); + constexpr int kNumKeys = 10; + + // Write kNumKeys to WAL. + for (int i = 0; i < kNumKeys; ++i) { + ASSERT_OK(Put(Key(i), "val")); + } + ReadOptions read_opts; + read_opts.total_order_seek = true; + { + std::unique_ptr iter(db_->NewIterator(read_opts)); + int count = 0; + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + ++count; + } + ASSERT_EQ(kNumKeys, count); + } + + // Reopen and rebuild the memtable from WAL. + options.create_if_missing = false; + options.avoid_flush_during_recovery = true; + options.inplace_update_support = true; + options.allow_concurrent_memtable_write = false; + Reopen(options); + { + std::unique_ptr iter(db_->NewIterator(read_opts)); + iter->SeekToLast(); + // Backward iteration not supported due to inplace_update_support = true. + ASSERT_TRUE(iter->status().IsNotSupported()); + ASSERT_FALSE(iter->Valid()); + } +} + +TEST_F(DBIteratorTest, IteratorRefreshReturnSV) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + DestroyAndReopen(options); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + std::unique_ptr iter{db_->NewIterator(ReadOptions())}; + SyncPoint::GetInstance()->SetCallBack( + "ArenaWrappedDBIter::Refresh:SV", [&](void*) { + ASSERT_OK(db_->Put(WriteOptions(), "dummy", "new SV")); + // This makes the local SV obselete. + ASSERT_OK(Flush()); + SyncPoint::GetInstance()->DisableProcessing(); + }); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(iter->Refresh()); + iter.reset(); + // iter used to not cleanup SV, so the Close() below would hit an assertion + // error. + Close(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_kv_checksum_test.cc b/src/rocksdb/db/db_kv_checksum_test.cc new file mode 100644 index 000000000..614399243 --- /dev/null +++ b/src/rocksdb/db/db_kv_checksum_test.cc @@ -0,0 +1,885 @@ +// Copyright (c) 2020-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_index.h" +#include "db/db_test_util.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +enum class WriteBatchOpType { + kPut = 0, + kDelete, + kSingleDelete, + kMerge, + kPutEntity, + kDeleteRange, + kNum, +}; + +// Integer addition is needed for `::testing::Range()` to take the enum type. +WriteBatchOpType operator+(WriteBatchOpType lhs, const int rhs) { + using T = std::underlying_type::type; + return static_cast(static_cast(lhs) + rhs); +} + +enum class WriteMode { + // `Write()` a `WriteBatch` constructed with `protection_bytes_per_key = 0` + // and `WriteOptions::protection_bytes_per_key = 0` + kWriteUnprotectedBatch = 0, + // `Write()` a `WriteBatch` constructed with `protection_bytes_per_key > 0`. + kWriteProtectedBatch, + // `Write()` a `WriteBatch` constructed with `protection_bytes_per_key == 0`. + // Protection is enabled via `WriteOptions::protection_bytes_per_key > 0`. + kWriteOptionProtectedBatch, + // TODO(ajkr): add a mode that uses `Write()` wrappers, e.g., `Put()`. + kNum, +}; + +// Integer addition is needed for `::testing::Range()` to take the enum type. +WriteMode operator+(WriteMode lhs, const int rhs) { + using T = std::underlying_type::type; + return static_cast(static_cast(lhs) + rhs); +} + +std::pair GetWriteBatch(ColumnFamilyHandle* cf_handle, + size_t protection_bytes_per_key, + WriteBatchOpType op_type) { + Status s; + WriteBatch wb(0 /* reserved_bytes */, 0 /* max_bytes */, + protection_bytes_per_key, 0 /* default_cf_ts_sz */); + switch (op_type) { + case WriteBatchOpType::kPut: + s = wb.Put(cf_handle, "key", "val"); + break; + case WriteBatchOpType::kDelete: + s = wb.Delete(cf_handle, "key"); + break; + case WriteBatchOpType::kSingleDelete: + s = wb.SingleDelete(cf_handle, "key"); + break; + case WriteBatchOpType::kDeleteRange: + s = wb.DeleteRange(cf_handle, "begin", "end"); + break; + case WriteBatchOpType::kMerge: + s = wb.Merge(cf_handle, "key", "val"); + break; + case WriteBatchOpType::kPutEntity: + s = wb.PutEntity(cf_handle, "key", + {{"attr_name1", "foo"}, {"attr_name2", "bar"}}); + break; + case WriteBatchOpType::kNum: + assert(false); + } + return {std::move(wb), std::move(s)}; +} + +class DbKvChecksumTestBase : public DBTestBase { + public: + DbKvChecksumTestBase(const std::string& path, bool env_do_fsync) + : DBTestBase(path, env_do_fsync) {} + + ColumnFamilyHandle* GetCFHandleToUse(ColumnFamilyHandle* column_family, + WriteBatchOpType op_type) const { + // Note: PutEntity cannot be called without column family + if (op_type == WriteBatchOpType::kPutEntity && !column_family) { + return db_->DefaultColumnFamily(); + } + + return column_family; + } +}; + +class DbKvChecksumTest + : public DbKvChecksumTestBase, + public ::testing::WithParamInterface< + std::tuple> { + public: + DbKvChecksumTest() + : DbKvChecksumTestBase("db_kv_checksum_test", /*env_do_fsync=*/false) { + op_type_ = std::get<0>(GetParam()); + corrupt_byte_addend_ = std::get<1>(GetParam()); + write_mode_ = std::get<2>(GetParam()); + memtable_protection_bytes_per_key_ = std::get<3>(GetParam()); + } + + Status ExecuteWrite(ColumnFamilyHandle* cf_handle) { + switch (write_mode_) { + case WriteMode::kWriteUnprotectedBatch: { + auto batch_and_status = + GetWriteBatch(GetCFHandleToUse(cf_handle, op_type_), + 0 /* protection_bytes_per_key */, op_type_); + assert(batch_and_status.second.ok()); + // Default write option has protection_bytes_per_key = 0 + return db_->Write(WriteOptions(), &batch_and_status.first); + } + case WriteMode::kWriteProtectedBatch: { + auto batch_and_status = + GetWriteBatch(GetCFHandleToUse(cf_handle, op_type_), + 8 /* protection_bytes_per_key */, op_type_); + assert(batch_and_status.second.ok()); + return db_->Write(WriteOptions(), &batch_and_status.first); + } + case WriteMode::kWriteOptionProtectedBatch: { + auto batch_and_status = + GetWriteBatch(GetCFHandleToUse(cf_handle, op_type_), + 0 /* protection_bytes_per_key */, op_type_); + assert(batch_and_status.second.ok()); + WriteOptions write_opts; + write_opts.protection_bytes_per_key = 8; + return db_->Write(write_opts, &batch_and_status.first); + } + case WriteMode::kNum: + assert(false); + } + return Status::NotSupported("WriteMode " + + std::to_string(static_cast(write_mode_))); + } + + void CorruptNextByteCallBack(void* arg) { + Slice encoded = *static_cast(arg); + if (entry_len_ == std::numeric_limits::max()) { + // We learn the entry size on the first attempt + entry_len_ = encoded.size(); + } + char* buf = const_cast(encoded.data()); + buf[corrupt_byte_offset_] += corrupt_byte_addend_; + ++corrupt_byte_offset_; + } + + bool MoreBytesToCorrupt() { return corrupt_byte_offset_ < entry_len_; } + + protected: + WriteBatchOpType op_type_; + char corrupt_byte_addend_; + WriteMode write_mode_; + uint32_t memtable_protection_bytes_per_key_; + size_t corrupt_byte_offset_ = 0; + size_t entry_len_ = std::numeric_limits::max(); +}; + +std::string GetOpTypeString(const WriteBatchOpType& op_type) { + switch (op_type) { + case WriteBatchOpType::kPut: + return "Put"; + case WriteBatchOpType::kDelete: + return "Delete"; + case WriteBatchOpType::kSingleDelete: + return "SingleDelete"; + case WriteBatchOpType::kDeleteRange: + return "DeleteRange"; + case WriteBatchOpType::kMerge: + return "Merge"; + case WriteBatchOpType::kPutEntity: + return "PutEntity"; + case WriteBatchOpType::kNum: + assert(false); + } + assert(false); + return ""; +} + +std::string GetWriteModeString(const WriteMode& mode) { + switch (mode) { + case WriteMode::kWriteUnprotectedBatch: + return "WriteUnprotectedBatch"; + case WriteMode::kWriteProtectedBatch: + return "WriteProtectedBatch"; + case WriteMode::kWriteOptionProtectedBatch: + return "kWriteOptionProtectedBatch"; + case WriteMode::kNum: + assert(false); + } + return ""; +} + +INSTANTIATE_TEST_CASE_P( + DbKvChecksumTest, DbKvChecksumTest, + ::testing::Combine(::testing::Range(static_cast(0), + WriteBatchOpType::kNum), + ::testing::Values(2, 103, 251), + ::testing::Range(WriteMode::kWriteProtectedBatch, + WriteMode::kNum), + ::testing::Values(0)), + [](const testing::TestParamInfo< + std::tuple>& args) { + std::ostringstream oss; + oss << GetOpTypeString(std::get<0>(args.param)) << "Add" + << static_cast( + static_cast(std::get<1>(args.param))) + << GetWriteModeString(std::get<2>(args.param)) + << static_cast(std::get<3>(args.param)); + return oss.str(); + }); + +// TODO(ajkr): add a test that corrupts the `WriteBatch` contents. Such +// corruptions should only be detectable in `WriteMode::kWriteProtectedBatch`. + +TEST_P(DbKvChecksumTest, MemTableAddCorrupted) { + // This test repeatedly attempts to write `WriteBatch`es containing a single + // entry of type `op_type_`. Each attempt has one byte corrupted in its + // memtable entry by adding `corrupt_byte_addend_` to its original value. The + // test repeats until an attempt has been made on each byte in the encoded + // memtable entry. All attempts are expected to fail with `Status::Corruption` + SyncPoint::GetInstance()->SetCallBack( + "MemTable::Add:Encoded", + std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this, + std::placeholders::_1)); + + while (MoreBytesToCorrupt()) { + // Failed memtable insert always leads to read-only mode, so we have to + // reopen for every attempt. + Options options = CurrentOptions(); + if (op_type_ == WriteBatchOpType::kMerge) { + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + } + Reopen(options); + + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_TRUE(ExecuteWrite(nullptr /* cf_handle */).IsCorruption()); + SyncPoint::GetInstance()->DisableProcessing(); + + // In case the above callback is not invoked, this test will run + // numeric_limits::max() times until it reports an error (or will + // exhaust disk space). Added this assert to report error early. + ASSERT_TRUE(entry_len_ < std::numeric_limits::max()); + } +} + +TEST_P(DbKvChecksumTest, MemTableAddWithColumnFamilyCorrupted) { + // This test repeatedly attempts to write `WriteBatch`es containing a single + // entry of type `op_type_` to a non-default column family. Each attempt has + // one byte corrupted in its memtable entry by adding `corrupt_byte_addend_` + // to its original value. The test repeats until an attempt has been made on + // each byte in the encoded memtable entry. All attempts are expected to fail + // with `Status::Corruption`. + Options options = CurrentOptions(); + if (op_type_ == WriteBatchOpType::kMerge) { + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + } + CreateAndReopenWithCF({"pikachu"}, options); + SyncPoint::GetInstance()->SetCallBack( + "MemTable::Add:Encoded", + std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this, + std::placeholders::_1)); + + while (MoreBytesToCorrupt()) { + // Failed memtable insert always leads to read-only mode, so we have to + // reopen for every attempt. + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); + + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_TRUE(ExecuteWrite(handles_[1]).IsCorruption()); + SyncPoint::GetInstance()->DisableProcessing(); + + // In case the above callback is not invoked, this test will run + // numeric_limits::max() times until it reports an error (or will + // exhaust disk space). Added this assert to report error early. + ASSERT_TRUE(entry_len_ < std::numeric_limits::max()); + } +} + +TEST_P(DbKvChecksumTest, NoCorruptionCase) { + // If this test fails, we may have found a piece of malfunctioned hardware + auto batch_and_status = + GetWriteBatch(GetCFHandleToUse(nullptr, op_type_), + 8 /* protection_bytes_per_key */, op_type_); + ASSERT_OK(batch_and_status.second); + ASSERT_OK(batch_and_status.first.VerifyChecksum()); +} + +TEST_P(DbKvChecksumTest, WriteToWALCorrupted) { + // This test repeatedly attempts to write `WriteBatch`es containing a single + // entry of type `op_type_`. Each attempt has one byte corrupted by adding + // `corrupt_byte_addend_` to its original value. The test repeats until an + // attempt has been made on each byte in the encoded write batch. All attempts + // are expected to fail with `Status::Corruption` + Options options = CurrentOptions(); + if (op_type_ == WriteBatchOpType::kMerge) { + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + } + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::WriteToWAL:log_entry", + std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this, + std::placeholders::_1)); + // First 8 bytes are for sequence number which is not protected in write batch + corrupt_byte_offset_ = 8; + + while (MoreBytesToCorrupt()) { + // Corrupted write batch leads to read-only mode, so we have to + // reopen for every attempt. + Reopen(options); + auto log_size_pre_write = dbfull()->TEST_total_log_size(); + + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_TRUE(ExecuteWrite(nullptr /* cf_handle */).IsCorruption()); + // Confirm that nothing was written to WAL + ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size()); + ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption()); + SyncPoint::GetInstance()->DisableProcessing(); + + // In case the above callback is not invoked, this test will run + // numeric_limits::max() times until it reports an error (or will + // exhaust disk space). Added this assert to report error early. + ASSERT_TRUE(entry_len_ < std::numeric_limits::max()); + } +} + +TEST_P(DbKvChecksumTest, WriteToWALWithColumnFamilyCorrupted) { + // This test repeatedly attempts to write `WriteBatch`es containing a single + // entry of type `op_type_`. Each attempt has one byte corrupted by adding + // `corrupt_byte_addend_` to its original value. The test repeats until an + // attempt has been made on each byte in the encoded write batch. All attempts + // are expected to fail with `Status::Corruption` + Options options = CurrentOptions(); + if (op_type_ == WriteBatchOpType::kMerge) { + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + } + CreateAndReopenWithCF({"pikachu"}, options); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::WriteToWAL:log_entry", + std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this, + std::placeholders::_1)); + // First 8 bytes are for sequence number which is not protected in write batch + corrupt_byte_offset_ = 8; + + while (MoreBytesToCorrupt()) { + // Corrupted write batch leads to read-only mode, so we have to + // reopen for every attempt. + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); + auto log_size_pre_write = dbfull()->TEST_total_log_size(); + + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_TRUE(ExecuteWrite(nullptr /* cf_handle */).IsCorruption()); + // Confirm that nothing was written to WAL + ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size()); + ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption()); + SyncPoint::GetInstance()->DisableProcessing(); + + // In case the above callback is not invoked, this test will run + // numeric_limits::max() times until it reports an error (or will + // exhaust disk space). Added this assert to report error early. + ASSERT_TRUE(entry_len_ < std::numeric_limits::max()); + } +} + +class DbKvChecksumTestMergedBatch + : public DbKvChecksumTestBase, + public ::testing::WithParamInterface< + std::tuple> { + public: + DbKvChecksumTestMergedBatch() + : DbKvChecksumTestBase("db_kv_checksum_test", /*env_do_fsync=*/false) { + op_type1_ = std::get<0>(GetParam()); + op_type2_ = std::get<1>(GetParam()); + corrupt_byte_addend_ = std::get<2>(GetParam()); + } + + protected: + WriteBatchOpType op_type1_; + WriteBatchOpType op_type2_; + char corrupt_byte_addend_; +}; + +void CorruptWriteBatch(Slice* content, size_t offset, + char corrupt_byte_addend) { + ASSERT_TRUE(offset < content->size()); + char* buf = const_cast(content->data()); + buf[offset] += corrupt_byte_addend; +} + +TEST_P(DbKvChecksumTestMergedBatch, NoCorruptionCase) { + // Veirfy write batch checksum after write batch append + auto batch1 = GetWriteBatch(GetCFHandleToUse(nullptr, op_type1_), + 8 /* protection_bytes_per_key */, op_type1_); + ASSERT_OK(batch1.second); + auto batch2 = GetWriteBatch(GetCFHandleToUse(nullptr, op_type2_), + 8 /* protection_bytes_per_key */, op_type2_); + ASSERT_OK(batch2.second); + ASSERT_OK(WriteBatchInternal::Append(&batch1.first, &batch2.first)); + ASSERT_OK(batch1.first.VerifyChecksum()); +} + +TEST_P(DbKvChecksumTestMergedBatch, WriteToWALCorrupted) { + // This test has two writers repeatedly attempt to write `WriteBatch`es + // containing a single entry of type op_type1_ and op_type2_ respectively. The + // leader of the write group writes the batch containinng the entry of type + // op_type1_. One byte of the pre-merged write batches is corrupted by adding + // `corrupt_byte_addend_` to the batch's original value during each attempt. + // The test repeats until an attempt has been made on each byte in both + // pre-merged write batches. All attempts are expected to fail with + // `Status::Corruption`. + Options options = CurrentOptions(); + if (op_type1_ == WriteBatchOpType::kMerge || + op_type2_ == WriteBatchOpType::kMerge) { + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + } + + auto leader_batch_and_status = + GetWriteBatch(GetCFHandleToUse(nullptr, op_type1_), + 8 /* protection_bytes_per_key */, op_type1_); + ASSERT_OK(leader_batch_and_status.second); + auto follower_batch_and_status = + GetWriteBatch(GetCFHandleToUse(nullptr, op_type2_), + 8 /* protection_bytes_per_key */, op_type2_); + size_t leader_batch_size = leader_batch_and_status.first.GetDataSize(); + size_t total_bytes = + leader_batch_size + follower_batch_and_status.first.GetDataSize(); + // First 8 bytes are for sequence number which is not protected in write batch + size_t corrupt_byte_offset = 8; + + std::atomic follower_joined{false}; + std::atomic leader_count{0}; + port::Thread follower_thread; + // This callback should only be called by the leader thread + SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:Wait2", [&](void* arg_leader) { + auto* leader = reinterpret_cast(arg_leader); + ASSERT_EQ(leader->state, WriteThread::STATE_GROUP_LEADER); + + // This callback should only be called by the follower thread + SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:Wait", [&](void* arg_follower) { + auto* follower = + reinterpret_cast(arg_follower); + // The leader thread will wait on this bool and hence wait until + // this writer joins the write group + ASSERT_NE(follower->state, WriteThread::STATE_GROUP_LEADER); + if (corrupt_byte_offset >= leader_batch_size) { + Slice batch_content = follower->batch->Data(); + CorruptWriteBatch(&batch_content, + corrupt_byte_offset - leader_batch_size, + corrupt_byte_addend_); + } + // Leader busy waits on this flag + follower_joined = true; + // So the follower does not enter the outer callback at + // WriteThread::JoinBatchGroup:Wait2 + SyncPoint::GetInstance()->DisableProcessing(); + }); + + // Start the other writer thread which will join the write group as + // follower + follower_thread = port::Thread([&]() { + follower_batch_and_status = + GetWriteBatch(GetCFHandleToUse(nullptr, op_type2_), + 8 /* protection_bytes_per_key */, op_type2_); + ASSERT_OK(follower_batch_and_status.second); + ASSERT_TRUE( + db_->Write(WriteOptions(), &follower_batch_and_status.first) + .IsCorruption()); + }); + + ASSERT_EQ(leader->batch->GetDataSize(), leader_batch_size); + if (corrupt_byte_offset < leader_batch_size) { + Slice batch_content = leader->batch->Data(); + CorruptWriteBatch(&batch_content, corrupt_byte_offset, + corrupt_byte_addend_); + } + leader_count++; + while (!follower_joined) { + // busy waiting + } + }); + while (corrupt_byte_offset < total_bytes) { + // Reopen DB since it failed WAL write which lead to read-only mode + Reopen(options); + SyncPoint::GetInstance()->EnableProcessing(); + auto log_size_pre_write = dbfull()->TEST_total_log_size(); + leader_batch_and_status = + GetWriteBatch(GetCFHandleToUse(nullptr, op_type1_), + 8 /* protection_bytes_per_key */, op_type1_); + ASSERT_OK(leader_batch_and_status.second); + ASSERT_TRUE(db_->Write(WriteOptions(), &leader_batch_and_status.first) + .IsCorruption()); + follower_thread.join(); + // Prevent leader thread from entering this callback + SyncPoint::GetInstance()->ClearCallBack("WriteThread::JoinBatchGroup:Wait"); + ASSERT_EQ(1, leader_count); + // Nothing should have been written to WAL + ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size()); + ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption()); + + corrupt_byte_offset++; + if (corrupt_byte_offset == leader_batch_size) { + // skip over the sequence number part of follower's write batch + corrupt_byte_offset += 8; + } + follower_joined = false; + leader_count = 0; + } + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DbKvChecksumTestMergedBatch, WriteToWALWithColumnFamilyCorrupted) { + // This test has two writers repeatedly attempt to write `WriteBatch`es + // containing a single entry of type op_type1_ and op_type2_ respectively. The + // leader of the write group writes the batch containinng the entry of type + // op_type1_. One byte of the pre-merged write batches is corrupted by adding + // `corrupt_byte_addend_` to the batch's original value during each attempt. + // The test repeats until an attempt has been made on each byte in both + // pre-merged write batches. All attempts are expected to fail with + // `Status::Corruption`. + Options options = CurrentOptions(); + if (op_type1_ == WriteBatchOpType::kMerge || + op_type2_ == WriteBatchOpType::kMerge) { + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + } + CreateAndReopenWithCF({"ramen"}, options); + + auto leader_batch_and_status = + GetWriteBatch(GetCFHandleToUse(handles_[1], op_type1_), + 8 /* protection_bytes_per_key */, op_type1_); + ASSERT_OK(leader_batch_and_status.second); + auto follower_batch_and_status = + GetWriteBatch(GetCFHandleToUse(handles_[1], op_type2_), + 8 /* protection_bytes_per_key */, op_type2_); + size_t leader_batch_size = leader_batch_and_status.first.GetDataSize(); + size_t total_bytes = + leader_batch_size + follower_batch_and_status.first.GetDataSize(); + // First 8 bytes are for sequence number which is not protected in write batch + size_t corrupt_byte_offset = 8; + + std::atomic follower_joined{false}; + std::atomic leader_count{0}; + port::Thread follower_thread; + // This callback should only be called by the leader thread + SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:Wait2", [&](void* arg_leader) { + auto* leader = reinterpret_cast(arg_leader); + ASSERT_EQ(leader->state, WriteThread::STATE_GROUP_LEADER); + + // This callback should only be called by the follower thread + SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:Wait", [&](void* arg_follower) { + auto* follower = + reinterpret_cast(arg_follower); + // The leader thread will wait on this bool and hence wait until + // this writer joins the write group + ASSERT_NE(follower->state, WriteThread::STATE_GROUP_LEADER); + if (corrupt_byte_offset >= leader_batch_size) { + Slice batch_content = + WriteBatchInternal::Contents(follower->batch); + CorruptWriteBatch(&batch_content, + corrupt_byte_offset - leader_batch_size, + corrupt_byte_addend_); + } + follower_joined = true; + // So the follower does not enter the outer callback at + // WriteThread::JoinBatchGroup:Wait2 + SyncPoint::GetInstance()->DisableProcessing(); + }); + + // Start the other writer thread which will join the write group as + // follower + follower_thread = port::Thread([&]() { + follower_batch_and_status = + GetWriteBatch(GetCFHandleToUse(handles_[1], op_type2_), + 8 /* protection_bytes_per_key */, op_type2_); + ASSERT_OK(follower_batch_and_status.second); + ASSERT_TRUE( + db_->Write(WriteOptions(), &follower_batch_and_status.first) + .IsCorruption()); + }); + + ASSERT_EQ(leader->batch->GetDataSize(), leader_batch_size); + if (corrupt_byte_offset < leader_batch_size) { + Slice batch_content = WriteBatchInternal::Contents(leader->batch); + CorruptWriteBatch(&batch_content, corrupt_byte_offset, + corrupt_byte_addend_); + } + leader_count++; + while (!follower_joined) { + // busy waiting + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + while (corrupt_byte_offset < total_bytes) { + // Reopen DB since it failed WAL write which lead to read-only mode + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "ramen"}, options); + SyncPoint::GetInstance()->EnableProcessing(); + auto log_size_pre_write = dbfull()->TEST_total_log_size(); + leader_batch_and_status = + GetWriteBatch(GetCFHandleToUse(handles_[1], op_type1_), + 8 /* protection_bytes_per_key */, op_type1_); + ASSERT_OK(leader_batch_and_status.second); + ASSERT_TRUE(db_->Write(WriteOptions(), &leader_batch_and_status.first) + .IsCorruption()); + follower_thread.join(); + // Prevent leader thread from entering this callback + SyncPoint::GetInstance()->ClearCallBack("WriteThread::JoinBatchGroup:Wait"); + + ASSERT_EQ(1, leader_count); + // Nothing should have been written to WAL + ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size()); + ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption()); + + corrupt_byte_offset++; + if (corrupt_byte_offset == leader_batch_size) { + // skip over the sequence number part of follower's write batch + corrupt_byte_offset += 8; + } + follower_joined = false; + leader_count = 0; + } + SyncPoint::GetInstance()->DisableProcessing(); +} + +INSTANTIATE_TEST_CASE_P( + DbKvChecksumTestMergedBatch, DbKvChecksumTestMergedBatch, + ::testing::Combine(::testing::Range(static_cast(0), + WriteBatchOpType::kNum), + ::testing::Range(static_cast(0), + WriteBatchOpType::kNum), + ::testing::Values(2, 103, 251)), + [](const testing::TestParamInfo< + std::tuple>& args) { + std::ostringstream oss; + oss << GetOpTypeString(std::get<0>(args.param)) + << GetOpTypeString(std::get<1>(args.param)) << "Add" + << static_cast( + static_cast(std::get<2>(args.param))); + return oss.str(); + }); + +// TODO: add test for transactions +// TODO: add test for corrupted write batch with WAL disabled + +class DbKVChecksumWALToWriteBatchTest : public DBTestBase { + public: + DbKVChecksumWALToWriteBatchTest() + : DBTestBase("db_kv_checksum_test", /*env_do_fsync=*/false) {} +}; + +TEST_F(DbKVChecksumWALToWriteBatchTest, WriteBatchChecksumHandoff) { + Options options = CurrentOptions(); + Reopen(options); + ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); + std::string content = ""; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:batch", + [&](void* batch_ptr) { + WriteBatch* batch = reinterpret_cast(batch_ptr); + content.assign(batch->Data().data(), batch->GetDataSize()); + Slice batch_content = batch->Data(); + // Corrupt first bit + CorruptWriteBatch(&batch_content, 0, 1); + }); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:checksum", + [&](void* checksum_ptr) { + // Verify that checksum is produced on the batch content + uint64_t checksum = *reinterpret_cast(checksum_ptr); + ASSERT_EQ(checksum, XXH3_64bits(content.data(), content.size())); + }); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_TRUE(TryReopen(options).IsCorruption()); + SyncPoint::GetInstance()->DisableProcessing(); +}; + +// TODO (cbi): add DeleteRange coverage once it is implemented +class DbMemtableKVChecksumTest : public DbKvChecksumTest { + public: + DbMemtableKVChecksumTest() : DbKvChecksumTest() {} + + protected: + // Indices in the memtable entry that we will not corrupt. + // For memtable entry format, see comments in MemTable::Add(). + // We do not corrupt key length and value length fields in this test + // case since it causes segfault and ASAN will complain. + // For this test case, key and value are all of length 3, so + // key length field is at index 0 and value length field is at index 12. + const std::set index_not_to_corrupt{0, 12}; + + void SkipNotToCorruptEntry() { + if (index_not_to_corrupt.find(corrupt_byte_offset_) != + index_not_to_corrupt.end()) { + corrupt_byte_offset_++; + } + } +}; + +INSTANTIATE_TEST_CASE_P( + DbMemtableKVChecksumTest, DbMemtableKVChecksumTest, + ::testing::Combine(::testing::Range(static_cast(0), + WriteBatchOpType::kDeleteRange), + ::testing::Values(2, 103, 251), + ::testing::Range(static_cast(0), + WriteMode::kWriteOptionProtectedBatch), + // skip 1 byte checksum as it makes test flaky + ::testing::Values(2, 4, 8)), + [](const testing::TestParamInfo< + std::tuple>& args) { + std::ostringstream oss; + oss << GetOpTypeString(std::get<0>(args.param)) << "Add" + << static_cast( + static_cast(std::get<1>(args.param))) + << GetWriteModeString(std::get<2>(args.param)) + << static_cast(std::get<3>(args.param)); + return oss.str(); + }); + +TEST_P(DbMemtableKVChecksumTest, GetWithCorruptAfterMemtableInsert) { + // Record memtable entry size. + // Not corrupting memtable entry here since it will segfault + // or fail some asserts inside memtablerep implementation + // e.g., when key_len is corrupted. + SyncPoint::GetInstance()->SetCallBack( + "MemTable::Add:BeforeReturn:Encoded", [&](void* arg) { + Slice encoded = *static_cast(arg); + entry_len_ = encoded.size(); + }); + + SyncPoint::GetInstance()->SetCallBack( + "Memtable::SaveValue:Begin:entry", [&](void* entry) { + char* buf = *static_cast(entry); + buf[corrupt_byte_offset_] += corrupt_byte_addend_; + ++corrupt_byte_offset_; + }); + SyncPoint::GetInstance()->EnableProcessing(); + Options options = CurrentOptions(); + options.memtable_protection_bytes_per_key = + memtable_protection_bytes_per_key_; + if (op_type_ == WriteBatchOpType::kMerge) { + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + } + + SkipNotToCorruptEntry(); + while (MoreBytesToCorrupt()) { + Reopen(options); + ASSERT_OK(ExecuteWrite(nullptr)); + std::string val; + ASSERT_TRUE(db_->Get(ReadOptions(), "key", &val).IsCorruption()); + Destroy(options); + SkipNotToCorruptEntry(); + } +} + +TEST_P(DbMemtableKVChecksumTest, + GetWithColumnFamilyCorruptAfterMemtableInsert) { + // Record memtable entry size. + // Not corrupting memtable entry here since it will segfault + // or fail some asserts inside memtablerep implementation + // e.g., when key_len is corrupted. + SyncPoint::GetInstance()->SetCallBack( + "MemTable::Add:BeforeReturn:Encoded", [&](void* arg) { + Slice encoded = *static_cast(arg); + entry_len_ = encoded.size(); + }); + + SyncPoint::GetInstance()->SetCallBack( + "Memtable::SaveValue:Begin:entry", [&](void* entry) { + char* buf = *static_cast(entry); + buf[corrupt_byte_offset_] += corrupt_byte_addend_; + ++corrupt_byte_offset_; + }); + SyncPoint::GetInstance()->EnableProcessing(); + Options options = CurrentOptions(); + options.memtable_protection_bytes_per_key = + memtable_protection_bytes_per_key_; + if (op_type_ == WriteBatchOpType::kMerge) { + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + } + + SkipNotToCorruptEntry(); + while (MoreBytesToCorrupt()) { + Reopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(ExecuteWrite(handles_[1])); + std::string val; + ASSERT_TRUE( + db_->Get(ReadOptions(), handles_[1], "key", &val).IsCorruption()); + Destroy(options); + SkipNotToCorruptEntry(); + } +} + +TEST_P(DbMemtableKVChecksumTest, IteratorWithCorruptAfterMemtableInsert) { + SyncPoint::GetInstance()->SetCallBack( + "MemTable::Add:BeforeReturn:Encoded", + std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this, + std::placeholders::_1)); + SyncPoint::GetInstance()->EnableProcessing(); + Options options = CurrentOptions(); + options.memtable_protection_bytes_per_key = + memtable_protection_bytes_per_key_; + if (op_type_ == WriteBatchOpType::kMerge) { + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + } + + SkipNotToCorruptEntry(); + while (MoreBytesToCorrupt()) { + Reopen(options); + ASSERT_OK(ExecuteWrite(nullptr)); + Iterator* it = db_->NewIterator(ReadOptions()); + it->SeekToFirst(); + ASSERT_FALSE(it->Valid()); + ASSERT_TRUE(it->status().IsCorruption()); + delete it; + Destroy(options); + SkipNotToCorruptEntry(); + } +} + +TEST_P(DbMemtableKVChecksumTest, + IteratorWithColumnFamilyCorruptAfterMemtableInsert) { + SyncPoint::GetInstance()->SetCallBack( + "MemTable::Add:BeforeReturn:Encoded", + std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this, + std::placeholders::_1)); + SyncPoint::GetInstance()->EnableProcessing(); + Options options = CurrentOptions(); + options.memtable_protection_bytes_per_key = + memtable_protection_bytes_per_key_; + if (op_type_ == WriteBatchOpType::kMerge) { + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + } + + SkipNotToCorruptEntry(); + while (MoreBytesToCorrupt()) { + Reopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(ExecuteWrite(handles_[1])); + Iterator* it = db_->NewIterator(ReadOptions(), handles_[1]); + it->SeekToFirst(); + ASSERT_FALSE(it->Valid()); + ASSERT_TRUE(it->status().IsCorruption()); + delete it; + Destroy(options); + SkipNotToCorruptEntry(); + } +} + +TEST_P(DbMemtableKVChecksumTest, FlushWithCorruptAfterMemtableInsert) { + SyncPoint::GetInstance()->SetCallBack( + "MemTable::Add:BeforeReturn:Encoded", + std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this, + std::placeholders::_1)); + SyncPoint::GetInstance()->EnableProcessing(); + Options options = CurrentOptions(); + options.memtable_protection_bytes_per_key = + memtable_protection_bytes_per_key_; + if (op_type_ == WriteBatchOpType::kMerge) { + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + } + + SkipNotToCorruptEntry(); + // Not corruping each byte like other tests since Flush() is relatively slow. + Reopen(options); + ASSERT_OK(ExecuteWrite(nullptr)); + ASSERT_TRUE(Flush().IsCorruption()); + // DB enters read-only state when flush reads corrupted data + ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption()); + Destroy(options); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_log_iter_test.cc b/src/rocksdb/db/db_log_iter_test.cc new file mode 100644 index 000000000..4e982858c --- /dev/null +++ b/src/rocksdb/db/db_log_iter_test.cc @@ -0,0 +1,305 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// Introduction of SyncPoint effectively disabled building and running this test +// in Release build. +// which is a pity, it is a good test +#if !defined(ROCKSDB_LITE) + +#include "db/db_test_util.h" +#include "env/mock_env.h" +#include "port/stack_trace.h" + +namespace ROCKSDB_NAMESPACE { + +class DBTestXactLogIterator : public DBTestBase { + public: + DBTestXactLogIterator() + : DBTestBase("db_log_iter_test", /*env_do_fsync=*/true) {} + + std::unique_ptr OpenTransactionLogIter( + const SequenceNumber seq) { + std::unique_ptr iter; + Status status = dbfull()->GetUpdatesSince(seq, &iter); + EXPECT_OK(status); + EXPECT_TRUE(iter->Valid()); + return iter; + } +}; + +namespace { +SequenceNumber ReadRecords(std::unique_ptr& iter, + int& count, bool expect_ok = true) { + count = 0; + SequenceNumber lastSequence = 0; + BatchResult res; + while (iter->Valid()) { + res = iter->GetBatch(); + EXPECT_TRUE(res.sequence > lastSequence); + ++count; + lastSequence = res.sequence; + EXPECT_OK(iter->status()); + iter->Next(); + } + if (expect_ok) { + EXPECT_OK(iter->status()); + } else { + EXPECT_NOK(iter->status()); + } + return res.sequence; +} + +void ExpectRecords(const int expected_no_records, + std::unique_ptr& iter) { + int num_records; + ReadRecords(iter, num_records); + ASSERT_EQ(num_records, expected_no_records); +} +} // anonymous namespace + +TEST_F(DBTestXactLogIterator, TransactionLogIterator) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put(0, "key1", DummyString(1024))); + ASSERT_OK(Put(1, "key2", DummyString(1024))); + ASSERT_OK(Put(1, "key2", DummyString(1024))); + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3U); + { + auto iter = OpenTransactionLogIter(0); + ExpectRecords(3, iter); + } + ReopenWithColumnFamilies({"default", "pikachu"}, options); + env_->SleepForMicroseconds(2 * 1000 * 1000); + { + ASSERT_OK(Put(0, "key4", DummyString(1024))); + ASSERT_OK(Put(1, "key5", DummyString(1024))); + ASSERT_OK(Put(0, "key6", DummyString(1024))); + } + { + auto iter = OpenTransactionLogIter(0); + ExpectRecords(6, iter); + } + } while (ChangeCompactOptions()); +} + +#ifndef NDEBUG // sync point is not included with DNDEBUG build +TEST_F(DBTestXactLogIterator, TransactionLogIteratorRace) { + static const int LOG_ITERATOR_RACE_TEST_COUNT = 2; + static const char* sync_points[LOG_ITERATOR_RACE_TEST_COUNT][4] = { + {"WalManager::GetSortedWalFiles:1", "WalManager::PurgeObsoleteFiles:1", + "WalManager::PurgeObsoleteFiles:2", "WalManager::GetSortedWalFiles:2"}, + {"WalManager::GetSortedWalsOfType:1", "WalManager::PurgeObsoleteFiles:1", + "WalManager::PurgeObsoleteFiles:2", + "WalManager::GetSortedWalsOfType:2"}}; + for (int test = 0; test < LOG_ITERATOR_RACE_TEST_COUNT; ++test) { + // Setup sync point dependency to reproduce the race condition of + // a log file moved to archived dir, in the middle of GetSortedWalFiles + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {sync_points[test][0], sync_points[test][1]}, + {sync_points[test][2], sync_points[test][3]}, + }); + + do { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + ASSERT_OK(Put("key1", DummyString(1024))); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(Put("key2", DummyString(1024))); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(Put("key3", DummyString(1024))); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(Put("key4", DummyString(1024))); + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4U); + ASSERT_OK(dbfull()->FlushWAL(false)); + + { + auto iter = OpenTransactionLogIter(0); + ExpectRecords(4, iter); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + // trigger async flush, and log move. Well, log move will + // wait until the GetSortedWalFiles:1 to reproduce the race + // condition + FlushOptions flush_options; + flush_options.wait = false; + ASSERT_OK(dbfull()->Flush(flush_options)); + + // "key5" would be written in a new memtable and log + ASSERT_OK(Put("key5", DummyString(1024))); + ASSERT_OK(dbfull()->FlushWAL(false)); + { + // this iter would miss "key4" if not fixed + auto iter = OpenTransactionLogIter(0); + ExpectRecords(5, iter); + } + } while (ChangeCompactOptions()); + } +} +#endif + +TEST_F(DBTestXactLogIterator, TransactionLogIteratorStallAtLastRecord) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + ASSERT_OK(Put("key1", DummyString(1024))); + auto iter = OpenTransactionLogIter(0); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_OK(Put("key2", DummyString(1024))); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTestXactLogIterator, TransactionLogIteratorCheckAfterRestart) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + ASSERT_OK(Put("key1", DummyString(1024))); + ASSERT_OK(Put("key2", DummyString(1023))); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + Reopen(options); + auto iter = OpenTransactionLogIter(0); + ExpectRecords(2, iter); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTestXactLogIterator, TransactionLogIteratorCorruptedLog) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + + for (int i = 0; i < 1024; i++) { + ASSERT_OK(Put("key" + std::to_string(i), DummyString(10))); + } + + ASSERT_OK(Flush()); + ASSERT_OK(db_->FlushWAL(false)); + + // Corrupt this log to create a gap + ASSERT_OK(db_->DisableFileDeletions()); + + VectorLogPtr wal_files; + ASSERT_OK(db_->GetSortedWalFiles(wal_files)); + ASSERT_FALSE(wal_files.empty()); + + const auto logfile_path = dbname_ + "/" + wal_files.front()->PathName(); + ASSERT_OK(test::TruncateFile(env_, logfile_path, + wal_files.front()->SizeFileBytes() / 2)); + + ASSERT_OK(db_->EnableFileDeletions()); + + // Insert a new entry to a new log file + ASSERT_OK(Put("key1025", DummyString(10))); + ASSERT_OK(db_->FlushWAL(false)); + + // Try to read from the beginning. Should stop before the gap and read less + // than 1025 entries + auto iter = OpenTransactionLogIter(0); + int count = 0; + SequenceNumber last_sequence_read = ReadRecords(iter, count, false); + ASSERT_LT(last_sequence_read, 1025U); + + // Try to read past the gap, should be able to seek to key1025 + auto iter2 = OpenTransactionLogIter(last_sequence_read + 1); + ExpectRecords(1, iter2); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTestXactLogIterator, TransactionLogIteratorBatchOperations) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + WriteBatch batch; + ASSERT_OK(batch.Put(handles_[1], "key1", DummyString(1024))); + ASSERT_OK(batch.Put(handles_[0], "key2", DummyString(1024))); + ASSERT_OK(batch.Put(handles_[1], "key3", DummyString(1024))); + ASSERT_OK(batch.Delete(handles_[0], "key2")); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); + ASSERT_OK(Flush(1)); + ASSERT_OK(Flush(0)); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_OK(Put(1, "key4", DummyString(1024))); + auto iter = OpenTransactionLogIter(3); + ExpectRecords(2, iter); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTestXactLogIterator, TransactionLogIteratorBlobs) { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + { + WriteBatch batch; + ASSERT_OK(batch.Put(handles_[1], "key1", DummyString(1024))); + ASSERT_OK(batch.Put(handles_[0], "key2", DummyString(1024))); + ASSERT_OK(batch.PutLogData(Slice("blob1"))); + ASSERT_OK(batch.Put(handles_[1], "key3", DummyString(1024))); + ASSERT_OK(batch.PutLogData(Slice("blob2"))); + ASSERT_OK(batch.Delete(handles_[0], "key2")); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + } + + auto res = OpenTransactionLogIter(0)->GetBatch(); + struct Handler : public WriteBatch::Handler { + std::string seen; + Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override { + seen += "Put(" + std::to_string(cf) + ", " + key.ToString() + ", " + + std::to_string(value.size()) + ")"; + return Status::OK(); + } + Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override { + seen += "Merge(" + std::to_string(cf) + ", " + key.ToString() + ", " + + std::to_string(value.size()) + ")"; + return Status::OK(); + } + void LogData(const Slice& blob) override { + seen += "LogData(" + blob.ToString() + ")"; + } + Status DeleteCF(uint32_t cf, const Slice& key) override { + seen += "Delete(" + std::to_string(cf) + ", " + key.ToString() + ")"; + return Status::OK(); + } + } handler; + ASSERT_OK(res.writeBatchPtr->Iterate(&handler)); + ASSERT_EQ( + "Put(1, key1, 1024)" + "Put(0, key2, 1024)" + "LogData(blob1)" + "Put(1, key3, 1024)" + "LogData(blob2)" + "Delete(0, key2)", + handler.seen); +} +} // namespace ROCKSDB_NAMESPACE + +#endif // !defined(ROCKSDB_LITE) + +int main(int argc, char** argv) { +#if !defined(ROCKSDB_LITE) + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +#else + (void)argc; + (void)argv; + return 0; +#endif +} diff --git a/src/rocksdb/db/db_logical_block_size_cache_test.cc b/src/rocksdb/db/db_logical_block_size_cache_test.cc new file mode 100644 index 000000000..13c16618e --- /dev/null +++ b/src/rocksdb/db/db_logical_block_size_cache_test.cc @@ -0,0 +1,521 @@ +// Copyright (c) 2020-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "test_util/testharness.h" + +#ifdef OS_LINUX +#include "env/io_posix.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { +class EnvWithCustomLogicalBlockSizeCache : public EnvWrapper { + public: + EnvWithCustomLogicalBlockSizeCache(Env* env, LogicalBlockSizeCache* cache) + : EnvWrapper(env), cache_(cache) {} + + Status RegisterDbPaths(const std::vector& paths) override { + return cache_->RefAndCacheLogicalBlockSize(paths); + } + + Status UnregisterDbPaths(const std::vector& paths) override { + cache_->UnrefAndTryRemoveCachedLogicalBlockSize(paths); + return Status::OK(); + } + + private: + LogicalBlockSizeCache* cache_; +}; + +class DBLogicalBlockSizeCacheTest : public testing::Test { + public: + DBLogicalBlockSizeCacheTest() + : dbname_(test::PerThreadDBPath("logical_block_size_cache_test")), + data_path_0_(dbname_ + "/data_path_0"), + data_path_1_(dbname_ + "/data_path_1"), + cf_path_0_(dbname_ + "/cf_path_0"), + cf_path_1_(dbname_ + "/cf_path_1") { + auto get_fd_block_size = [&](int fd) { return fd; }; + auto get_dir_block_size = [&](const std::string& /*dir*/, size_t* size) { + *size = 1024; + return Status::OK(); + }; + cache_.reset( + new LogicalBlockSizeCache(get_fd_block_size, get_dir_block_size)); + env_.reset( + new EnvWithCustomLogicalBlockSizeCache(Env::Default(), cache_.get())); + } + + protected: + std::string dbname_; + std::string data_path_0_; + std::string data_path_1_; + std::string cf_path_0_; + std::string cf_path_1_; + std::unique_ptr cache_; + std::unique_ptr env_; +}; + +TEST_F(DBLogicalBlockSizeCacheTest, OpenClose) { + // Tests that Open will cache the logical block size for data paths, + // and Close will remove the cached sizes. + Options options; + options.create_if_missing = true; + options.env = env_.get(); + options.db_paths = {{data_path_0_, 2048}, {data_path_1_, 2048}}; + + for (int i = 0; i < 2; i++) { + DB* db; + if (!i) { + printf("Open\n"); + ASSERT_OK(DB::Open(options, dbname_, &db)); + } else { +#ifdef ROCKSDB_LITE + break; +#else + printf("OpenForReadOnly\n"); + ASSERT_OK(DB::OpenForReadOnly(options, dbname_, &db)); +#endif + } + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(data_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(data_path_0_)); + ASSERT_TRUE(cache_->Contains(data_path_1_)); + ASSERT_EQ(1, cache_->GetRefCount(data_path_1_)); + ASSERT_OK(db->Close()); + ASSERT_EQ(0, cache_->Size()); + delete db; + } + ASSERT_OK(DestroyDB(dbname_, options, {})); +} + +TEST_F(DBLogicalBlockSizeCacheTest, OpenDelete) { + // Tests that Open will cache the logical block size for data paths, + // and delete the db pointer will remove the cached sizes. + Options options; + options.create_if_missing = true; + options.env = env_.get(); + + for (int i = 0; i < 2; i++) { + DB* db; + if (!i) { + printf("Open\n"); + ASSERT_OK(DB::Open(options, dbname_, &db)); + } else { +#ifdef ROCKSDB_LITE + break; +#else + printf("OpenForReadOnly\n"); + ASSERT_OK(DB::OpenForReadOnly(options, dbname_, &db)); +#endif + } + ASSERT_EQ(1, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + delete db; + ASSERT_EQ(0, cache_->Size()); + } + ASSERT_OK(DestroyDB(dbname_, options, {})); +} + +TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamily) { + // Tests that CreateColumnFamily will cache the cf_paths, + // drop the column family handle won't drop the cache, + // drop and then delete the column family handle will drop the cache. + Options options; + options.create_if_missing = true; + options.env = env_.get(); + ColumnFamilyOptions cf_options; + cf_options.cf_paths = {{cf_path_0_, 1024}, {cf_path_1_, 2048}}; + + DB* db; + ASSERT_OK(DB::Open(options, dbname_, &db)); + ASSERT_EQ(1, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + + ColumnFamilyHandle* cf = nullptr; + ASSERT_OK(db->CreateColumnFamily(cf_options, "cf", &cf)); + ASSERT_EQ(3, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + ASSERT_TRUE(cache_->Contains(cf_path_1_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_)); + + // Drop column family does not drop cache. + ASSERT_OK(db->DropColumnFamily(cf)); + ASSERT_EQ(3, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + ASSERT_TRUE(cache_->Contains(cf_path_1_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_)); + + // Delete handle will drop cache. + ASSERT_OK(db->DestroyColumnFamilyHandle(cf)); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + + delete db; + ASSERT_EQ(0, cache_->Size()); + ASSERT_OK(DestroyDB(dbname_, options, {{"cf", cf_options}})); +} + +TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamilies) { + // To test: + // (1) CreateColumnFamilies will cache the cf_paths in + // DBLogicalBlockSizeCache + // (2) Dropping column family handles associated with + // that cf_paths won't drop the cached cf_paths + // (3) Deleting all the column family handles associated + // with that cf_paths will drop the cached cf_paths + + Options options; + options.create_if_missing = true; + options.env = env_.get(); + ColumnFamilyOptions cf_options; + cf_options.cf_paths = {{cf_path_0_, 1024}}; + + DB* db; + ASSERT_OK(DB::Open(options, dbname_, &db)); + ASSERT_EQ(1, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + + std::vector cfs; + ASSERT_OK(db->CreateColumnFamilies(cf_options, {"cf1", "cf2"}, &cfs)); + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_)); + + // Drop column family does not drop cf_path_0_'s entry from cache + for (ColumnFamilyHandle* cf : cfs) { + ASSERT_OK(db->DropColumnFamily(cf)); + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_)); + } + + // Delete one cf handle will not drop cf_path_0_'s entry from cache because + // another handle is still referencing cf_path_0_. + ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[0])); + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + + // Delete all cf handles and ensure the ref count of cf_path_0_ in cache_ + // can be properly decreased by releasing any background reference to the + // ColumnFamilyData during db deletion + ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1])); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + delete db; + + // Now cf_path_0_ in cache_ has been properly decreased and cf_path_0_'s entry + // is dropped from cache + ASSERT_EQ(0, cache_->Size()); + ASSERT_OK( + DestroyDB(dbname_, options, {{"cf1", cf_options}, {"cf2", cf_options}})); +} + +TEST_F(DBLogicalBlockSizeCacheTest, OpenWithColumnFamilies) { + // Tests that Open two column families with the same cf_path will cache the + // cf_path and have 2 references to the cached size, + // drop the column family handle won't drop the cache, + // drop and then delete the column family handle will drop the cache. + Options options; + options.create_if_missing = true; + options.env = env_.get(); + + ColumnFamilyOptions cf_options; + cf_options.cf_paths = {{cf_path_0_, 1024}}; + + for (int i = 0; i < 2; i++) { + DB* db; + ColumnFamilyHandle* cf1 = nullptr; + ColumnFamilyHandle* cf2 = nullptr; + ASSERT_OK(DB::Open(options, dbname_, &db)); + ASSERT_OK(db->CreateColumnFamily(cf_options, "cf1", &cf1)); + ASSERT_OK(db->CreateColumnFamily(cf_options, "cf2", &cf2)); + ASSERT_OK(db->DestroyColumnFamilyHandle(cf1)); + ASSERT_OK(db->DestroyColumnFamilyHandle(cf2)); + delete db; + ASSERT_EQ(0, cache_->Size()); + + std::vector cfs; + if (!i) { + printf("Open\n"); + ASSERT_OK(DB::Open(options, dbname_, + {{"cf1", cf_options}, + {"cf2", cf_options}, + {"default", ColumnFamilyOptions()}}, + &cfs, &db)); + } else { +#ifdef ROCKSDB_LITE + break; +#else + printf("OpenForReadOnly\n"); + ASSERT_OK(DB::OpenForReadOnly(options, dbname_, + {{"cf1", cf_options}, + {"cf2", cf_options}, + {"default", ColumnFamilyOptions()}}, + &cfs, &db)); +#endif + } + + // Logical block sizes of dbname_ and cf_path_0_ are cached during Open. + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_)); + + // Drop handles won't drop the cache. + ASSERT_OK(db->DropColumnFamily(cfs[0])); + ASSERT_OK(db->DropColumnFamily(cfs[1])); + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_)); + + // Delete 1st handle won't drop the cache for cf_path_0_. + ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[0])); + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + + // Delete 2nd handle will drop the cache for cf_path_0_. + ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1])); + ASSERT_EQ(1, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + + // Delete the default handle won't affect the cache because db still refers + // to the default CF. + ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[2])); + ASSERT_EQ(1, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + + delete db; + ASSERT_EQ(0, cache_->Size()); + } + ASSERT_OK( + DestroyDB(dbname_, options, {{"cf1", cf_options}, {"cf2", cf_options}})); +} + +TEST_F(DBLogicalBlockSizeCacheTest, DestroyColumnFamilyHandle) { + // Tests that destroy column family without dropping won't drop the cache, + // because compaction and flush might still need to get logical block size + // when opening new files. + Options options; + options.create_if_missing = true; + options.env = env_.get(); + ColumnFamilyOptions cf_options; + cf_options.cf_paths = {{cf_path_0_, 1024}}; + + DB* db; + ASSERT_OK(DB::Open(options, dbname_, &db)); + ASSERT_EQ(1, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ColumnFamilyHandle* cf = nullptr; + ASSERT_OK(db->CreateColumnFamily(cf_options, "cf", &cf)); + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + + // Delete handle won't drop cache. + ASSERT_OK(db->DestroyColumnFamilyHandle(cf)); + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + + delete db; + ASSERT_EQ(0, cache_->Size()); + + // Open with column families. + std::vector cfs; + for (int i = 0; i < 2; i++) { + if (!i) { + printf("Open\n"); + ASSERT_OK(DB::Open( + options, dbname_, + {{"cf", cf_options}, {"default", ColumnFamilyOptions()}}, &cfs, &db)); + } else { +#ifdef ROCKSDB_LITE + break; +#else + printf("OpenForReadOnly\n"); + ASSERT_OK(DB::OpenForReadOnly( + options, dbname_, + {{"cf", cf_options}, {"default", ColumnFamilyOptions()}}, &cfs, &db)); +#endif + } + // cf_path_0_ and dbname_ are cached. + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + + // Deleting handle won't drop cache. + ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[0])); + ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1])); + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + + delete db; + ASSERT_EQ(0, cache_->Size()); + } + ASSERT_OK(DestroyDB(dbname_, options, {{"cf", cf_options}})); +} + +TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) { + // Tests the cache behavior when there are multiple DBs sharing the same env + // with different db_paths and cf_paths. + Options options; + options.create_if_missing = true; + options.env = env_.get(); + + ASSERT_OK(env_->CreateDirIfMissing(dbname_)); + + DB* db0; + ASSERT_OK(DB::Open(options, data_path_0_, &db0)); + ASSERT_EQ(1, cache_->Size()); + ASSERT_TRUE(cache_->Contains(data_path_0_)); + + ColumnFamilyOptions cf_options0; + cf_options0.cf_paths = {{cf_path_0_, 1024}}; + ColumnFamilyHandle* cf0; + ASSERT_OK(db0->CreateColumnFamily(cf_options0, "cf", &cf0)); + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(data_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(data_path_0_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + + DB* db1; + ASSERT_OK(DB::Open(options, data_path_1_, &db1)); + ASSERT_EQ(3, cache_->Size()); + ASSERT_TRUE(cache_->Contains(data_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(data_path_0_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + ASSERT_TRUE(cache_->Contains(data_path_1_)); + ASSERT_EQ(1, cache_->GetRefCount(data_path_1_)); + + ColumnFamilyOptions cf_options1; + cf_options1.cf_paths = {{cf_path_1_, 1024}}; + ColumnFamilyHandle* cf1; + ASSERT_OK(db1->CreateColumnFamily(cf_options1, "cf", &cf1)); + ASSERT_EQ(4, cache_->Size()); + ASSERT_TRUE(cache_->Contains(data_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(data_path_0_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + ASSERT_TRUE(cache_->Contains(data_path_1_)); + ASSERT_EQ(1, cache_->GetRefCount(data_path_1_)); + ASSERT_TRUE(cache_->Contains(cf_path_1_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_)); + + ASSERT_OK(db0->DestroyColumnFamilyHandle(cf0)); + delete db0; + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(data_path_1_)); + ASSERT_EQ(1, cache_->GetRefCount(data_path_1_)); + ASSERT_TRUE(cache_->Contains(cf_path_1_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_)); + ASSERT_OK(DestroyDB(data_path_0_, options, {{"cf", cf_options0}})); + + ASSERT_OK(db1->DestroyColumnFamilyHandle(cf1)); + delete db1; + ASSERT_EQ(0, cache_->Size()); + ASSERT_OK(DestroyDB(data_path_1_, options, {{"cf", cf_options1}})); +} + +TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithSamePaths) { + // Tests the cache behavior when there are multiple DBs sharing the same env + // with the same db_paths and cf_paths. + Options options; + options.create_if_missing = true; + options.env = env_.get(); + options.db_paths = {{data_path_0_, 1024}}; + ColumnFamilyOptions cf_options; + cf_options.cf_paths = {{cf_path_0_, 1024}}; + + ASSERT_OK(env_->CreateDirIfMissing(dbname_)); + + DB* db0; + ASSERT_OK(DB::Open(options, dbname_ + "/db0", &db0)); + ASSERT_EQ(1, cache_->Size()); + ASSERT_TRUE(cache_->Contains(data_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(data_path_0_)); + + ColumnFamilyHandle* cf0; + ASSERT_OK(db0->CreateColumnFamily(cf_options, "cf", &cf0)); + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(data_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(data_path_0_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + + DB* db1; + ASSERT_OK(DB::Open(options, dbname_ + "/db1", &db1)); + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(data_path_0_)); + ASSERT_EQ(2, cache_->GetRefCount(data_path_0_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + + ColumnFamilyHandle* cf1; + ASSERT_OK(db1->CreateColumnFamily(cf_options, "cf", &cf1)); + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(data_path_0_)); + ASSERT_EQ(2, cache_->GetRefCount(data_path_0_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_)); + + ASSERT_OK(db0->DestroyColumnFamilyHandle(cf0)); + delete db0; + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(data_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(data_path_0_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + ASSERT_OK(DestroyDB(dbname_ + "/db0", options, {{"cf", cf_options}})); + + ASSERT_OK(db1->DestroyColumnFamilyHandle(cf1)); + delete db1; + ASSERT_EQ(0, cache_->Size()); + ASSERT_OK(DestroyDB(dbname_ + "/db1", options, {{"cf", cf_options}})); +} + +} // namespace ROCKSDB_NAMESPACE +#endif // OS_LINUX + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_memtable_test.cc b/src/rocksdb/db/db_memtable_test.cc new file mode 100644 index 000000000..cae592db3 --- /dev/null +++ b/src/rocksdb/db/db_memtable_test.cc @@ -0,0 +1,344 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include + +#include "db/db_test_util.h" +#include "db/memtable.h" +#include "db/range_del_aggregator.h" +#include "port/stack_trace.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/slice_transform.h" + +namespace ROCKSDB_NAMESPACE { + +class DBMemTableTest : public DBTestBase { + public: + DBMemTableTest() : DBTestBase("db_memtable_test", /*env_do_fsync=*/true) {} +}; + +class MockMemTableRep : public MemTableRep { + public: + explicit MockMemTableRep(Allocator* allocator, MemTableRep* rep) + : MemTableRep(allocator), rep_(rep), num_insert_with_hint_(0) {} + + KeyHandle Allocate(const size_t len, char** buf) override { + return rep_->Allocate(len, buf); + } + + void Insert(KeyHandle handle) override { rep_->Insert(handle); } + + void InsertWithHint(KeyHandle handle, void** hint) override { + num_insert_with_hint_++; + EXPECT_NE(nullptr, hint); + last_hint_in_ = *hint; + rep_->InsertWithHint(handle, hint); + last_hint_out_ = *hint; + } + + bool Contains(const char* key) const override { return rep_->Contains(key); } + + void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) override { + rep_->Get(k, callback_args, callback_func); + } + + size_t ApproximateMemoryUsage() override { + return rep_->ApproximateMemoryUsage(); + } + + Iterator* GetIterator(Arena* arena) override { + return rep_->GetIterator(arena); + } + + void* last_hint_in() { return last_hint_in_; } + void* last_hint_out() { return last_hint_out_; } + int num_insert_with_hint() { return num_insert_with_hint_; } + + private: + std::unique_ptr rep_; + void* last_hint_in_; + void* last_hint_out_; + int num_insert_with_hint_; +}; + +class MockMemTableRepFactory : public MemTableRepFactory { + public: + MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& cmp, + Allocator* allocator, + const SliceTransform* transform, + Logger* logger) override { + SkipListFactory factory; + MemTableRep* skiplist_rep = + factory.CreateMemTableRep(cmp, allocator, transform, logger); + mock_rep_ = new MockMemTableRep(allocator, skiplist_rep); + return mock_rep_; + } + + MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& cmp, + Allocator* allocator, + const SliceTransform* transform, + Logger* logger, + uint32_t column_family_id) override { + last_column_family_id_ = column_family_id; + return CreateMemTableRep(cmp, allocator, transform, logger); + } + + const char* Name() const override { return "MockMemTableRepFactory"; } + + MockMemTableRep* rep() { return mock_rep_; } + + bool IsInsertConcurrentlySupported() const override { return false; } + + uint32_t GetLastColumnFamilyId() { return last_column_family_id_; } + + private: + MockMemTableRep* mock_rep_; + // workaround since there's no std::numeric_limits::max() yet. + uint32_t last_column_family_id_ = static_cast(-1); +}; + +class TestPrefixExtractor : public SliceTransform { + public: + const char* Name() const override { return "TestPrefixExtractor"; } + + Slice Transform(const Slice& key) const override { + const char* p = separator(key); + if (p == nullptr) { + return Slice(); + } + return Slice(key.data(), p - key.data() + 1); + } + + bool InDomain(const Slice& key) const override { + return separator(key) != nullptr; + } + + bool InRange(const Slice& /*key*/) const override { return false; } + + private: + const char* separator(const Slice& key) const { + return reinterpret_cast(memchr(key.data(), '_', key.size())); + } +}; + +// Test that ::Add properly returns false when inserting duplicate keys +TEST_F(DBMemTableTest, DuplicateSeq) { + SequenceNumber seq = 123; + std::string value; + MergeContext merge_context; + Options options; + InternalKeyComparator ikey_cmp(options.comparator); + ReadRangeDelAggregator range_del_agg(&ikey_cmp, + kMaxSequenceNumber /* upper_bound */); + + // Create a MemTable + InternalKeyComparator cmp(BytewiseComparator()); + auto factory = std::make_shared(); + options.memtable_factory = factory; + ImmutableOptions ioptions(options); + WriteBufferManager wb(options.db_write_buffer_size); + MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, + kMaxSequenceNumber, 0 /* column_family_id */); + + // Write some keys and make sure it returns false on duplicates + ASSERT_OK( + mem->Add(seq, kTypeValue, "key", "value2", nullptr /* kv_prot_info */)); + ASSERT_TRUE( + mem->Add(seq, kTypeValue, "key", "value2", nullptr /* kv_prot_info */) + .IsTryAgain()); + // Changing the type should still cause the duplicatae key + ASSERT_TRUE( + mem->Add(seq, kTypeMerge, "key", "value2", nullptr /* kv_prot_info */) + .IsTryAgain()); + // Changing the seq number will make the key fresh + ASSERT_OK(mem->Add(seq + 1, kTypeMerge, "key", "value2", + nullptr /* kv_prot_info */)); + // Test with different types for duplicate keys + ASSERT_TRUE( + mem->Add(seq, kTypeDeletion, "key", "", nullptr /* kv_prot_info */) + .IsTryAgain()); + ASSERT_TRUE( + mem->Add(seq, kTypeSingleDeletion, "key", "", nullptr /* kv_prot_info */) + .IsTryAgain()); + + // Test the duplicate keys under stress + for (int i = 0; i < 10000; i++) { + bool insert_dup = i % 10 == 1; + if (!insert_dup) { + seq++; + } + Status s = mem->Add(seq, kTypeValue, "foo", "value" + std::to_string(seq), + nullptr /* kv_prot_info */); + if (insert_dup) { + ASSERT_TRUE(s.IsTryAgain()); + } else { + ASSERT_OK(s); + } + } + delete mem; + + // Test with InsertWithHint + options.memtable_insert_with_hint_prefix_extractor.reset( + new TestPrefixExtractor()); // which uses _ to extract the prefix + ioptions = ImmutableOptions(options); + mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, + kMaxSequenceNumber, 0 /* column_family_id */); + // Insert a duplicate key with _ in it + ASSERT_OK( + mem->Add(seq, kTypeValue, "key_1", "value", nullptr /* kv_prot_info */)); + ASSERT_TRUE( + mem->Add(seq, kTypeValue, "key_1", "value", nullptr /* kv_prot_info */) + .IsTryAgain()); + delete mem; + + // Test when InsertConcurrently will be invoked + options.allow_concurrent_memtable_write = true; + ioptions = ImmutableOptions(options); + mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, + kMaxSequenceNumber, 0 /* column_family_id */); + MemTablePostProcessInfo post_process_info; + ASSERT_OK(mem->Add(seq, kTypeValue, "key", "value", + nullptr /* kv_prot_info */, true, &post_process_info)); + ASSERT_TRUE(mem->Add(seq, kTypeValue, "key", "value", + nullptr /* kv_prot_info */, true, &post_process_info) + .IsTryAgain()); + delete mem; +} + +// A simple test to verify that the concurrent merge writes is functional +TEST_F(DBMemTableTest, ConcurrentMergeWrite) { + int num_ops = 1000; + std::string value; + MergeContext merge_context; + Options options; + // A merge operator that is not sensitive to concurrent writes since in this + // test we don't order the writes. + options.merge_operator = MergeOperators::CreateUInt64AddOperator(); + + // Create a MemTable + InternalKeyComparator cmp(BytewiseComparator()); + auto factory = std::make_shared(); + options.memtable_factory = factory; + options.allow_concurrent_memtable_write = true; + ImmutableOptions ioptions(options); + WriteBufferManager wb(options.db_write_buffer_size); + MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, + kMaxSequenceNumber, 0 /* column_family_id */); + + // Put 0 as the base + PutFixed64(&value, static_cast(0)); + ASSERT_OK(mem->Add(0, kTypeValue, "key", value, nullptr /* kv_prot_info */)); + value.clear(); + + // Write Merge concurrently + ROCKSDB_NAMESPACE::port::Thread write_thread1([&]() { + MemTablePostProcessInfo post_process_info1; + std::string v1; + for (int seq = 1; seq < num_ops / 2; seq++) { + PutFixed64(&v1, seq); + ASSERT_OK(mem->Add(seq, kTypeMerge, "key", v1, nullptr /* kv_prot_info */, + true, &post_process_info1)); + v1.clear(); + } + }); + ROCKSDB_NAMESPACE::port::Thread write_thread2([&]() { + MemTablePostProcessInfo post_process_info2; + std::string v2; + for (int seq = num_ops / 2; seq < num_ops; seq++) { + PutFixed64(&v2, seq); + ASSERT_OK(mem->Add(seq, kTypeMerge, "key", v2, nullptr /* kv_prot_info */, + true, &post_process_info2)); + v2.clear(); + } + }); + write_thread1.join(); + write_thread2.join(); + + Status status; + ReadOptions roptions; + SequenceNumber max_covering_tombstone_seq = 0; + LookupKey lkey("key", kMaxSequenceNumber); + bool res = mem->Get(lkey, &value, /*columns=*/nullptr, /*timestamp=*/nullptr, + &status, &merge_context, &max_covering_tombstone_seq, + roptions, false /* immutable_memtable */); + ASSERT_OK(status); + ASSERT_TRUE(res); + uint64_t ivalue = DecodeFixed64(Slice(value).data()); + uint64_t sum = 0; + for (int seq = 0; seq < num_ops; seq++) { + sum += seq; + } + ASSERT_EQ(ivalue, sum); + + delete mem; +} + +TEST_F(DBMemTableTest, InsertWithHint) { + Options options; + options.allow_concurrent_memtable_write = false; + options.create_if_missing = true; + options.memtable_factory.reset(new MockMemTableRepFactory()); + options.memtable_insert_with_hint_prefix_extractor.reset( + new TestPrefixExtractor()); + options.env = env_; + Reopen(options); + MockMemTableRep* rep = + reinterpret_cast(options.memtable_factory.get()) + ->rep(); + ASSERT_OK(Put("foo_k1", "foo_v1")); + ASSERT_EQ(nullptr, rep->last_hint_in()); + void* hint_foo = rep->last_hint_out(); + ASSERT_OK(Put("foo_k2", "foo_v2")); + ASSERT_EQ(hint_foo, rep->last_hint_in()); + ASSERT_EQ(hint_foo, rep->last_hint_out()); + ASSERT_OK(Put("foo_k3", "foo_v3")); + ASSERT_EQ(hint_foo, rep->last_hint_in()); + ASSERT_EQ(hint_foo, rep->last_hint_out()); + ASSERT_OK(Put("bar_k1", "bar_v1")); + ASSERT_EQ(nullptr, rep->last_hint_in()); + void* hint_bar = rep->last_hint_out(); + ASSERT_NE(hint_foo, hint_bar); + ASSERT_OK(Put("bar_k2", "bar_v2")); + ASSERT_EQ(hint_bar, rep->last_hint_in()); + ASSERT_EQ(hint_bar, rep->last_hint_out()); + ASSERT_EQ(5, rep->num_insert_with_hint()); + ASSERT_OK(Put("NotInPrefixDomain", "vvv")); + ASSERT_EQ(5, rep->num_insert_with_hint()); + ASSERT_EQ("foo_v1", Get("foo_k1")); + ASSERT_EQ("foo_v2", Get("foo_k2")); + ASSERT_EQ("foo_v3", Get("foo_k3")); + ASSERT_EQ("bar_v1", Get("bar_k1")); + ASSERT_EQ("bar_v2", Get("bar_k2")); + ASSERT_EQ("vvv", Get("NotInPrefixDomain")); +} + +TEST_F(DBMemTableTest, ColumnFamilyId) { + // Verifies MemTableRepFactory is told the right column family id. + Options options; + options.env = CurrentOptions().env; + options.allow_concurrent_memtable_write = false; + options.create_if_missing = true; + options.memtable_factory.reset(new MockMemTableRepFactory()); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + for (uint32_t cf = 0; cf < 2; ++cf) { + ASSERT_OK(Put(cf, "key", "val")); + ASSERT_OK(Flush(cf)); + ASSERT_EQ( + cf, static_cast(options.memtable_factory.get()) + ->GetLastColumnFamilyId()); + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_merge_operand_test.cc b/src/rocksdb/db/db_merge_operand_test.cc new file mode 100644 index 000000000..cbec37138 --- /dev/null +++ b/src/rocksdb/db/db_merge_operand_test.cc @@ -0,0 +1,448 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/utilities/debug.h" +#include "table/block_based/block_builder.h" +#if !defined(ROCKSDB_LITE) +#include "test_util/sync_point.h" +#endif +#include "rocksdb/merge_operator.h" +#include "utilities/fault_injection_env.h" +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/sortlist.h" +#include "utilities/merge_operators/string_append/stringappend2.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +class LimitedStringAppendMergeOp : public StringAppendTESTOperator { + public: + LimitedStringAppendMergeOp(int limit, char delim) + : StringAppendTESTOperator(delim), limit_(limit) {} + + const char* Name() const override { + return "DBMergeOperatorTest::LimitedStringAppendMergeOp"; + } + + bool ShouldMerge(const std::vector& operands) const override { + if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) { + return true; + } + return false; + } + + private: + size_t limit_ = 0; +}; +} // anonymous namespace + +class DBMergeOperandTest : public DBTestBase { + public: + DBMergeOperandTest() + : DBTestBase("db_merge_operand_test", /*env_do_fsync=*/true) {} +}; + +TEST_F(DBMergeOperandTest, CacheEvictedMergeOperandReadAfterFreeBug) { + // There was a bug of reading merge operands after they are mistakely freed + // in DB::GetMergeOperands, which is surfaced by cache full. + // See PR#9507 for more. + Options options; + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.env = env_; + BlockBasedTableOptions table_options; + + // Small cache to simulate cache full + table_options.block_cache = NewLRUCache(1); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + Reopen(options); + int num_records = 4; + int number_of_operands = 0; + std::vector values(num_records); + GetMergeOperandsOptions merge_operands_info; + merge_operands_info.expected_max_number_of_operands = num_records; + + ASSERT_OK(Merge("k1", "v1")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k1", "v2")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k1", "v3")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k1", "v4")); + + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k1", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(number_of_operands, 4); + ASSERT_EQ(values[0].ToString(), "v1"); + ASSERT_EQ(values[1].ToString(), "v2"); + ASSERT_EQ(values[2].ToString(), "v3"); + ASSERT_EQ(values[3].ToString(), "v4"); +} + +TEST_F(DBMergeOperandTest, FlushedMergeOperandReadAfterFreeBug) { + // Repro for a bug where a memtable containing a merge operand could be + // deleted before the merge operand was saved to the result. + auto options = CurrentOptions(); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + Reopen(options); + + ASSERT_OK(Merge("key", "value")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::GetImpl:PostMemTableGet:0", + "DBMergeOperandTest::FlushedMergeOperandReadAfterFreeBug:PreFlush"}, + {"DBMergeOperandTest::FlushedMergeOperandReadAfterFreeBug:PostFlush", + "DBImpl::GetImpl:PostMemTableGet:1"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + auto flush_thread = port::Thread([&]() { + TEST_SYNC_POINT( + "DBMergeOperandTest::FlushedMergeOperandReadAfterFreeBug:PreFlush"); + ASSERT_OK(Flush()); + TEST_SYNC_POINT( + "DBMergeOperandTest::FlushedMergeOperandReadAfterFreeBug:PostFlush"); + }); + + PinnableSlice value; + GetMergeOperandsOptions merge_operands_info; + merge_operands_info.expected_max_number_of_operands = 1; + int number_of_operands; + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "key", &value, &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(1, number_of_operands); + + flush_thread.join(); +} + +TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) { + Options options; + options.create_if_missing = true; + // Use only the latest two merge operands. + options.merge_operator = std::make_shared(2, ','); + options.env = env_; + Reopen(options); + int num_records = 4; + int number_of_operands = 0; + std::vector values(num_records); + GetMergeOperandsOptions merge_operands_info; + merge_operands_info.expected_max_number_of_operands = num_records; + + // k0 value in memtable + ASSERT_OK(Put("k0", "PutARock")); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k0", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(values[0], "PutARock"); + + // k0.1 value in SST + ASSERT_OK(Put("k0.1", "RockInSST")); + ASSERT_OK(Flush()); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k0.1", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(values[0], "RockInSST"); + + // All k1 values are in memtable. + ASSERT_OK(Merge("k1", "a")); + ASSERT_OK(Put("k1", "x")); + ASSERT_OK(Merge("k1", "b")); + ASSERT_OK(Merge("k1", "c")); + ASSERT_OK(Merge("k1", "d")); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k1", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(values[0], "x"); + ASSERT_EQ(values[1], "b"); + ASSERT_EQ(values[2], "c"); + ASSERT_EQ(values[3], "d"); + + // expected_max_number_of_operands is less than number of merge operands so + // status should be Incomplete. + merge_operands_info.expected_max_number_of_operands = num_records - 1; + Status status = db_->GetMergeOperands( + ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), + &merge_operands_info, &number_of_operands); + ASSERT_EQ(status.IsIncomplete(), true); + merge_operands_info.expected_max_number_of_operands = num_records; + + // All k1.1 values are in memtable. + ASSERT_OK(Merge("k1.1", "r")); + ASSERT_OK(Delete("k1.1")); + ASSERT_OK(Merge("k1.1", "c")); + ASSERT_OK(Merge("k1.1", "k")); + ASSERT_OK(Merge("k1.1", "s")); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k1.1", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(values[0], "c"); + ASSERT_EQ(values[1], "k"); + ASSERT_EQ(values[2], "s"); + + // All k2 values are flushed to L0 into a single file. + ASSERT_OK(Merge("k2", "q")); + ASSERT_OK(Merge("k2", "w")); + ASSERT_OK(Merge("k2", "e")); + ASSERT_OK(Merge("k2", "r")); + ASSERT_OK(Flush()); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k2", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(values[0], "q"); + ASSERT_EQ(values[1], "w"); + ASSERT_EQ(values[2], "e"); + ASSERT_EQ(values[3], "r"); + + // All k2.1 values are flushed to L0 into a single file. + ASSERT_OK(Merge("k2.1", "m")); + ASSERT_OK(Put("k2.1", "l")); + ASSERT_OK(Merge("k2.1", "n")); + ASSERT_OK(Merge("k2.1", "o")); + ASSERT_OK(Flush()); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k2.1", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(values[0], "l,n,o"); + + // All k2.2 values are flushed to L0 into a single file. + ASSERT_OK(Merge("k2.2", "g")); + ASSERT_OK(Delete("k2.2")); + ASSERT_OK(Merge("k2.2", "o")); + ASSERT_OK(Merge("k2.2", "t")); + ASSERT_OK(Flush()); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k2.2", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(values[0], "o,t"); + + // Do some compaction that will make the following tests more predictable + // Slice start("PutARock"); + // Slice end("t"); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // All k3 values are flushed and are in different files. + ASSERT_OK(Merge("k3", "ab")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "bc")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "cd")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "de")); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k3", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(values[0], "ab"); + ASSERT_EQ(values[1], "bc"); + ASSERT_EQ(values[2], "cd"); + ASSERT_EQ(values[3], "de"); + + // All k3.1 values are flushed and are in different files. + ASSERT_OK(Merge("k3.1", "ab")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("k3.1", "bc")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3.1", "cd")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3.1", "de")); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k3.1", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(values[0], "bc"); + ASSERT_EQ(values[1], "cd"); + ASSERT_EQ(values[2], "de"); + + // All k3.2 values are flushed and are in different files. + ASSERT_OK(Merge("k3.2", "ab")); + ASSERT_OK(Flush()); + ASSERT_OK(Delete("k3.2")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3.2", "cd")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3.2", "de")); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k3.2", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(values[0], "cd"); + ASSERT_EQ(values[1], "de"); + + // All K4 values are in different levels + ASSERT_OK(Merge("k4", "ba")); + ASSERT_OK(Flush()); + MoveFilesToLevel(4); + ASSERT_OK(Merge("k4", "cb")); + ASSERT_OK(Flush()); + MoveFilesToLevel(3); + ASSERT_OK(Merge("k4", "dc")); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + ASSERT_OK(Merge("k4", "ed")); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k4", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(values[0], "ba"); + ASSERT_EQ(values[1], "cb"); + ASSERT_EQ(values[2], "dc"); + ASSERT_EQ(values[3], "ed"); + + // First 3 k5 values are in SST and next 4 k5 values are in Immutable + // Memtable + ASSERT_OK(Merge("k5", "who")); + ASSERT_OK(Merge("k5", "am")); + ASSERT_OK(Merge("k5", "i")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("k5", "remember")); + ASSERT_OK(Merge("k5", "i")); + ASSERT_OK(Merge("k5", "am")); + ASSERT_OK(Merge("k5", "rocks")); + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k5", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(values[0], "remember"); + ASSERT_EQ(values[1], "i"); + ASSERT_EQ(values[2], "am"); +} + +TEST_F(DBMergeOperandTest, BlobDBGetMergeOperandsBasic) { + Options options; + options.create_if_missing = true; + options.enable_blob_files = true; + options.min_blob_size = 0; + // Use only the latest two merge operands. + options.merge_operator = std::make_shared(2, ','); + options.env = env_; + Reopen(options); + int num_records = 4; + int number_of_operands = 0; + std::vector values(num_records); + GetMergeOperandsOptions merge_operands_info; + merge_operands_info.expected_max_number_of_operands = num_records; + + // All k1 values are in memtable. + ASSERT_OK(Put("k1", "x")); + ASSERT_OK(Merge("k1", "b")); + ASSERT_OK(Merge("k1", "c")); + ASSERT_OK(Merge("k1", "d")); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k1", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(values[0], "x"); + ASSERT_EQ(values[1], "b"); + ASSERT_EQ(values[2], "c"); + ASSERT_EQ(values[3], "d"); + + // expected_max_number_of_operands is less than number of merge operands so + // status should be Incomplete. + merge_operands_info.expected_max_number_of_operands = num_records - 1; + Status status = db_->GetMergeOperands( + ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), + &merge_operands_info, &number_of_operands); + ASSERT_EQ(status.IsIncomplete(), true); + merge_operands_info.expected_max_number_of_operands = num_records; + + // All k2 values are flushed to L0 into a single file. + ASSERT_OK(Put("k2", "q")); + ASSERT_OK(Merge("k2", "w")); + ASSERT_OK(Merge("k2", "e")); + ASSERT_OK(Merge("k2", "r")); + ASSERT_OK(Flush()); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k2", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(values[0], "q,w,e,r"); + + // Do some compaction that will make the following tests more predictable + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // All k3 values are flushed and are in different files. + ASSERT_OK(Put("k3", "ab")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "bc")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "cd")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "de")); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k3", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(values[0], "ab"); + ASSERT_EQ(values[1], "bc"); + ASSERT_EQ(values[2], "cd"); + ASSERT_EQ(values[3], "de"); + + // All K4 values are in different levels + ASSERT_OK(Put("k4", "ba")); + ASSERT_OK(Flush()); + MoveFilesToLevel(4); + ASSERT_OK(Merge("k4", "cb")); + ASSERT_OK(Flush()); + MoveFilesToLevel(3); + ASSERT_OK(Merge("k4", "dc")); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + ASSERT_OK(Merge("k4", "ed")); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k4", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(values[0], "ba"); + ASSERT_EQ(values[1], "cb"); + ASSERT_EQ(values[2], "dc"); + ASSERT_EQ(values[3], "ed"); +} + +TEST_F(DBMergeOperandTest, GetMergeOperandsLargeResultOptimization) { + // These constants are chosen to trigger the large result optimization + // (pinning a bundle of `DBImpl` resources). + const int kNumOperands = 1024; + const int kOperandLen = 1024; + + Options options; + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + DestroyAndReopen(options); + + Random rnd(301); + std::vector expected_merge_operands; + expected_merge_operands.reserve(kNumOperands); + for (int i = 0; i < kNumOperands; ++i) { + expected_merge_operands.emplace_back(rnd.RandomString(kOperandLen)); + ASSERT_OK(Merge("key", expected_merge_operands.back())); + } + + std::vector merge_operands(kNumOperands); + GetMergeOperandsOptions merge_operands_info; + merge_operands_info.expected_max_number_of_operands = kNumOperands; + int num_merge_operands = 0; + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "key", merge_operands.data(), + &merge_operands_info, &num_merge_operands)); + ASSERT_EQ(num_merge_operands, kNumOperands); + + // Ensures the large result optimization was used. + for (int i = 0; i < kNumOperands; ++i) { + ASSERT_TRUE(merge_operands[i].IsPinned()); + } + + // Add a Flush() to change the `SuperVersion` to challenge the resource + // pinning. + ASSERT_OK(Flush()); + + for (int i = 0; i < kNumOperands; ++i) { + ASSERT_EQ(expected_merge_operands[i], merge_operands[i]); + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_merge_operator_test.cc b/src/rocksdb/db/db_merge_operator_test.cc new file mode 100644 index 000000000..7c5505bd1 --- /dev/null +++ b/src/rocksdb/db/db_merge_operator_test.cc @@ -0,0 +1,669 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#include +#include + +#include "db/db_test_util.h" +#include "db/forward_iterator.h" +#include "port/stack_trace.h" +#include "rocksdb/merge_operator.h" +#include "util/random.h" +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/string_append/stringappend2.h" + +namespace ROCKSDB_NAMESPACE { + +class TestReadCallback : public ReadCallback { + public: + TestReadCallback(SnapshotChecker* snapshot_checker, + SequenceNumber snapshot_seq) + : ReadCallback(snapshot_seq), + snapshot_checker_(snapshot_checker), + snapshot_seq_(snapshot_seq) {} + + bool IsVisibleFullCheck(SequenceNumber seq) override { + return snapshot_checker_->CheckInSnapshot(seq, snapshot_seq_) == + SnapshotCheckerResult::kInSnapshot; + } + + private: + SnapshotChecker* snapshot_checker_; + SequenceNumber snapshot_seq_; +}; + +// Test merge operator functionality. +class DBMergeOperatorTest : public DBTestBase { + public: + DBMergeOperatorTest() + : DBTestBase("db_merge_operator_test", /*env_do_fsync=*/false) {} + + std::string GetWithReadCallback(SnapshotChecker* snapshot_checker, + const Slice& key, + const Snapshot* snapshot = nullptr) { + SequenceNumber seq = snapshot == nullptr ? db_->GetLatestSequenceNumber() + : snapshot->GetSequenceNumber(); + TestReadCallback read_callback(snapshot_checker, seq); + ReadOptions read_opt; + read_opt.snapshot = snapshot; + PinnableSlice value; + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = db_->DefaultColumnFamily(); + get_impl_options.value = &value; + get_impl_options.callback = &read_callback; + Status s = dbfull()->GetImpl(read_opt, key, get_impl_options); + if (!s.ok()) { + return s.ToString(); + } + return value.ToString(); + } +}; + +TEST_F(DBMergeOperatorTest, LimitMergeOperands) { + class LimitedStringAppendMergeOp : public StringAppendTESTOperator { + public: + LimitedStringAppendMergeOp(int limit, char delim) + : StringAppendTESTOperator(delim), limit_(limit) {} + + const char* Name() const override { + return "DBMergeOperatorTest::LimitedStringAppendMergeOp"; + } + + bool ShouldMerge(const std::vector& operands) const override { + if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) { + return true; + } + return false; + } + + private: + size_t limit_ = 0; + }; + + Options options; + options.create_if_missing = true; + // Use only the latest two merge operands. + options.merge_operator = std::make_shared(2, ','); + options.env = env_; + Reopen(options); + // All K1 values are in memtable. + ASSERT_OK(Merge("k1", "a")); + ASSERT_OK(Merge("k1", "b")); + ASSERT_OK(Merge("k1", "c")); + ASSERT_OK(Merge("k1", "d")); + std::string value; + ASSERT_OK(db_->Get(ReadOptions(), "k1", &value)); + // Make sure that only the latest two merge operands are used. If this was + // not the case the value would be "a,b,c,d". + ASSERT_EQ(value, "c,d"); + + // All K2 values are flushed to L0 into a single file. + ASSERT_OK(Merge("k2", "a")); + ASSERT_OK(Merge("k2", "b")); + ASSERT_OK(Merge("k2", "c")); + ASSERT_OK(Merge("k2", "d")); + ASSERT_OK(Flush()); + ASSERT_OK(db_->Get(ReadOptions(), "k2", &value)); + ASSERT_EQ(value, "c,d"); + + // All K3 values are flushed and are in different files. + ASSERT_OK(Merge("k3", "ab")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "bc")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "cd")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "de")); + ASSERT_OK(db_->Get(ReadOptions(), "k3", &value)); + ASSERT_EQ(value, "cd,de"); + + // All K4 values are in different levels + ASSERT_OK(Merge("k4", "ab")); + ASSERT_OK(Flush()); + MoveFilesToLevel(4); + ASSERT_OK(Merge("k4", "bc")); + ASSERT_OK(Flush()); + MoveFilesToLevel(3); + ASSERT_OK(Merge("k4", "cd")); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + ASSERT_OK(Merge("k4", "de")); + ASSERT_OK(db_->Get(ReadOptions(), "k4", &value)); + ASSERT_EQ(value, "cd,de"); +} + +TEST_F(DBMergeOperatorTest, MergeErrorOnRead) { + Options options; + options.create_if_missing = true; + options.merge_operator.reset(new TestPutOperator()); + options.env = env_; + Reopen(options); + ASSERT_OK(Merge("k1", "v1")); + ASSERT_OK(Merge("k1", "corrupted")); + std::string value; + ASSERT_TRUE(db_->Get(ReadOptions(), "k1", &value).IsCorruption()); + VerifyDBInternal({{"k1", "corrupted"}, {"k1", "v1"}}); +} + +TEST_F(DBMergeOperatorTest, MergeErrorOnWrite) { + Options options; + options.create_if_missing = true; + options.merge_operator.reset(new TestPutOperator()); + options.max_successive_merges = 3; + options.env = env_; + Reopen(options); + ASSERT_OK(Merge("k1", "v1")); + ASSERT_OK(Merge("k1", "v2")); + // Will trigger a merge when hitting max_successive_merges and the merge + // will fail. The delta will be inserted nevertheless. + ASSERT_OK(Merge("k1", "corrupted")); + // Data should stay unmerged after the error. + VerifyDBInternal({{"k1", "corrupted"}, {"k1", "v2"}, {"k1", "v1"}}); +} + +TEST_F(DBMergeOperatorTest, MergeErrorOnIteration) { + Options options; + options.create_if_missing = true; + options.merge_operator.reset(new TestPutOperator()); + options.env = env_; + + DestroyAndReopen(options); + ASSERT_OK(Merge("k1", "v1")); + ASSERT_OK(Merge("k1", "corrupted")); + ASSERT_OK(Put("k2", "v2")); + auto* iter = db_->NewIterator(ReadOptions()); + iter->Seek("k1"); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->status().IsCorruption()); + delete iter; + iter = db_->NewIterator(ReadOptions()); + iter->Seek("k2"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->status().IsCorruption()); + delete iter; + VerifyDBInternal({{"k1", "corrupted"}, {"k1", "v1"}, {"k2", "v2"}}); + + DestroyAndReopen(options); + ASSERT_OK(Merge("k1", "v1")); + ASSERT_OK(Put("k2", "v2")); + ASSERT_OK(Merge("k2", "corrupted")); + iter = db_->NewIterator(ReadOptions()); + iter->Seek("k1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->status().IsCorruption()); + delete iter; + VerifyDBInternal({{"k1", "v1"}, {"k2", "corrupted"}, {"k2", "v2"}}); +} + +class MergeOperatorPinningTest : public DBMergeOperatorTest, + public testing::WithParamInterface { + public: + MergeOperatorPinningTest() { disable_block_cache_ = GetParam(); } + + bool disable_block_cache_; +}; + +INSTANTIATE_TEST_CASE_P(MergeOperatorPinningTest, MergeOperatorPinningTest, + ::testing::Bool()); + +#ifndef ROCKSDB_LITE +TEST_P(MergeOperatorPinningTest, OperandsMultiBlocks) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.block_size = 1; // every block will contain one entry + table_options.no_block_cache = disable_block_cache_; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.merge_operator = MergeOperators::CreateStringAppendTESTOperator(); + options.level0_slowdown_writes_trigger = (1 << 30); + options.level0_stop_writes_trigger = (1 << 30); + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + const int kKeysPerFile = 10; + const int kOperandsPerKeyPerFile = 7; + const int kOperandSize = 100; + // Filse to write in L0 before compacting to lower level + const int kFilesPerLevel = 3; + + Random rnd(301); + std::map true_data; + int batch_num = 1; + int lvl_to_fill = 4; + int key_id = 0; + while (true) { + for (int j = 0; j < kKeysPerFile; j++) { + std::string key = Key(key_id % 35); + key_id++; + for (int k = 0; k < kOperandsPerKeyPerFile; k++) { + std::string val = rnd.RandomString(kOperandSize); + ASSERT_OK(db_->Merge(WriteOptions(), key, val)); + if (true_data[key].size() == 0) { + true_data[key] = val; + } else { + true_data[key] += "," + val; + } + } + } + + if (lvl_to_fill == -1) { + // Keep last batch in memtable and stop + break; + } + + ASSERT_OK(Flush()); + if (batch_num % kFilesPerLevel == 0) { + if (lvl_to_fill != 0) { + MoveFilesToLevel(lvl_to_fill); + } + lvl_to_fill--; + } + batch_num++; + } + + // 3 L0 files + // 1 L1 file + // 3 L2 files + // 1 L3 file + // 3 L4 Files + ASSERT_EQ(FilesPerLevel(), "3,1,3,1,3"); + + VerifyDBFromMap(true_data); +} + +class MergeOperatorHook : public MergeOperator { + public: + explicit MergeOperatorHook(std::shared_ptr _merge_op) + : merge_op_(_merge_op) {} + + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override { + before_merge_(); + bool res = merge_op_->FullMergeV2(merge_in, merge_out); + after_merge_(); + return res; + } + + const char* Name() const override { return merge_op_->Name(); } + + std::shared_ptr merge_op_; + std::function before_merge_ = []() {}; + std::function after_merge_ = []() {}; +}; + +TEST_P(MergeOperatorPinningTest, EvictCacheBeforeMerge) { + Options options = CurrentOptions(); + + auto merge_hook = + std::make_shared(MergeOperators::CreateMaxOperator()); + options.merge_operator = merge_hook; + options.disable_auto_compactions = true; + options.level0_slowdown_writes_trigger = (1 << 30); + options.level0_stop_writes_trigger = (1 << 30); + options.max_open_files = 20; + BlockBasedTableOptions bbto; + bbto.no_block_cache = disable_block_cache_; + if (bbto.no_block_cache == false) { + bbto.block_cache = NewLRUCache(64 * 1024 * 1024); + } else { + bbto.block_cache = nullptr; + } + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + const int kNumOperands = 30; + const int kNumKeys = 1000; + const int kOperandSize = 100; + Random rnd(301); + + // 1000 keys every key have 30 operands, every operand is in a different file + std::map true_data; + for (int i = 0; i < kNumOperands; i++) { + for (int j = 0; j < kNumKeys; j++) { + std::string k = Key(j); + std::string v = rnd.RandomString(kOperandSize); + ASSERT_OK(db_->Merge(WriteOptions(), k, v)); + + true_data[k] = std::max(true_data[k], v); + } + ASSERT_OK(Flush()); + } + + std::vector file_numbers = ListTableFiles(env_, dbname_); + ASSERT_EQ(file_numbers.size(), kNumOperands); + int merge_cnt = 0; + + // Code executed before merge operation + merge_hook->before_merge_ = [&]() { + // Evict all tables from cache before every merge operation + auto* table_cache = dbfull()->TEST_table_cache(); + for (uint64_t num : file_numbers) { + TableCache::Evict(table_cache, num); + } + // Decrease cache capacity to force all unrefed blocks to be evicted + if (bbto.block_cache) { + bbto.block_cache->SetCapacity(1); + } + merge_cnt++; + }; + + // Code executed after merge operation + merge_hook->after_merge_ = [&]() { + // Increase capacity again after doing the merge + if (bbto.block_cache) { + bbto.block_cache->SetCapacity(64 * 1024 * 1024); + } + }; + + size_t total_reads; + VerifyDBFromMap(true_data, &total_reads); + ASSERT_EQ(merge_cnt, total_reads); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + VerifyDBFromMap(true_data, &total_reads); +} + +TEST_P(MergeOperatorPinningTest, TailingIterator) { + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreateMaxOperator(); + BlockBasedTableOptions bbto; + bbto.no_block_cache = disable_block_cache_; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + const int kNumOperands = 100; + const int kNumWrites = 100000; + + std::function writer_func = [&]() { + int k = 0; + for (int i = 0; i < kNumWrites; i++) { + ASSERT_OK(db_->Merge(WriteOptions(), Key(k), Key(k))); + + if (i && i % kNumOperands == 0) { + k++; + } + if (i && i % 127 == 0) { + ASSERT_OK(Flush()); + } + if (i && i % 317 == 0) { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } + } + }; + + std::function reader_func = [&]() { + ReadOptions ro; + ro.tailing = true; + Iterator* iter = db_->NewIterator(ro); + ASSERT_OK(iter->status()); + iter->SeekToFirst(); + for (int i = 0; i < (kNumWrites / kNumOperands); i++) { + while (!iter->Valid()) { + // wait for the key to be written + env_->SleepForMicroseconds(100); + iter->Seek(Key(i)); + } + ASSERT_EQ(iter->key(), Key(i)); + ASSERT_EQ(iter->value(), Key(i)); + + iter->Next(); + } + ASSERT_OK(iter->status()); + + delete iter; + }; + + ROCKSDB_NAMESPACE::port::Thread writer_thread(writer_func); + ROCKSDB_NAMESPACE::port::Thread reader_thread(reader_func); + + writer_thread.join(); + reader_thread.join(); +} + +TEST_F(DBMergeOperatorTest, TailingIteratorMemtableUnrefedBySomeoneElse) { + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + DestroyAndReopen(options); + + // Overview of the test: + // * There are two merge operands for the same key: one in an sst file, + // another in a memtable. + // * Seek a tailing iterator to this key. + // * As part of the seek, the iterator will: + // (a) first visit the operand in the memtable and tell ForwardIterator + // to pin this operand, then + // (b) move on to the operand in the sst file, then pass both operands + // to merge operator. + // * The memtable may get flushed and unreferenced by another thread between + // (a) and (b). The test simulates it by flushing the memtable inside a + // SyncPoint callback located between (a) and (b). + // * In this case it's ForwardIterator's responsibility to keep the memtable + // pinned until (b) is complete. There used to be a bug causing + // ForwardIterator to not pin it in some circumstances. This test + // reproduces it. + + ASSERT_OK(db_->Merge(WriteOptions(), "key", "sst")); + ASSERT_OK(db_->Flush(FlushOptions())); // Switch to SuperVersion A + ASSERT_OK(db_->Merge(WriteOptions(), "key", "memtable")); + + // Pin SuperVersion A + std::unique_ptr someone_else(db_->NewIterator(ReadOptions())); + ASSERT_OK(someone_else->status()); + + bool pushed_first_operand = false; + bool stepped_to_next_operand = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBIter::MergeValuesNewToOld:PushedFirstOperand", [&](void*) { + EXPECT_FALSE(pushed_first_operand); + pushed_first_operand = true; + EXPECT_OK(db_->Flush(FlushOptions())); // Switch to SuperVersion B + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBIter::MergeValuesNewToOld:SteppedToNextOperand", [&](void*) { + EXPECT_FALSE(stepped_to_next_operand); + stepped_to_next_operand = true; + someone_else.reset(); // Unpin SuperVersion A + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ReadOptions ro; + ro.tailing = true; + std::unique_ptr iter(db_->NewIterator(ro)); + iter->Seek("key"); + + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(std::string("sst,memtable"), iter->value().ToString()); + EXPECT_TRUE(pushed_first_operand); + EXPECT_TRUE(stepped_to_next_operand); +} +#endif // ROCKSDB_LITE + +TEST_F(DBMergeOperatorTest, SnapshotCheckerAndReadCallback) { + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + DestroyAndReopen(options); + + class TestSnapshotChecker : public SnapshotChecker { + public: + SnapshotCheckerResult CheckInSnapshot( + SequenceNumber seq, SequenceNumber snapshot_seq) const override { + return IsInSnapshot(seq, snapshot_seq) + ? SnapshotCheckerResult::kInSnapshot + : SnapshotCheckerResult::kNotInSnapshot; + } + + bool IsInSnapshot(SequenceNumber seq, SequenceNumber snapshot_seq) const { + switch (snapshot_seq) { + case 0: + return seq == 0; + case 1: + return seq <= 1; + case 2: + // seq = 2 not visible to snapshot with seq = 2 + return seq <= 1; + case 3: + return seq <= 3; + case 4: + // seq = 4 not visible to snpahost with seq = 4 + return seq <= 3; + default: + // seq >=4 is uncommitted + return seq <= 4; + }; + } + }; + TestSnapshotChecker* snapshot_checker = new TestSnapshotChecker(); + dbfull()->SetSnapshotChecker(snapshot_checker); + + std::string value; + ASSERT_OK(Merge("foo", "v1")); + ASSERT_EQ(1, db_->GetLatestSequenceNumber()); + ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo")); + ASSERT_OK(Merge("foo", "v2")); + ASSERT_EQ(2, db_->GetLatestSequenceNumber()); + // v2 is not visible to latest snapshot, which has seq = 2. + ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo")); + // Take a snapshot with seq = 2. + const Snapshot* snapshot1 = db_->GetSnapshot(); + ASSERT_EQ(2, snapshot1->GetSequenceNumber()); + // v2 is not visible to snapshot1, which has seq = 2 + ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1)); + + // Verify flush doesn't alter the result. + ASSERT_OK(Flush()); + ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1)); + ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo")); + + ASSERT_OK(Merge("foo", "v3")); + ASSERT_EQ(3, db_->GetLatestSequenceNumber()); + ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo")); + ASSERT_OK(Merge("foo", "v4")); + ASSERT_EQ(4, db_->GetLatestSequenceNumber()); + // v4 is not visible to latest snapshot, which has seq = 4. + ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo")); + const Snapshot* snapshot2 = db_->GetSnapshot(); + ASSERT_EQ(4, snapshot2->GetSequenceNumber()); + // v4 is not visible to snapshot2, which has seq = 4. + ASSERT_EQ("v1,v2,v3", + GetWithReadCallback(snapshot_checker, "foo", snapshot2)); + + // Verify flush doesn't alter the result. + ASSERT_OK(Flush()); + ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1)); + ASSERT_EQ("v1,v2,v3", + GetWithReadCallback(snapshot_checker, "foo", snapshot2)); + ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo")); + + ASSERT_OK(Merge("foo", "v5")); + ASSERT_EQ(5, db_->GetLatestSequenceNumber()); + // v5 is uncommitted + ASSERT_EQ("v1,v2,v3,v4", GetWithReadCallback(snapshot_checker, "foo")); + + // full manual compaction. + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Verify compaction doesn't alter the result. + ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1)); + ASSERT_EQ("v1,v2,v3", + GetWithReadCallback(snapshot_checker, "foo", snapshot2)); + ASSERT_EQ("v1,v2,v3,v4", GetWithReadCallback(snapshot_checker, "foo")); + + db_->ReleaseSnapshot(snapshot1); + db_->ReleaseSnapshot(snapshot2); +} + +class PerConfigMergeOperatorPinningTest + : public DBMergeOperatorTest, + public testing::WithParamInterface> { + public: + PerConfigMergeOperatorPinningTest() { + std::tie(disable_block_cache_, option_config_) = GetParam(); + } + + bool disable_block_cache_; +}; + +INSTANTIATE_TEST_CASE_P( + MergeOperatorPinningTest, PerConfigMergeOperatorPinningTest, + ::testing::Combine(::testing::Bool(), + ::testing::Range(static_cast(DBTestBase::kDefault), + static_cast(DBTestBase::kEnd)))); + +TEST_P(PerConfigMergeOperatorPinningTest, Randomized) { + if (ShouldSkipOptions(option_config_, kSkipMergePut)) { + return; + } + + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreateMaxOperator(); + BlockBasedTableOptions table_options; + table_options.no_block_cache = disable_block_cache_; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + Random rnd(301); + std::map true_data; + + const int kTotalMerges = 5000; + // Every key gets ~10 operands + const int kKeyRange = kTotalMerges / 10; + const int kOperandSize = 20; + const int kNumPutBefore = kKeyRange / 10; // 10% value + const int kNumPutAfter = kKeyRange / 10; // 10% overwrite + const int kNumDelete = kKeyRange / 10; // 10% delete + + // kNumPutBefore keys will have base values + for (int i = 0; i < kNumPutBefore; i++) { + std::string key = Key(rnd.Next() % kKeyRange); + std::string value = rnd.RandomString(kOperandSize); + ASSERT_OK(db_->Put(WriteOptions(), key, value)); + + true_data[key] = value; + } + + // Do kTotalMerges merges + for (int i = 0; i < kTotalMerges; i++) { + std::string key = Key(rnd.Next() % kKeyRange); + std::string value = rnd.RandomString(kOperandSize); + ASSERT_OK(db_->Merge(WriteOptions(), key, value)); + + if (true_data[key] < value) { + true_data[key] = value; + } + } + + // Overwrite random kNumPutAfter keys + for (int i = 0; i < kNumPutAfter; i++) { + std::string key = Key(rnd.Next() % kKeyRange); + std::string value = rnd.RandomString(kOperandSize); + ASSERT_OK(db_->Put(WriteOptions(), key, value)); + + true_data[key] = value; + } + + // Delete random kNumDelete keys + for (int i = 0; i < kNumDelete; i++) { + std::string key = Key(rnd.Next() % kKeyRange); + ASSERT_OK(db_->Delete(WriteOptions(), key)); + + true_data.erase(key); + } + + VerifyDBFromMap(true_data); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_options_test.cc b/src/rocksdb/db/db_options_test.cc new file mode 100644 index 000000000..691081db9 --- /dev/null +++ b/src/rocksdb/db/db_options_test.cc @@ -0,0 +1,1219 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "options/options_helper.h" +#include "port/stack_trace.h" +#include "rocksdb/cache.h" +#include "rocksdb/convenience.h" +#include "rocksdb/rate_limiter.h" +#include "rocksdb/stats_history.h" +#include "test_util/sync_point.h" +#include "test_util/testutil.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +class DBOptionsTest : public DBTestBase { + public: + DBOptionsTest() : DBTestBase("db_options_test", /*env_do_fsync=*/true) {} + +#ifndef ROCKSDB_LITE + std::unordered_map GetMutableDBOptionsMap( + const DBOptions& options) { + std::string options_str; + std::unordered_map mutable_map; + ConfigOptions config_options(options); + config_options.delimiter = "; "; + + EXPECT_OK(GetStringFromMutableDBOptions( + config_options, MutableDBOptions(options), &options_str)); + EXPECT_OK(StringToMap(options_str, &mutable_map)); + + return mutable_map; + } + + std::unordered_map GetMutableCFOptionsMap( + const ColumnFamilyOptions& options) { + std::string options_str; + ConfigOptions config_options; + config_options.delimiter = "; "; + + std::unordered_map mutable_map; + EXPECT_OK(GetStringFromMutableCFOptions( + config_options, MutableCFOptions(options), &options_str)); + EXPECT_OK(StringToMap(options_str, &mutable_map)); + return mutable_map; + } + + std::unordered_map GetRandomizedMutableCFOptionsMap( + Random* rnd) { + Options options = CurrentOptions(); + options.env = env_; + ImmutableDBOptions db_options(options); + test::RandomInitCFOptions(&options, options, rnd); + auto sanitized_options = SanitizeOptions(db_options, options); + auto opt_map = GetMutableCFOptionsMap(sanitized_options); + delete options.compaction_filter; + return opt_map; + } + + std::unordered_map GetRandomizedMutableDBOptionsMap( + Random* rnd) { + DBOptions db_options; + test::RandomInitDBOptions(&db_options, rnd); + auto sanitized_options = SanitizeOptions(dbname_, db_options); + return GetMutableDBOptionsMap(sanitized_options); + } +#endif // ROCKSDB_LITE +}; + +TEST_F(DBOptionsTest, ImmutableTrackAndVerifyWalsInManifest) { + Options options; + options.env = env_; + options.track_and_verify_wals_in_manifest = true; + + ImmutableDBOptions db_options(options); + ASSERT_TRUE(db_options.track_and_verify_wals_in_manifest); + + Reopen(options); + ASSERT_TRUE(dbfull()->GetDBOptions().track_and_verify_wals_in_manifest); + + Status s = + dbfull()->SetDBOptions({{"track_and_verify_wals_in_manifest", "false"}}); + ASSERT_FALSE(s.ok()); +} + +TEST_F(DBOptionsTest, ImmutableVerifySstUniqueIdInManifest) { + Options options; + options.env = env_; + options.verify_sst_unique_id_in_manifest = true; + + ImmutableDBOptions db_options(options); + ASSERT_TRUE(db_options.verify_sst_unique_id_in_manifest); + + Reopen(options); + ASSERT_TRUE(dbfull()->GetDBOptions().verify_sst_unique_id_in_manifest); + + Status s = + dbfull()->SetDBOptions({{"verify_sst_unique_id_in_manifest", "false"}}); + ASSERT_FALSE(s.ok()); +} + +// RocksDB lite don't support dynamic options. +#ifndef ROCKSDB_LITE + +TEST_F(DBOptionsTest, AvoidUpdatingOptions) { + Options options; + options.env = env_; + options.max_background_jobs = 4; + options.delayed_write_rate = 1024; + + Reopen(options); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + bool is_changed_stats = false; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::WriteOptionsFile:PersistOptions", [&](void* /*arg*/) { + ASSERT_FALSE(is_changed_stats); // should only save options file once + is_changed_stats = true; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // helper function to check the status and reset after each check + auto is_changed = [&] { + bool ret = is_changed_stats; + is_changed_stats = false; + return ret; + }; + + // without changing the value, but it's sanitized to a different value + ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "0"}})); + ASSERT_TRUE(is_changed()); + + // without changing the value + ASSERT_OK(dbfull()->SetDBOptions({{"max_background_jobs", "4"}})); + ASSERT_FALSE(is_changed()); + + // changing the value + ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "123"}})); + ASSERT_TRUE(is_changed()); + + // update again + ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "123"}})); + ASSERT_FALSE(is_changed()); + + // without changing a default value + ASSERT_OK(dbfull()->SetDBOptions({{"strict_bytes_per_sync", "false"}})); + ASSERT_FALSE(is_changed()); + + // now change + ASSERT_OK(dbfull()->SetDBOptions({{"strict_bytes_per_sync", "true"}})); + ASSERT_TRUE(is_changed()); + + // multiple values without change + ASSERT_OK(dbfull()->SetDBOptions( + {{"max_total_wal_size", "0"}, {"stats_dump_period_sec", "600"}})); + ASSERT_FALSE(is_changed()); + + // multiple values with change + ASSERT_OK(dbfull()->SetDBOptions( + {{"max_open_files", "100"}, {"stats_dump_period_sec", "600"}})); + ASSERT_TRUE(is_changed()); +} + +TEST_F(DBOptionsTest, GetLatestDBOptions) { + // GetOptions should be able to get latest option changed by SetOptions. + Options options; + options.create_if_missing = true; + options.env = env_; + Random rnd(228); + Reopen(options); + auto new_options = GetRandomizedMutableDBOptionsMap(&rnd); + ASSERT_OK(dbfull()->SetDBOptions(new_options)); + ASSERT_EQ(new_options, GetMutableDBOptionsMap(dbfull()->GetDBOptions())); +} + +TEST_F(DBOptionsTest, GetLatestCFOptions) { + // GetOptions should be able to get latest option changed by SetOptions. + Options options; + options.create_if_missing = true; + options.env = env_; + Random rnd(228); + Reopen(options); + CreateColumnFamilies({"foo"}, options); + ReopenWithColumnFamilies({"default", "foo"}, options); + auto options_default = GetRandomizedMutableCFOptionsMap(&rnd); + auto options_foo = GetRandomizedMutableCFOptionsMap(&rnd); + ASSERT_OK(dbfull()->SetOptions(handles_[0], options_default)); + ASSERT_OK(dbfull()->SetOptions(handles_[1], options_foo)); + ASSERT_EQ(options_default, + GetMutableCFOptionsMap(dbfull()->GetOptions(handles_[0]))); + ASSERT_EQ(options_foo, + GetMutableCFOptionsMap(dbfull()->GetOptions(handles_[1]))); +} + +TEST_F(DBOptionsTest, SetMutableTableOptions) { + Options options; + options.create_if_missing = true; + options.env = env_; + options.blob_file_size = 16384; + BlockBasedTableOptions bbto; + bbto.no_block_cache = true; + bbto.block_size = 8192; + bbto.block_restart_interval = 7; + + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); + Options c_opts = dbfull()->GetOptions(cfh); + + const auto* c_bbto = + c_opts.table_factory->GetOptions(); + ASSERT_NE(c_bbto, nullptr); + ASSERT_EQ(c_opts.blob_file_size, 16384); + ASSERT_EQ(c_bbto->no_block_cache, true); + ASSERT_EQ(c_bbto->block_size, 8192); + ASSERT_EQ(c_bbto->block_restart_interval, 7); + ASSERT_OK(dbfull()->SetOptions( + cfh, {{"table_factory.block_size", "16384"}, + {"table_factory.block_restart_interval", "11"}})); + ASSERT_EQ(c_bbto->block_size, 16384); + ASSERT_EQ(c_bbto->block_restart_interval, 11); + + // Now set an option that is not mutable - options should not change + ASSERT_NOK( + dbfull()->SetOptions(cfh, {{"table_factory.no_block_cache", "false"}})); + ASSERT_EQ(c_bbto->no_block_cache, true); + ASSERT_EQ(c_bbto->block_size, 16384); + ASSERT_EQ(c_bbto->block_restart_interval, 11); + + // Set some that are mutable and some that are not - options should not change + ASSERT_NOK(dbfull()->SetOptions( + cfh, {{"table_factory.no_block_cache", "false"}, + {"table_factory.block_size", "8192"}, + {"table_factory.block_restart_interval", "7"}})); + ASSERT_EQ(c_bbto->no_block_cache, true); + ASSERT_EQ(c_bbto->block_size, 16384); + ASSERT_EQ(c_bbto->block_restart_interval, 11); + + // Set some that are mutable and some that do not exist - options should not + // change + ASSERT_NOK(dbfull()->SetOptions( + cfh, {{"table_factory.block_size", "8192"}, + {"table_factory.does_not_exist", "true"}, + {"table_factory.block_restart_interval", "7"}})); + ASSERT_EQ(c_bbto->no_block_cache, true); + ASSERT_EQ(c_bbto->block_size, 16384); + ASSERT_EQ(c_bbto->block_restart_interval, 11); + + // Trying to change the table factory fails + ASSERT_NOK(dbfull()->SetOptions( + cfh, {{"table_factory", TableFactory::kPlainTableName()}})); + + // Set some on the table and some on the Column Family + ASSERT_OK(dbfull()->SetOptions( + cfh, {{"table_factory.block_size", "16384"}, + {"blob_file_size", "32768"}, + {"table_factory.block_restart_interval", "13"}})); + c_opts = dbfull()->GetOptions(cfh); + ASSERT_EQ(c_opts.blob_file_size, 32768); + ASSERT_EQ(c_bbto->block_size, 16384); + ASSERT_EQ(c_bbto->block_restart_interval, 13); + // Set some on the table and a bad one on the ColumnFamily - options should + // not change + ASSERT_NOK(dbfull()->SetOptions( + cfh, {{"table_factory.block_size", "1024"}, + {"no_such_option", "32768"}, + {"table_factory.block_restart_interval", "7"}})); + ASSERT_EQ(c_bbto->block_size, 16384); + ASSERT_EQ(c_bbto->block_restart_interval, 13); +} + +TEST_F(DBOptionsTest, SetWithCustomMemTableFactory) { + class DummySkipListFactory : public SkipListFactory { + public: + static const char* kClassName() { return "DummySkipListFactory"; } + const char* Name() const override { return kClassName(); } + explicit DummySkipListFactory() : SkipListFactory(2) {} + }; + { + // Verify the DummySkipList cannot be created + ConfigOptions config_options; + config_options.ignore_unsupported_options = false; + std::unique_ptr factory; + ASSERT_NOK(MemTableRepFactory::CreateFromString( + config_options, DummySkipListFactory::kClassName(), &factory)); + } + Options options; + options.create_if_missing = true; + // Try with fail_if_options_file_error=false/true to update the options + for (bool on_error : {false, true}) { + options.fail_if_options_file_error = on_error; + options.env = env_; + options.disable_auto_compactions = false; + + options.memtable_factory.reset(new DummySkipListFactory()); + Reopen(options); + + ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); + ASSERT_OK( + dbfull()->SetOptions(cfh, {{"disable_auto_compactions", "true"}})); + ColumnFamilyDescriptor cfd; + ASSERT_OK(cfh->GetDescriptor(&cfd)); + ASSERT_STREQ(cfd.options.memtable_factory->Name(), + DummySkipListFactory::kClassName()); + ColumnFamilyHandle* test = nullptr; + ASSERT_OK(dbfull()->CreateColumnFamily(options, "test", &test)); + ASSERT_OK(test->GetDescriptor(&cfd)); + ASSERT_STREQ(cfd.options.memtable_factory->Name(), + DummySkipListFactory::kClassName()); + + ASSERT_OK(dbfull()->DropColumnFamily(test)); + delete test; + } +} + +TEST_F(DBOptionsTest, SetBytesPerSync) { + const size_t kValueSize = 1024 * 1024; // 1MB + Options options; + options.create_if_missing = true; + options.bytes_per_sync = 1024 * 1024; + options.use_direct_reads = false; + options.write_buffer_size = 400 * kValueSize; + options.disable_auto_compactions = true; + options.compression = kNoCompression; + options.env = env_; + Reopen(options); + int counter = 0; + int low_bytes_per_sync = 0; + int i = 0; + const std::string kValue(kValueSize, 'v'); + ASSERT_EQ(options.bytes_per_sync, dbfull()->GetDBOptions().bytes_per_sync); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) { counter++; }); + + WriteOptions write_opts; + // should sync approximately 40MB/1MB ~= 40 times. + for (i = 0; i < 40; i++) { + ASSERT_OK(Put(Key(i), kValue, write_opts)); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + low_bytes_per_sync = counter; + ASSERT_GT(low_bytes_per_sync, 35); + ASSERT_LT(low_bytes_per_sync, 45); + + counter = 0; + // 8388608 = 8 * 1024 * 1024 + ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "8388608"}})); + ASSERT_EQ(8388608, dbfull()->GetDBOptions().bytes_per_sync); + // should sync approximately 40MB*2/8MB ~= 10 times. + // data will be 40*2MB because of previous Puts too. + for (i = 0; i < 40; i++) { + ASSERT_OK(Put(Key(i), kValue, write_opts)); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_GT(counter, 5); + ASSERT_LT(counter, 15); + + // Redundant assert. But leaving it here just to get the point across that + // low_bytes_per_sync > counter. + ASSERT_GT(low_bytes_per_sync, counter); +} + +TEST_F(DBOptionsTest, SetWalBytesPerSync) { + const size_t kValueSize = 1024 * 1024 * 3; + Options options; + options.create_if_missing = true; + options.wal_bytes_per_sync = 512; + options.write_buffer_size = 100 * kValueSize; + options.disable_auto_compactions = true; + options.compression = kNoCompression; + options.env = env_; + Reopen(options); + ASSERT_EQ(512, dbfull()->GetDBOptions().wal_bytes_per_sync); + std::atomic_int counter{0}; + int low_bytes_per_sync = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::RangeSync:0", + [&](void* /*arg*/) { counter.fetch_add(1); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + const std::string kValue(kValueSize, 'v'); + int i = 0; + for (; i < 10; i++) { + ASSERT_OK(Put(Key(i), kValue)); + } + // Do not flush. If we flush here, SwitchWAL will reuse old WAL file since its + // empty and will not get the new wal_bytes_per_sync value. + low_bytes_per_sync = counter; + // 5242880 = 1024 * 1024 * 5 + ASSERT_OK(dbfull()->SetDBOptions({{"wal_bytes_per_sync", "5242880"}})); + ASSERT_EQ(5242880, dbfull()->GetDBOptions().wal_bytes_per_sync); + counter = 0; + i = 0; + for (; i < 10; i++) { + ASSERT_OK(Put(Key(i), kValue)); + } + ASSERT_GT(counter, 0); + ASSERT_GT(low_bytes_per_sync, 0); + ASSERT_GT(low_bytes_per_sync, counter); +} + +TEST_F(DBOptionsTest, WritableFileMaxBufferSize) { + Options options; + options.create_if_missing = true; + options.writable_file_max_buffer_size = 1024 * 1024; + options.level0_file_num_compaction_trigger = 3; + options.max_manifest_file_size = 1; + options.env = env_; + int buffer_size = 1024 * 1024; + Reopen(options); + ASSERT_EQ(buffer_size, + dbfull()->GetDBOptions().writable_file_max_buffer_size); + + std::atomic match_cnt(0); + std::atomic unmatch_cnt(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::WritableFileWriter:0", [&](void* arg) { + int value = static_cast(reinterpret_cast(arg)); + if (value == buffer_size) { + match_cnt++; + } else { + unmatch_cnt++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + int i = 0; + for (; i < 3; i++) { + ASSERT_OK(Put("foo", std::to_string(i))); + ASSERT_OK(Put("bar", std::to_string(i))); + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(unmatch_cnt, 0); + ASSERT_GE(match_cnt, 11); + + ASSERT_OK( + dbfull()->SetDBOptions({{"writable_file_max_buffer_size", "524288"}})); + buffer_size = 512 * 1024; + match_cnt = 0; + unmatch_cnt = 0; // SetDBOptions() will create a WritableFileWriter + + ASSERT_EQ(buffer_size, + dbfull()->GetDBOptions().writable_file_max_buffer_size); + i = 0; + for (; i < 3; i++) { + ASSERT_OK(Put("foo", std::to_string(i))); + ASSERT_OK(Put("bar", std::to_string(i))); + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(unmatch_cnt, 0); + ASSERT_GE(match_cnt, 11); +} + +TEST_F(DBOptionsTest, SetOptionsAndReopen) { + Random rnd(1044); + auto rand_opts = GetRandomizedMutableCFOptionsMap(&rnd); + ASSERT_OK(dbfull()->SetOptions(rand_opts)); + // Verify if DB can be reopen after setting options. + Options options; + options.env = env_; + ASSERT_OK(TryReopen(options)); +} + +TEST_F(DBOptionsTest, EnableAutoCompactionAndTriggerStall) { + const std::string kValue(1024, 'v'); + for (int method_type = 0; method_type < 2; method_type++) { + for (int option_type = 0; option_type < 4; option_type++) { + Options options; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.write_buffer_size = 1024 * 1024 * 10; + options.compression = CompressionType::kNoCompression; + options.level0_file_num_compaction_trigger = 1; + options.level0_stop_writes_trigger = std::numeric_limits::max(); + options.level0_slowdown_writes_trigger = std::numeric_limits::max(); + options.hard_pending_compaction_bytes_limit = + std::numeric_limits::max(); + options.soft_pending_compaction_bytes_limit = + std::numeric_limits::max(); + options.env = env_; + + DestroyAndReopen(options); + int i = 0; + for (; i < 1024; i++) { + ASSERT_OK(Put(Key(i), kValue)); + } + ASSERT_OK(Flush()); + for (; i < 1024 * 2; i++) { + ASSERT_OK(Put(Key(i), kValue)); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + uint64_t l0_size = SizeAtLevel(0); + + switch (option_type) { + case 0: + // test with level0_stop_writes_trigger + options.level0_stop_writes_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + break; + case 1: + options.level0_slowdown_writes_trigger = 2; + break; + case 2: + options.hard_pending_compaction_bytes_limit = l0_size; + options.soft_pending_compaction_bytes_limit = l0_size; + break; + case 3: + options.soft_pending_compaction_bytes_limit = l0_size; + break; + } + Reopen(options); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped()); + ASSERT_FALSE(dbfull()->TEST_write_controler().NeedsDelay()); + + SyncPoint::GetInstance()->LoadDependency( + {{"DBOptionsTest::EnableAutoCompactionAndTriggerStall:1", + "BackgroundCallCompaction:0"}, + {"DBImpl::BackgroundCompaction():BeforePickCompaction", + "DBOptionsTest::EnableAutoCompactionAndTriggerStall:2"}, + {"DBOptionsTest::EnableAutoCompactionAndTriggerStall:3", + "DBImpl::BackgroundCompaction():AfterPickCompaction"}}); + // Block background compaction. + SyncPoint::GetInstance()->EnableProcessing(); + + switch (method_type) { + case 0: + ASSERT_OK( + dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + break; + case 1: + ASSERT_OK(dbfull()->EnableAutoCompaction( + {dbfull()->DefaultColumnFamily()})); + break; + } + TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:1"); + // Wait for stall condition recalculate. + TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:2"); + + switch (option_type) { + case 0: + ASSERT_TRUE(dbfull()->TEST_write_controler().IsStopped()); + break; + case 1: + ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + break; + case 2: + ASSERT_TRUE(dbfull()->TEST_write_controler().IsStopped()); + break; + case 3: + ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + break; + } + TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:3"); + + // Background compaction executed. + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped()); + ASSERT_FALSE(dbfull()->TEST_write_controler().NeedsDelay()); + } + } +} + +TEST_F(DBOptionsTest, SetOptionsMayTriggerCompaction) { + Options options; + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 1000; + options.env = env_; + Reopen(options); + for (int i = 0; i < 3; i++) { + // Need to insert two keys to avoid trivial move. + ASSERT_OK(Put("foo", std::to_string(i))); + ASSERT_OK(Put("bar", std::to_string(i))); + ASSERT_OK(Flush()); + } + ASSERT_EQ("3", FilesPerLevel()); + ASSERT_OK( + dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "3"}})); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("0,1", FilesPerLevel()); +} + +TEST_F(DBOptionsTest, SetBackgroundCompactionThreads) { + Options options; + options.create_if_missing = true; + options.max_background_compactions = 1; // default value + options.env = env_; + Reopen(options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + ASSERT_OK(dbfull()->SetDBOptions({{"max_background_compactions", "3"}})); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + auto stop_token = dbfull()->TEST_write_controler().GetStopToken(); + ASSERT_EQ(3, dbfull()->TEST_BGCompactionsAllowed()); +} + +TEST_F(DBOptionsTest, SetBackgroundFlushThreads) { + Options options; + options.create_if_missing = true; + options.max_background_flushes = 1; + options.env = env_; + Reopen(options); + ASSERT_EQ(1, dbfull()->TEST_BGFlushesAllowed()); + ASSERT_EQ(1, env_->GetBackgroundThreads(Env::Priority::HIGH)); + ASSERT_OK(dbfull()->SetDBOptions({{"max_background_flushes", "3"}})); + ASSERT_EQ(3, env_->GetBackgroundThreads(Env::Priority::HIGH)); + ASSERT_EQ(3, dbfull()->TEST_BGFlushesAllowed()); +} + +TEST_F(DBOptionsTest, SetBackgroundJobs) { + Options options; + options.create_if_missing = true; + options.max_background_jobs = 8; + options.env = env_; + Reopen(options); + + for (int i = 0; i < 2; ++i) { + if (i > 0) { + options.max_background_jobs = 12; + ASSERT_OK(dbfull()->SetDBOptions( + {{"max_background_jobs", + std::to_string(options.max_background_jobs)}})); + } + + const int expected_max_flushes = options.max_background_jobs / 4; + + ASSERT_EQ(expected_max_flushes, dbfull()->TEST_BGFlushesAllowed()); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + + auto stop_token = dbfull()->TEST_write_controler().GetStopToken(); + + const int expected_max_compactions = 3 * expected_max_flushes; + + ASSERT_EQ(expected_max_flushes, dbfull()->TEST_BGFlushesAllowed()); + ASSERT_EQ(expected_max_compactions, dbfull()->TEST_BGCompactionsAllowed()); + + ASSERT_EQ(expected_max_flushes, + env_->GetBackgroundThreads(Env::Priority::HIGH)); + ASSERT_EQ(expected_max_compactions, + env_->GetBackgroundThreads(Env::Priority::LOW)); + } +} + +TEST_F(DBOptionsTest, AvoidFlushDuringShutdown) { + Options options; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.env = env_; + WriteOptions write_without_wal; + write_without_wal.disableWAL = true; + + ASSERT_FALSE(options.avoid_flush_during_shutdown); + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "v1", write_without_wal)); + Reopen(options); + ASSERT_EQ("v1", Get("foo")); + ASSERT_EQ("1", FilesPerLevel()); + + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "v2", write_without_wal)); + ASSERT_OK(dbfull()->SetDBOptions({{"avoid_flush_during_shutdown", "true"}})); + Reopen(options); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ("", FilesPerLevel()); +} + +TEST_F(DBOptionsTest, SetDelayedWriteRateOption) { + Options options; + options.create_if_missing = true; + options.delayed_write_rate = 2 * 1024U * 1024U; + options.env = env_; + Reopen(options); + ASSERT_EQ(2 * 1024U * 1024U, + dbfull()->TEST_write_controler().max_delayed_write_rate()); + + ASSERT_OK(dbfull()->SetDBOptions({{"delayed_write_rate", "20000"}})); + ASSERT_EQ(20000, dbfull()->TEST_write_controler().max_delayed_write_rate()); +} + +TEST_F(DBOptionsTest, MaxTotalWalSizeChange) { + Random rnd(1044); + const auto value_size = size_t(1024); + std::string value = rnd.RandomString(value_size); + + Options options; + options.create_if_missing = true; + options.env = env_; + CreateColumnFamilies({"1", "2", "3"}, options); + ReopenWithColumnFamilies({"default", "1", "2", "3"}, options); + + WriteOptions write_options; + + const int key_count = 100; + for (int i = 0; i < key_count; ++i) { + for (size_t cf = 0; cf < handles_.size(); ++cf) { + ASSERT_OK(Put(static_cast(cf), Key(i), value)); + } + } + ASSERT_OK(dbfull()->SetDBOptions({{"max_total_wal_size", "10"}})); + + for (size_t cf = 0; cf < handles_.size(); ++cf) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf])); + ASSERT_EQ("1", FilesPerLevel(static_cast(cf))); + } +} + +TEST_F(DBOptionsTest, SetStatsDumpPeriodSec) { + Options options; + options.create_if_missing = true; + options.stats_dump_period_sec = 5; + options.env = env_; + Reopen(options); + ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec); + + for (int i = 0; i < 20; i++) { + unsigned int num = rand() % 5000 + 1; + ASSERT_OK(dbfull()->SetDBOptions( + {{"stats_dump_period_sec", std::to_string(num)}})); + ASSERT_EQ(num, dbfull()->GetDBOptions().stats_dump_period_sec); + } + Close(); +} + +TEST_F(DBOptionsTest, SetOptionsStatsPersistPeriodSec) { + Options options; + options.create_if_missing = true; + options.stats_persist_period_sec = 5; + options.env = env_; + Reopen(options); + ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_persist_period_sec); + + ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "12345"}})); + ASSERT_EQ(12345u, dbfull()->GetDBOptions().stats_persist_period_sec); + ASSERT_NOK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "abcde"}})); + ASSERT_EQ(12345u, dbfull()->GetDBOptions().stats_persist_period_sec); +} + +static void assert_candidate_files_empty(DBImpl* dbfull, const bool empty) { + dbfull->TEST_LockMutex(); + JobContext job_context(0); + dbfull->FindObsoleteFiles(&job_context, false); + ASSERT_EQ(empty, job_context.full_scan_candidate_files.empty()); + dbfull->TEST_UnlockMutex(); + if (job_context.HaveSomethingToDelete()) { + // fulfill the contract of FindObsoleteFiles by calling PurgeObsoleteFiles + // afterwards; otherwise the test may hang on shutdown + dbfull->PurgeObsoleteFiles(job_context); + } + job_context.Clean(); +} + +TEST_F(DBOptionsTest, DeleteObsoleteFilesPeriodChange) { + Options options; + options.env = env_; + SetTimeElapseOnlySleepOnReopen(&options); + options.create_if_missing = true; + ASSERT_OK(TryReopen(options)); + + // Verify that candidate files set is empty when no full scan requested. + assert_candidate_files_empty(dbfull(), true); + + ASSERT_OK( + dbfull()->SetDBOptions({{"delete_obsolete_files_period_micros", "0"}})); + + // After delete_obsolete_files_period_micros updated to 0, the next call + // to FindObsoleteFiles should make a full scan + assert_candidate_files_empty(dbfull(), false); + + ASSERT_OK( + dbfull()->SetDBOptions({{"delete_obsolete_files_period_micros", "20"}})); + + assert_candidate_files_empty(dbfull(), true); + + env_->MockSleepForMicroseconds(20); + assert_candidate_files_empty(dbfull(), true); + + env_->MockSleepForMicroseconds(1); + assert_candidate_files_empty(dbfull(), false); + + Close(); +} + +TEST_F(DBOptionsTest, MaxOpenFilesChange) { + SpecialEnv env(env_); + Options options; + options.env = CurrentOptions().env; + options.max_open_files = -1; + + Reopen(options); + + Cache* tc = dbfull()->TEST_table_cache(); + + ASSERT_EQ(-1, dbfull()->GetDBOptions().max_open_files); + ASSERT_LT(2000, tc->GetCapacity()); + ASSERT_OK(dbfull()->SetDBOptions({{"max_open_files", "1024"}})); + ASSERT_EQ(1024, dbfull()->GetDBOptions().max_open_files); + // examine the table cache (actual size should be 1014) + ASSERT_GT(1500, tc->GetCapacity()); + Close(); +} + +TEST_F(DBOptionsTest, SanitizeDelayedWriteRate) { + Options options; + options.env = CurrentOptions().env; + options.delayed_write_rate = 0; + Reopen(options); + ASSERT_EQ(16 * 1024 * 1024, dbfull()->GetDBOptions().delayed_write_rate); + + options.rate_limiter.reset(NewGenericRateLimiter(31 * 1024 * 1024)); + Reopen(options); + ASSERT_EQ(31 * 1024 * 1024, dbfull()->GetDBOptions().delayed_write_rate); +} + +TEST_F(DBOptionsTest, SanitizeUniversalTTLCompaction) { + Options options; + options.env = CurrentOptions().env; + options.compaction_style = kCompactionStyleUniversal; + + options.ttl = 0; + options.periodic_compaction_seconds = 0; + Reopen(options); + ASSERT_EQ(0, dbfull()->GetOptions().ttl); + ASSERT_EQ(0, dbfull()->GetOptions().periodic_compaction_seconds); + + options.ttl = 0; + options.periodic_compaction_seconds = 100; + Reopen(options); + ASSERT_EQ(0, dbfull()->GetOptions().ttl); + ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds); + + options.ttl = 100; + options.periodic_compaction_seconds = 0; + Reopen(options); + ASSERT_EQ(100, dbfull()->GetOptions().ttl); + ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds); + + options.ttl = 100; + options.periodic_compaction_seconds = 500; + Reopen(options); + ASSERT_EQ(100, dbfull()->GetOptions().ttl); + ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds); +} + +TEST_F(DBOptionsTest, SanitizeTtlDefault) { + Options options; + options.env = CurrentOptions().env; + Reopen(options); + ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl); + + options.compaction_style = kCompactionStyleLevel; + options.ttl = 0; + Reopen(options); + ASSERT_EQ(0, dbfull()->GetOptions().ttl); + + options.ttl = 100; + Reopen(options); + ASSERT_EQ(100, dbfull()->GetOptions().ttl); +} + +TEST_F(DBOptionsTest, SanitizeFIFOPeriodicCompaction) { + Options options; + options.compaction_style = kCompactionStyleFIFO; + options.env = CurrentOptions().env; + options.ttl = 0; + Reopen(options); + ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl); + + options.ttl = 100; + Reopen(options); + ASSERT_EQ(100, dbfull()->GetOptions().ttl); + + options.ttl = 100 * 24 * 60 * 60; + Reopen(options); + ASSERT_EQ(100 * 24 * 60 * 60, dbfull()->GetOptions().ttl); + + options.ttl = 200; + options.periodic_compaction_seconds = 300; + Reopen(options); + ASSERT_EQ(200, dbfull()->GetOptions().ttl); + + options.ttl = 500; + options.periodic_compaction_seconds = 300; + Reopen(options); + ASSERT_EQ(300, dbfull()->GetOptions().ttl); +} + +TEST_F(DBOptionsTest, SetFIFOCompactionOptions) { + Options options; + options.env = CurrentOptions().env; + options.compaction_style = kCompactionStyleFIFO; + options.write_buffer_size = 10 << 10; // 10KB + options.arena_block_size = 4096; + options.compression = kNoCompression; + options.create_if_missing = true; + options.compaction_options_fifo.allow_compaction = false; + env_->SetMockSleep(); + options.env = env_; + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + // Test dynamically changing ttl. + options.ttl = 1 * 60 * 60; // 1 hour + ASSERT_OK(TryReopen(options)); + + Random rnd(301); + for (int i = 0; i < 10; i++) { + // Generate and flush a file about 10KB. + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + env_->MockSleepForSeconds(61); + + // No files should be compacted as ttl is set to 1 hour. + ASSERT_EQ(dbfull()->GetOptions().ttl, 3600); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + // Set ttl to 1 minute. So all files should get deleted. + ASSERT_OK(dbfull()->SetOptions({{"ttl", "60"}})); + ASSERT_EQ(dbfull()->GetOptions().ttl, 60); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + // Test dynamically changing compaction_options_fifo.max_table_files_size + options.compaction_options_fifo.max_table_files_size = 500 << 10; // 00KB + options.ttl = 0; + DestroyAndReopen(options); + + for (int i = 0; i < 10; i++) { + // Generate and flush a file about 10KB. + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + // No files should be compacted as max_table_files_size is set to 500 KB. + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 500 << 10); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + // Set max_table_files_size to 12 KB. So only 1 file should remain now. + ASSERT_OK(dbfull()->SetOptions( + {{"compaction_options_fifo", "{max_table_files_size=12288;}"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 12 << 10); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + + // Test dynamically changing compaction_options_fifo.allow_compaction + options.compaction_options_fifo.max_table_files_size = 500 << 10; // 500KB + options.ttl = 0; + options.compaction_options_fifo.allow_compaction = false; + options.level0_file_num_compaction_trigger = 6; + DestroyAndReopen(options); + + for (int i = 0; i < 10; i++) { + // Generate and flush a file about 10KB. + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + // No files should be compacted as max_table_files_size is set to 500 KB and + // allow_compaction is false + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + false); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + // Set allow_compaction to true. So number of files should be between 1 and 5. + ASSERT_OK(dbfull()->SetOptions( + {{"compaction_options_fifo", "{allow_compaction=true;}"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + true); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_GE(NumTableFilesAtLevel(0), 1); + ASSERT_LE(NumTableFilesAtLevel(0), 5); +} + +TEST_F(DBOptionsTest, CompactionReadaheadSizeChange) { + SpecialEnv env(env_); + Options options; + options.env = &env; + + options.compaction_readahead_size = 0; + options.level0_file_num_compaction_trigger = 2; + const std::string kValue(1024, 'v'); + Reopen(options); + + ASSERT_EQ(0, dbfull()->GetDBOptions().compaction_readahead_size); + ASSERT_OK(dbfull()->SetDBOptions({{"compaction_readahead_size", "256"}})); + ASSERT_EQ(256, dbfull()->GetDBOptions().compaction_readahead_size); + for (int i = 0; i < 1024; i++) { + ASSERT_OK(Put(Key(i), kValue)); + } + ASSERT_OK(Flush()); + for (int i = 0; i < 1024 * 2; i++) { + ASSERT_OK(Put(Key(i), kValue)); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(256, env_->compaction_readahead_size_); + Close(); +} + +TEST_F(DBOptionsTest, FIFOTtlBackwardCompatible) { + Options options; + options.compaction_style = kCompactionStyleFIFO; + options.write_buffer_size = 10 << 10; // 10KB + options.create_if_missing = true; + options.env = CurrentOptions().env; + + ASSERT_OK(TryReopen(options)); + + Random rnd(301); + for (int i = 0; i < 10; i++) { + // Generate and flush a file about 10KB. + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + // In release 6.0, ttl was promoted from a secondary level option under + // compaction_options_fifo to a top level option under ColumnFamilyOptions. + // We still need to handle old SetOptions calls but should ignore + // ttl under compaction_options_fifo. + ASSERT_OK(dbfull()->SetOptions( + {{"compaction_options_fifo", + "{allow_compaction=true;max_table_files_size=1024;ttl=731;}"}, + {"ttl", "60"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + true); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 1024); + ASSERT_EQ(dbfull()->GetOptions().ttl, 60); + + // Put ttl as the first option inside compaction_options_fifo. That works as + // it doesn't overwrite any other option. + ASSERT_OK(dbfull()->SetOptions( + {{"compaction_options_fifo", + "{ttl=985;allow_compaction=true;max_table_files_size=1024;}"}, + {"ttl", "191"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + true); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 1024); + ASSERT_EQ(dbfull()->GetOptions().ttl, 191); +} + +TEST_F(DBOptionsTest, ChangeCompression) { + if (!Snappy_Supported() || !LZ4_Supported()) { + return; + } + Options options; + options.write_buffer_size = 10 << 10; // 10KB + options.level0_file_num_compaction_trigger = 2; + options.create_if_missing = true; + options.compression = CompressionType::kLZ4Compression; + options.bottommost_compression = CompressionType::kNoCompression; + options.bottommost_compression_opts.level = 2; + options.bottommost_compression_opts.parallel_threads = 1; + options.env = CurrentOptions().env; + + ASSERT_OK(TryReopen(options)); + + CompressionType compression_used = CompressionType::kLZ4Compression; + CompressionOptions compression_opt_used; + bool compacted = false; + SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* c = reinterpret_cast(arg); + compression_used = c->output_compression(); + compression_opt_used = c->output_compression_opts(); + compacted = true; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("foo", "foofoofoo")); + ASSERT_OK(Put("bar", "foofoofoo")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "foofoofoo")); + ASSERT_OK(Put("bar", "foofoofoo")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(compacted); + ASSERT_EQ(CompressionType::kNoCompression, compression_used); + ASSERT_EQ(options.compression_opts.level, compression_opt_used.level); + ASSERT_EQ(options.compression_opts.parallel_threads, + compression_opt_used.parallel_threads); + + compression_used = CompressionType::kLZ4Compression; + compacted = false; + ASSERT_OK(dbfull()->SetOptions( + {{"bottommost_compression", "kSnappyCompression"}, + {"bottommost_compression_opts", "0:6:0:0:4:true"}})); + ASSERT_OK(Put("foo", "foofoofoo")); + ASSERT_OK(Put("bar", "foofoofoo")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "foofoofoo")); + ASSERT_OK(Put("bar", "foofoofoo")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(compacted); + ASSERT_EQ(CompressionType::kSnappyCompression, compression_used); + ASSERT_EQ(6, compression_opt_used.level); + // Right now parallel_level is not yet allowed to be changed. + + SyncPoint::GetInstance()->DisableProcessing(); +} + +#endif // ROCKSDB_LITE + +TEST_F(DBOptionsTest, BottommostCompressionOptsWithFallbackType) { + // Verify the bottommost compression options still take effect even when the + // bottommost compression type is left at its default value. Verify for both + // automatic and manual compaction. + if (!Snappy_Supported() || !LZ4_Supported()) { + return; + } + + constexpr int kUpperCompressionLevel = 1; + constexpr int kBottommostCompressionLevel = 2; + constexpr int kNumL0Files = 2; + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + options.compression = CompressionType::kLZ4Compression; + options.compression_opts.level = kUpperCompressionLevel; + options.bottommost_compression_opts.level = kBottommostCompressionLevel; + options.bottommost_compression_opts.enabled = true; + Reopen(options); + + CompressionType compression_used = CompressionType::kDisableCompressionOption; + CompressionOptions compression_opt_used; + bool compacted = false; + SyncPoint::GetInstance()->SetCallBack( + "CompactionPicker::RegisterCompaction:Registered", [&](void* arg) { + Compaction* c = static_cast(arg); + compression_used = c->output_compression(); + compression_opt_used = c->output_compression_opts(); + compacted = true; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // First, verify for automatic compaction. + for (int i = 0; i < kNumL0Files; ++i) { + ASSERT_OK(Put("foo", "foofoofoo")); + ASSERT_OK(Put("bar", "foofoofoo")); + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_TRUE(compacted); + ASSERT_EQ(CompressionType::kLZ4Compression, compression_used); + ASSERT_EQ(kBottommostCompressionLevel, compression_opt_used.level); + + // Second, verify for manual compaction. + compacted = false; + compression_used = CompressionType::kDisableCompressionOption; + compression_opt_used = CompressionOptions(); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_TRUE(compacted); + ASSERT_EQ(CompressionType::kLZ4Compression, compression_used); + ASSERT_EQ(kBottommostCompressionLevel, compression_opt_used.level); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_properties_test.cc b/src/rocksdb/db/db_properties_test.cc new file mode 100644 index 000000000..85cd5c04e --- /dev/null +++ b/src/rocksdb/db/db_properties_test.cc @@ -0,0 +1,2206 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#include +#include + +#include "db/db_test_util.h" +#include "options/cf_options.h" +#include "port/stack_trace.h" +#include "rocksdb/listener.h" +#include "rocksdb/options.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/perf_level.h" +#include "rocksdb/table.h" +#include "table/block_based/block.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "table/table_builder.h" +#include "test_util/mock_time_env.h" +#include "util/random.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class DBPropertiesTest : public DBTestBase { + public: + DBPropertiesTest() + : DBTestBase("db_properties_test", /*env_do_fsync=*/false) {} + + void AssertDbStats(const std::map& db_stats, + double expected_uptime, int expected_user_bytes_written, + int expected_wal_bytes_written, + int expected_user_writes_by_self, + int expected_user_writes_with_wal) { + ASSERT_EQ(std::to_string(expected_uptime), db_stats.at("db.uptime")); + ASSERT_EQ(std::to_string(expected_wal_bytes_written), + db_stats.at("db.wal_bytes_written")); + ASSERT_EQ("0", db_stats.at("db.wal_syncs")); + ASSERT_EQ(std::to_string(expected_user_bytes_written), + db_stats.at("db.user_bytes_written")); + ASSERT_EQ("0", db_stats.at("db.user_writes_by_other")); + ASSERT_EQ(std::to_string(expected_user_writes_by_self), + db_stats.at("db.user_writes_by_self")); + ASSERT_EQ(std::to_string(expected_user_writes_with_wal), + db_stats.at("db.user_writes_with_wal")); + ASSERT_EQ("0", db_stats.at("db.user_write_stall_micros")); + } +}; + +#ifndef ROCKSDB_LITE +TEST_F(DBPropertiesTest, Empty) { + do { + Options options; + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + options.allow_concurrent_memtable_write = false; + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, options); + + std::string num; + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ("0", num); + + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ("1", num); + + // Block sync calls + env_->delay_sstable_sync_.store(true, std::memory_order_release); + ASSERT_OK(Put(1, "k1", std::string(100000, 'x'))); // Fill memtable + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ("2", num); + + ASSERT_OK(Put(1, "k2", std::string(100000, 'y'))); // Trigger compaction + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ("1", num); + + ASSERT_EQ("v1", Get(1, "foo")); + // Release sync calls + env_->delay_sstable_sync_.store(false, std::memory_order_release); + + ASSERT_OK(db_->DisableFileDeletions()); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("0", num); + + ASSERT_OK(db_->DisableFileDeletions()); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("0", num); + + ASSERT_OK(db_->DisableFileDeletions()); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("0", num); + + ASSERT_OK(db_->EnableFileDeletions(false)); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("0", num); + + ASSERT_OK(db_->EnableFileDeletions()); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("1", num); + } while (ChangeOptions()); +} + +TEST_F(DBPropertiesTest, CurrentVersionNumber) { + uint64_t v1, v2, v3; + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v1)); + ASSERT_OK(Put("12345678", "")); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v2)); + ASSERT_OK(Flush()); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v3)); + + ASSERT_EQ(v1, v2); + ASSERT_GT(v3, v2); +} + +TEST_F(DBPropertiesTest, GetAggregatedIntPropertyTest) { + const int kKeySize = 100; + const int kValueSize = 500; + const int kKeyNum = 100; + + Options options; + options.env = env_; + options.create_if_missing = true; + options.write_buffer_size = (kKeySize + kValueSize) * kKeyNum / 10; + // Make them never flush + options.min_write_buffer_number_to_merge = 1000; + options.max_write_buffer_number = 1000; + options = CurrentOptions(options); + CreateAndReopenWithCF({"one", "two", "three", "four"}, options); + + Random rnd(301); + for (auto* handle : handles_) { + for (int i = 0; i < kKeyNum; ++i) { + ASSERT_OK(db_->Put(WriteOptions(), handle, rnd.RandomString(kKeySize), + rnd.RandomString(kValueSize))); + } + } + + uint64_t manual_sum = 0; + uint64_t api_sum = 0; + uint64_t value = 0; + for (auto* handle : handles_) { + ASSERT_TRUE( + db_->GetIntProperty(handle, DB::Properties::kSizeAllMemTables, &value)); + manual_sum += value; + } + ASSERT_TRUE(db_->GetAggregatedIntProperty(DB::Properties::kSizeAllMemTables, + &api_sum)); + ASSERT_GT(manual_sum, 0); + ASSERT_EQ(manual_sum, api_sum); + + ASSERT_FALSE(db_->GetAggregatedIntProperty(DB::Properties::kDBStats, &value)); + + uint64_t before_flush_trm; + uint64_t after_flush_trm; + for (auto* handle : handles_) { + ASSERT_TRUE(db_->GetAggregatedIntProperty( + DB::Properties::kEstimateTableReadersMem, &before_flush_trm)); + + // Issue flush and expect larger memory usage of table readers. + ASSERT_OK(db_->Flush(FlushOptions(), handle)); + + ASSERT_TRUE(db_->GetAggregatedIntProperty( + DB::Properties::kEstimateTableReadersMem, &after_flush_trm)); + ASSERT_GT(after_flush_trm, before_flush_trm); + } +} + +namespace { +void ResetTableProperties(TableProperties* tp) { + tp->data_size = 0; + tp->index_size = 0; + tp->filter_size = 0; + tp->raw_key_size = 0; + tp->raw_value_size = 0; + tp->num_data_blocks = 0; + tp->num_entries = 0; + tp->num_deletions = 0; + tp->num_merge_operands = 0; + tp->num_range_deletions = 0; +} + +void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) { + double dummy_double; + std::replace(tp_string.begin(), tp_string.end(), ';', ' '); + std::replace(tp_string.begin(), tp_string.end(), '=', ' '); + ResetTableProperties(tp); + sscanf(tp_string.c_str(), + "# data blocks %" SCNu64 " # entries %" SCNu64 " # deletions %" SCNu64 + " # merge operands %" SCNu64 " # range deletions %" SCNu64 + " raw key size %" SCNu64 + " raw average key size %lf " + " raw value size %" SCNu64 + " raw average value size %lf " + " data block size %" SCNu64 " index block size (user-key? %" SCNu64 + ", delta-value? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64, + &tp->num_data_blocks, &tp->num_entries, &tp->num_deletions, + &tp->num_merge_operands, &tp->num_range_deletions, &tp->raw_key_size, + &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size, + &tp->index_key_is_user_key, &tp->index_value_is_delta_encoded, + &tp->index_size, &tp->filter_size); +} + +void VerifySimilar(uint64_t a, uint64_t b, double bias) { + ASSERT_EQ(a == 0U, b == 0U); + if (a == 0) { + return; + } + double dbl_a = static_cast(a); + double dbl_b = static_cast(b); + if (dbl_a > dbl_b) { + ASSERT_LT(static_cast(dbl_a - dbl_b) / (dbl_a + dbl_b), bias); + } else { + ASSERT_LT(static_cast(dbl_b - dbl_a) / (dbl_a + dbl_b), bias); + } +} + +void VerifyTableProperties( + const TableProperties& base_tp, const TableProperties& new_tp, + double filter_size_bias = CACHE_LINE_SIZE >= 256 ? 0.18 : 0.1, + double index_size_bias = 0.1, double data_size_bias = 0.1, + double num_data_blocks_bias = 0.05) { + VerifySimilar(base_tp.data_size, new_tp.data_size, data_size_bias); + VerifySimilar(base_tp.index_size, new_tp.index_size, index_size_bias); + VerifySimilar(base_tp.filter_size, new_tp.filter_size, filter_size_bias); + VerifySimilar(base_tp.num_data_blocks, new_tp.num_data_blocks, + num_data_blocks_bias); + + ASSERT_EQ(base_tp.raw_key_size, new_tp.raw_key_size); + ASSERT_EQ(base_tp.raw_value_size, new_tp.raw_value_size); + ASSERT_EQ(base_tp.num_entries, new_tp.num_entries); + ASSERT_EQ(base_tp.num_deletions, new_tp.num_deletions); + ASSERT_EQ(base_tp.num_range_deletions, new_tp.num_range_deletions); + + // Merge operands may become Puts, so we only have an upper bound the exact + // number of merge operands. + ASSERT_GE(base_tp.num_merge_operands, new_tp.num_merge_operands); +} + +void GetExpectedTableProperties( + TableProperties* expected_tp, const int kKeySize, const int kValueSize, + const int kPutsPerTable, const int kDeletionsPerTable, + const int kMergeOperandsPerTable, const int kRangeDeletionsPerTable, + const int kTableCount, const int kBloomBitsPerKey, const size_t kBlockSize, + const bool index_key_is_user_key, const bool value_delta_encoding) { + const int kKeysPerTable = + kPutsPerTable + kDeletionsPerTable + kMergeOperandsPerTable; + const int kPutCount = kTableCount * kPutsPerTable; + const int kDeletionCount = kTableCount * kDeletionsPerTable; + const int kMergeCount = kTableCount * kMergeOperandsPerTable; + const int kRangeDeletionCount = kTableCount * kRangeDeletionsPerTable; + const int kKeyCount = + kPutCount + kDeletionCount + kMergeCount + kRangeDeletionCount; + const int kAvgSuccessorSize = kKeySize / 5; + const int kEncodingSavePerKey = kKeySize / 4; + expected_tp->raw_key_size = kKeyCount * (kKeySize + 8); + expected_tp->raw_value_size = + (kPutCount + kMergeCount + kRangeDeletionCount) * kValueSize; + expected_tp->num_entries = kKeyCount; + expected_tp->num_deletions = kDeletionCount + kRangeDeletionCount; + expected_tp->num_merge_operands = kMergeCount; + expected_tp->num_range_deletions = kRangeDeletionCount; + expected_tp->num_data_blocks = + kTableCount * + (kKeysPerTable * (kKeySize - kEncodingSavePerKey + kValueSize)) / + kBlockSize; + expected_tp->data_size = + kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize)); + expected_tp->index_size = + expected_tp->num_data_blocks * + (kAvgSuccessorSize + (index_key_is_user_key ? 0 : 8) - + // discount 1 byte as value size is not encoded in value delta encoding + (value_delta_encoding ? 1 : 0)); + expected_tp->filter_size = + kTableCount * ((kKeysPerTable * kBloomBitsPerKey + 7) / 8 + + /*average-ish overhead*/ CACHE_LINE_SIZE / 2); +} +} // anonymous namespace + +TEST_F(DBPropertiesTest, ValidatePropertyInfo) { + for (const auto& ppt_name_and_info : InternalStats::ppt_name_to_info) { + // If C++ gets a std::string_literal, this would be better to check at + // compile-time using static_assert. + ASSERT_TRUE(ppt_name_and_info.first.empty() || + !isdigit(ppt_name_and_info.first.back())); + + int count = 0; + count += (ppt_name_and_info.second.handle_string == nullptr) ? 0 : 1; + count += (ppt_name_and_info.second.handle_int == nullptr) ? 0 : 1; + count += (ppt_name_and_info.second.handle_string_dbimpl == nullptr) ? 0 : 1; + ASSERT_TRUE(count == 1); + } +} + +TEST_F(DBPropertiesTest, ValidateSampleNumber) { + // When "max_open_files" is -1, we read all the files for + // "rocksdb.estimate-num-keys" computation, which is the ground truth. + // Otherwise, we sample 20 newest files to make an estimation. + // Formula: lastest_20_files_active_key_ratio * total_files + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.level0_stop_writes_trigger = 1000; + DestroyAndReopen(options); + int key = 0; + for (int files = 20; files >= 10; files -= 10) { + for (int i = 0; i < files; i++) { + int rows = files / 10; + for (int j = 0; j < rows; j++) { + ASSERT_OK(db_->Put(WriteOptions(), std::to_string(++key), "foo")); + } + ASSERT_OK(db_->Flush(FlushOptions())); + } + } + std::string num; + Reopen(options); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num)); + ASSERT_EQ("45", num); + options.max_open_files = -1; + Reopen(options); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num)); + ASSERT_EQ("50", num); +} + +TEST_F(DBPropertiesTest, AggregatedTableProperties) { + for (int kTableCount = 40; kTableCount <= 100; kTableCount += 30) { + const int kDeletionsPerTable = 0; + const int kMergeOperandsPerTable = 15; + const int kRangeDeletionsPerTable = 5; + const int kPutsPerTable = 100; + const int kKeySize = 80; + const int kValueSize = 200; + const int kBloomBitsPerKey = 20; + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 8; + options.compression = kNoCompression; + options.create_if_missing = true; + options.merge_operator.reset(new TestPutOperator()); + + BlockBasedTableOptions table_options; + table_options.filter_policy.reset( + NewBloomFilterPolicy(kBloomBitsPerKey, false)); + table_options.block_size = 1024; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + + // Hold open a snapshot to prevent range tombstones from being compacted + // away. + ManagedSnapshot snapshot(db_); + + Random rnd(5632); + for (int table = 1; table <= kTableCount; ++table) { + for (int i = 0; i < kPutsPerTable; ++i) { + ASSERT_OK(db_->Put(WriteOptions(), rnd.RandomString(kKeySize), + rnd.RandomString(kValueSize))); + } + for (int i = 0; i < kDeletionsPerTable; i++) { + ASSERT_OK(db_->Delete(WriteOptions(), rnd.RandomString(kKeySize))); + } + for (int i = 0; i < kMergeOperandsPerTable; i++) { + ASSERT_OK(db_->Merge(WriteOptions(), rnd.RandomString(kKeySize), + rnd.RandomString(kValueSize))); + } + for (int i = 0; i < kRangeDeletionsPerTable; i++) { + std::string start = rnd.RandomString(kKeySize); + std::string end = start; + end.resize(kValueSize); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + start, end)); + } + ASSERT_OK(db_->Flush(FlushOptions())); + } + std::string property; + db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property); + TableProperties output_tp; + ParseTablePropertiesString(property, &output_tp); + bool index_key_is_user_key = output_tp.index_key_is_user_key > 0; + bool value_is_delta_encoded = output_tp.index_value_is_delta_encoded > 0; + + TableProperties expected_tp; + GetExpectedTableProperties( + &expected_tp, kKeySize, kValueSize, kPutsPerTable, kDeletionsPerTable, + kMergeOperandsPerTable, kRangeDeletionsPerTable, kTableCount, + kBloomBitsPerKey, table_options.block_size, index_key_is_user_key, + value_is_delta_encoded); + + VerifyTableProperties(expected_tp, output_tp); + } +} + +TEST_F(DBPropertiesTest, ReadLatencyHistogramByLevel) { + Options options = CurrentOptions(); + options.write_buffer_size = 110 << 10; + options.level0_file_num_compaction_trigger = 6; + options.num_levels = 4; + options.compression = kNoCompression; + options.max_bytes_for_level_base = 4500 << 10; + options.target_file_size_base = 98 << 10; + options.max_write_buffer_number = 2; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.max_open_files = 11; // Make sure no proloading of table readers + + // RocksDB sanitize max open files to at least 20. Modify it back. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = static_cast(arg); + *max_open_files = 11; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + + CreateAndReopenWithCF({"pikachu"}, options); + int key_index = 0; + Random rnd(301); + for (int num = 0; num < 8; num++) { + ASSERT_OK(Put("foo", "bar")); + GenerateNewFile(&rnd, &key_index); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + std::string prop; + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); + + // Get() after flushes, See latency histogram tracked. + for (int key = 0; key < key_index; key++) { + Get(Key(key)); + } + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cfstats", &prop)); + ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + + // Reopen and issue Get(). See thee latency tracked + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + for (int key = 0; key < key_index; key++) { + Get(Key(key)); + } + + // Test for getting immutable_db_options_.statistics + ASSERT_TRUE(dbfull()->GetProperty(dbfull()->DefaultColumnFamily(), + "rocksdb.options-statistics", &prop)); + ASSERT_NE(std::string::npos, prop.find("rocksdb.block.cache.miss")); + ASSERT_EQ(std::string::npos, prop.find("rocksdb.db.f.micros")); + + ASSERT_TRUE(dbfull()->GetProperty(dbfull()->DefaultColumnFamily(), + "rocksdb.cf-file-histogram", &prop)); + ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + + // Reopen and issue iterating. See thee latency tracked + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cf-file-histogram", &prop)); + ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + { + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) { + } + ASSERT_OK(iter->status()); + } + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cf-file-histogram", &prop)); + ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + + // CF 1 should show no histogram. + ASSERT_TRUE( + dbfull()->GetProperty(handles_[1], "rocksdb.cf-file-histogram", &prop)); + ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + // put something and read it back , CF 1 should show histogram. + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("bar", Get(1, "foo")); + + ASSERT_TRUE( + dbfull()->GetProperty(handles_[1], "rocksdb.cf-file-histogram", &prop)); + ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + + // options.max_open_files preloads table readers. + options.max_open_files = -1; + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_TRUE(dbfull()->GetProperty(dbfull()->DefaultColumnFamily(), + "rocksdb.cf-file-histogram", &prop)); + ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + for (int key = 0; key < key_index; key++) { + Get(Key(key)); + } + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cfstats", &prop)); + ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + + // Clear internal stats + ASSERT_OK(dbfull()->ResetStats()); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cfstats", &prop)); + ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); +} + +TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) { + const int kTableCount = 100; + const int kDeletionsPerTable = 0; + const int kMergeOperandsPerTable = 2; + const int kRangeDeletionsPerTable = 2; + const int kPutsPerTable = 10; + const int kKeySize = 50; + const int kValueSize = 400; + const int kMaxLevel = 7; + const int kBloomBitsPerKey = 20; + Random rnd(301); + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 8; + options.compression = kNoCompression; + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.target_file_size_base = 8192; + options.max_bytes_for_level_base = 10000; + options.max_bytes_for_level_multiplier = 2; + // This ensures there no compaction happening when we call GetProperty(). + options.disable_auto_compactions = true; + options.merge_operator.reset(new TestPutOperator()); + + BlockBasedTableOptions table_options; + table_options.filter_policy.reset( + NewBloomFilterPolicy(kBloomBitsPerKey, false)); + table_options.block_size = 1024; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + + // Hold open a snapshot to prevent range tombstones from being compacted away. + ManagedSnapshot snapshot(db_); + + std::string level_tp_strings[kMaxLevel]; + std::string tp_string; + TableProperties level_tps[kMaxLevel]; + TableProperties tp, sum_tp, expected_tp; + for (int table = 1; table <= kTableCount; ++table) { + for (int i = 0; i < kPutsPerTable; ++i) { + ASSERT_OK(db_->Put(WriteOptions(), rnd.RandomString(kKeySize), + rnd.RandomString(kValueSize))); + } + for (int i = 0; i < kDeletionsPerTable; i++) { + ASSERT_OK(db_->Delete(WriteOptions(), rnd.RandomString(kKeySize))); + } + for (int i = 0; i < kMergeOperandsPerTable; i++) { + ASSERT_OK(db_->Merge(WriteOptions(), rnd.RandomString(kKeySize), + rnd.RandomString(kValueSize))); + } + for (int i = 0; i < kRangeDeletionsPerTable; i++) { + std::string start = rnd.RandomString(kKeySize); + std::string end = start; + end.resize(kValueSize); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + start, end)); + } + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ResetTableProperties(&sum_tp); + for (int level = 0; level < kMaxLevel; ++level) { + db_->GetProperty(DB::Properties::kAggregatedTablePropertiesAtLevel + + std::to_string(level), + &level_tp_strings[level]); + ParseTablePropertiesString(level_tp_strings[level], &level_tps[level]); + sum_tp.data_size += level_tps[level].data_size; + sum_tp.index_size += level_tps[level].index_size; + sum_tp.filter_size += level_tps[level].filter_size; + sum_tp.raw_key_size += level_tps[level].raw_key_size; + sum_tp.raw_value_size += level_tps[level].raw_value_size; + sum_tp.num_data_blocks += level_tps[level].num_data_blocks; + sum_tp.num_entries += level_tps[level].num_entries; + sum_tp.num_deletions += level_tps[level].num_deletions; + sum_tp.num_merge_operands += level_tps[level].num_merge_operands; + sum_tp.num_range_deletions += level_tps[level].num_range_deletions; + } + db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string); + ParseTablePropertiesString(tp_string, &tp); + bool index_key_is_user_key = tp.index_key_is_user_key > 0; + bool value_is_delta_encoded = tp.index_value_is_delta_encoded > 0; + ASSERT_EQ(sum_tp.data_size, tp.data_size); + ASSERT_EQ(sum_tp.index_size, tp.index_size); + ASSERT_EQ(sum_tp.filter_size, tp.filter_size); + ASSERT_EQ(sum_tp.raw_key_size, tp.raw_key_size); + ASSERT_EQ(sum_tp.raw_value_size, tp.raw_value_size); + ASSERT_EQ(sum_tp.num_data_blocks, tp.num_data_blocks); + ASSERT_EQ(sum_tp.num_entries, tp.num_entries); + ASSERT_EQ(sum_tp.num_deletions, tp.num_deletions); + ASSERT_EQ(sum_tp.num_merge_operands, tp.num_merge_operands); + ASSERT_EQ(sum_tp.num_range_deletions, tp.num_range_deletions); + if (table > 3) { + GetExpectedTableProperties( + &expected_tp, kKeySize, kValueSize, kPutsPerTable, kDeletionsPerTable, + kMergeOperandsPerTable, kRangeDeletionsPerTable, table, + kBloomBitsPerKey, table_options.block_size, index_key_is_user_key, + value_is_delta_encoded); + // Gives larger bias here as index block size, filter block size, + // and data block size become much harder to estimate in this test. + VerifyTableProperties(expected_tp, tp, CACHE_LINE_SIZE >= 256 ? 0.6 : 0.5, + 0.5, 0.5, 0.25); + } + } +} + +TEST_F(DBPropertiesTest, NumImmutableMemTable) { + do { + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.write_buffer_size = 1000000; + options.max_write_buffer_size_to_maintain = + 5 * static_cast(options.write_buffer_size); + CreateAndReopenWithCF({"pikachu"}, options); + + std::string big_value(1000000 * 2, 'x'); + std::string num; + uint64_t value; + SetPerfLevel(kEnableTime); + ASSERT_TRUE(GetPerfLevel() == kEnableTime); + + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k1", big_value)); + ASSERT_TRUE(dbfull()->GetProperty(handles_[1], + "rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], DB::Properties::kNumImmutableMemTableFlushed, &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ(num, "1"); + get_perf_context()->Reset(); + Get(1, "k1"); + ASSERT_EQ(1, static_cast(get_perf_context()->get_from_memtable_count)); + + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value)); + ASSERT_TRUE(dbfull()->GetProperty(handles_[1], + "rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-imm-mem-tables", &num)); + ASSERT_EQ(num, "1"); + + get_perf_context()->Reset(); + Get(1, "k1"); + ASSERT_EQ(2, static_cast(get_perf_context()->get_from_memtable_count)); + get_perf_context()->Reset(); + Get(1, "k2"); + ASSERT_EQ(1, static_cast(get_perf_context()->get_from_memtable_count)); + + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", big_value)); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.cur-size-active-mem-table", &num)); + ASSERT_TRUE(dbfull()->GetProperty(handles_[1], + "rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "2"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-imm-mem-tables", &num)); + ASSERT_EQ(num, "2"); + get_perf_context()->Reset(); + Get(1, "k2"); + ASSERT_EQ(2, static_cast(get_perf_context()->get_from_memtable_count)); + get_perf_context()->Reset(); + Get(1, "k3"); + ASSERT_EQ(1, static_cast(get_perf_context()->get_from_memtable_count)); + get_perf_context()->Reset(); + Get(1, "k1"); + ASSERT_EQ(3, static_cast(get_perf_context()->get_from_memtable_count)); + + ASSERT_OK(Flush(1)); + ASSERT_TRUE(dbfull()->GetProperty(handles_[1], + "rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], DB::Properties::kNumImmutableMemTableFlushed, &num)); + ASSERT_EQ(num, "3"); + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.cur-size-active-mem-table", &value)); + // "192" is the size of the metadata of two empty skiplists, this would + // break if we change the default skiplist implementation + ASSERT_GE(value, 192); + + uint64_t int_num; + uint64_t base_total_size; + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.estimate-num-keys", &base_total_size)); + + ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k2")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", "")); + ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k3")); + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.num-deletes-active-mem-table", &int_num)); + ASSERT_EQ(int_num, 2U); + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &int_num)); + ASSERT_EQ(int_num, 3U); + + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value)); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value)); + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.num-entries-imm-mem-tables", &int_num)); + ASSERT_EQ(int_num, 4U); + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.num-deletes-imm-mem-tables", &int_num)); + ASSERT_EQ(int_num, 2U); + + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.estimate-num-keys", &int_num)); + ASSERT_EQ(int_num, base_total_size + 1); + + SetPerfLevel(kDisable); + ASSERT_TRUE(GetPerfLevel() == kDisable); + } while (ChangeCompactOptions()); +} + +// TODO(techdept) : Disabled flaky test #12863555 +TEST_F(DBPropertiesTest, DISABLED_GetProperty) { + // Set sizes to both background thread pool to be 1 and block them. + env_->SetBackgroundThreads(1, Env::HIGH); + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + test::SleepingBackgroundTask sleeping_task_high; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_high, Env::Priority::HIGH); + + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.compaction_style = kCompactionStyleUniversal; + options.level0_file_num_compaction_trigger = 1; + options.compaction_options_universal.size_ratio = 50; + options.max_background_compactions = 1; + options.max_background_flushes = 1; + options.max_write_buffer_number = 10; + options.min_write_buffer_number_to_merge = 1; + options.max_write_buffer_size_to_maintain = 0; + options.write_buffer_size = 1000000; + Reopen(options); + + std::string big_value(1000000 * 2, 'x'); + std::string num; + uint64_t int_num; + SetPerfLevel(kEnableTime); + + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_EQ(int_num, 0U); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-live-data-size", &int_num)); + ASSERT_EQ(int_num, 0U); + + ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num)); + ASSERT_EQ(num, "1"); + get_perf_context()->Reset(); + + ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "1"); + ASSERT_OK(dbfull()->Delete(writeOpt, "k-non-existing")); + ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "2"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num)); + ASSERT_EQ(num, "2"); + // Verify the same set of properties through GetIntProperty + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.num-immutable-mem-table", &int_num)); + ASSERT_EQ(int_num, 2U); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.mem-table-flush-pending", &int_num)); + ASSERT_EQ(int_num, 1U); + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.compaction-pending", &int_num)); + ASSERT_EQ(int_num, 0U); + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num)); + ASSERT_EQ(int_num, 2U); + + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_EQ(int_num, 0U); + + sleeping_task_high.WakeUp(); + sleeping_task_high.WaitUntilDone(); + dbfull()->TEST_WaitForFlushMemTable(); + + ASSERT_OK(dbfull()->Put(writeOpt, "k4", big_value)); + ASSERT_OK(dbfull()->Put(writeOpt, "k5", big_value)); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num)); + ASSERT_EQ(num, "4"); + + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_GT(int_num, 0U); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + + // Wait for compaction to be done. This is important because otherwise RocksDB + // might schedule a compaction when reopening the database, failing assertion + // (A) as a result. + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + options.max_open_files = 10; + Reopen(options); + // After reopening, no table reader is loaded, so no memory for table readers + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_EQ(int_num, 0U); // (A) + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num)); + ASSERT_GT(int_num, 0U); + + // After reading a key, at least one table reader is loaded. + Get("k5"); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_GT(int_num, 0U); + + // Test rocksdb.num-live-versions + { + options.level0_file_num_compaction_trigger = 20; + Reopen(options); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); + ASSERT_EQ(int_num, 1U); + + // Use an iterator to hold current version + std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + + ASSERT_OK(dbfull()->Put(writeOpt, "k6", big_value)); + ASSERT_OK(Flush()); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); + ASSERT_EQ(int_num, 2U); + + // Use an iterator to hold current version + std::unique_ptr iter2(dbfull()->NewIterator(ReadOptions())); + + ASSERT_OK(dbfull()->Put(writeOpt, "k7", big_value)); + ASSERT_OK(Flush()); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); + ASSERT_EQ(int_num, 3U); + + iter2.reset(); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); + ASSERT_EQ(int_num, 2U); + + iter1.reset(); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); + ASSERT_EQ(int_num, 1U); + } +} + +TEST_F(DBPropertiesTest, ApproximateMemoryUsage) { + const int kNumRounds = 10; + // TODO(noetzli) kFlushesPerRound does not really correlate with how many + // flushes happen. + const int kFlushesPerRound = 10; + const int kWritesPerFlush = 10; + const int kKeySize = 100; + const int kValueSize = 1000; + Options options; + options.write_buffer_size = 1000; // small write buffer + options.min_write_buffer_number_to_merge = 4; + options.compression = kNoCompression; + options.create_if_missing = true; + options = CurrentOptions(options); + DestroyAndReopen(options); + + Random rnd(301); + + std::vector iters; + + uint64_t active_mem; + uint64_t unflushed_mem; + uint64_t all_mem; + uint64_t prev_all_mem; + + // Phase 0. The verify the initial value of all these properties are the same + // as we have no mem-tables. + dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); + dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); + dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); + ASSERT_EQ(all_mem, active_mem); + ASSERT_EQ(all_mem, unflushed_mem); + + // Phase 1. Simply issue Put() and expect "cur-size-all-mem-tables" equals to + // "size-all-mem-tables" + for (int r = 0; r < kNumRounds; ++r) { + for (int f = 0; f < kFlushesPerRound; ++f) { + for (int w = 0; w < kWritesPerFlush; ++w) { + ASSERT_OK( + Put(rnd.RandomString(kKeySize), rnd.RandomString(kValueSize))); + } + } + // Make sure that there is no flush between getting the two properties. + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); + dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); + // in no iterator case, these two number should be the same. + ASSERT_EQ(unflushed_mem, all_mem); + } + prev_all_mem = all_mem; + + // Phase 2. Keep issuing Put() but also create new iterators. This time we + // expect "size-all-mem-tables" > "cur-size-all-mem-tables". + for (int r = 0; r < kNumRounds; ++r) { + iters.push_back(db_->NewIterator(ReadOptions())); + for (int f = 0; f < kFlushesPerRound; ++f) { + for (int w = 0; w < kWritesPerFlush; ++w) { + ASSERT_OK( + Put(rnd.RandomString(kKeySize), rnd.RandomString(kValueSize))); + } + } + // Force flush to prevent flush from happening between getting the + // properties or after getting the properties and before the new round. + ASSERT_OK(Flush()); + + // In the second round, add iterators. + dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); + dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); + dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); + ASSERT_GT(all_mem, active_mem); + ASSERT_GT(all_mem, unflushed_mem); + ASSERT_GT(all_mem, prev_all_mem); + prev_all_mem = all_mem; + } + + // Phase 3. Delete iterators and expect "size-all-mem-tables" shrinks + // whenever we release an iterator. + for (auto* iter : iters) { + ASSERT_OK(iter->status()); + delete iter; + dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); + // Expect the size shrinking + ASSERT_LT(all_mem, prev_all_mem); + prev_all_mem = all_mem; + } + + // Expect all these three counters to be the same. + dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); + dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); + dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); + ASSERT_EQ(active_mem, unflushed_mem); + ASSERT_EQ(unflushed_mem, all_mem); + + // Phase 5. Reopen, and expect all these three counters to be the same again. + Reopen(options); + dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); + dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); + dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); + ASSERT_EQ(active_mem, unflushed_mem); + ASSERT_EQ(unflushed_mem, all_mem); +} + +TEST_F(DBPropertiesTest, EstimatePendingCompBytes) { + // Set sizes to both background thread pool to be 1 and block them. + env_->SetBackgroundThreads(1, Env::HIGH); + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 2; + options.max_background_compactions = 1; + options.max_background_flushes = 1; + options.max_write_buffer_number = 10; + options.min_write_buffer_number_to_merge = 1; + options.max_write_buffer_size_to_maintain = 0; + options.write_buffer_size = 1000000; + Reopen(options); + + std::string big_value(1000000 * 2, 'x'); + std::string num; + uint64_t int_num; + + ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value)); + ASSERT_OK(Flush()); + ASSERT_TRUE(dbfull()->GetIntProperty( + "rocksdb.estimate-pending-compaction-bytes", &int_num)); + ASSERT_EQ(int_num, 0U); + + ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value)); + ASSERT_OK(Flush()); + ASSERT_TRUE(dbfull()->GetIntProperty( + "rocksdb.estimate-pending-compaction-bytes", &int_num)); + ASSERT_GT(int_num, 0U); + + ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value)); + ASSERT_OK(Flush()); + ASSERT_TRUE(dbfull()->GetIntProperty( + "rocksdb.estimate-pending-compaction-bytes", &int_num)); + ASSERT_GT(int_num, 0U); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(dbfull()->GetIntProperty( + "rocksdb.estimate-pending-compaction-bytes", &int_num)); + ASSERT_EQ(int_num, 0U); +} + +TEST_F(DBPropertiesTest, EstimateCompressionRatio) { + if (!Snappy_Supported()) { + return; + } + const int kNumL0Files = 3; + const int kNumEntriesPerFile = 1000; + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = 3; + Reopen(options); + + ASSERT_OK(db_->SetOptions( + {{"compression_per_level", "kNoCompression:kSnappyCompression"}})); + auto opts = db_->GetOptions(); + ASSERT_EQ(opts.compression_per_level.size(), 2); + ASSERT_EQ(opts.compression_per_level[0], kNoCompression); + ASSERT_EQ(opts.compression_per_level[1], kSnappyCompression); + + // compression ratio is -1.0 when no open files at level + ASSERT_EQ(CompressionRatioAtLevel(0), -1.0); + + const std::string kVal(100, 'a'); + for (int i = 0; i < kNumL0Files; ++i) { + for (int j = 0; j < kNumEntriesPerFile; ++j) { + // Put common data ("key") at end to prevent delta encoding from + // compressing the key effectively + std::string key = std::to_string(i) + std::to_string(j) + "key"; + ASSERT_OK(dbfull()->Put(WriteOptions(), key, kVal)); + } + ASSERT_OK(Flush()); + } + + // no compression at L0, so ratio is less than one + ASSERT_LT(CompressionRatioAtLevel(0), 1.0); + ASSERT_GT(CompressionRatioAtLevel(0), 0.0); + ASSERT_EQ(CompressionRatioAtLevel(1), -1.0); + + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); + + ASSERT_EQ(CompressionRatioAtLevel(0), -1.0); + // Data at L1 should be highly compressed thanks to Snappy and redundant data + // in values (ratio is 12.846 as of 4/19/2016). + ASSERT_GT(CompressionRatioAtLevel(1), 10.0); +} + +#endif // ROCKSDB_LITE + +class CountingUserTblPropCollector : public TablePropertiesCollector { + public: + const char* Name() const override { return "CountingUserTblPropCollector"; } + + Status Finish(UserCollectedProperties* properties) override { + std::string encoded; + PutVarint32(&encoded, count_); + *properties = UserCollectedProperties{ + {"CountingUserTblPropCollector", message_}, + {"Count", encoded}, + }; + return Status::OK(); + } + + Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/, + EntryType /*type*/, SequenceNumber /*seq*/, + uint64_t /*file_size*/) override { + ++count_; + return Status::OK(); + } + + UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties{}; + } + + private: + std::string message_ = "Rocksdb"; + uint32_t count_ = 0; +}; + +class CountingUserTblPropCollectorFactory + : public TablePropertiesCollectorFactory { + public: + explicit CountingUserTblPropCollectorFactory( + uint32_t expected_column_family_id) + : expected_column_family_id_(expected_column_family_id), + num_created_(0) {} + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context context) override { + EXPECT_EQ(expected_column_family_id_, context.column_family_id); + num_created_++; + return new CountingUserTblPropCollector(); + } + const char* Name() const override { + return "CountingUserTblPropCollectorFactory"; + } + void set_expected_column_family_id(uint32_t v) { + expected_column_family_id_ = v; + } + uint32_t expected_column_family_id_; + uint32_t num_created_; +}; + +class CountingDeleteTabPropCollector : public TablePropertiesCollector { + public: + const char* Name() const override { return "CountingDeleteTabPropCollector"; } + + Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/, + EntryType type, SequenceNumber /*seq*/, + uint64_t /*file_size*/) override { + if (type == kEntryDelete) { + num_deletes_++; + } + return Status::OK(); + } + + bool NeedCompact() const override { return num_deletes_ > 10; } + + UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties{}; + } + + Status Finish(UserCollectedProperties* properties) override { + *properties = + UserCollectedProperties{{"num_delete", std::to_string(num_deletes_)}}; + return Status::OK(); + } + + private: + uint32_t num_deletes_ = 0; +}; + +class CountingDeleteTabPropCollectorFactory + : public TablePropertiesCollectorFactory { + public: + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context /*context*/) override { + return new CountingDeleteTabPropCollector(); + } + const char* Name() const override { + return "CountingDeleteTabPropCollectorFactory"; + } +}; + +class BlockCountingTablePropertiesCollector : public TablePropertiesCollector { + public: + static const std::string kNumSampledBlocksPropertyName; + + const char* Name() const override { + return "BlockCountingTablePropertiesCollector"; + } + + Status Finish(UserCollectedProperties* properties) override { + (*properties)[kNumSampledBlocksPropertyName] = + std::to_string(num_sampled_blocks_); + return Status::OK(); + } + + Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/, + EntryType /*type*/, SequenceNumber /*seq*/, + uint64_t /*file_size*/) override { + return Status::OK(); + } + + void BlockAdd(uint64_t /* block_uncomp_bytes */, + uint64_t block_compressed_bytes_fast, + uint64_t block_compressed_bytes_slow) override { + if (block_compressed_bytes_fast > 0 || block_compressed_bytes_slow > 0) { + num_sampled_blocks_++; + } + } + + UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties{ + {kNumSampledBlocksPropertyName, std::to_string(num_sampled_blocks_)}, + }; + } + + private: + uint32_t num_sampled_blocks_ = 0; +}; + +const std::string + BlockCountingTablePropertiesCollector::kNumSampledBlocksPropertyName = + "NumSampledBlocks"; + +class BlockCountingTablePropertiesCollectorFactory + : public TablePropertiesCollectorFactory { + public: + const char* Name() const override { + return "BlockCountingTablePropertiesCollectorFactory"; + } + + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context /* context */) override { + return new BlockCountingTablePropertiesCollector(); + } +}; + +#ifndef ROCKSDB_LITE +TEST_F(DBPropertiesTest, GetUserDefinedTableProperties) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = (1 << 30); + options.table_properties_collector_factories.resize(1); + std::shared_ptr collector_factory = + std::make_shared(0); + options.table_properties_collector_factories[0] = collector_factory; + Reopen(options); + // Create 4 tables + for (int table = 0; table < 4; ++table) { + for (int i = 0; i < 10 + table; ++i) { + ASSERT_OK( + db_->Put(WriteOptions(), std::to_string(table * 100 + i), "val")); + } + ASSERT_OK(db_->Flush(FlushOptions())); + } + + TablePropertiesCollection props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&props)); + ASSERT_EQ(4U, props.size()); + uint32_t sum = 0; + for (const auto& item : props) { + auto& user_collected = item.second->user_collected_properties; + ASSERT_TRUE(user_collected.find("CountingUserTblPropCollector") != + user_collected.end()); + ASSERT_EQ(user_collected.at("CountingUserTblPropCollector"), "Rocksdb"); + ASSERT_TRUE(user_collected.find("Count") != user_collected.end()); + Slice key(user_collected.at("Count")); + uint32_t count; + ASSERT_TRUE(GetVarint32(&key, &count)); + sum += count; + } + ASSERT_EQ(10u + 11u + 12u + 13u, sum); + + ASSERT_GT(collector_factory->num_created_, 0U); + collector_factory->num_created_ = 0; + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); + ASSERT_GT(collector_factory->num_created_, 0U); +} +#endif // ROCKSDB_LITE + +TEST_F(DBPropertiesTest, UserDefinedTablePropertiesContext) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 3; + options.table_properties_collector_factories.resize(1); + std::shared_ptr collector_factory = + std::make_shared(1); + options.table_properties_collector_factories[0] = collector_factory, + CreateAndReopenWithCF({"pikachu"}, options); + // Create 2 files + for (int table = 0; table < 2; ++table) { + for (int i = 0; i < 10 + table; ++i) { + ASSERT_OK(Put(1, std::to_string(table * 100 + i), "val")); + } + ASSERT_OK(Flush(1)); + } + ASSERT_GT(collector_factory->num_created_, 0U); + + collector_factory->num_created_ = 0; + // Trigger automatic compactions. + for (int table = 0; table < 3; ++table) { + for (int i = 0; i < 10 + table; ++i) { + ASSERT_OK(Put(1, std::to_string(table * 100 + i), "val")); + } + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_GT(collector_factory->num_created_, 0U); + + collector_factory->num_created_ = 0; + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); + ASSERT_GT(collector_factory->num_created_, 0U); + + // Come back to write to default column family + collector_factory->num_created_ = 0; + collector_factory->set_expected_column_family_id(0); // default CF + // Create 4 tables in default column family + for (int table = 0; table < 2; ++table) { + for (int i = 0; i < 10 + table; ++i) { + ASSERT_OK(Put(std::to_string(table * 100 + i), "val")); + } + ASSERT_OK(Flush()); + } + ASSERT_GT(collector_factory->num_created_, 0U); + + collector_factory->num_created_ = 0; + // Trigger automatic compactions. + for (int table = 0; table < 3; ++table) { + for (int i = 0; i < 10 + table; ++i) { + ASSERT_OK(Put(std::to_string(table * 100 + i), "val")); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_GT(collector_factory->num_created_, 0U); + + collector_factory->num_created_ = 0; + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); + ASSERT_GT(collector_factory->num_created_, 0U); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBPropertiesTest, TablePropertiesNeedCompactTest) { + Random rnd(301); + + Options options; + options.create_if_missing = true; + options.write_buffer_size = 4096; + options.max_write_buffer_number = 8; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 4; + options.target_file_size_base = 2048; + options.max_bytes_for_level_base = 10240; + options.max_bytes_for_level_multiplier = 4; + options.soft_pending_compaction_bytes_limit = 1024 * 1024; + options.num_levels = 8; + options.env = env_; + + std::shared_ptr collector_factory = + std::make_shared(); + options.table_properties_collector_factories.resize(1); + options.table_properties_collector_factories[0] = collector_factory; + + DestroyAndReopen(options); + + const int kMaxKey = 1000; + for (int i = 0; i < kMaxKey; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(102))); + ASSERT_OK(Put(Key(kMaxKey + i), rnd.RandomString(102))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + if (NumTableFilesAtLevel(0) == 1) { + // Clear Level 0 so that when later flush a file with deletions, + // we don't trigger an organic compaction. + ASSERT_OK(Put(Key(0), "")); + ASSERT_OK(Put(Key(kMaxKey * 2), "")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + + { + int c = 0; + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + iter->Seek(Key(kMaxKey - 100)); + while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) { + iter->Next(); + ++c; + } + ASSERT_OK(iter->status()); + ASSERT_EQ(c, 200); + } + + ASSERT_OK(Delete(Key(0))); + for (int i = kMaxKey - 100; i < kMaxKey + 100; i++) { + ASSERT_OK(Delete(Key(i))); + } + ASSERT_OK(Delete(Key(kMaxKey * 2))); + + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + { + SetPerfLevel(kEnableCount); + get_perf_context()->Reset(); + int c = 0; + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + iter->Seek(Key(kMaxKey - 100)); + while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) { + iter->Next(); + } + ASSERT_OK(iter->status()); + ASSERT_EQ(c, 0); + ASSERT_LT(get_perf_context()->internal_delete_skipped_count, 30u); + ASSERT_LT(get_perf_context()->internal_key_skipped_count, 30u); + SetPerfLevel(kDisable); + } +} + +TEST_F(DBPropertiesTest, NeedCompactHintPersistentTest) { + Random rnd(301); + + Options options; + options.create_if_missing = true; + options.max_write_buffer_number = 8; + options.level0_file_num_compaction_trigger = 10; + options.level0_slowdown_writes_trigger = 10; + options.level0_stop_writes_trigger = 10; + options.disable_auto_compactions = true; + options.env = env_; + + std::shared_ptr collector_factory = + std::make_shared(); + options.table_properties_collector_factories.resize(1); + options.table_properties_collector_factories[0] = collector_factory; + + DestroyAndReopen(options); + + const int kMaxKey = 100; + for (int i = 0; i < kMaxKey; i++) { + ASSERT_OK(Put(Key(i), "")); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + for (int i = 1; i < kMaxKey - 1; i++) { + ASSERT_OK(Delete(Key(i))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ(NumTableFilesAtLevel(0), 2); + + // Restart the DB. Although number of files didn't reach + // options.level0_file_num_compaction_trigger, compaction should + // still be triggered because of the need-compaction hint. + options.disable_auto_compactions = false; + Reopen(options); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + { + SetPerfLevel(kEnableCount); + get_perf_context()->Reset(); + int c = 0; + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) { + c++; + } + ASSERT_OK(iter->status()); + ASSERT_EQ(c, 2); + ASSERT_EQ(get_perf_context()->internal_delete_skipped_count, 0); + // We iterate every key twice. Is it a bug? + ASSERT_LE(get_perf_context()->internal_key_skipped_count, 2); + SetPerfLevel(kDisable); + } +} + +// Excluded from RocksDB lite tests due to `GetPropertiesOfAllTables()` usage. +TEST_F(DBPropertiesTest, BlockAddForCompressionSampling) { + // Sampled compression requires at least one of the following four types. + if (!Snappy_Supported() && !Zlib_Supported() && !LZ4_Supported() && + !ZSTD_Supported()) { + return; + } + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.table_properties_collector_factories.emplace_back( + std::make_shared()); + + for (bool sample_for_compression : {false, true}) { + // For simplicity/determinism, sample 100% when enabled, or 0% when disabled + options.sample_for_compression = sample_for_compression ? 1 : 0; + + DestroyAndReopen(options); + + // Setup the following LSM: + // + // L0_0 ["a", "b"] + // L1_0 ["a", "b"] + // + // L0_0 was created by flush. L1_0 was created by compaction. Each file + // contains one data block. + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("a", "val")); + ASSERT_OK(Put("b", "val")); + ASSERT_OK(Flush()); + if (i == 1) { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } + } + + // A `BlockAdd()` should have been seen for files generated by flush or + // compaction when `sample_for_compression` is enabled. + TablePropertiesCollection file_to_props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&file_to_props)); + ASSERT_EQ(2, file_to_props.size()); + for (const auto& file_and_props : file_to_props) { + auto& user_props = file_and_props.second->user_collected_properties; + ASSERT_TRUE(user_props.find(BlockCountingTablePropertiesCollector:: + kNumSampledBlocksPropertyName) != + user_props.end()); + ASSERT_EQ(user_props.at(BlockCountingTablePropertiesCollector:: + kNumSampledBlocksPropertyName), + std::to_string(sample_for_compression ? 1 : 0)); + } + } +} + +class CompressionSamplingDBPropertiesTest + : public DBPropertiesTest, + public ::testing::WithParamInterface { + public: + CompressionSamplingDBPropertiesTest() : fast_(GetParam()) {} + + protected: + const bool fast_; +}; + +INSTANTIATE_TEST_CASE_P(CompressionSamplingDBPropertiesTest, + CompressionSamplingDBPropertiesTest, ::testing::Bool()); + +// Excluded from RocksDB lite tests due to `GetPropertiesOfAllTables()` usage. +TEST_P(CompressionSamplingDBPropertiesTest, + EstimateDataSizeWithCompressionSampling) { + Options options = CurrentOptions(); + if (fast_) { + // One of the following light compression libraries must be present. + if (LZ4_Supported()) { + options.compression = kLZ4Compression; + } else if (Snappy_Supported()) { + options.compression = kSnappyCompression; + } else { + return; + } + } else { + // One of the following heavy compression libraries must be present. + if (ZSTD_Supported()) { + options.compression = kZSTD; + } else if (Zlib_Supported()) { + options.compression = kZlibCompression; + } else { + return; + } + } + options.disable_auto_compactions = true; + // For simplicity/determinism, sample 100%. + options.sample_for_compression = 1; + Reopen(options); + + // Setup the following LSM: + // + // L0_0 ["a", "b"] + // L1_0 ["a", "b"] + // + // L0_0 was created by flush. L1_0 was created by compaction. Each file + // contains one data block. The value consists of compressible data so the + // data block should be stored compressed. + std::string val(1024, 'a'); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("a", val)); + ASSERT_OK(Put("b", val)); + ASSERT_OK(Flush()); + if (i == 1) { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } + } + + TablePropertiesCollection file_to_props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&file_to_props)); + ASSERT_EQ(2, file_to_props.size()); + for (const auto& file_and_props : file_to_props) { + ASSERT_GT(file_and_props.second->data_size, 0); + if (fast_) { + ASSERT_EQ(file_and_props.second->data_size, + file_and_props.second->fast_compression_estimated_data_size); + } else { + ASSERT_EQ(file_and_props.second->data_size, + file_and_props.second->slow_compression_estimated_data_size); + } + } +} + +TEST_F(DBPropertiesTest, EstimateNumKeysUnderflow) { + Options options = CurrentOptions(); + Reopen(options); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Delete("foo")); + ASSERT_OK(Delete("foo")); + uint64_t num_keys = 0; + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &num_keys)); + ASSERT_EQ(0, num_keys); +} + +TEST_F(DBPropertiesTest, EstimateOldestKeyTime) { + uint64_t oldest_key_time = 0; + Options options = CurrentOptions(); + SetTimeElapseOnlySleepOnReopen(&options); + + // "rocksdb.estimate-oldest-key-time" only available to fifo compaction. + for (auto compaction : {kCompactionStyleLevel, kCompactionStyleUniversal, + kCompactionStyleNone}) { + options.compaction_style = compaction; + options.create_if_missing = true; + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "bar")); + ASSERT_FALSE(dbfull()->GetIntProperty( + DB::Properties::kEstimateOldestKeyTime, &oldest_key_time)); + } + + int64_t mock_start_time; + ASSERT_OK(env_->GetCurrentTime(&mock_start_time)); + + options.compaction_style = kCompactionStyleFIFO; + options.ttl = 300; + options.max_open_files = -1; + options.compaction_options_fifo.allow_compaction = false; + DestroyAndReopen(options); + + env_->MockSleepForSeconds(100); + ASSERT_OK(Put("k1", "v1")); + ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime, + &oldest_key_time)); + ASSERT_EQ(100, oldest_key_time - mock_start_time); + ASSERT_OK(Flush()); + ASSERT_EQ("1", FilesPerLevel()); + ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime, + &oldest_key_time)); + ASSERT_EQ(100, oldest_key_time - mock_start_time); + + env_->MockSleepForSeconds(100); // -> 200 + ASSERT_OK(Put("k2", "v2")); + ASSERT_OK(Flush()); + ASSERT_EQ("2", FilesPerLevel()); + ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime, + &oldest_key_time)); + ASSERT_EQ(100, oldest_key_time - mock_start_time); + + env_->MockSleepForSeconds(100); // -> 300 + ASSERT_OK(Put("k3", "v3")); + ASSERT_OK(Flush()); + ASSERT_EQ("3", FilesPerLevel()); + ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime, + &oldest_key_time)); + ASSERT_EQ(100, oldest_key_time - mock_start_time); + + env_->MockSleepForSeconds(150); // -> 450 + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("2", FilesPerLevel()); + ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime, + &oldest_key_time)); + ASSERT_EQ(200, oldest_key_time - mock_start_time); + + env_->MockSleepForSeconds(100); // -> 550 + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("1", FilesPerLevel()); + ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime, + &oldest_key_time)); + ASSERT_EQ(300, oldest_key_time - mock_start_time); + + env_->MockSleepForSeconds(100); // -> 650 + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("", FilesPerLevel()); + ASSERT_FALSE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime, + &oldest_key_time)); +} + +TEST_F(DBPropertiesTest, SstFilesSize) { + struct TestListener : public EventListener { + void OnCompactionCompleted(DB* db, + const CompactionJobInfo& /*info*/) override { + assert(callback_triggered == false); + assert(size_before_compaction > 0); + callback_triggered = true; + uint64_t total_sst_size = 0; + uint64_t live_sst_size = 0; + bool ok = db->GetIntProperty(DB::Properties::kTotalSstFilesSize, + &total_sst_size); + ASSERT_TRUE(ok); + // total_sst_size include files before and after compaction. + ASSERT_GT(total_sst_size, size_before_compaction); + ok = + db->GetIntProperty(DB::Properties::kLiveSstFilesSize, &live_sst_size); + ASSERT_TRUE(ok); + // live_sst_size only include files after compaction. + ASSERT_GT(live_sst_size, 0); + ASSERT_LT(live_sst_size, size_before_compaction); + } + + uint64_t size_before_compaction = 0; + bool callback_triggered = false; + }; + std::shared_ptr listener = std::make_shared(); + + Options options; + options.env = CurrentOptions().env; + options.disable_auto_compactions = true; + options.listeners.push_back(listener); + Reopen(options); + + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put("key" + std::to_string(i), std::string(1000, 'v'))); + } + ASSERT_OK(Flush()); + for (int i = 0; i < 5; i++) { + ASSERT_OK(Delete("key" + std::to_string(i))); + } + ASSERT_OK(Flush()); + uint64_t sst_size; + bool ok = db_->GetIntProperty(DB::Properties::kTotalSstFilesSize, &sst_size); + ASSERT_TRUE(ok); + ASSERT_GT(sst_size, 0); + listener->size_before_compaction = sst_size; + // Compact to clean all keys and trigger listener. + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_TRUE(listener->callback_triggered); +} + +TEST_F(DBPropertiesTest, MinObsoleteSstNumberToKeep) { + class TestListener : public EventListener { + public: + void OnTableFileCreated(const TableFileCreationInfo& info) override { + if (info.reason == TableFileCreationReason::kCompaction) { + // Verify the property indicates that SSTs created by a running + // compaction cannot be deleted. + uint64_t created_file_num; + FileType created_file_type; + std::string filename = + info.file_path.substr(info.file_path.rfind('/') + 1); + ASSERT_TRUE( + ParseFileName(filename, &created_file_num, &created_file_type)); + ASSERT_EQ(kTableFile, created_file_type); + + uint64_t keep_sst_lower_bound; + ASSERT_TRUE( + db_->GetIntProperty(DB::Properties::kMinObsoleteSstNumberToKeep, + &keep_sst_lower_bound)); + + ASSERT_LE(keep_sst_lower_bound, created_file_num); + validated_ = true; + } + } + + void SetDB(DB* db) { db_ = db; } + + int GetNumCompactions() { return num_compactions_; } + + // True if we've verified the property for at least one output file + bool Validated() { return validated_; } + + private: + int num_compactions_ = 0; + bool validated_ = false; + DB* db_ = nullptr; + }; + + const int kNumL0Files = 4; + + std::shared_ptr listener = std::make_shared(); + + Options options = CurrentOptions(); + options.listeners.push_back(listener); + options.level0_file_num_compaction_trigger = kNumL0Files; + DestroyAndReopen(options); + listener->SetDB(db_); + + for (int i = 0; i < kNumL0Files; ++i) { + // Make sure they overlap in keyspace to prevent trivial move + ASSERT_OK(Put("key1", "val")); + ASSERT_OK(Put("key2", "val")); + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(listener->Validated()); +} + +TEST_F(DBPropertiesTest, BlobCacheProperties) { + Options options; + uint64_t value; + + options.env = CurrentOptions().env; + + // Test with empty blob cache. + constexpr size_t kCapacity = 100; + LRUCacheOptions co; + co.capacity = kCapacity; + co.num_shard_bits = 0; + co.metadata_charge_policy = kDontChargeCacheMetadata; + auto blob_cache = NewLRUCache(co); + options.blob_cache = blob_cache; + + Reopen(options); + + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheCapacity, &value)); + ASSERT_EQ(kCapacity, value); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheUsage, &value)); + ASSERT_EQ(0, value); + ASSERT_TRUE( + db_->GetIntProperty(DB::Properties::kBlobCachePinnedUsage, &value)); + ASSERT_EQ(0, value); + + // Insert unpinned blob to the cache and check size. + constexpr size_t kSize1 = 70; + ASSERT_OK(blob_cache->Insert("blob1", nullptr /*value*/, kSize1, + nullptr /*deleter*/)); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheCapacity, &value)); + ASSERT_EQ(kCapacity, value); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheUsage, &value)); + ASSERT_EQ(kSize1, value); + ASSERT_TRUE( + db_->GetIntProperty(DB::Properties::kBlobCachePinnedUsage, &value)); + ASSERT_EQ(0, value); + + // Insert pinned blob to the cache and check size. + constexpr size_t kSize2 = 60; + Cache::Handle* blob2 = nullptr; + ASSERT_OK(blob_cache->Insert("blob2", nullptr /*value*/, kSize2, + nullptr /*deleter*/, &blob2)); + ASSERT_NE(nullptr, blob2); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheCapacity, &value)); + ASSERT_EQ(kCapacity, value); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheUsage, &value)); + // blob1 is evicted. + ASSERT_EQ(kSize2, value); + ASSERT_TRUE( + db_->GetIntProperty(DB::Properties::kBlobCachePinnedUsage, &value)); + ASSERT_EQ(kSize2, value); + + // Insert another pinned blob to make the cache over-sized. + constexpr size_t kSize3 = 80; + Cache::Handle* blob3 = nullptr; + ASSERT_OK(blob_cache->Insert("blob3", nullptr /*value*/, kSize3, + nullptr /*deleter*/, &blob3)); + ASSERT_NE(nullptr, blob3); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheCapacity, &value)); + ASSERT_EQ(kCapacity, value); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheUsage, &value)); + ASSERT_EQ(kSize2 + kSize3, value); + ASSERT_TRUE( + db_->GetIntProperty(DB::Properties::kBlobCachePinnedUsage, &value)); + ASSERT_EQ(kSize2 + kSize3, value); + + // Check size after release. + blob_cache->Release(blob2); + blob_cache->Release(blob3); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheCapacity, &value)); + ASSERT_EQ(kCapacity, value); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheUsage, &value)); + // blob2 will be evicted, while blob3 remain in cache after release. + ASSERT_EQ(kSize3, value); + ASSERT_TRUE( + db_->GetIntProperty(DB::Properties::kBlobCachePinnedUsage, &value)); + ASSERT_EQ(0, value); +} + +TEST_F(DBPropertiesTest, BlockCacheProperties) { + Options options; + uint64_t value; + + options.env = CurrentOptions().env; + + // Block cache properties are not available for tables other than + // block-based table. + options.table_factory.reset(NewPlainTableFactory()); + Reopen(options); + ASSERT_FALSE( + db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value)); + ASSERT_FALSE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value)); + ASSERT_FALSE( + db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value)); + + options.table_factory.reset(NewCuckooTableFactory()); + Reopen(options); + ASSERT_FALSE( + db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value)); + ASSERT_FALSE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value)); + ASSERT_FALSE( + db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value)); + + // Block cache properties are not available if block cache is not used. + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + ASSERT_FALSE( + db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value)); + ASSERT_FALSE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value)); + ASSERT_FALSE( + db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value)); + + // Test with empty block cache. + constexpr size_t kCapacity = 100; + LRUCacheOptions co; + co.capacity = kCapacity; + co.num_shard_bits = 0; + co.metadata_charge_policy = kDontChargeCacheMetadata; + auto block_cache = NewLRUCache(co); + table_options.block_cache = block_cache; + table_options.no_block_cache = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value)); + ASSERT_EQ(kCapacity, value); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value)); + ASSERT_EQ(0, value); + ASSERT_TRUE( + db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value)); + ASSERT_EQ(0, value); + + // Insert unpinned item to the cache and check size. + constexpr size_t kSize1 = 50; + ASSERT_OK(block_cache->Insert("item1", nullptr /*value*/, kSize1, + nullptr /*deleter*/)); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value)); + ASSERT_EQ(kCapacity, value); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value)); + ASSERT_EQ(kSize1, value); + ASSERT_TRUE( + db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value)); + ASSERT_EQ(0, value); + + // Insert pinned item to the cache and check size. + constexpr size_t kSize2 = 30; + Cache::Handle* item2 = nullptr; + ASSERT_OK(block_cache->Insert("item2", nullptr /*value*/, kSize2, + nullptr /*deleter*/, &item2)); + ASSERT_NE(nullptr, item2); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value)); + ASSERT_EQ(kCapacity, value); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value)); + ASSERT_EQ(kSize1 + kSize2, value); + ASSERT_TRUE( + db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value)); + ASSERT_EQ(kSize2, value); + + // Insert another pinned item to make the cache over-sized. + constexpr size_t kSize3 = 80; + Cache::Handle* item3 = nullptr; + ASSERT_OK(block_cache->Insert("item3", nullptr /*value*/, kSize3, + nullptr /*deleter*/, &item3)); + ASSERT_NE(nullptr, item2); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value)); + ASSERT_EQ(kCapacity, value); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value)); + // Item 1 is evicted. + ASSERT_EQ(kSize2 + kSize3, value); + ASSERT_TRUE( + db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value)); + ASSERT_EQ(kSize2 + kSize3, value); + + // Check size after release. + block_cache->Release(item2); + block_cache->Release(item3); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value)); + ASSERT_EQ(kCapacity, value); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value)); + // item2 will be evicted, while item3 remain in cache after release. + ASSERT_EQ(kSize3, value); + ASSERT_TRUE( + db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value)); + ASSERT_EQ(0, value); +} + +TEST_F(DBPropertiesTest, GetMapPropertyDbStats) { + auto mock_clock = std::make_shared(env_->GetSystemClock()); + CompositeEnvWrapper env(env_, mock_clock); + + Options opts = CurrentOptions(); + opts.env = &env; + Reopen(opts); + + { + std::map db_stats; + ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kDBStats, &db_stats)); + AssertDbStats(db_stats, 0.0 /* expected_uptime */, + 0 /* expected_user_bytes_written */, + 0 /* expected_wal_bytes_written */, + 0 /* expected_user_writes_by_self */, + 0 /* expected_user_writes_with_wal */); + } + + { + mock_clock->SleepForMicroseconds(1500000); + + std::map db_stats; + ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kDBStats, &db_stats)); + AssertDbStats(db_stats, 1.5 /* expected_uptime */, + 0 /* expected_user_bytes_written */, + 0 /* expected_wal_bytes_written */, + 0 /* expected_user_writes_by_self */, + 0 /* expected_user_writes_with_wal */); + } + + int expected_user_bytes_written = 0; + { + // Write with WAL disabled. + WriteOptions write_opts; + write_opts.disableWAL = true; + + WriteBatch batch; + ASSERT_OK(batch.Put("key", "val")); + expected_user_bytes_written += static_cast(batch.GetDataSize()); + + ASSERT_OK(db_->Write(write_opts, &batch)); + + std::map db_stats; + ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kDBStats, &db_stats)); + AssertDbStats(db_stats, 1.5 /* expected_uptime */, + expected_user_bytes_written, + 0 /* expected_wal_bytes_written */, + 1 /* expected_user_writes_by_self */, + 0 /* expected_user_writes_with_wal */); + } + + int expected_wal_bytes_written = 0; + { + // Write with WAL enabled. + WriteBatch batch; + ASSERT_OK(batch.Delete("key")); + expected_user_bytes_written += static_cast(batch.GetDataSize()); + expected_wal_bytes_written += static_cast(batch.GetDataSize()); + + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + std::map db_stats; + ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kDBStats, &db_stats)); + AssertDbStats(db_stats, 1.5 /* expected_uptime */, + expected_user_bytes_written, expected_wal_bytes_written, + 2 /* expected_user_writes_by_self */, + 1 /* expected_user_writes_with_wal */); + } + + Close(); +} + +TEST_F(DBPropertiesTest, GetMapPropertyBlockCacheEntryStats) { + // Currently only verifies the expected properties are present + std::map values; + ASSERT_TRUE( + db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats, &values)); + + ASSERT_TRUE(values.find(BlockCacheEntryStatsMapKeys::CacheId()) != + values.end()); + ASSERT_TRUE(values.find(BlockCacheEntryStatsMapKeys::CacheCapacityBytes()) != + values.end()); + ASSERT_TRUE( + values.find( + BlockCacheEntryStatsMapKeys::LastCollectionDurationSeconds()) != + values.end()); + ASSERT_TRUE( + values.find(BlockCacheEntryStatsMapKeys::LastCollectionAgeSeconds()) != + values.end()); + for (size_t i = 0; i < kNumCacheEntryRoles; ++i) { + CacheEntryRole role = static_cast(i); + ASSERT_TRUE(values.find(BlockCacheEntryStatsMapKeys::EntryCount(role)) != + values.end()); + ASSERT_TRUE(values.find(BlockCacheEntryStatsMapKeys::UsedBytes(role)) != + values.end()); + ASSERT_TRUE(values.find(BlockCacheEntryStatsMapKeys::UsedPercent(role)) != + values.end()); + } + + // There should be no extra values in the map. + ASSERT_EQ(3 * kNumCacheEntryRoles + 4, values.size()); +} + +namespace { +std::string PopMetaIndexKey(InternalIterator* meta_iter) { + Status s = meta_iter->status(); + if (!s.ok()) { + return s.ToString(); + } else if (meta_iter->Valid()) { + std::string rv = meta_iter->key().ToString(); + meta_iter->Next(); + return rv; + } else { + return "NOT_FOUND"; + } +} + +} // anonymous namespace + +TEST_F(DBPropertiesTest, TableMetaIndexKeys) { + // This is to detect unexpected churn in metaindex block keys. This is more + // of a "table test" but table_test.cc doesn't depend on db_test_util.h and + // we need ChangeOptions() for broad coverage. + constexpr int kKeyCount = 100; + do { + Options options; + options = CurrentOptions(options); + DestroyAndReopen(options); + + // Create an SST file + for (int key = 0; key < kKeyCount; key++) { + ASSERT_OK(Put(Key(key), "val")); + } + ASSERT_OK(Flush()); + + // Find its file number + std::vector files; + db_->GetLiveFilesMetaData(&files); + // 1 SST file + ASSERT_EQ(1, files.size()); + + // Open it for inspection + std::string sst_file = + files[0].directory + "/" + files[0].relative_filename; + std::unique_ptr f; + ASSERT_OK(env_->GetFileSystem()->NewRandomAccessFile( + sst_file, FileOptions(), &f, nullptr)); + std::unique_ptr r; + r.reset(new RandomAccessFileReader(std::move(f), sst_file)); + uint64_t file_size = 0; + ASSERT_OK(env_->GetFileSize(sst_file, &file_size)); + + // Read metaindex + BlockContents bc; + ASSERT_OK(ReadMetaIndexBlockInFile(r.get(), file_size, 0U, + ImmutableOptions(options), &bc)); + Block metaindex_block(std::move(bc)); + std::unique_ptr meta_iter; + meta_iter.reset(metaindex_block.NewMetaIterator()); + meta_iter->SeekToFirst(); + + if (strcmp(options.table_factory->Name(), + TableFactory::kBlockBasedTableName()) == 0) { + auto bbto = options.table_factory->GetOptions(); + if (bbto->filter_policy) { + if (bbto->partition_filters) { + // The key names are intentionally hard-coded here to detect + // accidental regression on compatibility. + EXPECT_EQ("partitionedfilter.rocksdb.BuiltinBloomFilter", + PopMetaIndexKey(meta_iter.get())); + } else { + EXPECT_EQ("fullfilter.rocksdb.BuiltinBloomFilter", + PopMetaIndexKey(meta_iter.get())); + } + } + if (bbto->index_type == BlockBasedTableOptions::kHashSearch) { + EXPECT_EQ("rocksdb.hashindex.metadata", + PopMetaIndexKey(meta_iter.get())); + EXPECT_EQ("rocksdb.hashindex.prefixes", + PopMetaIndexKey(meta_iter.get())); + } + } + EXPECT_EQ("rocksdb.properties", PopMetaIndexKey(meta_iter.get())); + EXPECT_EQ("NOT_FOUND", PopMetaIndexKey(meta_iter.get())); + } while (ChangeOptions()); +} + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_range_del_test.cc b/src/rocksdb/db/db_range_del_test.cc new file mode 100644 index 000000000..d576f2217 --- /dev/null +++ b/src/rocksdb/db/db_range_del_test.cc @@ -0,0 +1,2807 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_test_util.h" +#include "db/version_set.h" +#include "port/stack_trace.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "test_util/testutil.h" +#include "util/random.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +// TODO(cbi): parameterize the test to cover user-defined timestamp cases +class DBRangeDelTest : public DBTestBase { + public: + DBRangeDelTest() : DBTestBase("db_range_del_test", /*env_do_fsync=*/false) {} + + std::string GetNumericStr(int key) { + uint64_t uint64_key = static_cast(key); + std::string str; + str.resize(8); + memcpy(&str[0], static_cast(&uint64_key), 8); + return str; + } +}; + +// PlainTableFactory, WriteBatchWithIndex, and NumTableFilesAtLevel() are not +// supported in ROCKSDB_LITE +#ifndef ROCKSDB_LITE +TEST_F(DBRangeDelTest, NonBlockBasedTableNotSupported) { + // TODO: figure out why MmapReads trips the iterator pinning assertion in + // RangeDelAggregator. Ideally it would be supported; otherwise it should at + // least be explicitly unsupported. + for (auto config : {kPlainTableAllBytesPrefix, /* kWalDirAndMmapReads */}) { + option_config_ = config; + DestroyAndReopen(CurrentOptions()); + ASSERT_TRUE(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + "dr1", "dr1") + .IsNotSupported()); + } +} + +TEST_F(DBRangeDelTest, WriteBatchWithIndexNotSupported) { + WriteBatchWithIndex indexedBatch{}; + ASSERT_TRUE(indexedBatch.DeleteRange(db_->DefaultColumnFamily(), "dr1", "dr1") + .IsNotSupported()); + ASSERT_TRUE(indexedBatch.DeleteRange("dr1", "dr1").IsNotSupported()); +} + +TEST_F(DBRangeDelTest, EndSameAsStartCoversNothing) { + ASSERT_OK(db_->Put(WriteOptions(), "b", "val")); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "b")); + ASSERT_EQ("val", Get("b")); +} + +TEST_F(DBRangeDelTest, EndComesBeforeStartInvalidArgument) { + ASSERT_OK(db_->Put(WriteOptions(), "b", "val")); + ASSERT_TRUE( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "a") + .IsInvalidArgument()); + ASSERT_EQ("val", Get("b")); +} + +TEST_F(DBRangeDelTest, FlushOutputHasOnlyRangeTombstones) { + do { + DestroyAndReopen(CurrentOptions()); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + "dr1", "dr2")); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + } while (ChangeOptions(kRangeDelSkipConfigs)); +} + +TEST_F(DBRangeDelTest, DictionaryCompressionWithOnlyRangeTombstones) { + Options opts = CurrentOptions(); + opts.compression_opts.max_dict_bytes = 16384; + Reopen(opts); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1", + "dr2")); + ASSERT_OK(db_->Flush(FlushOptions())); +} + +TEST_F(DBRangeDelTest, CompactionOutputHasOnlyRangeTombstone) { + do { + Options opts = CurrentOptions(); + opts.disable_auto_compactions = true; + opts.statistics = CreateDBStatistics(); + DestroyAndReopen(opts); + + // snapshot protects range tombstone from dropping due to becoming obsolete. + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + ASSERT_OK(db_->Flush(FlushOptions())); + + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + ASSERT_EQ(0, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE)); + db_->ReleaseSnapshot(snapshot); + // Skip cuckoo memtables, which do not support snapshots. Skip non-leveled + // compactions as the above assertions about the number of files in a level + // do not hold true. + } while (ChangeOptions(kRangeDelSkipConfigs | kSkipUniversalCompaction | + kSkipFIFOCompaction)); +} + +TEST_F(DBRangeDelTest, CompactionOutputFilesExactlyFilled) { + // regression test for exactly filled compaction output files. Previously + // another file would be generated containing all range deletions, which + // could invalidate the non-overlapping file boundary invariant. + const int kNumPerFile = 4, kNumFiles = 2, kFileBytes = 9 << 10; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.level0_file_num_compaction_trigger = kNumFiles; + options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile)); + options.num_levels = 2; + options.target_file_size_base = kFileBytes; + BlockBasedTableOptions table_options; + table_options.block_size_deviation = 50; // each block holds two keys + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + + // snapshot protects range tombstone from dropping due to becoming obsolete. + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), + Key(1))); + + Random rnd(301); + for (int i = 0; i < kNumFiles; ++i) { + std::vector values; + // Write 12K (4 values, each 3K) + for (int j = 0; j < kNumPerFile; j++) { + values.push_back(rnd.RandomString(3 << 10)); + ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j])); + if (j == 0 && i > 0) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + } + } + // put extra key to trigger final flush + ASSERT_OK(Put("", "")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); + + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(2, NumTableFilesAtLevel(1)); + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, MaxCompactionBytesCutsOutputFiles) { + // Ensures range deletion spanning multiple compaction output files that are + // cut by max_compaction_bytes will have non-overlapping key-ranges. + // https://github.com/facebook/rocksdb/issues/1778 + const int kNumFiles = 2, kNumPerFile = 1 << 8, kBytesPerVal = 1 << 12; + Options opts = CurrentOptions(); + opts.comparator = test::Uint64Comparator(); + opts.disable_auto_compactions = true; + opts.level0_file_num_compaction_trigger = kNumFiles; + opts.max_compaction_bytes = kNumPerFile * kBytesPerVal; + opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile)); + // Want max_compaction_bytes to trigger the end of compaction output file, not + // target_file_size_base, so make the latter much bigger + // opts.target_file_size_base = 100 * opts.max_compaction_bytes; + opts.target_file_size_base = 1; + DestroyAndReopen(opts); + + // snapshot protects range tombstone from dropping due to becoming obsolete. + const Snapshot* snapshot = db_->GetSnapshot(); + + Random rnd(301); + + ASSERT_OK(Put(GetNumericStr(0), rnd.RandomString(kBytesPerVal))); + ASSERT_OK( + Put(GetNumericStr(kNumPerFile - 1), rnd.RandomString(kBytesPerVal))); + ASSERT_OK(Flush()); + ASSERT_OK(Put(GetNumericStr(kNumPerFile), rnd.RandomString(kBytesPerVal))); + ASSERT_OK( + Put(GetNumericStr(kNumPerFile * 2 - 1), rnd.RandomString(kBytesPerVal))); + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(NumTableFilesAtLevel(2), 2); + + ASSERT_OK( + db_->SetOptions(db_->DefaultColumnFamily(), + {{"target_file_size_base", + std::to_string(100 * opts.max_compaction_bytes)}})); + + // It spans the whole key-range, thus will be included in all output files + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + GetNumericStr(0), + GetNumericStr(kNumFiles * kNumPerFile - 1))); + + for (int i = 0; i < kNumFiles; ++i) { + std::vector values; + // Write 1MB (256 values, each 4K) + for (int j = 0; j < kNumPerFile; j++) { + values.push_back(rnd.RandomString(kBytesPerVal)); + ASSERT_OK(Put(GetNumericStr(kNumPerFile * i + j), values[j])); + } + // extra entry to trigger SpecialSkipListFactory's flush + ASSERT_OK(Put(GetNumericStr(kNumPerFile), "")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); + } + + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, + /*column_family=*/nullptr, + /*disallow_trivial_move=*/true)); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_GE(NumTableFilesAtLevel(1), 2); + std::vector> files; + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files); + + for (size_t i = 0; i + 1 < files[1].size(); ++i) { + ASSERT_TRUE(InternalKeyComparator(opts.comparator) + .Compare(files[1][i].largest, files[1][i + 1].smallest) < + 0); + } + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, SentinelsOmittedFromOutputFile) { + // Regression test for bug where sentinel range deletions (i.e., ones with + // sequence number of zero) were included in output files. + // snapshot protects range tombstone from dropping due to becoming obsolete. + const Snapshot* snapshot = db_->GetSnapshot(); + + // gaps between ranges creates sentinels in our internal representation + std::vector> range_dels = { + {"a", "b"}, {"c", "d"}, {"e", "f"}}; + for (const auto& range_del : range_dels) { + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + range_del.first, range_del.second)); + } + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + std::vector> files; + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files); + ASSERT_GT(files[0][0].fd.smallest_seqno, 0); + + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, FlushRangeDelsSameStartKey) { + ASSERT_OK(db_->Put(WriteOptions(), "b1", "val")); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c")); + ASSERT_OK(db_->Put(WriteOptions(), "b2", "val")); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b")); + // first iteration verifies query correctness in memtable, second verifies + // query correctness for a single SST file + for (int i = 0; i < 2; ++i) { + if (i > 0) { + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + } + std::string value; + ASSERT_TRUE(db_->Get(ReadOptions(), "b1", &value).IsNotFound()); + ASSERT_OK(db_->Get(ReadOptions(), "b2", &value)); + } +} + +TEST_F(DBRangeDelTest, CompactRangeDelsSameStartKey) { + ASSERT_OK(db_->Put(WriteOptions(), "unused", + "val")); // prevents empty after compaction + ASSERT_OK(db_->Put(WriteOptions(), "b1", "val")); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c")); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b")); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(3, NumTableFilesAtLevel(0)); + + for (int i = 0; i < 2; ++i) { + if (i > 0) { + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + } + std::string value; + ASSERT_TRUE(db_->Get(ReadOptions(), "b1", &value).IsNotFound()); + } +} +#endif // ROCKSDB_LITE + +TEST_F(DBRangeDelTest, FlushRemovesCoveredKeys) { + const int kNum = 300, kRangeBegin = 50, kRangeEnd = 250; + Options opts = CurrentOptions(); + opts.comparator = test::Uint64Comparator(); + DestroyAndReopen(opts); + + // Write a third before snapshot, a third between snapshot and tombstone, and + // a third after the tombstone. Keys older than snapshot or newer than the + // tombstone should be preserved. + const Snapshot* snapshot = nullptr; + for (int i = 0; i < kNum; ++i) { + if (i == kNum / 3) { + snapshot = db_->GetSnapshot(); + } else if (i == 2 * kNum / 3) { + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + GetNumericStr(kRangeBegin), + GetNumericStr(kRangeEnd))); + } + ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val")); + } + ASSERT_OK(db_->Flush(FlushOptions())); + + for (int i = 0; i < kNum; ++i) { + ReadOptions read_opts; + read_opts.ignore_range_deletions = true; + std::string value; + if (i < kRangeBegin || i > kRangeEnd || i < kNum / 3 || i >= 2 * kNum / 3) { + ASSERT_OK(db_->Get(read_opts, GetNumericStr(i), &value)); + } else { + ASSERT_TRUE(db_->Get(read_opts, GetNumericStr(i), &value).IsNotFound()); + } + } + db_->ReleaseSnapshot(snapshot); +} + +// NumTableFilesAtLevel() is not supported in ROCKSDB_LITE +#ifndef ROCKSDB_LITE +TEST_F(DBRangeDelTest, CompactionRemovesCoveredKeys) { + const int kNumPerFile = 100, kNumFiles = 4; + Options opts = CurrentOptions(); + opts.comparator = test::Uint64Comparator(); + opts.disable_auto_compactions = true; + opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile)); + opts.num_levels = 2; + opts.statistics = CreateDBStatistics(); + DestroyAndReopen(opts); + + for (int i = 0; i < kNumFiles; ++i) { + if (i > 0) { + // range tombstone covers first half of the previous file + ASSERT_OK(db_->DeleteRange( + WriteOptions(), db_->DefaultColumnFamily(), + GetNumericStr((i - 1) * kNumPerFile), + GetNumericStr((i - 1) * kNumPerFile + kNumPerFile / 2))); + } + // Make sure a given key appears in each file so compaction won't be able to + // use trivial move, which would happen if the ranges were non-overlapping. + // Also, we need an extra element since flush is only triggered when the + // number of keys is one greater than SpecialSkipListFactory's limit. + // We choose a key outside the key-range used by the test to avoid conflict. + ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(kNumPerFile * kNumFiles), + "val")); + + for (int j = 0; j < kNumPerFile; ++j) { + ASSERT_OK( + db_->Put(WriteOptions(), GetNumericStr(i * kNumPerFile + j), "val")); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); + } + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + ASSERT_EQ((kNumFiles - 1) * kNumPerFile / 2, + TestGetTickerCount(opts, COMPACTION_KEY_DROP_RANGE_DEL)); + + for (int i = 0; i < kNumFiles; ++i) { + for (int j = 0; j < kNumPerFile; ++j) { + ReadOptions read_opts; + read_opts.ignore_range_deletions = true; + std::string value; + if (i == kNumFiles - 1 || j >= kNumPerFile / 2) { + ASSERT_OK( + db_->Get(read_opts, GetNumericStr(i * kNumPerFile + j), &value)); + } else { + ASSERT_TRUE( + db_->Get(read_opts, GetNumericStr(i * kNumPerFile + j), &value) + .IsNotFound()); + } + } + } +} + +TEST_F(DBRangeDelTest, ValidLevelSubcompactionBoundaries) { + const int kNumPerFile = 100, kNumFiles = 4, kFileBytes = 100 << 10; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.level0_file_num_compaction_trigger = kNumFiles; + options.max_bytes_for_level_base = 2 * kFileBytes; + options.max_subcompactions = 4; + options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile)); + options.num_levels = 3; + options.target_file_size_base = kFileBytes; + options.target_file_size_multiplier = 1; + options.max_compaction_bytes = 1500; + Reopen(options); + + Random rnd(301); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < kNumFiles; ++j) { + if (i > 0) { + // delete [95,105) in two files, [295,305) in next two + int mid = (j + (1 - j % 2)) * kNumPerFile; + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(mid - 5), Key(mid + 5))); + } + std::vector values; + // Write 100KB (100 values, each 1K) + for (int k = 0; k < kNumPerFile; k++) { + values.push_back(rnd.RandomString(990)); + ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k])); + } + // put extra key to trigger flush + ASSERT_OK(Put("", "")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + if (j < kNumFiles - 1) { + // background compaction may happen early for kNumFiles'th file + ASSERT_EQ(NumTableFilesAtLevel(0), j + 1); + } + if (j == options.level0_file_num_compaction_trigger - 1) { + // When i == 1, compaction will output some files to L1, at which point + // L1 is not bottommost so range deletions cannot be compacted away. The + // new L1 files must be generated with non-overlapping key ranges even + // though multiple subcompactions see the same ranges deleted, else an + // assertion will fail. + // + // Only enable auto-compactions when we're ready; otherwise, the + // oversized L0 (relative to base_level) causes the compaction to run + // earlier. + ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()})); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(), + {{"disable_auto_compactions", "true"}})); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + ASSERT_GT(NumTableFilesAtLevel(2), 0); + } + } + } +} + +TEST_F(DBRangeDelTest, ValidUniversalSubcompactionBoundaries) { + const int kNumPerFile = 100, kFilesPerLevel = 4, kNumLevels = 4; + Options options = CurrentOptions(); + options.compaction_options_universal.min_merge_width = kFilesPerLevel; + options.compaction_options_universal.max_merge_width = kFilesPerLevel; + options.compaction_options_universal.size_ratio = 10; + options.compaction_style = kCompactionStyleUniversal; + options.level0_file_num_compaction_trigger = kFilesPerLevel; + options.max_subcompactions = 4; + options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile)); + options.num_levels = kNumLevels; + options.target_file_size_base = kNumPerFile << 10; + options.target_file_size_multiplier = 1; + Reopen(options); + + Random rnd(301); + for (int i = 0; i < kNumLevels - 1; ++i) { + for (int j = 0; j < kFilesPerLevel; ++j) { + if (i == kNumLevels - 2) { + // insert range deletions [95,105) in two files, [295,305) in next two + // to prepare L1 for later manual compaction. + int mid = (j + (1 - j % 2)) * kNumPerFile; + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(mid - 5), Key(mid + 5))); + } + std::vector values; + // Write 100KB (100 values, each 1K) + for (int k = 0; k < kNumPerFile; k++) { + values.push_back(rnd.RandomString(990)); + ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k])); + } + // put extra key to trigger flush + ASSERT_OK(Put("", "")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + if (j < kFilesPerLevel - 1) { + // background compaction may happen early for kFilesPerLevel'th file + ASSERT_EQ(NumTableFilesAtLevel(0), j + 1); + } + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(kNumLevels - 1 - i), kFilesPerLevel - 1); + } + // Now L1-L3 are full, when we compact L1->L2 we should see (1) subcompactions + // happen since input level > 0; (2) range deletions are not dropped since + // output level is not bottommost. If no file boundary assertion fails, that + // probably means universal compaction + subcompaction + range deletion are + // compatible. + ASSERT_OK(dbfull()->RunManualCompaction( + static_cast_with_check(db_->DefaultColumnFamily()) + ->cfd(), + 1 /* input_level */, 2 /* output_level */, CompactRangeOptions(), + nullptr /* begin */, nullptr /* end */, true /* exclusive */, + true /* disallow_trivial_move */, + std::numeric_limits::max() /* max_file_num_to_ignore */, + "" /*trim_ts*/)); +} +#endif // ROCKSDB_LITE + +TEST_F(DBRangeDelTest, CompactionRemovesCoveredMergeOperands) { + const int kNumPerFile = 3, kNumFiles = 3; + Options opts = CurrentOptions(); + opts.disable_auto_compactions = true; + opts.memtable_factory.reset(test::NewSpecialSkipListFactory(2 * kNumPerFile)); + opts.merge_operator = MergeOperators::CreateUInt64AddOperator(); + opts.num_levels = 2; + Reopen(opts); + + // Iterates kNumFiles * kNumPerFile + 1 times since flushing the last file + // requires an extra entry. + for (int i = 0; i <= kNumFiles * kNumPerFile; ++i) { + if (i % kNumPerFile == 0 && i / kNumPerFile == kNumFiles - 1) { + // Delete merge operands from all but the last file + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + "key", "key_")); + } + std::string val; + PutFixed64(&val, i); + ASSERT_OK(db_->Merge(WriteOptions(), "key", val)); + // we need to prevent trivial move using Puts so compaction will actually + // process the merge operands. + ASSERT_OK(db_->Put(WriteOptions(), "prevent_trivial_move", "")); + if (i > 0 && i % kNumPerFile == 0) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + } + + ReadOptions read_opts; + read_opts.ignore_range_deletions = true; + std::string expected, actual; + ASSERT_OK(db_->Get(read_opts, "key", &actual)); + PutFixed64(&expected, 45); // 1+2+...+9 + ASSERT_EQ(expected, actual); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + expected.clear(); + ASSERT_OK(db_->Get(read_opts, "key", &actual)); + uint64_t tmp; + Slice tmp2(actual); + GetFixed64(&tmp2, &tmp); + PutFixed64(&expected, 30); // 6+7+8+9 (earlier operands covered by tombstone) + ASSERT_EQ(expected, actual); +} + +TEST_F(DBRangeDelTest, PutDeleteRangeMergeFlush) { + // Test the sequence of operations: (1) Put, (2) DeleteRange, (3) Merge, (4) + // Flush. The `CompactionIterator` previously had a bug where we forgot to + // check for covering range tombstones when processing the (1) Put, causing + // it to reappear after the flush. + Options opts = CurrentOptions(); + opts.merge_operator = MergeOperators::CreateUInt64AddOperator(); + Reopen(opts); + + std::string val; + PutFixed64(&val, 1); + ASSERT_OK(db_->Put(WriteOptions(), "key", val)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key", + "key_")); + ASSERT_OK(db_->Merge(WriteOptions(), "key", val)); + ASSERT_OK(db_->Flush(FlushOptions())); + + ReadOptions read_opts; + std::string expected, actual; + ASSERT_OK(db_->Get(read_opts, "key", &actual)); + PutFixed64(&expected, 1); + ASSERT_EQ(expected, actual); +} + +// NumTableFilesAtLevel() is not supported in ROCKSDB_LITE +#ifndef ROCKSDB_LITE +TEST_F(DBRangeDelTest, ObsoleteTombstoneCleanup) { + // During compaction to bottommost level, verify range tombstones older than + // the oldest snapshot are removed, while others are preserved. + Options opts = CurrentOptions(); + opts.disable_auto_compactions = true; + opts.num_levels = 2; + opts.statistics = CreateDBStatistics(); + Reopen(opts); + + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1", + "dr10")); // obsolete after compaction + ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); + ASSERT_OK(db_->Flush(FlushOptions())); + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr2", + "dr20")); // protected by snapshot + ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); + ASSERT_OK(db_->Flush(FlushOptions())); + + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + ASSERT_EQ(1, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE)); + + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, TableEvictedDuringScan) { + // The RangeDelAggregator holds pointers into range deletion blocks created by + // table readers. This test ensures the aggregator can still access those + // blocks even if it outlives the table readers that created them. + // + // DBIter always keeps readers open for L0 files. So, in order to test + // aggregator outliving reader, we need to have deletions in L1 files, which + // are opened/closed on-demand during the scan. This is accomplished by + // setting kNumRanges > level0_stop_writes_trigger, which prevents deletions + // from all lingering in L0 (there is at most one range deletion per L0 file). + // + // The first L1 file will contain a range deletion since its begin key is 0. + // SeekToFirst() references that table's reader and adds its range tombstone + // to the aggregator. Upon advancing beyond that table's key-range via Next(), + // the table reader will be unreferenced by the iterator. Since we manually + // call Evict() on all readers before the full scan, this unreference causes + // the reader's refcount to drop to zero and thus be destroyed. + // + // When it is destroyed, we do not remove its range deletions from the + // aggregator. So, subsequent calls to Next() must be able to use these + // deletions to decide whether a key is covered. This will work as long as + // the aggregator properly references the range deletion block. + const int kNum = 25, kRangeBegin = 0, kRangeEnd = 7, kNumRanges = 5; + Options opts = CurrentOptions(); + opts.comparator = test::Uint64Comparator(); + opts.level0_file_num_compaction_trigger = 4; + opts.level0_stop_writes_trigger = 4; + opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1)); + opts.num_levels = 2; + BlockBasedTableOptions bbto; + bbto.cache_index_and_filter_blocks = true; + bbto.block_cache = NewLRUCache(8 << 20); + opts.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(opts); + + // Hold a snapshot so range deletions can't become obsolete during compaction + // to bottommost level (i.e., L1). + const Snapshot* snapshot = db_->GetSnapshot(); + for (int i = 0; i < kNum; ++i) { + ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val")); + if (i > 0) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + if (i >= kNum / 2 && i < kNum / 2 + kNumRanges) { + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + GetNumericStr(kRangeBegin), + GetNumericStr(kRangeEnd))); + } + } + // Must be > 1 so the first L1 file can be closed before scan finishes + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_GT(NumTableFilesAtLevel(1), 1); + std::vector file_numbers = ListTableFiles(env_, dbname_); + + ReadOptions read_opts; + auto* iter = db_->NewIterator(read_opts); + ASSERT_OK(iter->status()); + int expected = kRangeEnd; + iter->SeekToFirst(); + for (auto file_number : file_numbers) { + // This puts table caches in the state of being externally referenced only + // so they are destroyed immediately upon iterator unreferencing. + TableCache::Evict(dbfull()->TEST_table_cache(), file_number); + } + for (; iter->Valid(); iter->Next()) { + ASSERT_EQ(GetNumericStr(expected), iter->key()); + ++expected; + // Keep clearing block cache's LRU so range deletion block can be freed as + // soon as its refcount drops to zero. + bbto.block_cache->EraseUnRefEntries(); + } + ASSERT_EQ(kNum, expected); + delete iter; + db_->ReleaseSnapshot(snapshot); + + // Also test proper cache handling in GetRangeTombstoneIterator, + // via TablesRangeTombstoneSummary. (This once triggered memory leak + // report with ASAN.) + opts.max_open_files = 1; + Reopen(opts); + + std::string str; + ASSERT_OK(dbfull()->TablesRangeTombstoneSummary(db_->DefaultColumnFamily(), + 100, &str)); +} + +TEST_F(DBRangeDelTest, GetCoveredKeyFromMutableMemtable) { + do { + DestroyAndReopen(CurrentOptions()); + ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + + ReadOptions read_opts; + std::string value; + ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound()); + } while (ChangeOptions(kRangeDelSkipConfigs)); +} + +TEST_F(DBRangeDelTest, GetCoveredKeyFromImmutableMemtable) { + do { + Options opts = CurrentOptions(); + opts.max_write_buffer_number = 3; + opts.min_write_buffer_number_to_merge = 2; + // SpecialSkipListFactory lets us specify maximum number of elements the + // memtable can hold. It switches the active memtable to immutable (flush is + // prevented by the above options) upon inserting an element that would + // overflow the memtable. + opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1)); + DestroyAndReopen(opts); + + ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + ASSERT_OK(db_->Put(WriteOptions(), "blah", "val")); + + ReadOptions read_opts; + std::string value; + ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound()); + } while (ChangeOptions(kRangeDelSkipConfigs)); +} + +TEST_F(DBRangeDelTest, GetCoveredKeyFromSst) { + do { + DestroyAndReopen(CurrentOptions()); + ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); + // snapshot prevents key from being deleted during flush + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + ASSERT_OK(db_->Flush(FlushOptions())); + + ReadOptions read_opts; + std::string value; + ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound()); + db_->ReleaseSnapshot(snapshot); + } while (ChangeOptions(kRangeDelSkipConfigs)); +} + +TEST_F(DBRangeDelTest, GetCoveredMergeOperandFromMemtable) { + const int kNumMergeOps = 10; + Options opts = CurrentOptions(); + opts.merge_operator = MergeOperators::CreateUInt64AddOperator(); + Reopen(opts); + + for (int i = 0; i < kNumMergeOps; ++i) { + std::string val; + PutFixed64(&val, i); + ASSERT_OK(db_->Merge(WriteOptions(), "key", val)); + if (i == kNumMergeOps / 2) { + // deletes [0, 5] + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + "key", "key_")); + } + } + + ReadOptions read_opts; + std::string expected, actual; + ASSERT_OK(db_->Get(read_opts, "key", &actual)); + PutFixed64(&expected, 30); // 6+7+8+9 + ASSERT_EQ(expected, actual); + + expected.clear(); + read_opts.ignore_range_deletions = true; + ASSERT_OK(db_->Get(read_opts, "key", &actual)); + PutFixed64(&expected, 45); // 0+1+2+...+9 + ASSERT_EQ(expected, actual); +} + +TEST_F(DBRangeDelTest, GetIgnoresRangeDeletions) { + Options opts = CurrentOptions(); + opts.max_write_buffer_number = 4; + opts.min_write_buffer_number_to_merge = 3; + opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1)); + Reopen(opts); + + ASSERT_OK(db_->Put(WriteOptions(), "sst_key", "val")); + // snapshot prevents key from being deleted during flush + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_OK(db_->Put(WriteOptions(), "imm_key", "val")); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + ASSERT_OK(db_->Put(WriteOptions(), "mem_key", "val")); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + + ReadOptions read_opts; + read_opts.ignore_range_deletions = true; + for (std::string key : {"sst_key", "imm_key", "mem_key"}) { + std::string value; + ASSERT_OK(db_->Get(read_opts, key, &value)); + } + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, IteratorRemovesCoveredKeys) { + const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25; + Options opts = CurrentOptions(); + opts.comparator = test::Uint64Comparator(); + opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile)); + DestroyAndReopen(opts); + + // Write half of the keys before the tombstone and half after the tombstone. + // Only covered keys (i.e., within the range and older than the tombstone) + // should be deleted. + for (int i = 0; i < kNum; ++i) { + if (i == kNum / 2) { + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + GetNumericStr(kRangeBegin), + GetNumericStr(kRangeEnd))); + } + ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val")); + } + ReadOptions read_opts; + auto* iter = db_->NewIterator(read_opts); + ASSERT_OK(iter->status()); + + int expected = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_EQ(GetNumericStr(expected), iter->key()); + if (expected == kRangeBegin - 1) { + expected = kNum / 2; + } else { + ++expected; + } + } + ASSERT_EQ(kNum, expected); + delete iter; +} + +TEST_F(DBRangeDelTest, IteratorOverUserSnapshot) { + const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25; + Options opts = CurrentOptions(); + opts.comparator = test::Uint64Comparator(); + opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile)); + DestroyAndReopen(opts); + + const Snapshot* snapshot = nullptr; + // Put a snapshot before the range tombstone, verify an iterator using that + // snapshot sees all inserted keys. + for (int i = 0; i < kNum; ++i) { + if (i == kNum / 2) { + snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + GetNumericStr(kRangeBegin), + GetNumericStr(kRangeEnd))); + } + ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val")); + } + ReadOptions read_opts; + read_opts.snapshot = snapshot; + auto* iter = db_->NewIterator(read_opts); + ASSERT_OK(iter->status()); + + int expected = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_EQ(GetNumericStr(expected), iter->key()); + ++expected; + } + ASSERT_EQ(kNum / 2, expected); + delete iter; + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, IteratorIgnoresRangeDeletions) { + Options opts = CurrentOptions(); + opts.max_write_buffer_number = 4; + opts.min_write_buffer_number_to_merge = 3; + opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1)); + Reopen(opts); + + ASSERT_OK(db_->Put(WriteOptions(), "sst_key", "val")); + // snapshot prevents key from being deleted during flush + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_OK(db_->Put(WriteOptions(), "imm_key", "val")); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + ASSERT_OK(db_->Put(WriteOptions(), "mem_key", "val")); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + + ReadOptions read_opts; + read_opts.ignore_range_deletions = true; + auto* iter = db_->NewIterator(read_opts); + ASSERT_OK(iter->status()); + int i = 0; + std::string expected[] = {"imm_key", "mem_key", "sst_key"}; + for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++i) { + std::string key; + ASSERT_EQ(expected[i], iter->key()); + } + ASSERT_EQ(3, i); + delete iter; + db_->ReleaseSnapshot(snapshot); +} + +#ifndef ROCKSDB_UBSAN_RUN +TEST_F(DBRangeDelTest, TailingIteratorRangeTombstoneUnsupported) { + ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); + // snapshot prevents key from being deleted during flush + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + + // iterations check unsupported in memtable, l0, and then l1 + for (int i = 0; i < 3; ++i) { + ReadOptions read_opts; + read_opts.tailing = true; + auto* iter = db_->NewIterator(read_opts); + if (i == 2) { + // For L1+, iterators over files are created on-demand, so need seek + iter->SeekToFirst(); + } + ASSERT_TRUE(iter->status().IsNotSupported()); + + delete iter; + if (i == 0) { + ASSERT_OK(db_->Flush(FlushOptions())); + } else if (i == 1) { + MoveFilesToLevel(1); + } + } + db_->ReleaseSnapshot(snapshot); +} +#endif // !ROCKSDB_UBSAN_RUN + +TEST_F(DBRangeDelTest, SubcompactionHasEmptyDedicatedRangeDelFile) { + const int kNumFiles = 2, kNumKeysPerFile = 4; + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.level0_file_num_compaction_trigger = kNumFiles; + options.max_subcompactions = 2; + options.num_levels = 2; + options.target_file_size_base = 4096; + Reopen(options); + + // need a L1 file for subcompaction to be triggered + ASSERT_OK( + db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(0), "val")); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(1); + + // put enough keys to fill up the first subcompaction, and later range-delete + // them so that the first subcompaction outputs no key-values. In that case + // it'll consider making an SST file dedicated to range deletions. + for (int i = 0; i < kNumKeysPerFile; ++i) { + ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(i), + std::string(1024, 'a'))); + } + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), + Key(kNumKeysPerFile))); + + // the above range tombstone can be dropped, so that one alone won't cause a + // dedicated file to be opened. We can make one protected by snapshot that + // must be considered. Make its range outside the first subcompaction's range + // to exercise the tricky part of the code. + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(kNumKeysPerFile + 1), + Key(kNumKeysPerFile + 2))); + ASSERT_OK(db_->Flush(FlushOptions())); + + ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0)); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + + ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()})); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, MemtableBloomFilter) { + // regression test for #2743. the range delete tombstones in memtable should + // be added even when Get() skips searching due to its prefix bloom filter + const int kMemtableSize = 1 << 20; // 1MB + const int kMemtablePrefixFilterSize = 1 << 13; // 8KB + const int kNumKeys = 1000; + const int kPrefixLen = 8; + Options options = CurrentOptions(); + options.memtable_prefix_bloom_size_ratio = + static_cast(kMemtablePrefixFilterSize) / kMemtableSize; + options.prefix_extractor.reset( + ROCKSDB_NAMESPACE::NewFixedPrefixTransform(kPrefixLen)); + options.write_buffer_size = kMemtableSize; + Reopen(options); + + for (int i = 0; i < kNumKeys; ++i) { + ASSERT_OK(Put(Key(i), "val")); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), + Key(kNumKeys))); + for (int i = 0; i < kNumKeys; ++i) { + std::string value; + ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound()); + } +} + +TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) { + // This test originally verified that compaction treated files containing a + // split range deletion in the input level as an atomic unit. I.e., + // compacting any input-level file(s) containing a portion of the range + // deletion causes all other input-level files containing portions of that + // same range deletion to be included in the compaction. Range deletion + // tombstones are now truncated to sstable boundaries which removed the need + // for that behavior (which could lead to excessively large + // compactions). + const int kNumFilesPerLevel = 4, kValueBytes = 4 << 10; + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = kNumFilesPerLevel; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(2 /* num_entries_flush */)); + // max file size could be 2x of target file size, so set it to half of that + options.target_file_size_base = kValueBytes / 2; + // disable dynamic_file_size, as it will cut L1 files into more files (than + // kNumFilesPerLevel). + options.level_compaction_dynamic_file_size = false; + options.max_compaction_bytes = 1500; + // i == 0: CompactFiles + // i == 1: CompactRange + // i == 2: automatic compaction + for (int i = 0; i < 3; ++i) { + DestroyAndReopen(options); + + ASSERT_OK(Put(Key(0), "")); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(2); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + + // snapshot protects range tombstone from dropping due to becoming obsolete. + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(0), Key(2 * kNumFilesPerLevel))); + + Random rnd(301); + std::string value = rnd.RandomString(kValueBytes); + for (int j = 0; j < kNumFilesPerLevel; ++j) { + // give files overlapping key-ranges to prevent trivial move + ASSERT_OK(Put(Key(j), value)); + ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value)); + if (j > 0) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ(j, NumTableFilesAtLevel(0)); + } + } + // put extra key to trigger final flush + ASSERT_OK(Put("", "")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(kNumFilesPerLevel, NumTableFilesAtLevel(1)); + + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + if (i == 0) { + ASSERT_OK(db_->CompactFiles( + CompactionOptions(), {meta.levels[1].files[0].name}, 2 /* level */)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); + } else if (i == 1) { + auto begin_str = Key(0), end_str = Key(1); + Slice begin = begin_str, end = end_str; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin, &end)); + ASSERT_EQ(3, NumTableFilesAtLevel(1)); + } else if (i == 2) { + ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(), + {{"max_bytes_for_level_base", "10000"}})); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + } + ASSERT_GT(NumTableFilesAtLevel(2), 0); + + db_->ReleaseSnapshot(snapshot); + } +} + +TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) { + // Test the handling of the range-tombstone end-key as the + // upper-bound for an sstable. + + const int kNumFilesPerLevel = 2, kValueBytes = 4 << 10; + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = kNumFilesPerLevel; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(2 /* num_entries_flush */)); + options.target_file_size_base = kValueBytes; + options.disable_auto_compactions = true; + // disable it for now, otherwise the L1 files are going be cut before data 1: + // L1: [0] [1,4] + // L2: [0,0] + // because the grandparent file is between [0]->[1] and it's size is more than + // 1/8 of target size (4k). + options.level_compaction_dynamic_file_size = false; + + DestroyAndReopen(options); + + // Create an initial sstable at L2: + // [key000000#1,1, key000000#1,1] + ASSERT_OK(Put(Key(0), "")); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(2); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + + // A snapshot protects the range tombstone from dropping due to + // becoming obsolete. + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), + Key(2 * kNumFilesPerLevel))); + + // Create 2 additional sstables in L0. Note that the first sstable + // contains the range tombstone. + // [key000000#3,1, key000004#72057594037927935,15] + // [key000001#5,1, key000002#6,1] + Random rnd(301); + std::string value = rnd.RandomString(kValueBytes); + for (int j = 0; j < kNumFilesPerLevel; ++j) { + // Give files overlapping key-ranges to prevent a trivial move when we + // compact from L0 to L1. + ASSERT_OK(Put(Key(j), value)); + ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value)); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(j + 1, NumTableFilesAtLevel(0)); + } + // Compact the 2 L0 sstables to L1, resulting in the following LSM. There + // are 2 sstables generated in L1 due to the target_file_size_base setting. + // L1: + // [key000000#3,1, key000002#72057594037927935,15] + // [key000002#6,1, key000004#72057594037927935,15] + // L2: + // [key000000#1,1, key000000#1,1] + MoveFilesToLevel(1); + ASSERT_EQ(2, NumTableFilesAtLevel(1)); + + { + // Compact the second sstable in L1: + // L1: + // [key000000#3,1, key000002#72057594037927935,15] + // L2: + // [key000000#1,1, key000000#1,1] + // [key000002#6,1, key000004#72057594037927935,15] + // + // At the same time, verify the compaction does not cause the key at the + // endpoint (key000002#6,1) to disappear. + ASSERT_EQ(value, Get(Key(2))); + auto begin_str = Key(3); + const ROCKSDB_NAMESPACE::Slice begin = begin_str; + ASSERT_OK(dbfull()->TEST_CompactRange(1, &begin, nullptr)); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + ASSERT_EQ(2, NumTableFilesAtLevel(2)); + ASSERT_EQ(value, Get(Key(2))); + } + + { + // Compact the first sstable in L1. This should be copacetic, but + // was previously resulting in overlapping sstables in L2 due to + // mishandling of the range tombstone end-key when used as the + // largest key for an sstable. The resulting LSM structure should + // be: + // + // L2: + // [key000000#1,1, key000001#72057594037927935,15] + // [key000001#5,1, key000002#72057594037927935,15] + // [key000002#6,1, key000004#72057594037927935,15] + auto begin_str = Key(0); + const ROCKSDB_NAMESPACE::Slice begin = begin_str; + ASSERT_OK(dbfull()->TEST_CompactRange(1, &begin, &begin)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); + ASSERT_EQ(3, NumTableFilesAtLevel(2)); + } + + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, UnorderedTombstones) { + // Regression test for #2752. Range delete tombstones between + // different snapshot stripes are not stored in order, so the first + // tombstone of each snapshot stripe should be checked as a smallest + // candidate. + Options options = CurrentOptions(); + DestroyAndReopen(options); + + auto cf = db_->DefaultColumnFamily(); + + ASSERT_OK(db_->Put(WriteOptions(), cf, "a", "a")); + ASSERT_OK(db_->Flush(FlushOptions(), cf)); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + + ASSERT_OK(db_->DeleteRange(WriteOptions(), cf, "b", "c")); + // Hold a snapshot to separate these two delete ranges. + auto snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), cf, "a", "b")); + ASSERT_OK(db_->Flush(FlushOptions(), cf)); + db_->ReleaseSnapshot(snapshot); + + std::vector> files; + dbfull()->TEST_GetFilesMetaData(cf, &files); + ASSERT_EQ(1, files[0].size()); + ASSERT_EQ("a", files[0][0].smallest.user_key()); + ASSERT_EQ("c", files[0][0].largest.user_key()); + + std::string v; + auto s = db_->Get(ReadOptions(), "a", &v); + ASSERT_TRUE(s.IsNotFound()); +} + +class MockMergeOperator : public MergeOperator { + // Mock non-associative operator. Non-associativity is expressed by lack of + // implementation for any `PartialMerge*` functions. + public: + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override { + assert(merge_out != nullptr); + merge_out->new_value = merge_in.operand_list.back().ToString(); + return true; + } + + const char* Name() const override { return "MockMergeOperator"; } +}; + +TEST_F(DBRangeDelTest, KeyAtOverlappingEndpointReappears) { + // This test uses a non-associative merge operator since that is a convenient + // way to get compaction to write out files with overlapping user-keys at the + // endpoints. Note, however, overlapping endpoints can also occur with other + // value types (Put, etc.), assuming the right snapshots are present. + const int kFileBytes = 1 << 20; + const int kValueBytes = 1 << 10; + const int kNumFiles = 4; + + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.merge_operator.reset(new MockMergeOperator()); + options.target_file_size_base = kFileBytes; + Reopen(options); + + // Push dummy data to L3 so that our actual test files on L0-L2 + // will not be considered "bottommost" level, otherwise compaction + // may prevent us from creating overlapping user keys + // as on the bottommost layer MergeHelper + ASSERT_OK(db_->Merge(WriteOptions(), "key", "dummy")); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(3); + + Random rnd(301); + const Snapshot* snapshot = nullptr; + for (int i = 0; i < kNumFiles; ++i) { + for (int j = 0; j < kFileBytes / kValueBytes; ++j) { + auto value = rnd.RandomString(kValueBytes); + ASSERT_OK(db_->Merge(WriteOptions(), "key", value)); + } + if (i == kNumFiles - 1) { + // Take snapshot to prevent covered merge operands from being dropped by + // compaction. + snapshot = db_->GetSnapshot(); + // The DeleteRange is the last write so all merge operands are covered. + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + "key", "key_")); + } + ASSERT_OK(db_->Flush(FlushOptions())); + } + ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0)); + std::string value; + ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound()); + + ASSERT_OK(dbfull()->TEST_CompactRange( + 0 /* level */, nullptr /* begin */, nullptr /* end */, + nullptr /* column_family */, true /* disallow_trivial_move */)); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + // Now we have multiple files at L1 all containing a single user key, thus + // guaranteeing overlap in the file endpoints. + ASSERT_GT(NumTableFilesAtLevel(1), 1); + + // Verify no merge operands reappeared after the compaction. + ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound()); + + // Compact and verify again. It's worthwhile because now the files have + // tighter endpoints, so we can verify that doesn't mess anything up. + ASSERT_OK(dbfull()->TEST_CompactRange( + 1 /* level */, nullptr /* begin */, nullptr /* end */, + nullptr /* column_family */, true /* disallow_trivial_move */)); + ASSERT_GT(NumTableFilesAtLevel(2), 1); + ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound()); + + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, UntruncatedTombstoneDoesNotDeleteNewerKey) { + // Verify a key newer than a range tombstone cannot be deleted by being + // compacted to the bottom level (and thus having its seqnum zeroed) before + // the range tombstone. This used to happen when range tombstones were + // untruncated on reads such that they extended past their file boundaries. + // + // Test summary: + // + // - L1 is bottommost. + // - A couple snapshots are strategically taken to prevent seqnums from being + // zeroed, range tombstone from being dropped, merge operands from being + // dropped, and merge operands from being combined. + // - Left half of files in L1 all have same user key, ensuring their file + // boundaries overlap. In the past this would cause range tombstones to be + // untruncated. + // - Right half of L1 files all have different keys, ensuring no overlap. + // - A range tombstone spans all L1 keys, so it is stored in every L1 file. + // - Keys in the right side of the key-range are overwritten. These are + // compacted down to L1 after releasing snapshots such that their seqnums + // will be zeroed. + // - A full range scan is performed. If the tombstone in the left L1 files + // were untruncated, it would now cover keys newer than it (but with zeroed + // seqnums) in the right L1 files. + const int kFileBytes = 1 << 20; + const int kValueBytes = 1 << 10; + const int kNumFiles = 4; + const int kMaxKey = kNumFiles * kFileBytes / kValueBytes; + const int kKeysOverwritten = 10; + + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.merge_operator.reset(new MockMergeOperator()); + options.num_levels = 2; + options.target_file_size_base = kFileBytes; + Reopen(options); + + Random rnd(301); + // - snapshots[0] prevents merge operands from being combined during + // compaction. + // - snapshots[1] prevents merge operands from being dropped due to the + // covering range tombstone. + const Snapshot* snapshots[] = {nullptr, nullptr}; + for (int i = 0; i < kNumFiles; ++i) { + for (int j = 0; j < kFileBytes / kValueBytes; ++j) { + auto value = rnd.RandomString(kValueBytes); + std::string key; + if (i < kNumFiles / 2) { + key = Key(0); + } else { + key = Key(1 + i * kFileBytes / kValueBytes + j); + } + ASSERT_OK(db_->Merge(WriteOptions(), key, value)); + } + if (i == 0) { + snapshots[0] = db_->GetSnapshot(); + } + if (i == kNumFiles - 1) { + snapshots[1] = db_->GetSnapshot(); + // The DeleteRange is the last write so all merge operands are covered. + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(0), Key(kMaxKey + 1))); + } + ASSERT_OK(db_->Flush(FlushOptions())); + } + ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0)); + + auto get_key_count = [this]() -> int { + auto* iter = db_->NewIterator(ReadOptions()); + assert(iter->status().ok()); + iter->SeekToFirst(); + int keys_found = 0; + for (; iter->Valid(); iter->Next()) { + ++keys_found; + } + delete iter; + return keys_found; + }; + + // All keys should be covered + ASSERT_EQ(0, get_key_count()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr /* begin_key */, + nullptr /* end_key */)); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + // Roughly the left half of L1 files should have overlapping boundary keys, + // while the right half should not. + ASSERT_GE(NumTableFilesAtLevel(1), kNumFiles); + + // Now overwrite a few keys that are in L1 files that definitely don't have + // overlapping boundary keys. + for (int i = kMaxKey; i > kMaxKey - kKeysOverwritten; --i) { + auto value = rnd.RandomString(kValueBytes); + ASSERT_OK(db_->Merge(WriteOptions(), Key(i), value)); + } + ASSERT_OK(db_->Flush(FlushOptions())); + + // The overwritten keys are in L0 now, so clearly aren't covered by the range + // tombstone in L1. + ASSERT_EQ(kKeysOverwritten, get_key_count()); + + // Release snapshots so seqnums can be zeroed when L0->L1 happens. + db_->ReleaseSnapshot(snapshots[0]); + db_->ReleaseSnapshot(snapshots[1]); + + auto begin_key_storage = Key(kMaxKey - kKeysOverwritten + 1); + auto end_key_storage = Key(kMaxKey); + Slice begin_key(begin_key_storage); + Slice end_key(end_key_storage); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin_key, &end_key)); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_GE(NumTableFilesAtLevel(1), kNumFiles); + + ASSERT_EQ(kKeysOverwritten, get_key_count()); +} + +TEST_F(DBRangeDelTest, DeletedMergeOperandReappearsIterPrev) { + // Exposes a bug where we were using + // `RangeDelPositioningMode::kBackwardTraversal` while scanning merge operands + // in the forward direction. Confusingly, this case happened during + // `DBIter::Prev`. It could cause assertion failure, or reappearing keys. + const int kFileBytes = 1 << 20; + const int kValueBytes = 1 << 10; + // Need multiple keys so we can get results when calling `Prev()` after + // `SeekToLast()`. + const int kNumKeys = 3; + const int kNumFiles = 4; + + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.merge_operator.reset(new MockMergeOperator()); + options.target_file_size_base = kFileBytes; + Reopen(options); + + Random rnd(301); + const Snapshot* snapshot = nullptr; + for (int i = 0; i < kNumFiles; ++i) { + for (int j = 0; j < kFileBytes / kValueBytes; ++j) { + auto value = rnd.RandomString(kValueBytes); + ASSERT_OK(db_->Merge(WriteOptions(), Key(j % kNumKeys), value)); + if (i == 0 && j == kNumKeys) { + // Take snapshot to prevent covered merge operands from being dropped or + // merged by compaction. + snapshot = db_->GetSnapshot(); + // Do a DeleteRange near the beginning so only the oldest merge operand + // for each key is covered. This ensures the sequence of events: + // + // - `DBIter::Prev()` is called + // - After several same versions of the same user key are encountered, + // it decides to seek using `DBIter::FindValueForCurrentKeyUsingSeek`. + // - Binary searches to the newest version of the key, which is in the + // leftmost file containing the user key. + // - Scans forwards to collect all merge operands. Eventually reaches + // the rightmost file containing the oldest merge operand, which + // should be covered by the `DeleteRange`. If `RangeDelAggregator` + // were not properly using `kForwardTraversal` here, that operand + // would reappear. + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(0), Key(kNumKeys + 1))); + } + } + ASSERT_OK(db_->Flush(FlushOptions())); + } + ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0)); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr /* begin_key */, + nullptr /* end_key */)); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_GT(NumTableFilesAtLevel(1), 1); + + auto* iter = db_->NewIterator(ReadOptions()); + ASSERT_OK(iter->status()); + iter->SeekToLast(); + int keys_found = 0; + for (; iter->Valid(); iter->Prev()) { + ++keys_found; + } + delete iter; + ASSERT_EQ(kNumKeys, keys_found); + + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeys) { + const int kFileBytes = 1 << 20; + + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.target_file_size_base = kFileBytes; + Reopen(options); + + ASSERT_OK(Put(Key(0), "a")); + const Snapshot* snapshot = db_->GetSnapshot(); + + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), + Key(10))); + + ASSERT_OK(db_->Flush(FlushOptions())); + + ReadOptions read_opts; + read_opts.snapshot = snapshot; + auto* iter = db_->NewIterator(read_opts); + ASSERT_OK(iter->status()); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(Key(0), iter->key()); + + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + delete iter; + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeysInImmMemTables) { + const int kFileBytes = 1 << 20; + + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.target_file_size_base = kFileBytes; + Reopen(options); + + // block flush thread -> pin immtables in memory + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({ + {"SnapshotPreventsDroppedKeysInImmMemTables:AfterNewIterator", + "DBImpl::BGWorkFlush"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(0), "a")); + std::unique_ptr> + snapshot(db_->GetSnapshot(), + [this](const Snapshot* s) { db_->ReleaseSnapshot(s); }); + + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), + Key(10))); + + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + + ReadOptions read_opts; + read_opts.snapshot = snapshot.get(); + std::unique_ptr iter(db_->NewIterator(read_opts)); + ASSERT_OK(iter->status()); + + TEST_SYNC_POINT("SnapshotPreventsDroppedKeysInImmMemTables:AfterNewIterator"); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(Key(0), iter->key()); + + iter->Next(); + ASSERT_FALSE(iter->Valid()); +} + +TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) { + // Adapted from + // https://github.com/cockroachdb/cockroach/blob/de8b3ea603dd1592d9dc26443c2cc92c356fbc2f/pkg/storage/engine/rocksdb_test.go#L1267-L1398. + // Regression test for issue where range tombstone was written to more files + // than necessary when it began exactly at the begin key in the next + // compaction output file. + const int kFileBytes = 1 << 20; + const int kValueBytes = 4 << 10; + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + // Have a bit of slack in the size limits but we enforce them more strictly + // when manually flushing/compacting. + options.max_compaction_bytes = 2 * kFileBytes; + options.target_file_size_base = 2 * kFileBytes; + options.write_buffer_size = 2 * kFileBytes; + Reopen(options); + + Random rnd(301); + for (char first_char : {'a', 'b', 'c'}) { + for (int i = 0; i < kFileBytes / kValueBytes; ++i) { + std::string key(1, first_char); + key.append(Key(i)); + std::string value = rnd.RandomString(kValueBytes); + ASSERT_OK(Put(key, value)); + } + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(2); + } + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(3, NumTableFilesAtLevel(2)); + + // Populate the memtable lightly while spanning the whole key-space. The + // setting of `max_compaction_bytes` will cause the L0->L1 to output multiple + // files to prevent a large L1->L2 compaction later. + ASSERT_OK(Put("a", "val")); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + "c" + Key(1), "d")); + // Our compaction output file cutting logic currently only considers point + // keys. So, in order for the range tombstone to have a chance at landing at + // the start of a new file, we need a point key at the range tombstone's + // start. + // TODO(ajkr): remove this `Put` after file cutting accounts for range + // tombstones (#3977). + ASSERT_OK(Put("c" + Key(1), "value")); + ASSERT_OK(db_->Flush(FlushOptions())); + + // Ensure manual L0->L1 compaction cuts the outputs before the range tombstone + // and the range tombstone is only placed in the second SST. + std::string begin_key_storage("c" + Key(1)); + Slice begin_key(begin_key_storage); + std::string end_key_storage("d"); + Slice end_key(end_key_storage); + ASSERT_OK(dbfull()->TEST_CompactRange( + 0 /* level */, &begin_key /* begin */, &end_key /* end */, + nullptr /* column_family */, true /* disallow_trivial_move */)); + ASSERT_EQ(2, NumTableFilesAtLevel(1)); + + std::vector all_metadata; + std::vector l1_metadata; + db_->GetLiveFilesMetaData(&all_metadata); + for (const auto& metadata : all_metadata) { + if (metadata.level == 1) { + l1_metadata.push_back(metadata); + } + } + std::sort(l1_metadata.begin(), l1_metadata.end(), + [&](const LiveFileMetaData& a, const LiveFileMetaData& b) { + return options.comparator->Compare(a.smallestkey, b.smallestkey) < + 0; + }); + ASSERT_EQ("a", l1_metadata[0].smallestkey); + ASSERT_EQ("a", l1_metadata[0].largestkey); + ASSERT_EQ("c" + Key(1), l1_metadata[1].smallestkey); + ASSERT_EQ("d", l1_metadata[1].largestkey); + + TablePropertiesCollection all_table_props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&all_table_props)); + int64_t num_range_deletions = 0; + for (const auto& name_and_table_props : all_table_props) { + const auto& name = name_and_table_props.first; + const auto& table_props = name_and_table_props.second; + // The range tombstone should only be output to the second L1 SST. + if (name.size() >= l1_metadata[1].name.size() && + name.substr(name.size() - l1_metadata[1].name.size()) + .compare(l1_metadata[1].name) == 0) { + ASSERT_EQ(1, table_props->num_range_deletions); + ++num_range_deletions; + } else { + ASSERT_EQ(0, table_props->num_range_deletions); + } + } + ASSERT_EQ(1, num_range_deletions); +} + +TEST_F(DBRangeDelTest, OverlappedTombstones) { + const int kNumPerFile = 4, kNumFiles = 2; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.target_file_size_base = 9 * 1024; + options.max_compaction_bytes = 9 * 1024; + DestroyAndReopen(options); + Random rnd(301); + for (int i = 0; i < kNumFiles; ++i) { + std::vector values; + // Write 12K (4 values, each 3K) + for (int j = 0; j < kNumPerFile; j++) { + values.push_back(rnd.RandomString(3 << 10)); + ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j])); + } + } + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + MoveFilesToLevel(2); + ASSERT_EQ(2, NumTableFilesAtLevel(2)); + + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1), + Key((kNumFiles)*kNumPerFile + 1))); + ASSERT_OK(db_->Flush(FlushOptions())); + + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); + + // The tombstone range is not broken up into multiple SSTs which may incur a + // large compaction with L2. + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + std::vector> files; + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); +} + +TEST_F(DBRangeDelTest, OverlappedKeys) { + const int kNumPerFile = 4, kNumFiles = 2; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.target_file_size_base = 9 * 1024; + options.max_compaction_bytes = 9 * 1024; + DestroyAndReopen(options); + Random rnd(301); + for (int i = 0; i < kNumFiles; ++i) { + std::vector values; + // Write 12K (4 values, each 3K) + for (int j = 0; j < kNumPerFile; j++) { + values.push_back(rnd.RandomString(3 << 10)); + ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j])); + } + } + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + MoveFilesToLevel(2); + ASSERT_EQ(2, NumTableFilesAtLevel(2)); + + for (int i = 1; i < kNumFiles * kNumPerFile + 1; i++) { + ASSERT_OK(Put(Key(i), "0x123")); + } + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + // The key range is broken up into three SSTs to avoid a future big compaction + // with the grandparent + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); + ASSERT_EQ(3, NumTableFilesAtLevel(1)); + + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); + // L1->L2 compaction size is limited to max_compaction_bytes + ASSERT_EQ(3, NumTableFilesAtLevel(2)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); +} + +TEST_F(DBRangeDelTest, IteratorRefresh) { + // Refreshing an iterator after a range tombstone is added should cause the + // deleted range of keys to disappear. + for (bool sv_changed : {false, true}) { + ASSERT_OK(db_->Put(WriteOptions(), "key1", "value1")); + ASSERT_OK(db_->Put(WriteOptions(), "key2", "value2")); + + auto* iter = db_->NewIterator(ReadOptions()); + ASSERT_OK(iter->status()); + + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + "key2", "key3")); + + if (sv_changed) { + ASSERT_OK(db_->Flush(FlushOptions())); + } + + ASSERT_OK(iter->Refresh()); + ASSERT_OK(iter->status()); + iter->SeekToFirst(); + ASSERT_EQ("key1", iter->key()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + delete iter; + } +} + +void VerifyIteratorReachesEnd(InternalIterator* iter) { + ASSERT_TRUE(!iter->Valid() && iter->status().ok()); +} + +void VerifyIteratorReachesEnd(Iterator* iter) { + ASSERT_TRUE(!iter->Valid() && iter->status().ok()); +} + +TEST_F(DBRangeDelTest, IteratorReseek) { + // Range tombstone triggers reseek (seeking to a range tombstone end key) in + // merging iterator. Test set up: + // one memtable: range tombstone [0, 1) + // one immutable memtable: range tombstone [1, 2) + // one L0 file with range tombstone [2, 3) + // one L1 file with range tombstone [3, 4) + // Seek(0) should trigger cascading reseeks at all levels below memtable. + // Seek(1) should trigger cascading reseeks at all levels below immutable + // memtable. SeekToFirst and SeekToLast trigger no reseek. + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + + DestroyAndReopen(options); + // L1 + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3), + Key(4))); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(1); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + // L0 + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2), + Key(3))); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + // Immutable memtable + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1), + Key(2))); + ASSERT_OK(static_cast_with_check(db_)->TEST_SwitchMemtable()); + std::string value; + ASSERT_TRUE(dbfull()->GetProperty(db_->DefaultColumnFamily(), + "rocksdb.num-immutable-mem-table", &value)); + ASSERT_EQ(1, std::stoi(value)); + // live memtable + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), + Key(1))); + // this memtable is still active + ASSERT_TRUE(dbfull()->GetProperty(db_->DefaultColumnFamily(), + "rocksdb.num-immutable-mem-table", &value)); + ASSERT_EQ(1, std::stoi(value)); + + auto iter = db_->NewIterator(ReadOptions()); + get_perf_context()->Reset(); + iter->Seek(Key(0)); + // Reseeked immutable memtable, L0 and L1 + ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 3); + VerifyIteratorReachesEnd(iter); + get_perf_context()->Reset(); + iter->SeekForPrev(Key(1)); + // Reseeked L0 and L1 + ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2); + VerifyIteratorReachesEnd(iter); + get_perf_context()->Reset(); + iter->SeekToFirst(); + ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0); + VerifyIteratorReachesEnd(iter); + iter->SeekToLast(); + ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0); + VerifyIteratorReachesEnd(iter); + delete iter; +} + +TEST_F(DBRangeDelTest, ReseekDuringNextAndPrev) { + // Range tombstone triggers reseek during Next()/Prev() in merging iterator. + // Test set up: + // memtable has: [0, 1) [2, 3) + // L0 has: 2 + // L1 has: 1, 2, 3 + // Seek(0) will reseek to 1 for L0 and L1. Seek(1) will not trigger any + // reseek. Then Next() determines 2 is covered by [2, 3), it will try to + // reseek to 3 for L0 and L1. Similar story for Prev() and SeekForPrev() is + // tested. + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + + DestroyAndReopen(options); + // L1 + ASSERT_OK(db_->Put(WriteOptions(), Key(1), "foo")); + ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo")); + ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo")); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(1); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + + // L0 + ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo")); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + // Memtable + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), + Key(1))); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2), + Key(3))); + + auto iter = db_->NewIterator(ReadOptions()); + auto iter_test_forward = [&] { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), Key(1)); + + get_perf_context()->Reset(); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), Key(3)); + // Reseeked L0 and L1 + ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2); + + // Next to Prev + get_perf_context()->Reset(); + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), Key(1)); + // Reseeked L0 and L1 + ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2); + + // Prev to Next + get_perf_context()->Reset(); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), Key(3)); + // Reseeked L0 and L1 + ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2); + + iter->Next(); + VerifyIteratorReachesEnd(iter); + }; + + get_perf_context()->Reset(); + iter->Seek(Key(0)); + // Reseeked L0 and L1 + ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2); + iter_test_forward(); + get_perf_context()->Reset(); + iter->Seek(Key(1)); + ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0); + iter_test_forward(); + + get_perf_context()->Reset(); + iter->SeekForPrev(Key(2)); + // Reseeked L0 and L1 + ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2); + iter_test_forward(); + get_perf_context()->Reset(); + iter->SeekForPrev(Key(1)); + ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0); + iter_test_forward(); + + get_perf_context()->Reset(); + iter->SeekToFirst(); + ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0); + iter_test_forward(); + + iter->SeekToLast(); + iter->Prev(); + iter_test_forward(); + delete iter; +} + +TEST_F(DBRangeDelTest, TombstoneFromCurrentLevel) { + // Range tombstone triggers reseek when covering key from the same level. + // in merging iterator. Test set up: + // memtable has: [0, 1) + // L0 has: [2, 3), 2 + // L1 has: 1, 2, 3 + // Seek(0) will reseek to 1 for L0 and L1. + // Then Next() will reseek to 3 for L1 since 2 in L0 is covered by [2, 3) in + // L0. + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + + DestroyAndReopen(options); + // L1 + ASSERT_OK(db_->Put(WriteOptions(), Key(1), "foo")); + ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo")); + ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo")); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(1); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + + // L0 + ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo")); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2), + Key(3))); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + // Memtable + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), + Key(1))); + + auto iter = db_->NewIterator(ReadOptions()); + get_perf_context()->Reset(); + iter->Seek(Key(0)); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), Key(1)); + ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2); + + get_perf_context()->Reset(); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), Key(3)); + ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 1); + + delete iter; +} + +class TombstoneTestSstPartitioner : public SstPartitioner { + public: + const char* Name() const override { return "SingleKeySstPartitioner"; } + + PartitionerResult ShouldPartition( + const PartitionerRequest& request) override { + if (cmp->Compare(*request.current_user_key, DBTestBase::Key(5)) == 0) { + return kRequired; + } else { + return kNotRequired; + } + } + + bool CanDoTrivialMove(const Slice& /*smallest_user_key*/, + const Slice& /*largest_user_key*/) override { + return false; + } + + const Comparator* cmp = BytewiseComparator(); +}; + +class TombstoneTestSstPartitionerFactory : public SstPartitionerFactory { + public: + static const char* kClassName() { + return "TombstoneTestSstPartitionerFactory"; + } + const char* Name() const override { return kClassName(); } + + std::unique_ptr CreatePartitioner( + const SstPartitioner::Context& /* context */) const override { + return std::unique_ptr(new TombstoneTestSstPartitioner()); + } +}; + +TEST_F(DBRangeDelTest, TombstoneAcrossFileBoundary) { + // Verify that a range tombstone across file boundary covers keys from older + // levels. Test set up: + // L1_0: 1, 3, [2, 6) L1_1: 5, 7, [2, 6) ([2, 6) is from compaction with + // L1_0) L2 has: 5 + // Seek(1) and then Next() should move the L1 level iterator to + // L1_1. Check if 5 is returned after Next(). + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.target_file_size_base = 2 * 1024; + options.max_compaction_bytes = 2 * 1024; + + // Make sure L1 files are split before "5" + auto factory = std::make_shared(); + options.sst_partitioner_factory = factory; + + DestroyAndReopen(options); + + Random rnd(301); + // L2 + // the file should be smaller than max_compaction_bytes, otherwise the file + // will be cut before 7. + ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(1 << 9))); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(2); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + + // L1_1 + ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(1 << 10))); + ASSERT_OK(db_->Put(WriteOptions(), Key(7), rnd.RandomString(1 << 10))); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + // L1_0 + ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(1 << 10))); + ASSERT_OK(db_->Put(WriteOptions(), Key(3), rnd.RandomString(1 << 10))); + // Prevent keys being compacted away + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2), + Key(6))); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + MoveFilesToLevel(1); + ASSERT_EQ(2, NumTableFilesAtLevel(1)); + + auto iter = db_->NewIterator(ReadOptions()); + get_perf_context()->Reset(); + iter->Seek(Key(1)); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), Key(1)); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), Key(7)); + // 1 reseek into L2 when key 5 in L2 is covered by [2, 6) from L1 + ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 1); + + delete iter; + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, NonOverlappingTombstonAtBoundary) { + // Verify that a range tombstone across file boundary covers keys from older + // levels. + // Test set up: + // L1_0: 1, 3, [4, 7) L1_1: 6, 8, [4, 7) + // L2: 5 + // Note that [4, 7) is at end of L1_0 and not overlapping with any point key + // in L1_0. [4, 7) from L1_0 should cover 5 is sentinel works + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.target_file_size_base = 2 * 1024; + DestroyAndReopen(options); + + Random rnd(301); + // L2 + ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(4 << 10))); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(2); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + + // L1_1 + ASSERT_OK(db_->Put(WriteOptions(), Key(6), rnd.RandomString(4 << 10))); + ASSERT_OK(db_->Put(WriteOptions(), Key(8), rnd.RandomString(4 << 10))); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + // L1_0 + ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10))); + ASSERT_OK(db_->Put(WriteOptions(), Key(3), rnd.RandomString(4 << 10))); + // Prevent keys being compacted away + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(4), + Key(7))); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + MoveFilesToLevel(1); + ASSERT_EQ(2, NumTableFilesAtLevel(1)); + + auto iter = db_->NewIterator(ReadOptions()); + iter->Seek(Key(3)); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(3)); + get_perf_context()->Reset(); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), Key(8)); + // 1 reseek into L1 since 5 from L2 is covered by [4, 7) from L1 + ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 1); + for (auto& k : {4, 5, 6}) { + get_perf_context()->Reset(); + iter->Seek(Key(k)); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), Key(8)); + // 1 reseek into L1 + ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 1); + } + delete iter; +} + +TEST_F(DBRangeDelTest, OlderLevelHasNewerData) { + // L1_0: 1, 3, [2, 7) L1_1: 5, 6 at a newer sequence number than [2, 7) + // Compact L1_1 to L2. Seek(3) should not skip 5 or 6. + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.target_file_size_base = 3 * 1024; + DestroyAndReopen(options); + + Random rnd(301); + // L1_0 + ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10))); + ASSERT_OK(db_->Put(WriteOptions(), Key(3), rnd.RandomString(4 << 10))); + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2), + Key(7))); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(1); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + + // L1_1 + ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(4 << 10))); + ASSERT_OK(db_->Put(WriteOptions(), Key(6), rnd.RandomString(4 << 10))); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + MoveFilesToLevel(1); + ASSERT_EQ(2, NumTableFilesAtLevel(1)); + + auto key = Key(6); + Slice begin(key); + EXPECT_OK(dbfull()->TEST_CompactRange(1, &begin, nullptr)); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + + auto iter = db_->NewIterator(ReadOptions()); + iter->Seek(Key(3)); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), Key(5)); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), Key(6)); + delete iter; + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, LevelBoundaryDefinedByTombstone) { + // L1 has: 1, 2, [4, 5) + // L2 has: 4 + // Seek(3), which is over all points keys in L1, check whether + // sentinel key from L1 works in this case. + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.target_file_size_base = 3 * 1024; + DestroyAndReopen(options); + Random rnd(301); + // L2 + ASSERT_OK(db_->Put(WriteOptions(), Key(4), "foo")); + ASSERT_OK(db_->Flush(FlushOptions())); + const Snapshot* snapshot = db_->GetSnapshot(); + MoveFilesToLevel(2); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + + // L1_0 + ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10))); + ASSERT_OK(db_->Put(WriteOptions(), Key(2), rnd.RandomString(4 << 10))); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(4), + Key(5))); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(1); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + + auto iter = db_->NewIterator(ReadOptions()); + iter->Seek(Key(3)); + ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); + + get_perf_context()->Reset(); + iter->SeekForPrev(Key(5)); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(2)); + db_->ReleaseSnapshot(snapshot); + delete iter; +} + +TEST_F(DBRangeDelTest, TombstoneOnlyFile) { + // L1_0: 1, 2, L1_1: [3, 5) + // L2: 3 + // Seek(2) then Next() should advance L1 iterator into L1_1. + // If sentinel works with tombstone only file, it should cover the key in L2. + // Similar story for SeekForPrev(4). + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.target_file_size_base = 3 * 1024; + + DestroyAndReopen(options); + Random rnd(301); + // L2 + ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo")); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(2); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + + // L1_0 + ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10))); + ASSERT_OK(db_->Put(WriteOptions(), Key(2), rnd.RandomString(4 << 10))); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(1); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + + // L1_1 + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3), + Key(5))); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(1); + ASSERT_EQ(2, NumTableFilesAtLevel(1)); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + + auto iter = db_->NewIterator(ReadOptions()); + iter->Seek(Key(2)); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(2)); + iter->Next(); + VerifyIteratorReachesEnd(iter); + iter->SeekForPrev(Key(4)); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(2)); + iter->Next(); + VerifyIteratorReachesEnd(iter); + delete iter; +} + +void VerifyIteratorKey(InternalIterator* iter, + const std::vector& expected_keys, + bool forward = true) { + for (auto& key : expected_keys) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->user_key(), key); + if (forward) { + iter->Next(); + } else { + iter->Prev(); + } + } +} + +TEST_F(DBRangeDelTest, TombstoneOnlyLevel) { + // L1 [3, 5) + // L2 has: 3, 4 + // Any kind of iterator seek should skip 3 and 4 in L2. + // L1 level iterator should produce sentinel key. + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.target_file_size_base = 3 * 1024; + + DestroyAndReopen(options); + // L2 + ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo")); + ASSERT_OK(db_->Put(WriteOptions(), Key(4), "bar")); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(2); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + + // L1 + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3), + Key(5))); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(1); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + + auto iter = db_->NewIterator(ReadOptions()); + get_perf_context()->Reset(); + uint64_t expected_reseek = 0; + for (auto i = 0; i < 7; ++i) { + iter->Seek(Key(i)); + VerifyIteratorReachesEnd(iter); + if (i < 5) { + ++expected_reseek; + } + ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, + expected_reseek); + iter->SeekForPrev(Key(i)); + VerifyIteratorReachesEnd(iter); + if (i > 2) { + ++expected_reseek; + } + ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, + expected_reseek); + iter->SeekToFirst(); + VerifyIteratorReachesEnd(iter); + ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, + ++expected_reseek); + iter->SeekToLast(); + VerifyIteratorReachesEnd(iter); + ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, + ++expected_reseek); + } + delete iter; + + // Check L1 LevelIterator behavior + ColumnFamilyData* cfd = + static_cast_with_check(db_->DefaultColumnFamily()) + ->cfd(); + SuperVersion* sv = cfd->GetSuperVersion(); + Arena arena; + ReadOptions read_options; + MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), &arena, + false /* prefix seek */); + InternalIterator* level_iter = sv->current->TEST_GetLevelIterator( + read_options, &merge_iter_builder, 1 /* level */, true); + // This is needed to make LevelIterator range tombstone aware + auto miter = merge_iter_builder.Finish(); + auto k = Key(3); + IterKey target; + target.SetInternalKey(k, kMaxSequenceNumber, kValueTypeForSeek); + level_iter->Seek(target.GetInternalKey()); + // sentinel key (file boundary as a fake key) + VerifyIteratorKey(level_iter, {Key(5)}); + VerifyIteratorReachesEnd(level_iter); + + k = Key(5); + target.SetInternalKey(k, 0, kValueTypeForSeekForPrev); + level_iter->SeekForPrev(target.GetInternalKey()); + VerifyIteratorKey(level_iter, {Key(3)}, false); + VerifyIteratorReachesEnd(level_iter); + + level_iter->SeekToFirst(); + VerifyIteratorKey(level_iter, {Key(5)}); + VerifyIteratorReachesEnd(level_iter); + + level_iter->SeekToLast(); + VerifyIteratorKey(level_iter, {Key(3)}, false); + VerifyIteratorReachesEnd(level_iter); + + miter->~InternalIterator(); +} + +TEST_F(DBRangeDelTest, TombstoneOnlyWithOlderVisibleKey) { + // L1: [3, 5) + // L2: 2, 4, 5 + // 2 and 5 should be visible + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.target_file_size_base = 3 * 1024; + + DestroyAndReopen(options); + // L2 + ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo")); + ASSERT_OK(db_->Put(WriteOptions(), Key(4), "bar")); + ASSERT_OK(db_->Put(WriteOptions(), Key(5), "foobar")); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(2); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + + // l1 + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3), + Key(5))); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(1); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + + auto iter = db_->NewIterator(ReadOptions()); + auto iter_test_backward = [&] { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(5)); + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(2)); + iter->Prev(); + VerifyIteratorReachesEnd(iter); + }; + auto iter_test_forward = [&] { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(2)); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(5)); + iter->Next(); + VerifyIteratorReachesEnd(iter); + }; + iter->Seek(Key(4)); + iter_test_backward(); + iter->SeekForPrev(Key(4)); + iter->Next(); + iter_test_backward(); + + iter->Seek(Key(4)); + iter->Prev(); + iter_test_forward(); + iter->SeekForPrev(Key(4)); + iter_test_forward(); + + iter->SeekToFirst(); + iter_test_forward(); + iter->SeekToLast(); + iter_test_backward(); + + delete iter; +} + +TEST_F(DBRangeDelTest, TombstoneSentinelDirectionChange) { + // L1: 7 + // L2: [4, 6) + // L3: 4 + // Seek(5) will have sentinel key 6 at the top of minHeap in merging iterator. + // then do a prev, how would sentinel work? + // Redo the test after Put(5) into L1 so that there is a visible key in range + // [4, 6). + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.target_file_size_base = 3 * 1024; + + DestroyAndReopen(options); + // L3 + ASSERT_OK(db_->Put(WriteOptions(), Key(4), "bar")); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(3); + ASSERT_EQ(1, NumTableFilesAtLevel(3)); + // L2 + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(4), + Key(6))); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(2); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + + // L1 + ASSERT_OK(db_->Put(WriteOptions(), Key(7), "foobar")); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(1); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + + auto iter = db_->NewIterator(ReadOptions()); + iter->Seek(Key(5)); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(7)); + iter->Prev(); + ASSERT_TRUE(!iter->Valid() && iter->status().ok()); + delete iter; + + ASSERT_OK(db_->Put(WriteOptions(), Key(5), "foobar")); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(1); + ASSERT_EQ(2, NumTableFilesAtLevel(1)); + + iter = db_->NewIterator(ReadOptions()); + iter->Seek(Key(5)); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(5)); + iter->Prev(); + ASSERT_TRUE(!iter->Valid() && iter->status().ok()); + delete iter; +} + +// Right sentinel tested in many test cases above +TEST_F(DBRangeDelTest, LeftSentinelKeyTest) { + // L1_0: 0, 1 L1_1: [2, 3), 5 + // L2: 2 + // SeekForPrev(4) should give 1 due to sentinel key keeping [2, 3) alive. + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.target_file_size_base = 3 * 1024; + options.max_compaction_bytes = 1024; + + DestroyAndReopen(options); + // L2 + ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo")); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(2); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + + // L1_0 + Random rnd(301); + ASSERT_OK(db_->Put(WriteOptions(), Key(0), rnd.RandomString(4 << 10))); + ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10))); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(1); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + + // L1_1 + ASSERT_OK(db_->Put(WriteOptions(), Key(5), "bar")); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2), + Key(3))); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(1); + ASSERT_EQ(2, NumTableFilesAtLevel(1)); + + auto iter = db_->NewIterator(ReadOptions()); + iter->SeekForPrev(Key(4)); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(1)); + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(0)); + iter->Prev(); + ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); + delete iter; +} + +TEST_F(DBRangeDelTest, LeftSentinelKeyTestWithNewerKey) { + // L1_0: 1, 2 newer than L1_1, L1_1: [2, 4), 5 + // L2: 3 + // SeekForPrev(4) then Prev() should give 2 and then 1. + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.target_file_size_base = 3 * 1024; + options.max_compaction_bytes = 1024; + + DestroyAndReopen(options); + // L2 + ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo")); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(2); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + + // L1_1 + ASSERT_OK(db_->Put(WriteOptions(), Key(5), "bar")); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2), + Key(4))); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(1); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + + // L1_0 + Random rnd(301); + ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10))); + ASSERT_OK(db_->Put(WriteOptions(), Key(2), rnd.RandomString(4 << 10))); + // Used to verify sequence number of iterator key later. + auto seq = dbfull()->TEST_GetLastVisibleSequence(); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(1); + ASSERT_EQ(2, NumTableFilesAtLevel(1)); + + Arena arena; + InternalKeyComparator icmp(options.comparator); + ReadOptions read_options; + ScopedArenaIterator iter; + iter.set( + dbfull()->NewInternalIterator(read_options, &arena, kMaxSequenceNumber)); + + auto k = Key(4); + IterKey target; + target.SetInternalKey(k, 0 /* sequence_number */, kValueTypeForSeekForPrev); + iter->SeekForPrev(target.GetInternalKey()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->user_key(), Key(2)); + SequenceNumber actual_seq; + ValueType type; + UnPackSequenceAndType(ExtractInternalKeyFooter(iter->key()), &actual_seq, + &type); + ASSERT_EQ(seq, actual_seq); + // might as well check type + ASSERT_EQ(type, kTypeValue); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->user_key(), Key(1)); + iter->Prev(); + ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); +} + +TEST_F(DBRangeDelTest, SentinelKeyCommonCaseTest) { + // L1 has 3 files + // L1_0: 1, 2 L1_1: [3, 4) 5, 6, [7, 8) L1_2: 9 + // Check iterator operations on LevelIterator. + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.target_file_size_base = 3 * 1024; + + DestroyAndReopen(options); + Random rnd(301); + // L1_0 + ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10))); + ASSERT_OK(db_->Put(WriteOptions(), Key(2), rnd.RandomString(4 << 10))); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(1); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + + // L1_1 + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3), + Key(4))); + ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(4 << 10))); + ASSERT_OK(db_->Put(WriteOptions(), Key(6), rnd.RandomString(4 << 10))); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(7), + Key(8))); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(1); + ASSERT_EQ(2, NumTableFilesAtLevel(1)); + + // L1_2 + ASSERT_OK(db_->Put(WriteOptions(), Key(9), rnd.RandomString(4 << 10))); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(1); + ASSERT_EQ(3, NumTableFilesAtLevel(1)); + + ColumnFamilyData* cfd = + static_cast_with_check(db_->DefaultColumnFamily()) + ->cfd(); + SuperVersion* sv = cfd->GetSuperVersion(); + Arena arena; + ReadOptions read_options; + MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), &arena, + false /* prefix seek */); + InternalIterator* level_iter = sv->current->TEST_GetLevelIterator( + read_options, &merge_iter_builder, 1 /* level */, true); + // This is needed to make LevelIterator range tombstone aware + auto miter = merge_iter_builder.Finish(); + auto k = Key(7); + IterKey target; + target.SetInternalKey(k, kMaxSequenceNumber, kValueTypeForSeek); + level_iter->Seek(target.GetInternalKey()); + // The last Key(9) is a sentinel key. + VerifyIteratorKey(level_iter, {Key(8), Key(9), Key(9)}); + ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok()); + + k = Key(6); + target.SetInternalKey(k, kMaxSequenceNumber, kValueTypeForSeek); + level_iter->Seek(target.GetInternalKey()); + VerifyIteratorKey(level_iter, {Key(6), Key(8), Key(9), Key(9)}); + ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok()); + + k = Key(4); + target.SetInternalKey(k, 0, kValueTypeForSeekForPrev); + level_iter->SeekForPrev(target.GetInternalKey()); + VerifyIteratorKey(level_iter, {Key(3), Key(2), Key(1), Key(1)}, false); + ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok()); + + k = Key(5); + target.SetInternalKey(k, 0, kValueTypeForSeekForPrev); + level_iter->SeekForPrev(target.GetInternalKey()); + VerifyIteratorKey(level_iter, {Key(5), Key(3), Key(2), Key(1), Key(1)}, + false); + + level_iter->SeekToFirst(); + VerifyIteratorKey(level_iter, {Key(1), Key(2), Key(2), Key(5), Key(6), Key(8), + Key(9), Key(9)}); + ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok()); + + level_iter->SeekToLast(); + VerifyIteratorKey( + level_iter, + {Key(9), Key(9), Key(6), Key(5), Key(3), Key(2), Key(1), Key(1)}, false); + ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok()); + + miter->~InternalIterator(); +} + +TEST_F(DBRangeDelTest, PrefixSentinelKey) { + // L1: ['aaaa', 'aaad'), 'bbbb' + // L2: 'aaac', 'aaae' + // Prefix extracts first 3 chars + // Seek('aaab') should give 'aaae' as first key. + // This is to test a previous bug where prefix seek sees there is no prefix in + // the SST file, and will just set file iter to null in LevelIterator and may + // just skip to the next SST file. But in this case, we should keep the file's + // tombstone alive. + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + table_options.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + Random rnd(301); + + // L2: + ASSERT_OK(db_->Put(WriteOptions(), "aaac", rnd.RandomString(10))); + ASSERT_OK(db_->Put(WriteOptions(), "aaae", rnd.RandomString(10))); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(2); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + + // L1 + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "aaaa", + "aaad")); + ASSERT_OK(db_->Put(WriteOptions(), "bbbb", rnd.RandomString(10))); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(1); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + + auto iter = db_->NewIterator(ReadOptions()); + iter->Seek("aaab"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), "aaae"); + delete iter; +} + +TEST_F(DBRangeDelTest, RefreshMemtableIter) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + DestroyAndReopen(options); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + ReadOptions ro; + ro.read_tier = kMemtableTier; + std::unique_ptr iter{db_->NewIterator(ro)}; + ASSERT_OK(Flush()); + // First refresh reinits iter, which had a bug where + // iter.memtable_range_tombstone_iter_ was not set to nullptr, and caused + // subsequent refresh to double free. + ASSERT_OK(iter->Refresh()); + ASSERT_OK(iter->Refresh()); +} + +TEST_F(DBRangeDelTest, RangeTombstoneRespectIterateUpperBound) { + // Memtable: a, [b, bz) + // Do a Seek on `a` with iterate_upper_bound being az + // range tombstone [b, bz) should not be processed (added to and + // popped from the min_heap in MergingIterator). + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + ASSERT_OK(Put("a", "bar")); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "bz")); + + // I could not find a cleaner way to test this without relying on + // implementation detail. Tried to test the value of + // `internal_range_del_reseek_count` but that did not work + // since BlockBasedTable iterator becomes !Valid() when point key + // is out of bound and that reseek only happens when a point key + // is covered by some range tombstone. + SyncPoint::GetInstance()->SetCallBack("MergeIterator::PopDeleteRangeStart", + [](void*) { + // there should not be any range + // tombstone in the heap. + FAIL(); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ReadOptions read_opts; + std::string upper_bound = "az"; + Slice upper_bound_slice = upper_bound; + read_opts.iterate_upper_bound = &upper_bound_slice; + std::unique_ptr iter{db_->NewIterator(read_opts)}; + iter->Seek("a"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), "a"); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); +} + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_rate_limiter_test.cc b/src/rocksdb/db/db_rate_limiter_test.cc new file mode 100644 index 000000000..e44cc047d --- /dev/null +++ b/src/rocksdb/db/db_rate_limiter_test.cc @@ -0,0 +1,451 @@ +// Copyright (c) 2022-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include +#include + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "test_util/testharness.h" +#include "util/file_checksum_helper.h" + +namespace ROCKSDB_NAMESPACE { + +class DBRateLimiterOnReadTest + : public DBTestBase, + public ::testing::WithParamInterface> { + public: + explicit DBRateLimiterOnReadTest() + : DBTestBase("db_rate_limiter_on_read_test", /*env_do_fsync=*/false), + use_direct_io_(std::get<0>(GetParam())), + use_block_cache_(std::get<1>(GetParam())), + use_readahead_(std::get<2>(GetParam())) {} + + void Init() { + options_ = GetOptions(); + Reopen(options_); + for (int i = 0; i < kNumFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK(Put(Key(i * kNumKeysPerFile + j), "val")); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(1); + } + + BlockBasedTableOptions GetTableOptions() { + BlockBasedTableOptions table_options; + table_options.no_block_cache = !use_block_cache_; + return table_options; + } + + ReadOptions GetReadOptions() { + ReadOptions read_options; + read_options.rate_limiter_priority = Env::IO_USER; + read_options.readahead_size = use_readahead_ ? kReadaheadBytes : 0; + return read_options; + } + + Options GetOptions() { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.file_checksum_gen_factory.reset(new FileChecksumGenCrc32cFactory()); + options.rate_limiter.reset(NewGenericRateLimiter( + 1 << 20 /* rate_bytes_per_sec */, 100 * 1000 /* refill_period_us */, + 10 /* fairness */, RateLimiter::Mode::kAllIo)); + options.table_factory.reset(NewBlockBasedTableFactory(GetTableOptions())); + options.use_direct_reads = use_direct_io_; + return options; + } + + protected: + const static int kNumKeysPerFile = 1; + const static int kNumFiles = 3; + const static int kReadaheadBytes = 32 << 10; // 32KB + + Options options_; + const bool use_direct_io_; + const bool use_block_cache_; + const bool use_readahead_; +}; + +std::string GetTestNameSuffix( + ::testing::TestParamInfo> info) { + std::ostringstream oss; + if (std::get<0>(info.param)) { + oss << "DirectIO"; + } else { + oss << "BufferedIO"; + } + if (std::get<1>(info.param)) { + oss << "_BlockCache"; + } else { + oss << "_NoBlockCache"; + } + if (std::get<2>(info.param)) { + oss << "_Readahead"; + } else { + oss << "_NoReadahead"; + } + return oss.str(); +} + +#ifndef ROCKSDB_LITE +INSTANTIATE_TEST_CASE_P(DBRateLimiterOnReadTest, DBRateLimiterOnReadTest, + ::testing::Combine(::testing::Bool(), ::testing::Bool(), + ::testing::Bool()), + GetTestNameSuffix); +#else // ROCKSDB_LITE +// Cannot use direct I/O in lite mode. +INSTANTIATE_TEST_CASE_P(DBRateLimiterOnReadTest, DBRateLimiterOnReadTest, + ::testing::Combine(::testing::Values(false), + ::testing::Bool(), + ::testing::Bool()), + GetTestNameSuffix); +#endif // ROCKSDB_LITE + +TEST_P(DBRateLimiterOnReadTest, Get) { + if (use_direct_io_ && !IsDirectIOSupported()) { + return; + } + Init(); + + ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER)); + + int expected = 0; + for (int i = 0; i < kNumFiles; ++i) { + { + std::string value; + ASSERT_OK(db_->Get(GetReadOptions(), Key(i * kNumKeysPerFile), &value)); + ++expected; + } + ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER)); + + { + std::string value; + ASSERT_OK(db_->Get(GetReadOptions(), Key(i * kNumKeysPerFile), &value)); + if (!use_block_cache_) { + ++expected; + } + } + ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER)); + } +} + +TEST_P(DBRateLimiterOnReadTest, NewMultiGet) { + if (use_direct_io_ && !IsDirectIOSupported()) { + return; + } + Init(); + + ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER)); + + const int kNumKeys = kNumFiles * kNumKeysPerFile; + int64_t expected = 0; + { + std::vector key_bufs; + key_bufs.reserve(kNumKeys); + std::vector keys; + keys.reserve(kNumKeys); + for (int i = 0; i < kNumKeys; ++i) { + key_bufs.emplace_back(Key(i)); + keys.emplace_back(key_bufs[i]); + } + std::vector statuses(kNumKeys); + std::vector values(kNumKeys); + const int64_t prev_total_rl_req = options_.rate_limiter->GetTotalRequests(); + db_->MultiGet(GetReadOptions(), dbfull()->DefaultColumnFamily(), kNumKeys, + keys.data(), values.data(), statuses.data()); + const int64_t cur_total_rl_req = options_.rate_limiter->GetTotalRequests(); + for (int i = 0; i < kNumKeys; ++i) { + ASSERT_TRUE(statuses[i].ok()); + } + ASSERT_GT(cur_total_rl_req, prev_total_rl_req); + ASSERT_EQ(cur_total_rl_req - prev_total_rl_req, + options_.rate_limiter->GetTotalRequests(Env::IO_USER)); + } + expected += kNumKeys; + ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER)); +} + +TEST_P(DBRateLimiterOnReadTest, OldMultiGet) { + // The old `vector`-returning `MultiGet()` APIs use `Read()`, which + // supports rate limiting. + if (use_direct_io_ && !IsDirectIOSupported()) { + return; + } + Init(); + + ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER)); + + const int kNumKeys = kNumFiles * kNumKeysPerFile; + int expected = 0; + { + std::vector key_bufs; + key_bufs.reserve(kNumKeys); + std::vector keys; + keys.reserve(kNumKeys); + for (int i = 0; i < kNumKeys; ++i) { + key_bufs.emplace_back(Key(i)); + keys.emplace_back(key_bufs[i]); + } + std::vector values; + std::vector statuses = + db_->MultiGet(GetReadOptions(), keys, &values); + for (int i = 0; i < kNumKeys; ++i) { + ASSERT_OK(statuses[i]); + } + } + expected += kNumKeys; + ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER)); +} + +TEST_P(DBRateLimiterOnReadTest, Iterator) { + if (use_direct_io_ && !IsDirectIOSupported()) { + return; + } + Init(); + + std::unique_ptr iter(db_->NewIterator(GetReadOptions())); + ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER)); + + int expected = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ++expected; + ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER)); + } + + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + // When `use_block_cache_ == true`, the reverse scan will access the blocks + // loaded to cache during the above forward scan, in which case no further + // file reads are expected. + if (!use_block_cache_) { + ++expected; + } + } + // Reverse scan does not read evenly (one block per iteration) due to + // descending seqno ordering, so wait until after the loop to check total. + ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER)); +} + +#if !defined(ROCKSDB_LITE) + +TEST_P(DBRateLimiterOnReadTest, VerifyChecksum) { + if (use_direct_io_ && !IsDirectIOSupported()) { + return; + } + Init(); + + ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER)); + + ASSERT_OK(db_->VerifyChecksum(GetReadOptions())); + // The files are tiny so there should have just been one read per file. + int expected = kNumFiles; + ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER)); +} + +TEST_P(DBRateLimiterOnReadTest, VerifyFileChecksums) { + if (use_direct_io_ && !IsDirectIOSupported()) { + return; + } + Init(); + + ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER)); + + ASSERT_OK(db_->VerifyFileChecksums(GetReadOptions())); + // The files are tiny so there should have just been one read per file. + int expected = kNumFiles; + ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER)); +} + +#endif // !defined(ROCKSDB_LITE) + +class DBRateLimiterOnWriteTest : public DBTestBase { + public: + explicit DBRateLimiterOnWriteTest() + : DBTestBase("db_rate_limiter_on_write_test", /*env_do_fsync=*/false) {} + + void Init() { + options_ = GetOptions(); + ASSERT_OK(TryReopenWithColumnFamilies({"default"}, options_)); + Random rnd(301); + for (int i = 0; i < kNumFiles; i++) { + ASSERT_OK(Put(0, kStartKey, rnd.RandomString(2))); + ASSERT_OK(Put(0, kEndKey, rnd.RandomString(2))); + ASSERT_OK(Flush(0)); + } + } + + Options GetOptions() { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.rate_limiter.reset(NewGenericRateLimiter( + 1 << 20 /* rate_bytes_per_sec */, 100 * 1000 /* refill_period_us */, + 10 /* fairness */, RateLimiter::Mode::kWritesOnly)); + options.table_factory.reset( + NewBlockBasedTableFactory(BlockBasedTableOptions())); + return options; + } + + protected: + inline const static int64_t kNumFiles = 3; + inline const static std::string kStartKey = "a"; + inline const static std::string kEndKey = "b"; + Options options_; +}; + +TEST_F(DBRateLimiterOnWriteTest, Flush) { + std::int64_t prev_total_request = 0; + + Init(); + + std::int64_t actual_flush_request = + options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL) - + prev_total_request; + std::int64_t exepcted_flush_request = kNumFiles; + EXPECT_EQ(actual_flush_request, exepcted_flush_request); + EXPECT_EQ(actual_flush_request, + options_.rate_limiter->GetTotalRequests(Env::IO_HIGH)); +} + +TEST_F(DBRateLimiterOnWriteTest, Compact) { + Init(); + + // Pre-comaction: + // level-0 : `kNumFiles` SST files overlapping on [kStartKey, kEndKey] +#ifndef ROCKSDB_LITE + std::string files_per_level_pre_compaction = std::to_string(kNumFiles); + ASSERT_EQ(files_per_level_pre_compaction, FilesPerLevel(0 /* cf */)); +#endif // !ROCKSDB_LITE + + std::int64_t prev_total_request = + options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL); + ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_LOW)); + + Compact(kStartKey, kEndKey); + + std::int64_t actual_compaction_request = + options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL) - + prev_total_request; + + // Post-comaction: + // level-0 : 0 SST file + // level-1 : 1 SST file +#ifndef ROCKSDB_LITE + std::string files_per_level_post_compaction = "0,1"; + ASSERT_EQ(files_per_level_post_compaction, FilesPerLevel(0 /* cf */)); +#endif // !ROCKSDB_LITE + + std::int64_t exepcted_compaction_request = 1; + EXPECT_EQ(actual_compaction_request, exepcted_compaction_request); + EXPECT_EQ(actual_compaction_request, + options_.rate_limiter->GetTotalRequests(Env::IO_LOW)); +} + +class DBRateLimiterOnWriteWALTest + : public DBRateLimiterOnWriteTest, + public ::testing::WithParamInterface> { + public: + static std::string GetTestNameSuffix( + ::testing::TestParamInfo> info) { + std::ostringstream oss; + if (std::get<0>(info.param)) { + oss << "DisableWAL"; + } else { + oss << "EnableWAL"; + } + if (std::get<1>(info.param)) { + oss << "_ManualWALFlush"; + } else { + oss << "_AutoWALFlush"; + } + if (std::get<2>(info.param) == Env::IO_USER) { + oss << "_RateLimitAutoWALFlush"; + } else if (std::get<2>(info.param) == Env::IO_TOTAL) { + oss << "_NoRateLimitAutoWALFlush"; + } else { + oss << "_RateLimitAutoWALFlushWithIncorrectPriority"; + } + return oss.str(); + } + + explicit DBRateLimiterOnWriteWALTest() + : disable_wal_(std::get<0>(GetParam())), + manual_wal_flush_(std::get<1>(GetParam())), + rate_limiter_priority_(std::get<2>(GetParam())) {} + + void Init() { + options_ = GetOptions(); + options_.manual_wal_flush = manual_wal_flush_; + Reopen(options_); + } + + WriteOptions GetWriteOptions() { + WriteOptions write_options; + write_options.disableWAL = disable_wal_; + write_options.rate_limiter_priority = rate_limiter_priority_; + return write_options; + } + + protected: + bool disable_wal_; + bool manual_wal_flush_; + Env::IOPriority rate_limiter_priority_; +}; + +INSTANTIATE_TEST_CASE_P( + DBRateLimiterOnWriteWALTest, DBRateLimiterOnWriteWALTest, + ::testing::Values(std::make_tuple(false, false, Env::IO_TOTAL), + std::make_tuple(false, false, Env::IO_USER), + std::make_tuple(false, false, Env::IO_HIGH), + std::make_tuple(false, true, Env::IO_USER), + std::make_tuple(true, false, Env::IO_USER)), + DBRateLimiterOnWriteWALTest::GetTestNameSuffix); + +TEST_P(DBRateLimiterOnWriteWALTest, AutoWalFlush) { + Init(); + + const bool no_rate_limit_auto_wal_flush = + (rate_limiter_priority_ == Env::IO_TOTAL); + const bool valid_arg = (rate_limiter_priority_ == Env::IO_USER && + !disable_wal_ && !manual_wal_flush_); + + std::int64_t prev_total_request = + options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL); + ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER)); + + Status s = Put("foo", "v1", GetWriteOptions()); + + if (no_rate_limit_auto_wal_flush || valid_arg) { + EXPECT_TRUE(s.ok()); + } else { + EXPECT_TRUE(s.IsInvalidArgument()); + EXPECT_TRUE(s.ToString().find("WriteOptions::rate_limiter_priority") != + std::string::npos); + } + + std::int64_t actual_auto_wal_flush_request = + options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL) - + prev_total_request; + std::int64_t expected_auto_wal_flush_request = valid_arg ? 1 : 0; + + EXPECT_EQ(actual_auto_wal_flush_request, expected_auto_wal_flush_request); + EXPECT_EQ(actual_auto_wal_flush_request, + options_.rate_limiter->GetTotalRequests(Env::IO_USER)); +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_readonly_with_timestamp_test.cc b/src/rocksdb/db/db_readonly_with_timestamp_test.cc new file mode 100644 index 000000000..3f53e7806 --- /dev/null +++ b/src/rocksdb/db/db_readonly_with_timestamp_test.cc @@ -0,0 +1,960 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_with_timestamp_test_util.h" +#include "test_util/testutil.h" + +namespace ROCKSDB_NAMESPACE { +class DBReadOnlyTestWithTimestamp : public DBBasicTestWithTimestampBase { + public: + DBReadOnlyTestWithTimestamp() + : DBBasicTestWithTimestampBase("db_readonly_test_with_timestamp") {} + + protected: +#ifndef ROCKSDB_LITE + void CheckDBOpenedAsCompactedDBWithOneLevel0File() { + VersionSet* const versions = dbfull()->GetVersionSet(); + ASSERT_NE(versions, nullptr); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + + const VersionStorageInfo* const storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + + // Only 1 L0 file. + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + // L0 is the max level. + ASSERT_EQ(storage_info->num_non_empty_levels(), 1); + } + + void CheckDBOpenedAsCompactedDBWithOnlyHighestNonEmptyLevelFiles() { + VersionSet* const versions = dbfull()->GetVersionSet(); + ASSERT_NE(versions, nullptr); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + + const VersionStorageInfo* const storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + + // L0 has no files. + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + + // All other levels have no files except the highest level with files. + for (int i = 1; i < storage_info->num_non_empty_levels() - 1; ++i) { + ASSERT_FALSE(storage_info->LevelFilesBrief(i).num_files > 0); + } + + // The highest level with files have some files. + int highest_non_empty_level = storage_info->num_non_empty_levels() - 1; + ASSERT_TRUE( + storage_info->LevelFilesBrief(highest_non_empty_level).num_files > 0); + } +#endif // !ROCKSDB_LITE +}; + +#ifndef ROCKSDB_LITE +TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGetReadTimestampSizeMismatch) { + const int kNumKeysPerFile = 128; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + const std::string write_timestamp = Timestamp(1, 0); + WriteOptions write_opts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), write_timestamp, + "value" + std::to_string(key)); + ASSERT_OK(s); + } + + // Reopen the database in read only mode to test its timestamp support. + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + ReadOptions read_opts; + std::string different_size_read_timestamp; + PutFixed32(&different_size_read_timestamp, 2); + Slice different_size_read_ts = different_size_read_timestamp; + read_opts.timestamp = &different_size_read_ts; + { + std::unique_ptr iter(db_->NewIterator(read_opts)); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->status().IsInvalidArgument()); + } + + for (uint64_t key = 0; key <= kMaxKey; ++key) { + std::string value_from_get; + std::string timestamp; + ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, ×tamp) + .IsInvalidArgument()); + } + + Close(); +} + +TEST_F(DBReadOnlyTestWithTimestamp, + IteratorAndGetReadTimestampSpecifiedWithoutWriteTimestamp) { + const int kNumKeysPerFile = 128; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + WriteOptions write_opts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(key)); + ASSERT_OK(s); + } + + // Reopen the database in read only mode to test its timestamp support. + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + ReadOptions read_opts; + const std::string read_timestamp = Timestamp(2, 0); + Slice read_ts = read_timestamp; + read_opts.timestamp = &read_ts; + { + std::unique_ptr iter(db_->NewIterator(read_opts)); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->status().IsInvalidArgument()); + } + + for (uint64_t key = 0; key <= kMaxKey; ++key) { + std::string value_from_get; + std::string timestamp; + ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, ×tamp) + .IsInvalidArgument()); + } + + Close(); +} + +TEST_F(DBReadOnlyTestWithTimestamp, + IteratorAndGetWriteWithTimestampReadWithoutTimestamp) { + const int kNumKeysPerFile = 128; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + const std::string write_timestamp = Timestamp(1, 0); + WriteOptions write_opts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), write_timestamp, + "value" + std::to_string(key)); + ASSERT_OK(s); + } + + // Reopen the database in read only mode to test its timestamp support. + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + ReadOptions read_opts; + { + std::unique_ptr iter(db_->NewIterator(read_opts)); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->status().IsInvalidArgument()); + } + + for (uint64_t key = 0; key <= kMaxKey; ++key) { + std::string value_from_get; + ASSERT_TRUE( + db_->Get(read_opts, Key1(key), &value_from_get).IsInvalidArgument()); + } + + Close(); +} + +TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGet) { + const int kNumKeysPerFile = 128; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + const std::vector start_keys = {1, 0}; + const std::vector write_timestamps = {Timestamp(1, 0), + Timestamp(3, 0)}; + const std::vector read_timestamps = {Timestamp(2, 0), + Timestamp(4, 0)}; + for (size_t i = 0; i < write_timestamps.size(); ++i) { + WriteOptions write_opts; + for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), write_timestamps[i], + "value" + std::to_string(i)); + ASSERT_OK(s); + } + } + + // Reopen the database in read only mode to test its timestamp support. + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + + auto get_value_and_check = [](DB* db, ReadOptions read_opts, Slice key, + Slice expected_value, std::string expected_ts) { + std::string value_from_get; + std::string timestamp; + ASSERT_OK(db->Get(read_opts, key.ToString(), &value_from_get, ×tamp)); + ASSERT_EQ(expected_value, value_from_get); + ASSERT_EQ(expected_ts, timestamp); + }; + for (size_t i = 0; i < read_timestamps.size(); ++i) { + ReadOptions read_opts; + Slice read_ts = read_timestamps[i]; + read_opts.timestamp = &read_ts; + std::unique_ptr it(db_->NewIterator(read_opts)); + int count = 0; + uint64_t key = 0; + // Forward iterate. + for (it->Seek(Key1(0)), key = start_keys[i]; it->Valid(); + it->Next(), ++count, ++key) { + CheckIterUserEntry(it.get(), Key1(key), kTypeValue, + "value" + std::to_string(i), write_timestamps[i]); + get_value_and_check(db_, read_opts, it->key(), it->value(), + write_timestamps[i]); + } + size_t expected_count = kMaxKey - start_keys[i] + 1; + ASSERT_EQ(expected_count, count); + + // Backward iterate. + count = 0; + for (it->SeekForPrev(Key1(kMaxKey)), key = kMaxKey; it->Valid(); + it->Prev(), ++count, --key) { + CheckIterUserEntry(it.get(), Key1(key), kTypeValue, + "value" + std::to_string(i), write_timestamps[i]); + get_value_and_check(db_, read_opts, it->key(), it->value(), + write_timestamps[i]); + } + ASSERT_EQ(static_cast(kMaxKey) - start_keys[i] + 1, count); + + // SeekToFirst()/SeekToLast() with lower/upper bounds. + // Then iter with lower and upper bounds. + uint64_t l = 0; + uint64_t r = kMaxKey + 1; + while (l < r) { + std::string lb_str = Key1(l); + Slice lb = lb_str; + std::string ub_str = Key1(r); + Slice ub = ub_str; + read_opts.iterate_lower_bound = &lb; + read_opts.iterate_upper_bound = &ub; + it.reset(db_->NewIterator(read_opts)); + for (it->SeekToFirst(), key = std::max(l, start_keys[i]), count = 0; + it->Valid(); it->Next(), ++key, ++count) { + CheckIterUserEntry(it.get(), Key1(key), kTypeValue, + "value" + std::to_string(i), write_timestamps[i]); + get_value_and_check(db_, read_opts, it->key(), it->value(), + write_timestamps[i]); + } + ASSERT_EQ(r - std::max(l, start_keys[i]), count); + + for (it->SeekToLast(), key = std::min(r, kMaxKey + 1), count = 0; + it->Valid(); it->Prev(), --key, ++count) { + CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue, + "value" + std::to_string(i), write_timestamps[i]); + get_value_and_check(db_, read_opts, it->key(), it->value(), + write_timestamps[i]); + } + l += (kMaxKey / 100); + r -= (kMaxKey / 100); + } + } + Close(); +} + +TEST_F(DBReadOnlyTestWithTimestamp, Iterators) { + const int kNumKeysPerFile = 128; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + const std::string write_timestamp = Timestamp(1, 0); + const std::string read_timestamp = Timestamp(2, 0); + WriteOptions write_opts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), write_timestamp, + "value" + std::to_string(key)); + ASSERT_OK(s); + } + + // Reopen the database in read only mode to test its timestamp support. + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + ReadOptions read_opts; + Slice read_ts = read_timestamp; + read_opts.timestamp = &read_ts; + std::vector iters; + ASSERT_OK(db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters)); + ASSERT_EQ(static_cast(1), iters.size()); + + int count = 0; + uint64_t key = 0; + // Forward iterate. + for (iters[0]->Seek(Key1(0)), key = 0; iters[0]->Valid(); + iters[0]->Next(), ++count, ++key) { + CheckIterUserEntry(iters[0], Key1(key), kTypeValue, + "value" + std::to_string(key), write_timestamp); + } + + size_t expected_count = kMaxKey - 0 + 1; + ASSERT_EQ(expected_count, count); + delete iters[0]; + + Close(); +} + +TEST_F(DBReadOnlyTestWithTimestamp, IteratorsReadTimestampSizeMismatch) { + const int kNumKeysPerFile = 128; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + const std::string write_timestamp = Timestamp(1, 0); + WriteOptions write_opts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), write_timestamp, + "value" + std::to_string(key)); + ASSERT_OK(s); + } + + // Reopen the database in read only mode to test its timestamp support. + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + ReadOptions read_opts; + std::string different_size_read_timestamp; + PutFixed32(&different_size_read_timestamp, 2); + Slice different_size_read_ts = different_size_read_timestamp; + read_opts.timestamp = &different_size_read_ts; + { + std::vector iters; + ASSERT_TRUE( + db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters) + .IsInvalidArgument()); + } + + Close(); +} + +TEST_F(DBReadOnlyTestWithTimestamp, + IteratorsReadTimestampSpecifiedWithoutWriteTimestamp) { + const int kNumKeysPerFile = 128; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + WriteOptions write_opts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(key)); + ASSERT_OK(s); + } + + // Reopen the database in read only mode to test its timestamp support. + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + ReadOptions read_opts; + const std::string read_timestamp = Timestamp(2, 0); + Slice read_ts = read_timestamp; + read_opts.timestamp = &read_ts; + { + std::vector iters; + ASSERT_TRUE( + db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters) + .IsInvalidArgument()); + } + + Close(); +} + +TEST_F(DBReadOnlyTestWithTimestamp, + IteratorsWriteWithTimestampReadWithoutTimestamp) { + const int kNumKeysPerFile = 128; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + const std::string write_timestamp = Timestamp(1, 0); + WriteOptions write_opts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), write_timestamp, + "value" + std::to_string(key)); + ASSERT_OK(s); + } + + // Reopen the database in read only mode to test its timestamp support. + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + ReadOptions read_opts; + { + std::vector iters; + ASSERT_TRUE( + db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters) + .IsInvalidArgument()); + } + + Close(); +} + +TEST_F(DBReadOnlyTestWithTimestamp, CompactedDBGetReadTimestampSizeMismatch) { + const int kNumKeysPerFile = 1026; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.disable_auto_compactions = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + std::string write_timestamp = Timestamp(1, 0); + WriteOptions write_opts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), write_timestamp, + "value" + std::to_string(0)); + ASSERT_OK(s); + } + ASSERT_OK(db_->Flush(FlushOptions())); + Close(); + + // Reopen the database in read only mode as a Compacted DB to test its + // timestamp support. + options.max_open_files = -1; + ASSERT_OK(ReadOnlyReopen(options)); + CheckDBOpenedAsCompactedDBWithOneLevel0File(); + + ReadOptions read_opts; + std::string different_size_read_timestamp; + PutFixed32(&different_size_read_timestamp, 2); + Slice different_size_read_ts = different_size_read_timestamp; + read_opts.timestamp = &different_size_read_ts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + std::string value_from_get; + std::string timestamp; + ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, ×tamp) + .IsInvalidArgument()); + } + Close(); +} + +TEST_F(DBReadOnlyTestWithTimestamp, + CompactedDBGetReadTimestampSpecifiedWithoutWriteTimestamp) { + const int kNumKeysPerFile = 1026; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + WriteOptions write_opts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(0)); + ASSERT_OK(s); + } + ASSERT_OK(db_->Flush(FlushOptions())); + Close(); + + // Reopen the database in read only mode as a Compacted DB to test its + // timestamp support. + options.max_open_files = -1; + ASSERT_OK(ReadOnlyReopen(options)); + CheckDBOpenedAsCompactedDBWithOneLevel0File(); + + ReadOptions read_opts; + const std::string read_timestamp = Timestamp(2, 0); + Slice read_ts = read_timestamp; + read_opts.timestamp = &read_ts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + std::string value_from_get; + std::string timestamp; + ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, ×tamp) + .IsInvalidArgument()); + } + Close(); +} + +TEST_F(DBReadOnlyTestWithTimestamp, + CompactedDBGetWriteWithTimestampReadWithoutTimestamp) { + const int kNumKeysPerFile = 1026; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.disable_auto_compactions = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + std::string write_timestamp = Timestamp(1, 0); + WriteOptions write_opts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), write_timestamp, + "value" + std::to_string(0)); + ASSERT_OK(s); + } + ASSERT_OK(db_->Flush(FlushOptions())); + Close(); + + // Reopen the database in read only mode as a Compacted DB to test its + // timestamp support. + options.max_open_files = -1; + ASSERT_OK(ReadOnlyReopen(options)); + CheckDBOpenedAsCompactedDBWithOneLevel0File(); + + ReadOptions read_opts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + std::string value_from_get; + ASSERT_TRUE( + db_->Get(read_opts, Key1(key), &value_from_get).IsInvalidArgument()); + } + Close(); +} + +TEST_F(DBReadOnlyTestWithTimestamp, CompactedDBGetWithOnlyOneL0File) { + const int kNumKeysPerFile = 1026 * 2; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.disable_auto_compactions = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + const std::vector start_keys = {1, 0}; + const std::vector write_timestamps = {Timestamp(1, 0), + Timestamp(3, 0)}; + const std::vector read_timestamps = {Timestamp(2, 0), + Timestamp(4, 0)}; + for (size_t i = 0; i < write_timestamps.size(); ++i) { + WriteOptions write_opts; + for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), write_timestamps[i], + "value" + std::to_string(i)); + ASSERT_OK(s); + } + } + ASSERT_OK(db_->Flush(FlushOptions())); + Close(); + + // Reopen the database in read only mode as a Compacted DB to test its + // timestamp support. + options.max_open_files = -1; + ASSERT_OK(ReadOnlyReopen(options)); + CheckDBOpenedAsCompactedDBWithOneLevel0File(); + + for (size_t i = 0; i < read_timestamps.size(); ++i) { + ReadOptions read_opts; + Slice read_ts = read_timestamps[i]; + read_opts.timestamp = &read_ts; + int count = 0; + for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key, ++count) { + std::string value_from_get; + std::string timestamp; + ASSERT_OK(db_->Get(read_opts, Key1(key), &value_from_get, ×tamp)); + ASSERT_EQ("value" + std::to_string(i), value_from_get); + ASSERT_EQ(write_timestamps[i], timestamp); + } + size_t expected_count = kMaxKey - start_keys[i] + 1; + ASSERT_EQ(expected_count, count); + } + Close(); +} + +TEST_F(DBReadOnlyTestWithTimestamp, + CompactedDBGetWithOnlyHighestNonEmptyLevelFiles) { + const int kNumKeysPerFile = 128; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.disable_auto_compactions = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + const std::vector start_keys = {1, 0}; + const std::vector write_timestamps = {Timestamp(1, 0), + Timestamp(3, 0)}; + const std::vector read_timestamps = {Timestamp(2, 0), + Timestamp(4, 0)}; + for (size_t i = 0; i < write_timestamps.size(); ++i) { + WriteOptions write_opts; + for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), write_timestamps[i], + "value" + std::to_string(i)); + ASSERT_OK(s); + } + } + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + Close(); + + // Reopen the database in read only mode as a Compacted DB to test its + // timestamp support. + options.max_open_files = -1; + ASSERT_OK(ReadOnlyReopen(options)); + CheckDBOpenedAsCompactedDBWithOnlyHighestNonEmptyLevelFiles(); + + for (size_t i = 0; i < read_timestamps.size(); ++i) { + ReadOptions read_opts; + Slice read_ts = read_timestamps[i]; + read_opts.timestamp = &read_ts; + int count = 0; + for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key, ++count) { + std::string value_from_get; + std::string timestamp; + ASSERT_OK(db_->Get(read_opts, Key1(key), &value_from_get, ×tamp)); + ASSERT_EQ("value" + std::to_string(i), value_from_get); + ASSERT_EQ(write_timestamps[i], timestamp); + } + size_t expected_count = kMaxKey - start_keys[i] + 1; + ASSERT_EQ(expected_count, count); + } + Close(); +} + +TEST_F(DBReadOnlyTestWithTimestamp, + CompactedDBMultiGetReadTimestampSizeMismatch) { + const int kNumKeysPerFile = 1026; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.disable_auto_compactions = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + std::string write_timestamp = Timestamp(1, 0); + WriteOptions write_opts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), write_timestamp, + "value" + std::to_string(0)); + ASSERT_OK(s); + } + ASSERT_OK(db_->Flush(FlushOptions())); + Close(); + + // Reopen the database in read only mode as a Compacted DB to test its + // timestamp support. + options.max_open_files = -1; + ASSERT_OK(ReadOnlyReopen(options)); + CheckDBOpenedAsCompactedDBWithOneLevel0File(); + + ReadOptions read_opts; + std::string different_size_read_timestamp; + PutFixed32(&different_size_read_timestamp, 2); + Slice different_size_read_ts = different_size_read_timestamp; + read_opts.timestamp = &different_size_read_ts; + std::vector key_strs; + std::vector keys; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + key_strs.push_back(Key1(key)); + } + for (const auto& key_str : key_strs) { + keys.emplace_back(key_str); + } + std::vector values; + std::vector timestamps; + std::vector status_list = + db_->MultiGet(read_opts, keys, &values, ×tamps); + for (const auto& status : status_list) { + ASSERT_TRUE(status.IsInvalidArgument()); + } + Close(); +} + +TEST_F(DBReadOnlyTestWithTimestamp, + CompactedDBMultiGetReadTimestampSpecifiedWithoutWriteTimestamp) { + const int kNumKeysPerFile = 1026; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + WriteOptions write_opts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(0)); + ASSERT_OK(s); + } + ASSERT_OK(db_->Flush(FlushOptions())); + Close(); + + // Reopen the database in read only mode as a Compacted DB to test its + // timestamp support. + options.max_open_files = -1; + ASSERT_OK(ReadOnlyReopen(options)); + CheckDBOpenedAsCompactedDBWithOneLevel0File(); + + ReadOptions read_opts; + std::string read_timestamp = Timestamp(2, 0); + Slice read_ts = read_timestamp; + read_opts.timestamp = &read_ts; + std::vector key_strs; + std::vector keys; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + key_strs.push_back(Key1(key)); + } + for (const auto& key_str : key_strs) { + keys.emplace_back(key_str); + } + std::vector values; + std::vector timestamps; + std::vector status_list = + db_->MultiGet(read_opts, keys, &values, ×tamps); + for (const auto& status : status_list) { + ASSERT_TRUE(status.IsInvalidArgument()); + } + Close(); +} + +TEST_F(DBReadOnlyTestWithTimestamp, + CompactedDBMultiGetWriteWithTimestampReadWithoutTimestamp) { + const int kNumKeysPerFile = 1026; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.disable_auto_compactions = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + std::string write_timestamp = Timestamp(1, 0); + WriteOptions write_opts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), write_timestamp, + "value" + std::to_string(0)); + ASSERT_OK(s); + } + ASSERT_OK(db_->Flush(FlushOptions())); + Close(); + + // Reopen the database in read only mode as a Compacted DB to test its + // timestamp support. + options.max_open_files = -1; + ASSERT_OK(ReadOnlyReopen(options)); + CheckDBOpenedAsCompactedDBWithOneLevel0File(); + + ReadOptions read_opts; + std::vector key_strs; + std::vector keys; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + key_strs.push_back(Key1(key)); + } + for (const auto& key_str : key_strs) { + keys.emplace_back(key_str); + } + std::vector values; + std::vector status_list = db_->MultiGet(read_opts, keys, &values); + for (const auto& status : status_list) { + ASSERT_TRUE(status.IsInvalidArgument()); + } + Close(); +} + +TEST_F(DBReadOnlyTestWithTimestamp, CompactedDBMultiGetWithOnlyOneL0File) { + const int kNumKeysPerFile = 1026 * 2; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.disable_auto_compactions = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + const std::vector start_keys = {1, 0}; + const std::vector write_timestamps = {Timestamp(1, 0), + Timestamp(3, 0)}; + const std::vector read_timestamps = {Timestamp(2, 0), + Timestamp(4, 0)}; + for (size_t i = 0; i < write_timestamps.size(); ++i) { + WriteOptions write_opts; + for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), write_timestamps[i], + "value" + std::to_string(i)); + ASSERT_OK(s); + } + } + ASSERT_OK(db_->Flush(FlushOptions())); + Close(); + + // Reopen the database in read only mode as a Compacted DB to test its + // timestamp support. + options.max_open_files = -1; + ASSERT_OK(ReadOnlyReopen(options)); + CheckDBOpenedAsCompactedDBWithOneLevel0File(); + + for (size_t i = 0; i < write_timestamps.size(); ++i) { + ReadOptions read_opts; + Slice read_ts = read_timestamps[i]; + read_opts.timestamp = &read_ts; + std::vector key_strs; + std::vector keys; + for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) { + key_strs.push_back(Key1(key)); + } + for (const auto& key_str : key_strs) { + keys.emplace_back(key_str); + } + size_t batch_size = kMaxKey - start_keys[i] + 1; + std::vector values; + std::vector timestamps; + std::vector status_list = + db_->MultiGet(read_opts, keys, &values, ×tamps); + ASSERT_EQ(batch_size, values.size()); + ASSERT_EQ(batch_size, timestamps.size()); + for (uint64_t idx = 0; idx < values.size(); ++idx) { + ASSERT_EQ("value" + std::to_string(i), values[idx]); + ASSERT_EQ(write_timestamps[i], timestamps[idx]); + ASSERT_OK(status_list[idx]); + } + } + + Close(); +} + +TEST_F(DBReadOnlyTestWithTimestamp, + CompactedDBMultiGetWithOnlyHighestNonEmptyLevelFiles) { + const int kNumKeysPerFile = 128; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.disable_auto_compactions = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + const std::vector start_keys = {1, 0}; + const std::vector write_timestamps = {Timestamp(1, 0), + Timestamp(3, 0)}; + const std::vector read_timestamps = {Timestamp(2, 0), + Timestamp(4, 0)}; + for (size_t i = 0; i < write_timestamps.size(); ++i) { + WriteOptions write_opts; + for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), write_timestamps[i], + "value" + std::to_string(i)); + ASSERT_OK(s); + } + } + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + Close(); + + // Reopen the database in read only mode as a Compacted DB to test its + // timestamp support. + options.max_open_files = -1; + ASSERT_OK(ReadOnlyReopen(options)); + CheckDBOpenedAsCompactedDBWithOnlyHighestNonEmptyLevelFiles(); + + for (size_t i = 0; i < write_timestamps.size(); ++i) { + ReadOptions read_opts; + Slice read_ts = read_timestamps[i]; + read_opts.timestamp = &read_ts; + std::vector key_strs; + std::vector keys; + for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) { + key_strs.push_back(Key1(key)); + } + for (const auto& key_str : key_strs) { + keys.emplace_back(key_str); + } + size_t batch_size = kMaxKey - start_keys[i] + 1; + std::vector values; + std::vector timestamps; + std::vector status_list = + db_->MultiGet(read_opts, keys, &values, ×tamps); + ASSERT_EQ(batch_size, values.size()); + ASSERT_EQ(batch_size, timestamps.size()); + for (uint64_t idx = 0; idx < values.size(); ++idx) { + ASSERT_EQ("value" + std::to_string(i), values[idx]); + ASSERT_EQ(write_timestamps[i], timestamps[idx]); + ASSERT_OK(status_list[idx]); + } + } + + Close(); +} +#endif // !ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_secondary_test.cc b/src/rocksdb/db/db_secondary_test.cc new file mode 100644 index 000000000..20d7534e0 --- /dev/null +++ b/src/rocksdb/db/db_secondary_test.cc @@ -0,0 +1,1693 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_impl/db_impl_secondary.h" +#include "db/db_test_util.h" +#include "db/db_with_timestamp_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/utilities/transaction_db.h" +#include "test_util/sync_point.h" +#include "test_util/testutil.h" +#include "utilities/fault_injection_env.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE +class DBSecondaryTestBase : public DBBasicTestWithTimestampBase { + public: + explicit DBSecondaryTestBase(const std::string& dbname) + : DBBasicTestWithTimestampBase(dbname), + secondary_path_(), + handles_secondary_(), + db_secondary_(nullptr) { + secondary_path_ = + test::PerThreadDBPath(env_, "/db_secondary_test_secondary"); + } + + ~DBSecondaryTestBase() override { + CloseSecondary(); + if (getenv("KEEP_DB") != nullptr) { + fprintf(stdout, "Secondary DB is still at %s\n", secondary_path_.c_str()); + } else { + Options options; + options.env = env_; + EXPECT_OK(DestroyDB(secondary_path_, options)); + } + } + + protected: + Status ReopenAsSecondary(const Options& options) { + return DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_); + } + + void OpenSecondary(const Options& options); + + Status TryOpenSecondary(const Options& options); + + void OpenSecondaryWithColumnFamilies( + const std::vector& column_families, const Options& options); + + void CloseSecondary() { + for (auto h : handles_secondary_) { + ASSERT_OK(db_secondary_->DestroyColumnFamilyHandle(h)); + } + handles_secondary_.clear(); + delete db_secondary_; + db_secondary_ = nullptr; + } + + DBImplSecondary* db_secondary_full() { + return static_cast(db_secondary_); + } + + void CheckFileTypeCounts(const std::string& dir, int expected_log, + int expected_sst, int expected_manifest) const; + + std::string secondary_path_; + std::vector handles_secondary_; + DB* db_secondary_; +}; + +void DBSecondaryTestBase::OpenSecondary(const Options& options) { + ASSERT_OK(TryOpenSecondary(options)); +} + +Status DBSecondaryTestBase::TryOpenSecondary(const Options& options) { + Status s = + DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_secondary_); + return s; +} + +void DBSecondaryTestBase::OpenSecondaryWithColumnFamilies( + const std::vector& column_families, const Options& options) { + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + for (const auto& cf_name : column_families) { + cf_descs.emplace_back(cf_name, options); + } + Status s = DB::OpenAsSecondary(options, dbname_, secondary_path_, cf_descs, + &handles_secondary_, &db_secondary_); + ASSERT_OK(s); +} + +void DBSecondaryTestBase::CheckFileTypeCounts(const std::string& dir, + int expected_log, + int expected_sst, + int expected_manifest) const { + std::vector filenames; + ASSERT_OK(env_->GetChildren(dir, &filenames)); + + int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0; + for (auto file : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(file, &number, &type)) { + log_cnt += (type == kWalFile); + sst_cnt += (type == kTableFile); + manifest_cnt += (type == kDescriptorFile); + } + } + ASSERT_EQ(expected_log, log_cnt); + ASSERT_EQ(expected_sst, sst_cnt); + ASSERT_EQ(expected_manifest, manifest_cnt); +} + +class DBSecondaryTest : public DBSecondaryTestBase { + public: + explicit DBSecondaryTest() : DBSecondaryTestBase("db_secondary_test") {} +}; + +TEST_F(DBSecondaryTest, FailOpenIfLoggerCreationFail) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + Reopen(options); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "rocksdb::CreateLoggerFromOptions:AfterGetPath", [&](void* arg) { + auto* s = reinterpret_cast(arg); + assert(s); + *s = Status::IOError("Injected"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + options.max_open_files = -1; + Status s = TryOpenSecondary(options); + ASSERT_EQ(nullptr, options.info_log); + ASSERT_TRUE(s.IsIOError()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(DBSecondaryTest, NonExistingDb) { + Destroy(last_options_); + + Options options = GetDefaultOptions(); + options.env = env_; + options.max_open_files = -1; + const std::string dbname = "/doesnt/exist"; + Status s = + DB::OpenAsSecondary(options, dbname, secondary_path_, &db_secondary_); + ASSERT_TRUE(s.IsIOError()); +} + +TEST_F(DBSecondaryTest, ReopenAsSecondary) { + Options options; + options.env = env_; + Reopen(options); + ASSERT_OK(Put("foo", "foo_value")); + ASSERT_OK(Put("bar", "bar_value")); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + Close(); + + ASSERT_OK(ReopenAsSecondary(options)); + ASSERT_EQ("foo_value", Get("foo")); + ASSERT_EQ("bar_value", Get("bar")); + ReadOptions ropts; + ropts.verify_checksums = true; + auto db1 = static_cast(db_); + ASSERT_NE(nullptr, db1); + Iterator* iter = db1->NewIterator(ropts); + ASSERT_NE(nullptr, iter); + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + if (0 == count) { + ASSERT_EQ("bar", iter->key().ToString()); + ASSERT_EQ("bar_value", iter->value().ToString()); + } else if (1 == count) { + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("foo_value", iter->value().ToString()); + } + ++count; + } + delete iter; + ASSERT_EQ(2, count); +} + +TEST_F(DBSecondaryTest, SimpleInternalCompaction) { + Options options; + options.env = env_; + Reopen(options); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(Flush()); + } + CompactionServiceInput input; + + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + for (auto& file : meta.levels[0].files) { + ASSERT_EQ(0, meta.levels[0].level); + input.input_files.push_back(file.name); + } + ASSERT_EQ(input.input_files.size(), 3); + + input.output_level = 1; + ASSERT_OK(db_->GetDbIdentity(input.db_id)); + Close(); + + options.max_open_files = -1; + OpenSecondary(options); + auto cfh = db_secondary_->DefaultColumnFamily(); + + CompactionServiceResult result; + ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation( + OpenAndCompactOptions(), cfh, input, &result)); + + ASSERT_EQ(result.output_files.size(), 1); + InternalKey smallest, largest; + smallest.DecodeFrom(result.output_files[0].smallest_internal_key); + largest.DecodeFrom(result.output_files[0].largest_internal_key); + ASSERT_EQ(smallest.user_key().ToString(), "bar"); + ASSERT_EQ(largest.user_key().ToString(), "foo"); + ASSERT_EQ(result.output_level, 1); + ASSERT_EQ(result.output_path, this->secondary_path_); + ASSERT_EQ(result.num_output_records, 2); + ASSERT_GT(result.bytes_written, 0); + ASSERT_OK(result.status); +} + +TEST_F(DBSecondaryTest, InternalCompactionMultiLevels) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + Reopen(options); + const int kRangeL2 = 10; + const int kRangeL1 = 30; + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i * kRangeL2), "value" + std::to_string(i))); + ASSERT_OK(Put(Key((i + 1) * kRangeL2 - 1), "value" + std::to_string(i))); + ASSERT_OK(Flush()); + } + MoveFilesToLevel(2); + for (int i = 0; i < 5; i++) { + ASSERT_OK(Put(Key(i * kRangeL1), "value" + std::to_string(i))); + ASSERT_OK(Put(Key((i + 1) * kRangeL1 - 1), "value" + std::to_string(i))); + ASSERT_OK(Flush()); + } + MoveFilesToLevel(1); + for (int i = 0; i < 4; i++) { + ASSERT_OK(Put(Key(i * 30), "value" + std::to_string(i))); + ASSERT_OK(Put(Key(i * 30 + 50), "value" + std::to_string(i))); + ASSERT_OK(Flush()); + } + + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + + // pick 2 files on level 0 for compaction, which has 3 overlap files on L1 + CompactionServiceInput input1; + input1.input_files.push_back(meta.levels[0].files[2].name); + input1.input_files.push_back(meta.levels[0].files[3].name); + input1.input_files.push_back(meta.levels[1].files[0].name); + input1.input_files.push_back(meta.levels[1].files[1].name); + input1.input_files.push_back(meta.levels[1].files[2].name); + + input1.output_level = 1; + ASSERT_OK(db_->GetDbIdentity(input1.db_id)); + + options.max_open_files = -1; + Close(); + + OpenSecondary(options); + auto cfh = db_secondary_->DefaultColumnFamily(); + CompactionServiceResult result; + ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation( + OpenAndCompactOptions(), cfh, input1, &result)); + ASSERT_OK(result.status); + + // pick 2 files on level 1 for compaction, which has 6 overlap files on L2 + CompactionServiceInput input2; + input2.input_files.push_back(meta.levels[1].files[1].name); + input2.input_files.push_back(meta.levels[1].files[2].name); + for (int i = 3; i < 9; i++) { + input2.input_files.push_back(meta.levels[2].files[i].name); + } + + input2.output_level = 2; + input2.db_id = input1.db_id; + ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation( + OpenAndCompactOptions(), cfh, input2, &result)); + ASSERT_OK(result.status); + + CloseSecondary(); + + // delete all l2 files, without update manifest + for (auto& file : meta.levels[2].files) { + ASSERT_OK(env_->DeleteFile(dbname_ + file.name)); + } + OpenSecondary(options); + cfh = db_secondary_->DefaultColumnFamily(); + Status s = db_secondary_full()->TEST_CompactWithoutInstallation( + OpenAndCompactOptions(), cfh, input2, &result); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_OK(result.status); + + // TODO: L0 -> L1 compaction should success, currently version is not built + // if files is missing. + // ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(OpenAndCompactOptions(), + // cfh, input1, &result)); +} + +TEST_F(DBSecondaryTest, InternalCompactionCompactedFiles) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(Flush()); + } + CompactionServiceInput input; + + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + for (auto& file : meta.levels[0].files) { + ASSERT_EQ(0, meta.levels[0].level); + input.input_files.push_back(file.name); + } + ASSERT_EQ(input.input_files.size(), 3); + + input.output_level = 1; + ASSERT_OK(db_->GetDbIdentity(input.db_id)); + + // trigger compaction to delete the files for secondary instance compaction + ASSERT_OK(Put("foo", "foo_value" + std::to_string(3))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(3))); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + Close(); + + options.max_open_files = -1; + OpenSecondary(options); + auto cfh = db_secondary_->DefaultColumnFamily(); + + CompactionServiceResult result; + Status s = db_secondary_full()->TEST_CompactWithoutInstallation( + OpenAndCompactOptions(), cfh, input, &result); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_OK(result.status); +} + +TEST_F(DBSecondaryTest, InternalCompactionMissingFiles) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(Flush()); + } + CompactionServiceInput input; + + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + for (auto& file : meta.levels[0].files) { + ASSERT_EQ(0, meta.levels[0].level); + input.input_files.push_back(file.name); + } + ASSERT_EQ(input.input_files.size(), 3); + + input.output_level = 1; + ASSERT_OK(db_->GetDbIdentity(input.db_id)); + + Close(); + + ASSERT_OK(env_->DeleteFile(dbname_ + input.input_files[0])); + + options.max_open_files = -1; + OpenSecondary(options); + auto cfh = db_secondary_->DefaultColumnFamily(); + + CompactionServiceResult result; + Status s = db_secondary_full()->TEST_CompactWithoutInstallation( + OpenAndCompactOptions(), cfh, input, &result); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_OK(result.status); + + input.input_files.erase(input.input_files.begin()); + + ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation( + OpenAndCompactOptions(), cfh, input, &result)); + ASSERT_OK(result.status); +} + +TEST_F(DBSecondaryTest, OpenAsSecondary) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(Flush()); + } + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ReadOptions ropts; + ropts.verify_checksums = true; + const auto verify_db_func = [&](const std::string& foo_val, + const std::string& bar_val) { + std::string value; + ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_EQ(foo_val, value); + ASSERT_OK(db_secondary_->Get(ropts, "bar", &value)); + ASSERT_EQ(bar_val, value); + Iterator* iter = db_secondary_->NewIterator(ropts); + ASSERT_NE(nullptr, iter); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ(foo_val, iter->value().ToString()); + iter->Seek("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar", iter->key().ToString()); + ASSERT_EQ(bar_val, iter->value().ToString()); + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ++count; + } + ASSERT_EQ(2, count); + delete iter; + }; + + verify_db_func("foo_value2", "bar_value2"); + + ASSERT_OK(Put("foo", "new_foo_value")); + ASSERT_OK(Put("bar", "new_bar_value")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db_func("new_foo_value", "new_bar_value"); +} + +namespace { +class TraceFileEnv : public EnvWrapper { + public: + explicit TraceFileEnv(Env* _target) : EnvWrapper(_target) {} + static const char* kClassName() { return "TraceFileEnv"; } + const char* Name() const override { return kClassName(); } + + Status NewRandomAccessFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& env_options) override { + class TracedRandomAccessFile : public RandomAccessFile { + public: + TracedRandomAccessFile(std::unique_ptr&& target, + std::atomic& counter) + : target_(std::move(target)), files_closed_(counter) {} + ~TracedRandomAccessFile() override { + files_closed_.fetch_add(1, std::memory_order_relaxed); + } + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + return target_->Read(offset, n, result, scratch); + } + + private: + std::unique_ptr target_; + std::atomic& files_closed_; + }; + Status s = target()->NewRandomAccessFile(f, r, env_options); + if (s.ok()) { + r->reset(new TracedRandomAccessFile(std::move(*r), files_closed_)); + } + return s; + } + + int files_closed() const { + return files_closed_.load(std::memory_order_relaxed); + } + + private: + std::atomic files_closed_{0}; +}; +} // anonymous namespace + +TEST_F(DBSecondaryTest, SecondaryCloseFiles) { + Options options; + options.env = env_; + options.max_open_files = 1; + options.disable_auto_compactions = true; + Reopen(options); + Options options1; + std::unique_ptr traced_env(new TraceFileEnv(env_)); + options1.env = traced_env.get(); + OpenSecondary(options1); + + static const auto verify_db = [&]() { + std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + std::unique_ptr iter2(db_secondary_->NewIterator(ReadOptions())); + for (iter1->SeekToFirst(), iter2->SeekToFirst(); + iter1->Valid() && iter2->Valid(); iter1->Next(), iter2->Next()) { + ASSERT_EQ(iter1->key(), iter2->key()); + ASSERT_EQ(iter1->value(), iter2->value()); + } + ASSERT_FALSE(iter1->Valid()); + ASSERT_FALSE(iter2->Valid()); + }; + + ASSERT_OK(Put("a", "value")); + ASSERT_OK(Put("c", "value")); + ASSERT_OK(Flush()); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db(); + + ASSERT_OK(Put("b", "value")); + ASSERT_OK(Put("d", "value")); + ASSERT_OK(Flush()); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db(); + + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + ASSERT_EQ(2, static_cast(traced_env.get())->files_closed()); + + Status s = db_secondary_->SetDBOptions({{"max_open_files", "-1"}}); + ASSERT_TRUE(s.IsNotSupported()); + CloseSecondary(); +} + +TEST_F(DBSecondaryTest, OpenAsSecondaryWALTailing) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + } + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + ReadOptions ropts; + ropts.verify_checksums = true; + const auto verify_db_func = [&](const std::string& foo_val, + const std::string& bar_val) { + std::string value; + ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_EQ(foo_val, value); + ASSERT_OK(db_secondary_->Get(ropts, "bar", &value)); + ASSERT_EQ(bar_val, value); + Iterator* iter = db_secondary_->NewIterator(ropts); + ASSERT_NE(nullptr, iter); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ(foo_val, iter->value().ToString()); + iter->Seek("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar", iter->key().ToString()); + ASSERT_EQ(bar_val, iter->value().ToString()); + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ++count; + } + ASSERT_EQ(2, count); + delete iter; + }; + + verify_db_func("foo_value2", "bar_value2"); + + ASSERT_OK(Put("foo", "new_foo_value")); + ASSERT_OK(Put("bar", "new_bar_value")); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db_func("new_foo_value", "new_bar_value"); + + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "new_foo_value_1")); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db_func("new_foo_value_1", "new_bar_value"); +} + +TEST_F(DBSecondaryTest, SecondaryTailingBug_ISSUE_8467) { + Options options; + options.env = env_; + Reopen(options); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + } + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + const auto verify_db = [&](const std::string& foo_val, + const std::string& bar_val) { + std::string value; + ReadOptions ropts; + Status s = db_secondary_->Get(ropts, "foo", &value); + ASSERT_OK(s); + ASSERT_EQ(foo_val, value); + + s = db_secondary_->Get(ropts, "bar", &value); + ASSERT_OK(s); + ASSERT_EQ(bar_val, value); + }; + + for (int i = 0; i < 2; ++i) { + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db("foo_value2", "bar_value2"); + } +} + +TEST_F(DBSecondaryTest, RefreshIterator) { + Options options; + options.env = env_; + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + std::unique_ptr it(db_secondary_->NewIterator(ReadOptions())); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + if (0 == i) { + it->Seek("foo"); + ASSERT_FALSE(it->Valid()); + ASSERT_OK(it->status()); + + ASSERT_OK(it->Refresh()); + + it->Seek("foo"); + ASSERT_OK(it->status()); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("foo", it->key()); + ASSERT_EQ("foo_value0", it->value()); + } else { + it->Seek("foo"); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("foo", it->key()); + ASSERT_EQ("foo_value" + std::to_string(i - 1), it->value()); + ASSERT_OK(it->status()); + + ASSERT_OK(it->Refresh()); + + it->Seek("foo"); + ASSERT_OK(it->status()); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("foo", it->key()); + ASSERT_EQ("foo_value" + std::to_string(i), it->value()); + } + } +} + +TEST_F(DBSecondaryTest, OpenWithNonExistColumnFamily) { + Options options; + options.env = env_; + CreateAndReopenWithCF({"pikachu"}, options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options1); + cf_descs.emplace_back("pikachu", options1); + cf_descs.emplace_back("eevee", options1); + Status s = DB::OpenAsSecondary(options1, dbname_, secondary_path_, cf_descs, + &handles_secondary_, &db_secondary_); + ASSERT_NOK(s); +} + +TEST_F(DBSecondaryTest, OpenWithSubsetOfColumnFamilies) { + Options options; + options.env = env_; + CreateAndReopenWithCF({"pikachu"}, options); + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + ASSERT_EQ(0, handles_secondary_.size()); + ASSERT_NE(nullptr, db_secondary_); + + ASSERT_OK(Put(0 /*cf*/, "foo", "foo_value")); + ASSERT_OK(Put(1 /*cf*/, "foo", "foo_value")); + ASSERT_OK(Flush(0 /*cf*/)); + ASSERT_OK(Flush(1 /*cf*/)); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + ReadOptions ropts; + ropts.verify_checksums = true; + std::string value; + ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_EQ("foo_value", value); +} + +TEST_F(DBSecondaryTest, SwitchToNewManifestDuringOpen) { + Options options; + options.env = env_; + Reopen(options); + Close(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency( + {{"ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:0", + "VersionSet::ProcessManifestWrites:BeforeNewManifest"}, + {"DBImpl::Open:AfterDeleteFiles", + "ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:" + "1"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + port::Thread ro_db_thread([&]() { + Options options1; + options1.env = env_; + options1.max_open_files = -1; + Status s = TryOpenSecondary(options1); + ASSERT_TRUE(s.IsTryAgain()); + + // Try again + OpenSecondary(options1); + CloseSecondary(); + }); + Reopen(options); + ro_db_thread.join(); +} + +TEST_F(DBSecondaryTest, MissingTableFileDuringOpen) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + ReadOptions ropts; + ropts.verify_checksums = true; + std::string value; + ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_EQ("foo_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + value); + ASSERT_OK(db_secondary_->Get(ropts, "bar", &value)); + ASSERT_EQ("bar_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + value); + Iterator* iter = db_secondary_->NewIterator(ropts); + ASSERT_NE(nullptr, iter); + iter->Seek("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar", iter->key().ToString()); + ASSERT_EQ("bar_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + iter->value().ToString()); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("foo_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + iter->value().ToString()); + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ++count; + } + ASSERT_EQ(2, count); + delete iter; +} + +TEST_F(DBSecondaryTest, MissingTableFile) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_NE(nullptr, db_secondary_full()); + ReadOptions ropts; + ropts.verify_checksums = true; + std::string value; + ASSERT_NOK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_NOK(db_secondary_->Get(ropts, "bar", &value)); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_EQ("foo_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + value); + ASSERT_OK(db_secondary_->Get(ropts, "bar", &value)); + ASSERT_EQ("bar_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + value); + Iterator* iter = db_secondary_->NewIterator(ropts); + ASSERT_NE(nullptr, iter); + iter->Seek("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar", iter->key().ToString()); + ASSERT_EQ("bar_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + iter->value().ToString()); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("foo_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + iter->value().ToString()); + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ++count; + } + ASSERT_EQ(2, count); + delete iter; +} + +TEST_F(DBSecondaryTest, PrimaryDropColumnFamily) { + Options options; + options.env = env_; + const std::string kCfName1 = "pikachu"; + CreateAndReopenWithCF({kCfName1}, options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondaryWithColumnFamilies({kCfName1}, options1); + ASSERT_EQ(2, handles_secondary_.size()); + + ASSERT_OK(Put(1 /*cf*/, "foo", "foo_val_1")); + ASSERT_OK(Flush(1 /*cf*/)); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + ReadOptions ropts; + ropts.verify_checksums = true; + std::string value; + ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value)); + ASSERT_EQ("foo_val_1", value); + + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + Close(); + CheckFileTypeCounts(dbname_, 1, 0, 1); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + value.clear(); + ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value)); + ASSERT_EQ("foo_val_1", value); +} + +TEST_F(DBSecondaryTest, SwitchManifest) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + const std::string cf1_name("test_cf"); + CreateAndReopenWithCF({cf1_name}, options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondaryWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, + options1); + + const int kNumFiles = options.level0_file_num_compaction_trigger - 1; + // Keep it smaller than 10 so that key0, key1, ..., key9 are sorted as 0, 1, + // ..., 9. + const int kNumKeys = 10; + // Create two sst + for (int i = 0; i != kNumFiles; ++i) { + for (int j = 0; j != kNumKeys; ++j) { + ASSERT_OK(Put("key" + std::to_string(j), "value_" + std::to_string(i))); + } + ASSERT_OK(Flush()); + } + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + const auto& range_scan_db = [&]() { + ReadOptions tmp_ropts; + tmp_ropts.total_order_seek = true; + tmp_ropts.verify_checksums = true; + std::unique_ptr iter(db_secondary_->NewIterator(tmp_ropts)); + int cnt = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++cnt) { + ASSERT_EQ("key" + std::to_string(cnt), iter->key().ToString()); + ASSERT_EQ("value_" + std::to_string(kNumFiles - 1), + iter->value().ToString()); + } + }; + + range_scan_db(); + + // While secondary instance still keeps old MANIFEST open, we close primary, + // restart primary, performs full compaction, close again, restart again so + // that next time secondary tries to catch up with primary, the secondary + // will skip the MANIFEST in middle. + ReopenWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, options); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ReopenWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, options); + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + range_scan_db(); +} + +TEST_F(DBSecondaryTest, SwitchManifestTwice) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + const std::string cf1_name("test_cf"); + CreateAndReopenWithCF({cf1_name}, options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondaryWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, + options1); + + ASSERT_OK(Put("0", "value0")); + ASSERT_OK(Flush()); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + std::string value; + ReadOptions ropts; + ropts.verify_checksums = true; + ASSERT_OK(db_secondary_->Get(ropts, "0", &value)); + ASSERT_EQ("value0", value); + + ReopenWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, options); + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, options); + ASSERT_OK(Put("0", "value1")); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + + ASSERT_OK(db_secondary_->Get(ropts, "0", &value)); + ASSERT_EQ("value1", value); +} + +TEST_F(DBSecondaryTest, DISABLED_SwitchWAL) { + const int kNumKeysPerMemtable = 1; + Options options; + options.env = env_; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 2; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerMemtable)); + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + const auto& verify_db = [](DB* db1, DB* db2) { + ASSERT_NE(nullptr, db1); + ASSERT_NE(nullptr, db2); + ReadOptions read_opts; + read_opts.verify_checksums = true; + std::unique_ptr it1(db1->NewIterator(read_opts)); + std::unique_ptr it2(db2->NewIterator(read_opts)); + it1->SeekToFirst(); + it2->SeekToFirst(); + for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) { + ASSERT_EQ(it1->key(), it2->key()); + ASSERT_EQ(it1->value(), it2->value()); + } + ASSERT_FALSE(it1->Valid()); + ASSERT_FALSE(it2->Valid()); + + for (it1->SeekToFirst(); it1->Valid(); it1->Next()) { + std::string value; + ASSERT_OK(db2->Get(read_opts, it1->key(), &value)); + ASSERT_EQ(it1->value(), value); + } + for (it2->SeekToFirst(); it2->Valid(); it2->Next()) { + std::string value; + ASSERT_OK(db1->Get(read_opts, it2->key(), &value)); + ASSERT_EQ(it2->value(), value); + } + }; + for (int k = 0; k != 16; ++k) { + ASSERT_OK(Put("key" + std::to_string(k), "value" + std::to_string(k))); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db(dbfull(), db_secondary_); + } +} + +TEST_F(DBSecondaryTest, DISABLED_SwitchWALMultiColumnFamilies) { + const int kNumKeysPerMemtable = 1; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCallFlush:ContextCleanedUp", + "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"}}); + SyncPoint::GetInstance()->EnableProcessing(); + const std::string kCFName1 = "pikachu"; + Options options; + options.env = env_; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 2; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerMemtable)); + CreateAndReopenWithCF({kCFName1}, options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondaryWithColumnFamilies({kCFName1}, options1); + ASSERT_EQ(2, handles_secondary_.size()); + + const auto& verify_db = [](DB* db1, + const std::vector& handles1, + DB* db2, + const std::vector& handles2) { + ASSERT_NE(nullptr, db1); + ASSERT_NE(nullptr, db2); + ReadOptions read_opts; + read_opts.verify_checksums = true; + ASSERT_EQ(handles1.size(), handles2.size()); + for (size_t i = 0; i != handles1.size(); ++i) { + std::unique_ptr it1(db1->NewIterator(read_opts, handles1[i])); + std::unique_ptr it2(db2->NewIterator(read_opts, handles2[i])); + it1->SeekToFirst(); + it2->SeekToFirst(); + for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) { + ASSERT_EQ(it1->key(), it2->key()); + ASSERT_EQ(it1->value(), it2->value()); + } + ASSERT_FALSE(it1->Valid()); + ASSERT_FALSE(it2->Valid()); + + for (it1->SeekToFirst(); it1->Valid(); it1->Next()) { + std::string value; + ASSERT_OK(db2->Get(read_opts, handles2[i], it1->key(), &value)); + ASSERT_EQ(it1->value(), value); + } + for (it2->SeekToFirst(); it2->Valid(); it2->Next()) { + std::string value; + ASSERT_OK(db1->Get(read_opts, handles1[i], it2->key(), &value)); + ASSERT_EQ(it2->value(), value); + } + } + }; + for (int k = 0; k != 8; ++k) { + for (int j = 0; j < 2; ++j) { + ASSERT_OK(Put(0 /*cf*/, "key" + std::to_string(k), + "value" + std::to_string(k))); + ASSERT_OK(Put(1 /*cf*/, "key" + std::to_string(k), + "value" + std::to_string(k))); + } + TEST_SYNC_POINT( + "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db(dbfull(), handles_, db_secondary_, handles_secondary_); + SyncPoint::GetInstance()->ClearTrace(); + } +} + +TEST_F(DBSecondaryTest, CatchUpAfterFlush) { + const int kNumKeysPerMemtable = 16; + Options options; + options.env = env_; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 2; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerMemtable)); + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + WriteOptions write_opts; + WriteBatch wb; + ASSERT_OK(wb.Put("key0", "value0")); + ASSERT_OK(wb.Put("key1", "value1")); + ASSERT_OK(dbfull()->Write(write_opts, &wb)); + ReadOptions read_opts; + std::unique_ptr iter1(db_secondary_->NewIterator(read_opts)); + iter1->Seek("key0"); + ASSERT_FALSE(iter1->Valid()); + iter1->Seek("key1"); + ASSERT_FALSE(iter1->Valid()); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + iter1->Seek("key0"); + ASSERT_FALSE(iter1->Valid()); + iter1->Seek("key1"); + ASSERT_FALSE(iter1->Valid()); + ASSERT_OK(iter1->status()); + std::unique_ptr iter2(db_secondary_->NewIterator(read_opts)); + iter2->Seek("key0"); + ASSERT_TRUE(iter2->Valid()); + ASSERT_EQ("value0", iter2->value()); + iter2->Seek("key1"); + ASSERT_TRUE(iter2->Valid()); + ASSERT_OK(iter2->status()); + ASSERT_EQ("value1", iter2->value()); + + { + WriteBatch wb1; + ASSERT_OK(wb1.Put("key0", "value01")); + ASSERT_OK(wb1.Put("key1", "value11")); + ASSERT_OK(dbfull()->Write(write_opts, &wb1)); + } + + { + WriteBatch wb2; + ASSERT_OK(wb2.Put("key0", "new_value0")); + ASSERT_OK(wb2.Delete("key1")); + ASSERT_OK(dbfull()->Write(write_opts, &wb2)); + } + + ASSERT_OK(Flush()); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + std::unique_ptr iter3(db_secondary_->NewIterator(read_opts)); + // iter3 should not see value01 and value11 at all. + iter3->Seek("key0"); + ASSERT_TRUE(iter3->Valid()); + ASSERT_EQ("new_value0", iter3->value()); + iter3->Seek("key1"); + ASSERT_FALSE(iter3->Valid()); + ASSERT_OK(iter3->status()); +} + +TEST_F(DBSecondaryTest, CheckConsistencyWhenOpen) { + bool called = false; + Options options; + options.env = env_; + options.disable_auto_compactions = true; + Reopen(options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "DBImplSecondary::CheckConsistency:AfterFirstAttempt", [&](void* arg) { + ASSERT_NE(nullptr, arg); + called = true; + auto* s = reinterpret_cast(arg); + ASSERT_NOK(*s); + }); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::CheckConsistency:AfterGetLiveFilesMetaData", + "BackgroundCallCompaction:0"}, + {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles", + "DBImpl::CheckConsistency:BeforeGetFileSize"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("a", "value0")); + ASSERT_OK(Put("c", "value0")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("b", "value1")); + ASSERT_OK(Put("d", "value1")); + ASSERT_OK(Flush()); + port::Thread thread([this]() { + Options opts; + opts.env = env_; + opts.max_open_files = -1; + OpenSecondary(opts); + }); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + thread.join(); + ASSERT_TRUE(called); +} + +TEST_F(DBSecondaryTest, StartFromInconsistent) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(Flush()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) { + ASSERT_NE(nullptr, arg); + *(reinterpret_cast(arg)) = + Status::Corruption("Inject corruption"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + Options options1; + options1.env = env_; + Status s = TryOpenSecondary(options1); + ASSERT_TRUE(s.IsCorruption()); +} + +TEST_F(DBSecondaryTest, InconsistencyDuringCatchUp) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(Flush()); + + Options options1; + options1.env = env_; + OpenSecondary(options1); + + { + std::string value; + ASSERT_OK(db_secondary_->Get(ReadOptions(), "foo", &value)); + ASSERT_EQ("value", value); + } + + ASSERT_OK(Put("bar", "value1")); + ASSERT_OK(Flush()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) { + ASSERT_NE(nullptr, arg); + *(reinterpret_cast(arg)) = + Status::Corruption("Inject corruption"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + Status s = db_secondary_->TryCatchUpWithPrimary(); + ASSERT_TRUE(s.IsCorruption()); +} + +TEST_F(DBSecondaryTest, OpenWithTransactionDB) { + Options options = CurrentOptions(); + options.create_if_missing = true; + + // Destroy the DB to recreate as a TransactionDB. + Close(); + Destroy(options, true); + + // Create a TransactionDB. + TransactionDB* txn_db = nullptr; + TransactionDBOptions txn_db_opts; + ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db)); + ASSERT_NE(txn_db, nullptr); + db_ = txn_db; + + std::vector cfs = {"new_CF"}; + CreateColumnFamilies(cfs, options); + ASSERT_EQ(handles_.size(), 1); + + WriteOptions wopts; + TransactionOptions txn_opts; + Transaction* txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr); + ASSERT_NE(txn1, nullptr); + ASSERT_OK(txn1->Put(handles_[0], "k1", "v1")); + ASSERT_OK(txn1->Commit()); + delete txn1; + + options = CurrentOptions(); + options.max_open_files = -1; + ASSERT_OK(TryOpenSecondary(options)); +} + +class DBSecondaryTestWithTimestamp : public DBSecondaryTestBase { + public: + explicit DBSecondaryTestWithTimestamp() + : DBSecondaryTestBase("db_secondary_test_with_timestamp") {} +}; +TEST_F(DBSecondaryTestWithTimestamp, IteratorAndGetReadTimestampSizeMismatch) { + const int kNumKeysPerFile = 128; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + const std::string write_timestamp = Timestamp(1, 0); + WriteOptions write_opts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), write_timestamp, + "value" + std::to_string(key)); + ASSERT_OK(s); + } + + // Reopen the database as secondary instance to test its timestamp support. + Close(); + options.max_open_files = -1; + ASSERT_OK(ReopenAsSecondary(options)); + + ReadOptions read_opts; + std::string different_size_read_timestamp; + PutFixed32(&different_size_read_timestamp, 2); + Slice different_size_read_ts = different_size_read_timestamp; + read_opts.timestamp = &different_size_read_ts; + { + std::unique_ptr iter(db_->NewIterator(read_opts)); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->status().IsInvalidArgument()); + } + + for (uint64_t key = 0; key <= kMaxKey; ++key) { + std::string value_from_get; + std::string timestamp; + ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, ×tamp) + .IsInvalidArgument()); + } + + Close(); +} + +TEST_F(DBSecondaryTestWithTimestamp, + IteratorAndGetReadTimestampSpecifiedWithoutWriteTimestamp) { + const int kNumKeysPerFile = 128; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + WriteOptions write_opts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(key)); + ASSERT_OK(s); + } + + // Reopen the database as secondary instance to test its timestamp support. + Close(); + options.max_open_files = -1; + ASSERT_OK(ReopenAsSecondary(options)); + + ReadOptions read_opts; + const std::string read_timestamp = Timestamp(2, 0); + Slice read_ts = read_timestamp; + read_opts.timestamp = &read_ts; + { + std::unique_ptr iter(db_->NewIterator(read_opts)); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->status().IsInvalidArgument()); + } + + for (uint64_t key = 0; key <= kMaxKey; ++key) { + std::string value_from_get; + std::string timestamp; + ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, ×tamp) + .IsInvalidArgument()); + } + + Close(); +} + +TEST_F(DBSecondaryTestWithTimestamp, + IteratorAndGetWriteWithTimestampReadWithoutTimestamp) { + const int kNumKeysPerFile = 128; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + const std::string write_timestamp = Timestamp(1, 0); + WriteOptions write_opts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), write_timestamp, + "value" + std::to_string(key)); + ASSERT_OK(s); + } + + // Reopen the database as secondary instance to test its timestamp support. + Close(); + options.max_open_files = -1; + ASSERT_OK(ReopenAsSecondary(options)); + + ReadOptions read_opts; + { + std::unique_ptr iter(db_->NewIterator(read_opts)); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->status().IsInvalidArgument()); + } + + for (uint64_t key = 0; key <= kMaxKey; ++key) { + std::string value_from_get; + ASSERT_TRUE( + db_->Get(read_opts, Key1(key), &value_from_get).IsInvalidArgument()); + } + + Close(); +} + +TEST_F(DBSecondaryTestWithTimestamp, IteratorAndGet) { + const int kNumKeysPerFile = 128; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + const std::vector start_keys = {1, 0}; + const std::vector write_timestamps = {Timestamp(1, 0), + Timestamp(3, 0)}; + const std::vector read_timestamps = {Timestamp(2, 0), + Timestamp(4, 0)}; + for (size_t i = 0; i < write_timestamps.size(); ++i) { + WriteOptions write_opts; + for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), write_timestamps[i], + "value" + std::to_string(i)); + ASSERT_OK(s); + } + } + + // Reopen the database as secondary instance to test its timestamp support. + Close(); + options.max_open_files = -1; + ASSERT_OK(ReopenAsSecondary(options)); + + auto get_value_and_check = [](DB* db, ReadOptions read_opts, Slice key, + Slice expected_value, std::string expected_ts) { + std::string value_from_get; + std::string timestamp; + ASSERT_OK(db->Get(read_opts, key.ToString(), &value_from_get, ×tamp)); + ASSERT_EQ(expected_value, value_from_get); + ASSERT_EQ(expected_ts, timestamp); + }; + for (size_t i = 0; i < read_timestamps.size(); ++i) { + ReadOptions read_opts; + Slice read_ts = read_timestamps[i]; + read_opts.timestamp = &read_ts; + std::unique_ptr it(db_->NewIterator(read_opts)); + int count = 0; + uint64_t key = 0; + // Forward iterate. + for (it->Seek(Key1(0)), key = start_keys[i]; it->Valid(); + it->Next(), ++count, ++key) { + CheckIterUserEntry(it.get(), Key1(key), kTypeValue, + "value" + std::to_string(i), write_timestamps[i]); + get_value_and_check(db_, read_opts, it->key(), it->value(), + write_timestamps[i]); + } + size_t expected_count = kMaxKey - start_keys[i] + 1; + ASSERT_EQ(expected_count, count); + + // Backward iterate. + count = 0; + for (it->SeekForPrev(Key1(kMaxKey)), key = kMaxKey; it->Valid(); + it->Prev(), ++count, --key) { + CheckIterUserEntry(it.get(), Key1(key), kTypeValue, + "value" + std::to_string(i), write_timestamps[i]); + get_value_and_check(db_, read_opts, it->key(), it->value(), + write_timestamps[i]); + } + ASSERT_EQ(static_cast(kMaxKey) - start_keys[i] + 1, count); + + // SeekToFirst()/SeekToLast() with lower/upper bounds. + // Then iter with lower and upper bounds. + uint64_t l = 0; + uint64_t r = kMaxKey + 1; + while (l < r) { + std::string lb_str = Key1(l); + Slice lb = lb_str; + std::string ub_str = Key1(r); + Slice ub = ub_str; + read_opts.iterate_lower_bound = &lb; + read_opts.iterate_upper_bound = &ub; + it.reset(db_->NewIterator(read_opts)); + for (it->SeekToFirst(), key = std::max(l, start_keys[i]), count = 0; + it->Valid(); it->Next(), ++key, ++count) { + CheckIterUserEntry(it.get(), Key1(key), kTypeValue, + "value" + std::to_string(i), write_timestamps[i]); + get_value_and_check(db_, read_opts, it->key(), it->value(), + write_timestamps[i]); + } + ASSERT_EQ(r - std::max(l, start_keys[i]), count); + + for (it->SeekToLast(), key = std::min(r, kMaxKey + 1), count = 0; + it->Valid(); it->Prev(), --key, ++count) { + CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue, + "value" + std::to_string(i), write_timestamps[i]); + get_value_and_check(db_, read_opts, it->key(), it->value(), + write_timestamps[i]); + } + l += (kMaxKey / 100); + r -= (kMaxKey / 100); + } + } + Close(); +} + +TEST_F(DBSecondaryTestWithTimestamp, IteratorsReadTimestampSizeMismatch) { + const int kNumKeysPerFile = 128; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + const std::string write_timestamp = Timestamp(1, 0); + WriteOptions write_opts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), write_timestamp, + "value" + std::to_string(key)); + ASSERT_OK(s); + } + + // Reopen the database as secondary instance to test its timestamp support. + Close(); + options.max_open_files = -1; + ASSERT_OK(ReopenAsSecondary(options)); + + ReadOptions read_opts; + std::string different_size_read_timestamp; + PutFixed32(&different_size_read_timestamp, 2); + Slice different_size_read_ts = different_size_read_timestamp; + read_opts.timestamp = &different_size_read_ts; + { + std::vector iters; + ASSERT_TRUE( + db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters) + .IsInvalidArgument()); + } + + Close(); +} + +TEST_F(DBSecondaryTestWithTimestamp, + IteratorsReadTimestampSpecifiedWithoutWriteTimestamp) { + const int kNumKeysPerFile = 128; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + WriteOptions write_opts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(key)); + ASSERT_OK(s); + } + + // Reopen the database as secondary instance to test its timestamp support. + Close(); + options.max_open_files = -1; + ASSERT_OK(ReopenAsSecondary(options)); + + ReadOptions read_opts; + const std::string read_timestamp = Timestamp(2, 0); + Slice read_ts = read_timestamp; + read_opts.timestamp = &read_ts; + { + std::vector iters; + ASSERT_TRUE( + db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters) + .IsInvalidArgument()); + } + + Close(); +} + +TEST_F(DBSecondaryTestWithTimestamp, + IteratorsWriteWithTimestampReadWithoutTimestamp) { + const int kNumKeysPerFile = 128; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + const std::string write_timestamp = Timestamp(1, 0); + WriteOptions write_opts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), write_timestamp, + "value" + std::to_string(key)); + ASSERT_OK(s); + } + + // Reopen the database as secondary instance to test its timestamp support. + Close(); + options.max_open_files = -1; + ASSERT_OK(ReopenAsSecondary(options)); + + ReadOptions read_opts; + { + std::vector iters; + ASSERT_TRUE( + db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters) + .IsInvalidArgument()); + } + + Close(); +} + +TEST_F(DBSecondaryTestWithTimestamp, Iterators) { + const int kNumKeysPerFile = 128; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + const std::string write_timestamp = Timestamp(1, 0); + const std::string read_timestamp = Timestamp(2, 0); + WriteOptions write_opts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), write_timestamp, + "value" + std::to_string(key)); + ASSERT_OK(s); + } + + // Reopen the database as secondary instance to test its timestamp support. + Close(); + options.max_open_files = -1; + ASSERT_OK(ReopenAsSecondary(options)); + + ReadOptions read_opts; + Slice read_ts = read_timestamp; + read_opts.timestamp = &read_ts; + std::vector iters; + ASSERT_OK(db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters)); + ASSERT_EQ(static_cast(1), iters.size()); + + int count = 0; + uint64_t key = 0; + // Forward iterate. + for (iters[0]->Seek(Key1(0)), key = 0; iters[0]->Valid(); + iters[0]->Next(), ++count, ++key) { + CheckIterUserEntry(iters[0], Key1(key), kTypeValue, + "value" + std::to_string(key), write_timestamp); + } + + size_t expected_count = kMaxKey - 0 + 1; + ASSERT_EQ(expected_count, count); + delete iters[0]; + + Close(); +} +#endif //! ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_sst_test.cc b/src/rocksdb/db/db_sst_test.cc new file mode 100644 index 000000000..7f031444a --- /dev/null +++ b/src/rocksdb/db/db_sst_test.cc @@ -0,0 +1,1868 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "env/mock_env.h" +#include "file/sst_file_manager_impl.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/cache.h" +#include "rocksdb/sst_file_manager.h" +#include "rocksdb/table.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +class DBSSTTest : public DBTestBase { + public: + DBSSTTest() : DBTestBase("db_sst_test", /*env_do_fsync=*/true) {} +}; + +#ifndef ROCKSDB_LITE +// A class which remembers the name of each flushed file. +class FlushedFileCollector : public EventListener { + public: + FlushedFileCollector() {} + ~FlushedFileCollector() override {} + + void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { + std::lock_guard lock(mutex_); + flushed_files_.push_back(info.file_path); + } + + std::vector GetFlushedFiles() { + std::lock_guard lock(mutex_); + std::vector result; + for (auto fname : flushed_files_) { + result.push_back(fname); + } + return result; + } + void ClearFlushedFiles() { + std::lock_guard lock(mutex_); + flushed_files_.clear(); + } + + private: + std::vector flushed_files_; + std::mutex mutex_; +}; +#endif // ROCKSDB_LITE + +TEST_F(DBSSTTest, DontDeletePendingOutputs) { + Options options; + options.env = env_; + options.create_if_missing = true; + DestroyAndReopen(options); + + // Every time we write to a table file, call FOF/POF with full DB scan. This + // will make sure our pending_outputs_ protection work correctly + std::function purge_obsolete_files_function = [&]() { + JobContext job_context(0); + dbfull()->TEST_LockMutex(); + dbfull()->FindObsoleteFiles(&job_context, true /*force*/); + dbfull()->TEST_UnlockMutex(); + dbfull()->PurgeObsoleteFiles(job_context); + job_context.Clean(); + }; + + env_->table_write_callback_ = &purge_obsolete_files_function; + + for (int i = 0; i < 2; ++i) { + ASSERT_OK(Put("a", "begin")); + ASSERT_OK(Put("z", "end")); + ASSERT_OK(Flush()); + } + + // If pending output guard does not work correctly, PurgeObsoleteFiles() will + // delete the file that Compaction is trying to create, causing this: error + // db/db_test.cc:975: IO error: + // /tmp/rocksdbtest-1552237650/db_test/000009.sst: No such file or directory + Compact("a", "b"); +} + +// 1 Create some SST files by inserting K-V pairs into DB +// 2 Close DB and change suffix from ".sst" to ".ldb" for every other SST file +// 3 Open DB and check if all key can be read +TEST_F(DBSSTTest, SSTsWithLdbSuffixHandling) { + Options options = CurrentOptions(); + options.write_buffer_size = 110 << 10; // 110KB + options.num_levels = 4; + DestroyAndReopen(options); + + Random rnd(301); + int key_id = 0; + for (int i = 0; i < 10; ++i) { + GenerateNewFile(&rnd, &key_id, false); + } + ASSERT_OK(Flush()); + Close(); + int const num_files = GetSstFileCount(dbname_); + ASSERT_GT(num_files, 0); + + Reopen(options); + std::vector values; + values.reserve(key_id); + for (int k = 0; k < key_id; ++k) { + values.push_back(Get(Key(k))); + } + Close(); + + std::vector filenames; + GetSstFiles(env_, dbname_, &filenames); + int num_ldb_files = 0; + for (size_t i = 0; i < filenames.size(); ++i) { + if (i & 1) { + continue; + } + std::string const rdb_name = dbname_ + "/" + filenames[i]; + std::string const ldb_name = Rocks2LevelTableFileName(rdb_name); + ASSERT_TRUE(env_->RenameFile(rdb_name, ldb_name).ok()); + ++num_ldb_files; + } + ASSERT_GT(num_ldb_files, 0); + ASSERT_EQ(num_files, GetSstFileCount(dbname_)); + + Reopen(options); + for (int k = 0; k < key_id; ++k) { + ASSERT_EQ(values[k], Get(Key(k))); + } + Destroy(options); +} + +// Check that we don't crash when opening DB with +// DBOptions::skip_checking_sst_file_sizes_on_db_open = true. +TEST_F(DBSSTTest, SkipCheckingSSTFileSizesOnDBOpen) { + ASSERT_OK(Put("pika", "choo")); + ASSERT_OK(Flush()); + + // Just open the DB with the option set to true and check that we don't crash. + Options options; + options.env = env_; + options.skip_checking_sst_file_sizes_on_db_open = true; + Reopen(options); + + ASSERT_EQ("choo", Get("pika")); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBSSTTest, DontDeleteMovedFile) { + // This test triggers move compaction and verifies that the file is not + // deleted when it's part of move compaction + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.max_bytes_for_level_base = 1024 * 1024; // 1 MB + options.level0_file_num_compaction_trigger = + 2; // trigger compaction when we have 2 files + DestroyAndReopen(options); + + Random rnd(301); + // Create two 1MB sst files + for (int i = 0; i < 2; ++i) { + // Create 1MB sst file + for (int j = 0; j < 100; ++j) { + ASSERT_OK(Put(Key(i * 50 + j), rnd.RandomString(10 * 1024))); + } + ASSERT_OK(Flush()); + } + // this should execute both L0->L1 and L1->(move)->L2 compactions + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("0,0,1", FilesPerLevel(0)); + + // If the moved file is actually deleted (the move-safeguard in + // ~Version::Version() is not there), we get this failure: + // Corruption: Can't access /000009.sst + Reopen(options); +} + +// This reproduces a bug where we don't delete a file because when it was +// supposed to be deleted, it was blocked by pending_outputs +// Consider: +// 1. current file_number is 13 +// 2. compaction (1) starts, blocks deletion of all files starting with 13 +// (pending outputs) +// 3. file 13 is created by compaction (2) +// 4. file 13 is consumed by compaction (3) and file 15 was created. Since file +// 13 has no references, it is put into VersionSet::obsolete_files_ +// 5. FindObsoleteFiles() gets file 13 from VersionSet::obsolete_files_. File 13 +// is deleted from obsolete_files_ set. +// 6. PurgeObsoleteFiles() tries to delete file 13, but this file is blocked by +// pending outputs since compaction (1) is still running. It is not deleted and +// it is not present in obsolete_files_ anymore. Therefore, we never delete it. +TEST_F(DBSSTTest, DeleteObsoleteFilesPendingOutputs) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 2 * 1024 * 1024; // 2 MB + options.max_bytes_for_level_base = 1024 * 1024; // 1 MB + options.level0_file_num_compaction_trigger = + 2; // trigger compaction when we have 2 files + options.max_background_flushes = 2; + options.max_background_compactions = 2; + + OnFileDeletionListener* listener = new OnFileDeletionListener(); + options.listeners.emplace_back(listener); + + Reopen(options); + + Random rnd(301); + // Create two 1MB sst files + for (int i = 0; i < 2; ++i) { + // Create 1MB sst file + for (int j = 0; j < 100; ++j) { + ASSERT_OK(Put(Key(i * 50 + j), rnd.RandomString(10 * 1024))); + } + ASSERT_OK(Flush()); + } + // this should execute both L0->L1 and L1->(move)->L2 compactions + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("0,0,1", FilesPerLevel(0)); + + test::SleepingBackgroundTask blocking_thread; + port::Mutex mutex_; + bool already_blocked(false); + + // block the flush + std::function block_first_time = [&]() { + bool blocking = false; + { + MutexLock l(&mutex_); + if (!already_blocked) { + blocking = true; + already_blocked = true; + } + } + if (blocking) { + blocking_thread.DoSleep(); + } + }; + env_->table_write_callback_ = &block_first_time; + // Insert 2.5MB data, which should trigger a flush because we exceed + // write_buffer_size. The flush will be blocked with block_first_time + // pending_file is protecting all the files created after + for (int j = 0; j < 256; ++j) { + ASSERT_OK(Put(Key(j), rnd.RandomString(10 * 1024))); + } + blocking_thread.WaitUntilSleeping(); + + ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr)); + + ASSERT_EQ("0,0,0,1", FilesPerLevel(0)); + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(metadata.size(), 1U); + auto file_on_L2 = metadata[0].name; + listener->SetExpectedFileName(dbname_ + file_on_L2); + + ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr, nullptr, + true /* disallow trivial move */)); + ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0)); + + // finish the flush! + blocking_thread.WakeUp(); + blocking_thread.WaitUntilDone(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + // File just flushed is too big for L0 and L1 so gets moved to L2. + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("0,0,1,0,1", FilesPerLevel(0)); + + metadata.clear(); + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(metadata.size(), 2U); + + // This file should have been deleted during last compaction + ASSERT_EQ(Status::NotFound(), env_->FileExists(dbname_ + file_on_L2)); + listener->VerifyMatchedCount(1); +} + +// Test that producing an empty .sst file does not write it out to +// disk, and that the DeleteFile() env method is not called for +// removing the non-existing file later. +TEST_F(DBSSTTest, DeleteFileNotCalledForNotCreatedSSTFile) { + Options options = CurrentOptions(); + options.env = env_; + + OnFileDeletionListener* listener = new OnFileDeletionListener(); + options.listeners.emplace_back(listener); + + Reopen(options); + + // Flush the empty database. + ASSERT_OK(Flush()); + ASSERT_EQ("", FilesPerLevel(0)); + + // We expect no .sst files. + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(metadata.size(), 0U); + + // We expect no file deletions. + listener->VerifyMatchedCount(0); +} + +// Test that producing a non-empty .sst file does write it out to +// disk, and that the DeleteFile() env method is not called for removing +// the file later. +TEST_F(DBSSTTest, DeleteFileNotCalledForCreatedSSTFile) { + Options options = CurrentOptions(); + options.env = env_; + + OnFileDeletionListener* listener = new OnFileDeletionListener(); + options.listeners.emplace_back(listener); + + Reopen(options); + + ASSERT_OK(Put("pika", "choo")); + + // Flush the non-empty database. + ASSERT_OK(Flush()); + ASSERT_EQ("1", FilesPerLevel(0)); + + // We expect 1 .sst files. + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(metadata.size(), 1U); + + // We expect no file deletions. + listener->VerifyMatchedCount(0); +} + +TEST_F(DBSSTTest, DBWithSstFileManager) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + int files_added = 0; + int files_deleted = 0; + int files_moved = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnAddFile", [&](void* /*arg*/) { files_added++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnDeleteFile", + [&](void* /*arg*/) { files_deleted++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 25; i++) { + GenerateNewRandomFile(&rnd); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Verify that we are tracking all sst files in dbname_ + std::unordered_map files_in_db; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db)); + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + } + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + std::unordered_map files_in_db; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db)); + // Verify that we are tracking all sst files in dbname_ + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + // Verify the total files size + uint64_t total_files_size = 0; + for (auto& file_to_size : files_in_db) { + total_files_size += file_to_size.second; + } + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + // We flushed at least 25 files + ASSERT_GE(files_added, 25); + // Compaction must have deleted some files + ASSERT_GT(files_deleted, 0); + // No files were moved + ASSERT_EQ(files_moved, 0); + + Close(); + Reopen(options); + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + + // Verify that we track all the files again after the DB is closed and opened + Close(); + sst_file_manager.reset(NewSstFileManager(env_)); + options.sst_file_manager = sst_file_manager; + sfm = static_cast(sst_file_manager.get()); + + Reopen(options); + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + int files_added = 0; + int files_deleted = 0; + int files_moved = 0; + int files_scheduled_to_delete = 0; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnAddFile", [&](void* arg) { + const std::string* const file_path = + static_cast(arg); + if (file_path->find(".blob") != std::string::npos) { + files_added++; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnDeleteFile", [&](void* arg) { + const std::string* const file_path = + static_cast(arg); + if (file_path->find(".blob") != std::string::npos) { + files_deleted++; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) { + assert(arg); + const std::string* const file_path = + static_cast(arg); + if (file_path->find(".blob") != std::string::npos) { + ++files_scheduled_to_delete; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.enable_blob_files = true; + options.blob_file_size = 32; // create one blob per file + DestroyAndReopen(options); + Random rnd(301); + + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put("Key_" + std::to_string(i), "Value_" + std::to_string(i))); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Verify that we are tracking all sst and blob files in dbname_ + std::unordered_map files_in_db; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db)); + ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db)); + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + } + + std::vector blob_files = GetBlobFileNumbers(); + ASSERT_EQ(files_added, blob_files.size()); + // No blob file is obsoleted. + ASSERT_EQ(files_deleted, 0); + ASSERT_EQ(files_scheduled_to_delete, 0); + // No files were moved. + ASSERT_EQ(files_moved, 0); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + std::unordered_map files_in_db; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db)); + ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db)); + + // Verify that we are tracking all sst and blob files in dbname_ + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + // Verify the total files size + uint64_t total_files_size = 0; + for (auto& file_to_size : files_in_db) { + total_files_size += file_to_size.second; + } + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + Close(); + + Reopen(options); + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + + // Verify that we track all the files again after the DB is closed and opened. + Close(); + + sst_file_manager.reset(NewSstFileManager(env_)); + options.sst_file_manager = sst_file_manager; + sfm = static_cast(sst_file_manager.get()); + + Reopen(options); + + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + + // Destroy DB and it will remove all the blob files from sst file manager and + // blob files deletion will go through ScheduleFileDeletion. + ASSERT_EQ(files_deleted, 0); + ASSERT_EQ(files_scheduled_to_delete, 0); + Close(); + ASSERT_OK(DestroyDB(dbname_, options)); + ASSERT_EQ(files_deleted, blob_files.size()); + ASSERT_EQ(files_scheduled_to_delete, blob_files.size()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFilesWithGC) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.enable_blob_files = true; + options.blob_file_size = 32; // create one blob per file + options.disable_auto_compactions = true; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 0.5; + + int files_added = 0; + int files_deleted = 0; + int files_moved = 0; + int files_scheduled_to_delete = 0; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnAddFile", [&](void* arg) { + const std::string* const file_path = + static_cast(arg); + if (file_path->find(".blob") != std::string::npos) { + files_added++; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnDeleteFile", [&](void* arg) { + const std::string* const file_path = + static_cast(arg); + if (file_path->find(".blob") != std::string::npos) { + files_deleted++; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) { + assert(arg); + const std::string* const file_path = + static_cast(arg); + if (file_path->find(".blob") != std::string::npos) { + ++files_scheduled_to_delete; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndReopen(options); + Random rnd(301); + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + + ASSERT_OK(Put(first_key, first_value)); + ASSERT_OK(Put(second_key, second_value)); + ASSERT_OK(Flush()); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "third_value"; + constexpr char fourth_key[] = "fourth_key"; + constexpr char fourth_value[] = "fourth_value"; + constexpr char fifth_key[] = "fifth_key"; + constexpr char fifth_value[] = "fifth_value"; + + ASSERT_OK(Put(third_key, third_value)); + ASSERT_OK(Put(fourth_key, fourth_value)); + ASSERT_OK(Put(fifth_key, fifth_value)); + ASSERT_OK(Flush()); + + const std::vector original_blob_files = GetBlobFileNumbers(); + + ASSERT_EQ(original_blob_files.size(), 5); + ASSERT_EQ(files_added, 5); + ASSERT_EQ(files_deleted, 0); + ASSERT_EQ(files_scheduled_to_delete, 0); + ASSERT_EQ(files_moved, 0); + { + // Verify that we are tracking all sst and blob files in dbname_ + std::unordered_map files_in_db; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db)); + ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db)); + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + } + + const size_t cutoff_index = static_cast( + options.blob_garbage_collection_age_cutoff * original_blob_files.size()); + + size_t expected_number_of_files = original_blob_files.size(); + // Note: turning off enable_blob_files before the compaction results in + // garbage collected values getting inlined. + ASSERT_OK(db_->SetOptions({{"enable_blob_files", "false"}})); + expected_number_of_files -= cutoff_index; + files_added = 0; + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + sfm->WaitForEmptyTrash(); + + ASSERT_EQ(Get(first_key), first_value); + ASSERT_EQ(Get(second_key), second_value); + ASSERT_EQ(Get(third_key), third_value); + ASSERT_EQ(Get(fourth_key), fourth_value); + ASSERT_EQ(Get(fifth_key), fifth_value); + + const std::vector new_blob_files = GetBlobFileNumbers(); + + ASSERT_EQ(new_blob_files.size(), expected_number_of_files); + // No new file is added. + ASSERT_EQ(files_added, 0); + ASSERT_EQ(files_deleted, cutoff_index); + ASSERT_EQ(files_scheduled_to_delete, cutoff_index); + ASSERT_EQ(files_moved, 0); + + // Original blob files below the cutoff should be gone, original blob files at + // or above the cutoff should be still there + for (size_t i = cutoff_index; i < original_blob_files.size(); ++i) { + ASSERT_EQ(new_blob_files[i - cutoff_index], original_blob_files[i]); + } + + { + // Verify that we are tracking all sst and blob files in dbname_ + std::unordered_map files_in_db; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db)); + ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db)); + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + } + + Close(); + ASSERT_OK(DestroyDB(dbname_, options)); + sfm->WaitForEmptyTrash(); + ASSERT_EQ(files_deleted, 5); + ASSERT_EQ(files_scheduled_to_delete, 5); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +class DBSSTTestRateLimit : public DBSSTTest, + public ::testing::WithParamInterface { + public: + DBSSTTestRateLimit() : DBSSTTest() {} + ~DBSSTTestRateLimit() override {} +}; + +TEST_P(DBSSTTestRateLimit, RateLimitedDelete) { + Destroy(last_options_); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"DBSSTTest::RateLimitedDelete:1", + "DeleteScheduler::BackgroundEmptyTrash"}, + }); + + std::vector penalties; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::BackgroundEmptyTrash:Wait", + [&](void* arg) { penalties.push_back(*(static_cast(arg))); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { + // Turn timed wait into a simulated sleep + uint64_t* abs_time_us = static_cast(arg); + uint64_t cur_time = env_->NowMicros(); + if (*abs_time_us > cur_time) { + env_->MockSleepForMicroseconds(*abs_time_us - cur_time); + } + + // Plus an additional short, random amount + env_->MockSleepForMicroseconds(Random::GetTLSInstance()->Uniform(10)); + + // Set wait until time to before (actual) current time to force not + // to sleep + *abs_time_us = Env::Default()->NowMicros(); + }); + + // Disable PeriodicTaskScheduler as it also has TimedWait, which could update + // the simulated sleep time + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::StartPeriodicTaskScheduler:DisableScheduler", [&](void* arg) { + bool* disable_scheduler = static_cast(arg); + *disable_scheduler = true; + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + bool different_wal_dir = GetParam(); + Options options = CurrentOptions(); + SetTimeElapseOnlySleepOnReopen(&options); + options.disable_auto_compactions = true; + options.env = env_; + options.statistics = CreateDBStatistics(); + if (different_wal_dir) { + options.wal_dir = alternative_wal_dir_; + } + + int64_t rate_bytes_per_sec = 1024 * 10; // 10 Kbs / Sec + Status s; + options.sst_file_manager.reset( + NewSstFileManager(env_, nullptr, "", 0, false, &s, 0)); + ASSERT_OK(s); + options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec); + auto sfm = static_cast(options.sst_file_manager.get()); + sfm->delete_scheduler()->SetMaxTrashDBRatio(1.1); + + WriteOptions wo; + if (!different_wal_dir) { + wo.disableWAL = true; + } + Reopen(options); + // Create 4 files in L0 + for (char v = 'a'; v <= 'd'; v++) { + ASSERT_OK(Put("Key2", DummyString(1024, v), wo)); + ASSERT_OK(Put("Key3", DummyString(1024, v), wo)); + ASSERT_OK(Put("Key4", DummyString(1024, v), wo)); + ASSERT_OK(Put("Key1", DummyString(1024, v), wo)); + ASSERT_OK(Put("Key4", DummyString(1024, v), wo)); + ASSERT_OK(Flush()); + } + // We created 4 sst files in L0 + ASSERT_EQ("4", FilesPerLevel(0)); + + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + + // Compaction will move the 4 files in L0 to trash and create 1 L1 file + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_EQ("0,1", FilesPerLevel(0)); + + uint64_t delete_start_time = env_->NowMicros(); + // Hold BackgroundEmptyTrash + TEST_SYNC_POINT("DBSSTTest::RateLimitedDelete:1"); + sfm->WaitForEmptyTrash(); + uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time; + + uint64_t total_files_size = 0; + uint64_t expected_penlty = 0; + ASSERT_EQ(penalties.size(), metadata.size()); + for (size_t i = 0; i < metadata.size(); i++) { + total_files_size += metadata[i].size; + expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec); + ASSERT_EQ(expected_penlty, penalties[i]); + } + ASSERT_GT(time_spent_deleting, expected_penlty * 0.9); + ASSERT_LT(time_spent_deleting, expected_penlty * 1.1); + ASSERT_EQ(4, options.statistics->getAndResetTickerCount(FILES_MARKED_TRASH)); + ASSERT_EQ( + 0, options.statistics->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +INSTANTIATE_TEST_CASE_P(RateLimitedDelete, DBSSTTestRateLimit, + ::testing::Bool()); + +TEST_F(DBSSTTest, RateLimitedWALDelete) { + Destroy(last_options_); + + std::vector penalties; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::BackgroundEmptyTrash:Wait", + [&](void* arg) { penalties.push_back(*(static_cast(arg))); }); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.compression = kNoCompression; + options.env = env_; + + int64_t rate_bytes_per_sec = 1024 * 10; // 10 Kbs / Sec + Status s; + options.sst_file_manager.reset( + NewSstFileManager(env_, nullptr, "", 0, false, &s, 0)); + ASSERT_OK(s); + options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec); + auto sfm = static_cast(options.sst_file_manager.get()); + sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1); + SetTimeElapseOnlySleepOnReopen(&options); + + ASSERT_OK(TryReopen(options)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Create 4 files in L0 + for (char v = 'a'; v <= 'd'; v++) { + ASSERT_OK(Put("Key2", DummyString(1024, v))); + ASSERT_OK(Put("Key3", DummyString(1024, v))); + ASSERT_OK(Put("Key4", DummyString(1024, v))); + ASSERT_OK(Put("Key1", DummyString(1024, v))); + ASSERT_OK(Put("Key4", DummyString(1024, v))); + ASSERT_OK(Flush()); + } + // We created 4 sst files in L0 + ASSERT_EQ("4", FilesPerLevel(0)); + + // Compaction will move the 4 files in L0 to trash and create 1 L1 file + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_EQ("0,1", FilesPerLevel(0)); + + sfm->WaitForEmptyTrash(); + ASSERT_EQ(penalties.size(), 8); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +class DBWALTestWithParam + : public DBTestBase, + public testing::WithParamInterface> { + public: + explicit DBWALTestWithParam() + : DBTestBase("db_wal_test_with_params", /*env_do_fsync=*/true) { + wal_dir_ = std::get<0>(GetParam()); + wal_dir_same_as_dbname_ = std::get<1>(GetParam()); + } + + std::string wal_dir_; + bool wal_dir_same_as_dbname_; +}; + +TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) { + class MyEnv : public EnvWrapper { + public: + MyEnv(Env* t) : EnvWrapper(t), fake_log_delete(false) {} + const char* Name() const override { return "MyEnv"; } + Status DeleteFile(const std::string& fname) override { + if (fname.find(".log.trash") != std::string::npos && fake_log_delete) { + return Status::OK(); + } + + return target()->DeleteFile(fname); + } + + void set_fake_log_delete(bool fake) { fake_log_delete = fake; } + + private: + bool fake_log_delete; + }; + + std::unique_ptr env(new MyEnv(env_)); + Destroy(last_options_); + + env->set_fake_log_delete(true); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.compression = kNoCompression; + options.env = env.get(); + options.wal_dir = dbname_ + wal_dir_; + + int64_t rate_bytes_per_sec = 1024 * 10; // 10 Kbs / Sec + Status s; + options.sst_file_manager.reset( + NewSstFileManager(env_, nullptr, "", 0, false, &s, 0)); + ASSERT_OK(s); + options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec); + auto sfm = static_cast(options.sst_file_manager.get()); + sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1); + + Reopen(options); + + // Create 4 files in L0 + for (char v = 'a'; v <= 'd'; v++) { + if (v == 'c') { + // Maximize the change that the last log file will be preserved in trash + // before restarting the DB. + // We have to set this on the 2nd to last file for it to delay deletion + // on the last file. (Quirk of DeleteScheduler::BackgroundEmptyTrash()) + options.sst_file_manager->SetDeleteRateBytesPerSecond(1); + } + ASSERT_OK(Put("Key2", DummyString(1024, v))); + ASSERT_OK(Put("Key3", DummyString(1024, v))); + ASSERT_OK(Put("Key4", DummyString(1024, v))); + ASSERT_OK(Put("Key1", DummyString(1024, v))); + ASSERT_OK(Put("Key4", DummyString(1024, v))); + ASSERT_OK(Flush()); + } + // We created 4 sst files in L0 + ASSERT_EQ("4", FilesPerLevel(0)); + + Close(); + + options.sst_file_manager.reset(); + std::vector filenames; + int trash_log_count = 0; + if (!wal_dir_same_as_dbname_) { + // Forcibly create some trash log files + std::unique_ptr result; + ASSERT_OK(env->NewWritableFile(options.wal_dir + "/1000.log.trash", &result, + EnvOptions())); + result.reset(); + } + ASSERT_OK(env->GetChildren(options.wal_dir, &filenames)); + for (const std::string& fname : filenames) { + if (fname.find(".log.trash") != std::string::npos) { + trash_log_count++; + } + } + ASSERT_GE(trash_log_count, 1); + + env->set_fake_log_delete(false); + Reopen(options); + + filenames.clear(); + trash_log_count = 0; + ASSERT_OK(env->GetChildren(options.wal_dir, &filenames)); + for (const std::string& fname : filenames) { + if (fname.find(".log.trash") != std::string::npos) { + trash_log_count++; + } + } + ASSERT_EQ(trash_log_count, 0); + Close(); +} + +INSTANTIATE_TEST_CASE_P(DBWALTestWithParam, DBWALTestWithParam, + ::testing::Values(std::make_tuple("", true), + std::make_tuple("_wal_dir", false))); + +TEST_F(DBSSTTest, OpenDBWithExistingTrash) { + Options options = CurrentOptions(); + + options.sst_file_manager.reset( + NewSstFileManager(env_, nullptr, "", 1024 * 1024 /* 1 MB/sec */)); + auto sfm = static_cast(options.sst_file_manager.get()); + + Destroy(last_options_); + + // Add some trash files to the db directory so the DB can clean them up + ASSERT_OK(env_->CreateDirIfMissing(dbname_)); + ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "001.sst.trash")); + ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "002.sst.trash")); + ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "003.sst.trash")); + + // Reopen the DB and verify that it deletes existing trash files + Reopen(options); + sfm->WaitForEmptyTrash(); + ASSERT_NOK(env_->FileExists(dbname_ + "/" + "001.sst.trash")); + ASSERT_NOK(env_->FileExists(dbname_ + "/" + "002.sst.trash")); + ASSERT_NOK(env_->FileExists(dbname_ + "/" + "003.sst.trash")); +} + +// Create a DB with 2 db_paths, and generate multiple files in the 2 +// db_paths using CompactRangeOptions, make sure that files that were +// deleted from first db_path were deleted using DeleteScheduler and +// files in the second path were not. +TEST_F(DBSSTTest, DeleteSchedulerMultipleDBPaths) { + std::atomic bg_delete_file(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:DeleteFile", + [&](void* /*arg*/) { bg_delete_file++; }); + // The deletion scheduler sometimes skips marking file as trash according to + // a heuristic. In that case the deletion will go through the below SyncPoint. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteFile", [&](void* /*arg*/) { bg_delete_file++; }); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.db_paths.emplace_back(dbname_, 1024 * 100); + options.db_paths.emplace_back(dbname_ + "_2", 1024 * 100); + options.env = env_; + + int64_t rate_bytes_per_sec = 1024 * 1024; // 1 Mb / Sec + Status s; + options.sst_file_manager.reset( + NewSstFileManager(env_, nullptr, "", rate_bytes_per_sec, false, &s, + /* max_trash_db_ratio= */ 1.1)); + + ASSERT_OK(s); + auto sfm = static_cast(options.sst_file_manager.get()); + + DestroyAndReopen(options); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + WriteOptions wo; + wo.disableWAL = true; + + // Create 4 files in L0 + for (int i = 0; i < 4; i++) { + ASSERT_OK(Put("Key" + std::to_string(i), DummyString(1024, 'A'), wo)); + ASSERT_OK(Flush()); + } + // We created 4 sst files in L0 + ASSERT_EQ("4", FilesPerLevel(0)); + // Compaction will delete files from L0 in first db path and generate a new + // file in L1 in second db path + CompactRangeOptions compact_options; + compact_options.target_path_id = 1; + Slice begin("Key0"); + Slice end("Key3"); + ASSERT_OK(db_->CompactRange(compact_options, &begin, &end)); + ASSERT_EQ("0,1", FilesPerLevel(0)); + + // Create 4 files in L0 + for (int i = 4; i < 8; i++) { + ASSERT_OK(Put("Key" + std::to_string(i), DummyString(1024, 'B'), wo)); + ASSERT_OK(Flush()); + } + ASSERT_EQ("4,1", FilesPerLevel(0)); + + // Compaction will delete files from L0 in first db path and generate a new + // file in L1 in second db path + begin = "Key4"; + end = "Key7"; + ASSERT_OK(db_->CompactRange(compact_options, &begin, &end)); + ASSERT_EQ("0,2", FilesPerLevel(0)); + + sfm->WaitForEmptyTrash(); + ASSERT_EQ(bg_delete_file, 8); + + // Compaction will delete both files and regenerate a file in L1 in second + // db path. The deleted files should still be cleaned up via delete scheduler. + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,1", FilesPerLevel(0)); + + sfm->WaitForEmptyTrash(); + ASSERT_EQ(bg_delete_file, 10); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBSSTTest, DestroyDBWithRateLimitedDelete) { + int bg_delete_file = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:DeleteFile", + [&](void* /*arg*/) { bg_delete_file++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Status s; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.env = env_; + options.sst_file_manager.reset( + NewSstFileManager(env_, nullptr, "", 0, false, &s, 0)); + ASSERT_OK(s); + DestroyAndReopen(options); + + // Create 4 files in L0 + for (int i = 0; i < 4; i++) { + ASSERT_OK(Put("Key" + std::to_string(i), DummyString(1024, 'A'))); + ASSERT_OK(Flush()); + } + // We created 4 sst files in L0 + ASSERT_EQ("4", FilesPerLevel(0)); + + // Close DB and destroy it using DeleteScheduler + Close(); + + int num_sst_files = 0; + int num_wal_files = 0; + std::vector db_files; + ASSERT_OK(env_->GetChildren(dbname_, &db_files)); + for (std::string f : db_files) { + if (f.substr(f.find_last_of(".") + 1) == "sst") { + num_sst_files++; + } else if (f.substr(f.find_last_of(".") + 1) == "log") { + num_wal_files++; + } + } + ASSERT_GT(num_sst_files, 0); + ASSERT_GT(num_wal_files, 0); + + auto sfm = static_cast(options.sst_file_manager.get()); + + sfm->SetDeleteRateBytesPerSecond(1024 * 1024); + // Set an extra high trash ratio to prevent immediate/non-rate limited + // deletions + sfm->delete_scheduler()->SetMaxTrashDBRatio(1000.0); + ASSERT_OK(DestroyDB(dbname_, options)); + sfm->WaitForEmptyTrash(); + ASSERT_EQ(bg_delete_file, num_sst_files + num_wal_files); +} + +TEST_F(DBSSTTest, DBWithMaxSpaceAllowed) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + Random rnd(301); + + // Generate a file containing 100 keys. + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(50))); + } + ASSERT_OK(Flush()); + + uint64_t first_file_size = 0; + std::unordered_map files_in_db; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &first_file_size)); + ASSERT_EQ(sfm->GetTotalSize(), first_file_size); + + // Set the maximum allowed space usage to the current total size + sfm->SetMaxAllowedSpaceUsage(first_file_size + 1); + + ASSERT_OK(Put("key1", "val1")); + // This flush will cause bg_error_ and will fail + ASSERT_NOK(Flush()); +} + +TEST_F(DBSSTTest, DBWithMaxSpaceAllowedWithBlobFiles) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.disable_auto_compactions = true; + options.enable_blob_files = true; + DestroyAndReopen(options); + + Random rnd(301); + + // Generate a file containing keys. + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(50))); + } + ASSERT_OK(Flush()); + + uint64_t files_size = 0; + uint64_t total_files_size = 0; + std::unordered_map files_in_db; + + ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db, &files_size)); + // Make sure blob files are considered by SSTFileManage in size limits. + ASSERT_GT(files_size, 0); + total_files_size = files_size; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &files_size)); + total_files_size += files_size; + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + + // Set the maximum allowed space usage to the current total size. + sfm->SetMaxAllowedSpaceUsage(total_files_size + 1); + + bool max_allowed_space_reached = false; + bool delete_blob_file = false; + // Sync point called after blob file is closed and max allowed space is + // checked. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BlobFileCompletionCallback::CallBack::MaxAllowedSpaceReached", + [&](void* /*arg*/) { max_allowed_space_reached = true; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BuildTable::AfterDeleteFile", + [&](void* /*arg*/) { delete_blob_file = true; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + { + "BuildTable::AfterDeleteFile", + "DBSSTTest::DBWithMaxSpaceAllowedWithBlobFiles:1", + }, + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("key1", "val1")); + // This flush will fail + ASSERT_NOK(Flush()); + ASSERT_TRUE(max_allowed_space_reached); + + TEST_SYNC_POINT("DBSSTTest::DBWithMaxSpaceAllowedWithBlobFiles:1"); + ASSERT_TRUE(delete_blob_file); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBSSTTest, CancellingCompactionsWorks) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.level0_file_num_compaction_trigger = 2; + options.statistics = CreateDBStatistics(); + DestroyAndReopen(options); + + int completed_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction():CancelledCompaction", [&](void* /*arg*/) { + sfm->SetMaxAllowedSpaceUsage(0); + ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", + [&](void* /*arg*/) { completed_compactions++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + + // Generate a file containing 10 keys. + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(50))); + } + ASSERT_OK(Flush()); + uint64_t total_file_size = 0; + std::unordered_map files_in_db; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &total_file_size)); + // Set the maximum allowed space usage to the current total size + sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1); + + // Generate another file to trigger compaction. + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(50))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + // Because we set a callback in CancelledCompaction, we actually + // let the compaction run + ASSERT_GT(completed_compactions, 0); + ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0); + // Make sure the stat is bumped + ASSERT_GT(dbfull()->immutable_db_options().statistics.get()->getTickerCount( + COMPACTION_CANCELLED), + 0); + ASSERT_EQ(0, + dbfull()->immutable_db_options().statistics.get()->getTickerCount( + FILES_MARKED_TRASH)); + ASSERT_EQ(4, + dbfull()->immutable_db_options().statistics.get()->getTickerCount( + FILES_DELETED_IMMEDIATELY)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBSSTTest, CancellingManualCompactionsWorks) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.statistics = CreateDBStatistics(); + + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + + DestroyAndReopen(options); + + Random rnd(301); + + // Generate a file containing 10 keys. + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(50))); + } + ASSERT_OK(Flush()); + uint64_t total_file_size = 0; + std::unordered_map files_in_db; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &total_file_size)); + // Set the maximum allowed space usage to the current total size + sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1); + + // Generate another file to trigger compaction. + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(50))); + } + ASSERT_OK(Flush()); + + // OK, now trigger a manual compaction + ASSERT_TRUE(dbfull() + ->CompactRange(CompactRangeOptions(), nullptr, nullptr) + .IsCompactionTooLarge()); + + // Wait for manual compaction to get scheduled and finish + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0); + // Make sure the stat is bumped + ASSERT_EQ(dbfull()->immutable_db_options().statistics.get()->getTickerCount( + COMPACTION_CANCELLED), + 1); + + // Now make sure CompactFiles also gets cancelled + auto l0_files = collector->GetFlushedFiles(); + ASSERT_TRUE( + dbfull() + ->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), l0_files, 0) + .IsCompactionTooLarge()); + + // Wait for manual compaction to get scheduled and finish + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + ASSERT_EQ(dbfull()->immutable_db_options().statistics.get()->getTickerCount( + COMPACTION_CANCELLED), + 2); + ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0); + + // Now let the flush through and make sure GetCompactionsReservedSize + // returns to normal + sfm->SetMaxAllowedSpaceUsage(0); + int completed_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactFilesImpl:End", [&](void* /*arg*/) { completed_compactions++; }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), + l0_files, 0)); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0); + ASSERT_GT(completed_compactions, 0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBSSTTest, DBWithMaxSpaceAllowedRandomized) { + // This test will set a maximum allowed space for the DB, then it will + // keep filling the DB until the limit is reached and bg_error_ is set. + // When bg_error_ is set we will verify that the DB size is greater + // than the limit. + + std::vector max_space_limits_mbs = {1, 10}; + std::atomic bg_error_set(false); + + std::atomic reached_max_space_on_flush(0); + std::atomic reached_max_space_on_compaction(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached", + [&](void* arg) { + Status* bg_error = static_cast(arg); + bg_error_set = true; + reached_max_space_on_flush++; + // clear error to ensure compaction callback is called + *bg_error = Status::OK(); + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction():CancelledCompaction", [&](void* arg) { + bool* enough_room = static_cast(arg); + *enough_room = true; + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached", + [&](void* /*arg*/) { + bg_error_set = true; + reached_max_space_on_compaction++; + }); + + for (auto limit_mb : max_space_limits_mbs) { + bg_error_set = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.write_buffer_size = 1024 * 512; // 512 Kb + DestroyAndReopen(options); + Random rnd(301); + + sfm->SetMaxAllowedSpaceUsage(limit_mb * 1024 * 1024); + + // It is easy to detect if the test is stuck in a loop. No need for + // complex termination logic. + while (true) { + auto s = Put(rnd.RandomString(10), rnd.RandomString(50)); + if (!s.ok()) { + break; + } + } + ASSERT_TRUE(bg_error_set); + uint64_t total_sst_files_size = 0; + std::unordered_map files_in_db; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &total_sst_files_size)); + ASSERT_GE(total_sst_files_size, limit_mb * 1024 * 1024); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } + + ASSERT_GT(reached_max_space_on_flush, 0); + ASSERT_GT(reached_max_space_on_compaction, 0); +} + +TEST_F(DBSSTTest, OpenDBWithInfiniteMaxOpenFiles) { + // Open DB with infinite max open files + // - First iteration use 1 thread to open files + // - Second iteration use 5 threads to open files + for (int iter = 0; iter < 2; iter++) { + Options options; + options.create_if_missing = true; + options.write_buffer_size = 100000; + options.disable_auto_compactions = true; + options.max_open_files = -1; + if (iter == 0) { + options.max_file_opening_threads = 1; + } else { + options.max_file_opening_threads = 5; + } + options = CurrentOptions(options); + DestroyAndReopen(options); + + // Create 12 Files in L0 (then move then to L2) + for (int i = 0; i < 12; i++) { + std::string k = "L2_" + Key(i); + ASSERT_OK(Put(k, k + std::string(1000, 'a'))); + ASSERT_OK(Flush()); + } + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + + // Create 12 Files in L0 + for (int i = 0; i < 12; i++) { + std::string k = "L0_" + Key(i); + ASSERT_OK(Put(k, k + std::string(1000, 'a'))); + ASSERT_OK(Flush()); + } + Close(); + + // Reopening the DB will load all existing files + Reopen(options); + ASSERT_EQ("12,0,12", FilesPerLevel(0)); + std::vector> files; + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files); + + for (const auto& level : files) { + for (const auto& file : level) { + ASSERT_TRUE(file.table_reader_handle != nullptr); + } + } + + for (int i = 0; i < 12; i++) { + ASSERT_EQ(Get("L0_" + Key(i)), "L0_" + Key(i) + std::string(1000, 'a')); + ASSERT_EQ(Get("L2_" + Key(i)), "L2_" + Key(i) + std::string(1000, 'a')); + } + } +} + +TEST_F(DBSSTTest, OpenDBWithInfiniteMaxOpenFilesSubjectToMemoryLimit) { + for (CacheEntryRoleOptions::Decision charge_table_reader : + {CacheEntryRoleOptions::Decision::kEnabled, + CacheEntryRoleOptions::Decision::kDisabled}) { + // Open DB with infinite max open files + // - First iteration use 1 thread to open files + // - Second iteration use 5 threads to open files + for (int iter = 0; iter < 2; iter++) { + Options options; + options.create_if_missing = true; + options.write_buffer_size = 100000; + options.disable_auto_compactions = true; + options.max_open_files = -1; + + BlockBasedTableOptions table_options; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + if (iter == 0) { + options.max_file_opening_threads = 1; + } else { + options.max_file_opening_threads = 5; + } + + DestroyAndReopen(options); + + // Create 5 Files in L0 (then move then to L2) + for (int i = 0; i < 5; i++) { + std::string k = "L2_" + Key(i); + ASSERT_OK(Put(k, k + std::string(1000, 'a'))); + ASSERT_OK(Flush()) << i; + } + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + + // Create 5 Files in L0 + for (int i = 0; i < 5; i++) { + std::string k = "L0_" + Key(i); + ASSERT_OK(Put(k, k + std::string(1000, 'a'))); + ASSERT_OK(Flush()); + } + Close(); + + table_options.cache_usage_options.options_overrides.insert( + {CacheEntryRole::kBlockBasedTableReader, + {/*.charged = */ charge_table_reader}}); + table_options.block_cache = + NewLRUCache(1024 /* capacity */, 0 /* num_shard_bits */, + true /* strict_capacity_limit */); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + // Reopening the DB will try to load all existing files, conditionally + // subject to memory limit + Status s = TryReopen(options); + + if (charge_table_reader == CacheEntryRoleOptions::Decision::kEnabled) { + EXPECT_TRUE(s.IsMemoryLimit()); + EXPECT_TRUE(s.ToString().find( + kCacheEntryRoleToCamelString[static_cast( + CacheEntryRole::kBlockBasedTableReader)]) != + std::string::npos); + EXPECT_TRUE(s.ToString().find("memory limit based on cache capacity") != + std::string::npos); + + } else { + EXPECT_TRUE(s.ok()); + ASSERT_EQ("5,0,5", FilesPerLevel(0)); + } + } + } +} + +TEST_F(DBSSTTest, GetTotalSstFilesSize) { + // We don't propagate oldest-key-time table property on compaction and + // just write 0 as default value. This affect the exact table size, since + // we encode table properties as varint64. Force time to be 0 to work around + // it. Should remove the workaround after we propagate the property on + // compaction. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "FlushJob::WriteLevel0Table:oldest_ancester_time", [&](void* arg) { + uint64_t* current_time = static_cast(arg); + *current_time = 0; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.compression = kNoCompression; + DestroyAndReopen(options); + // Generate 5 files in L0 + for (int i = 0; i < 5; i++) { + for (int j = 0; j < 10; j++) { + std::string val = "val_file_" + std::to_string(i); + ASSERT_OK(Put(Key(j), val)); + } + ASSERT_OK(Flush()); + } + ASSERT_EQ("5", FilesPerLevel(0)); + + std::vector live_files_meta; + dbfull()->GetLiveFilesMetaData(&live_files_meta); + ASSERT_EQ(live_files_meta.size(), 5); + uint64_t single_file_size = live_files_meta[0].size; + + uint64_t live_sst_files_size = 0; + uint64_t total_sst_files_size = 0; + for (const auto& file_meta : live_files_meta) { + live_sst_files_size += file_meta.size; + } + + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 5 + // Total SST files = 5 + ASSERT_EQ(live_sst_files_size, 5 * single_file_size); + ASSERT_EQ(total_sst_files_size, 5 * single_file_size); + + // hold current version + std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + ASSERT_OK(iter1->status()); + + // Compact 5 files into 1 file in L0 + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,1", FilesPerLevel(0)); + + live_files_meta.clear(); + dbfull()->GetLiveFilesMetaData(&live_files_meta); + ASSERT_EQ(live_files_meta.size(), 1); + + live_sst_files_size = 0; + total_sst_files_size = 0; + for (const auto& file_meta : live_files_meta) { + live_sst_files_size += file_meta.size; + } + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 1 (compacted file) + // Total SST files = 6 (5 original files + compacted file) + ASSERT_EQ(live_sst_files_size, 1 * single_file_size); + ASSERT_EQ(total_sst_files_size, 6 * single_file_size); + + // hold current version + std::unique_ptr iter2(dbfull()->NewIterator(ReadOptions())); + ASSERT_OK(iter2->status()); + + // Delete all keys and compact, this will delete all live files + for (int i = 0; i < 10; i++) { + ASSERT_OK(Delete(Key(i))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("", FilesPerLevel(0)); + + live_files_meta.clear(); + dbfull()->GetLiveFilesMetaData(&live_files_meta); + ASSERT_EQ(live_files_meta.size(), 0); + + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 0 + // Total SST files = 6 (5 original files + compacted file) + ASSERT_EQ(total_sst_files_size, 6 * single_file_size); + + ASSERT_OK(iter1->status()); + iter1.reset(); + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 0 + // Total SST files = 1 (compacted file) + ASSERT_EQ(total_sst_files_size, 1 * single_file_size); + + ASSERT_OK(iter2->status()); + iter2.reset(); + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 0 + // Total SST files = 0 + ASSERT_EQ(total_sst_files_size, 0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBSSTTest, OpenDBWithoutGetFileSizeInvocations) { + Options options = CurrentOptions(); + std::unique_ptr env{MockEnv::Create(Env::Default())}; + options.env = env.get(); + options.disable_auto_compactions = true; + options.compression = kNoCompression; + options.enable_blob_files = true; + options.blob_file_size = 32; // create one blob per file + options.skip_checking_sst_file_sizes_on_db_open = true; + + DestroyAndReopen(options); + // Generate 5 files in L0 + for (int i = 0; i < 5; i++) { + for (int j = 0; j < 10; j++) { + std::string val = "val_file_" + std::to_string(i); + ASSERT_OK(Put(Key(j), val)); + } + ASSERT_OK(Flush()); + } + Close(); + + bool is_get_file_size_called = false; + SyncPoint::GetInstance()->SetCallBack( + "MockFileSystem::GetFileSize:CheckFileType", [&](void* arg) { + std::string* filename = reinterpret_cast(arg); + if (filename->find(".blob") != std::string::npos) { + is_get_file_size_called = true; + } + }); + + SyncPoint::GetInstance()->EnableProcessing(); + Reopen(options); + ASSERT_FALSE(is_get_file_size_called); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + Destroy(options); +} + +TEST_F(DBSSTTest, GetTotalSstFilesSizeVersionsFilesShared) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.compression = kNoCompression; + DestroyAndReopen(options); + // Generate 5 files in L0 + for (int i = 0; i < 5; i++) { + ASSERT_OK(Put(Key(i), "val")); + ASSERT_OK(Flush()); + } + ASSERT_EQ("5", FilesPerLevel(0)); + + std::vector live_files_meta; + dbfull()->GetLiveFilesMetaData(&live_files_meta); + ASSERT_EQ(live_files_meta.size(), 5); + uint64_t single_file_size = live_files_meta[0].size; + + uint64_t live_sst_files_size = 0; + uint64_t total_sst_files_size = 0; + for (const auto& file_meta : live_files_meta) { + live_sst_files_size += file_meta.size; + } + + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + + // Live SST files = 5 + // Total SST files = 5 + ASSERT_EQ(live_sst_files_size, 5 * single_file_size); + ASSERT_EQ(total_sst_files_size, 5 * single_file_size); + + // hold current version + std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + ASSERT_OK(iter1->status()); + + // Compaction will do trivial move from L0 to L1 + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,5", FilesPerLevel(0)); + + live_files_meta.clear(); + dbfull()->GetLiveFilesMetaData(&live_files_meta); + ASSERT_EQ(live_files_meta.size(), 5); + + live_sst_files_size = 0; + total_sst_files_size = 0; + for (const auto& file_meta : live_files_meta) { + live_sst_files_size += file_meta.size; + } + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 5 + // Total SST files = 5 (used in 2 version) + ASSERT_EQ(live_sst_files_size, 5 * single_file_size); + ASSERT_EQ(total_sst_files_size, 5 * single_file_size); + + // hold current version + std::unique_ptr iter2(dbfull()->NewIterator(ReadOptions())); + ASSERT_OK(iter2->status()); + + // Delete all keys and compact, this will delete all live files + for (int i = 0; i < 5; i++) { + ASSERT_OK(Delete(Key(i))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("", FilesPerLevel(0)); + + live_files_meta.clear(); + dbfull()->GetLiveFilesMetaData(&live_files_meta); + ASSERT_EQ(live_files_meta.size(), 0); + + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 0 + // Total SST files = 5 (used in 2 version) + ASSERT_EQ(total_sst_files_size, 5 * single_file_size); + + ASSERT_OK(iter1->status()); + iter1.reset(); + ASSERT_OK(iter2->status()); + iter2.reset(); + + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 0 + // Total SST files = 0 + ASSERT_EQ(total_sst_files_size, 0); +} + +// This test if blob files are recorded by SST File Manager when Compaction job +// creates/delete them and in case of AtomicFlush. +TEST_F(DBSSTTest, DBWithSFMForBlobFilesAtomicFlush) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.enable_blob_files = true; + options.min_blob_size = 0; + options.disable_auto_compactions = true; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 0.5; + options.atomic_flush = true; + + int files_added = 0; + int files_deleted = 0; + int files_scheduled_to_delete = 0; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnAddFile", [&](void* arg) { + const std::string* const file_path = + static_cast(arg); + if (EndsWith(*file_path, ".blob")) { + files_added++; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnDeleteFile", [&](void* arg) { + const std::string* const file_path = + static_cast(arg); + if (EndsWith(*file_path, ".blob")) { + files_deleted++; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) { + assert(arg); + const std::string* const file_path = + static_cast(arg); + if (EndsWith(*file_path, ".blob")) { + ++files_scheduled_to_delete; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndReopen(options); + Random rnd(301); + + ASSERT_OK(Put("key_1", "value_1")); + ASSERT_OK(Put("key_2", "value_2")); + ASSERT_OK(Put("key_3", "value_3")); + ASSERT_OK(Put("key_4", "value_4")); + ASSERT_OK(Flush()); + + // Overwrite will create the garbage data. + ASSERT_OK(Put("key_3", "new_value_3")); + ASSERT_OK(Put("key_4", "new_value_4")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("Key5", "blob_value5")); + ASSERT_OK(Put("Key6", "blob_value6")); + ASSERT_OK(Flush()); + + ASSERT_EQ(files_added, 3); + ASSERT_EQ(files_deleted, 0); + ASSERT_EQ(files_scheduled_to_delete, 0); + files_added = 0; + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + // Compaction job will create a new file and delete the older files. + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ(files_added, 1); + ASSERT_EQ(files_scheduled_to_delete, 1); + + sfm->WaitForEmptyTrash(); + + ASSERT_EQ(files_deleted, 1); + + Close(); + ASSERT_OK(DestroyDB(dbname_, options)); + + ASSERT_EQ(files_scheduled_to_delete, 4); + + sfm->WaitForEmptyTrash(); + + ASSERT_EQ(files_deleted, 4); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_statistics_test.cc b/src/rocksdb/db/db_statistics_test.cc new file mode 100644 index 000000000..4d4655361 --- /dev/null +++ b/src/rocksdb/db/db_statistics_test.cc @@ -0,0 +1,215 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "db/db_test_util.h" +#include "monitoring/thread_status_util.h" +#include "port/stack_trace.h" +#include "rocksdb/statistics.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +class DBStatisticsTest : public DBTestBase { + public: + DBStatisticsTest() + : DBTestBase("db_statistics_test", /*env_do_fsync=*/true) {} +}; + +TEST_F(DBStatisticsTest, CompressionStatsTest) { + CompressionType type; + + if (Snappy_Supported()) { + type = kSnappyCompression; + fprintf(stderr, "using snappy\n"); + } else if (Zlib_Supported()) { + type = kZlibCompression; + fprintf(stderr, "using zlib\n"); + } else if (BZip2_Supported()) { + type = kBZip2Compression; + fprintf(stderr, "using bzip2\n"); + } else if (LZ4_Supported()) { + type = kLZ4Compression; + fprintf(stderr, "using lz4\n"); + } else if (XPRESS_Supported()) { + type = kXpressCompression; + fprintf(stderr, "using xpress\n"); + } else if (ZSTD_Supported()) { + type = kZSTD; + fprintf(stderr, "using ZSTD\n"); + } else { + fprintf(stderr, "skipping test, compression disabled\n"); + return; + } + + Options options = CurrentOptions(); + options.compression = type; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex); + DestroyAndReopen(options); + + int kNumKeysWritten = 100000; + + // Check that compressions occur and are counted when compression is turned on + Random rnd(301); + for (int i = 0; i < kNumKeysWritten; ++i) { + // compressible string + ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a'))); + } + ASSERT_OK(Flush()); + ASSERT_GT(options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED), 0); + + for (int i = 0; i < kNumKeysWritten; ++i) { + auto r = Get(Key(i)); + } + ASSERT_GT(options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED), 0); + + options.compression = kNoCompression; + DestroyAndReopen(options); + uint64_t currentCompressions = + options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED); + uint64_t currentDecompressions = + options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED); + + // Check that compressions do not occur when turned off + for (int i = 0; i < kNumKeysWritten; ++i) { + // compressible string + ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a'))); + } + ASSERT_OK(Flush()); + ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED) - + currentCompressions, + 0); + + for (int i = 0; i < kNumKeysWritten; ++i) { + auto r = Get(Key(i)); + } + ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED) - + currentDecompressions, + 0); +} + +TEST_F(DBStatisticsTest, MutexWaitStatsDisabledByDefault) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + CreateAndReopenWithCF({"pikachu"}, options); + const uint64_t kMutexWaitDelay = 100; + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, + kMutexWaitDelay); + ASSERT_OK(Put("hello", "rocksdb")); + ASSERT_EQ(TestGetTickerCount(options, DB_MUTEX_WAIT_MICROS), 0); + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0); +} + +TEST_F(DBStatisticsTest, MutexWaitStats) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.statistics->set_stats_level(StatsLevel::kAll); + CreateAndReopenWithCF({"pikachu"}, options); + const uint64_t kMutexWaitDelay = 100; + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, + kMutexWaitDelay); + ASSERT_OK(Put("hello", "rocksdb")); + ASSERT_GE(TestGetTickerCount(options, DB_MUTEX_WAIT_MICROS), kMutexWaitDelay); + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0); +} + +TEST_F(DBStatisticsTest, ResetStats) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + for (int i = 0; i < 2; ++i) { + // pick arbitrary ticker and histogram. On first iteration they're zero + // because db is unused. On second iteration they're zero due to Reset(). + ASSERT_EQ(0, TestGetTickerCount(options, NUMBER_KEYS_WRITTEN)); + HistogramData histogram_data; + options.statistics->histogramData(DB_WRITE, &histogram_data); + ASSERT_EQ(0.0, histogram_data.max); + + if (i == 0) { + // The Put() makes some of the ticker/histogram stats nonzero until we + // Reset(). + ASSERT_OK(Put("hello", "rocksdb")); + ASSERT_EQ(1, TestGetTickerCount(options, NUMBER_KEYS_WRITTEN)); + options.statistics->histogramData(DB_WRITE, &histogram_data); + ASSERT_GT(histogram_data.max, 0.0); + ASSERT_OK(options.statistics->Reset()); + } + } +} + +TEST_F(DBStatisticsTest, ExcludeTickers) { + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + options.statistics->set_stats_level(StatsLevel::kExceptTickers); + ASSERT_OK(Put("foo", "value")); + ASSERT_EQ(0, options.statistics->getTickerCount(BYTES_WRITTEN)); + options.statistics->set_stats_level(StatsLevel::kExceptHistogramOrTimers); + Reopen(options); + ASSERT_EQ("value", Get("foo")); + ASSERT_GT(options.statistics->getTickerCount(BYTES_READ), 0); +} + +#ifndef ROCKSDB_LITE + +TEST_F(DBStatisticsTest, VerifyChecksumReadStat) { + Options options = CurrentOptions(); + options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + Reopen(options); + + // Expected to be populated regardless of `PerfLevel` in user thread + SetPerfLevel(kDisable); + + { + // Scenario 0: only WAL data. Not verified so require ticker to be zero. + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(db_->VerifyFileChecksums(ReadOptions())); + ASSERT_OK(db_->VerifyChecksum()); + ASSERT_EQ(0, + options.statistics->getTickerCount(VERIFY_CHECKSUM_READ_BYTES)); + } + + // Create one SST. + ASSERT_OK(Flush()); + std::unordered_map table_files; + uint64_t table_files_size = 0; + GetAllDataFiles(kTableFile, &table_files, &table_files_size); + + { + // Scenario 1: Table verified in `VerifyFileChecksums()`. This should read + // the whole file so we require the ticker stat exactly matches the file + // size. + ASSERT_OK(options.statistics->Reset()); + ASSERT_OK(db_->VerifyFileChecksums(ReadOptions())); + ASSERT_EQ(table_files_size, + options.statistics->getTickerCount(VERIFY_CHECKSUM_READ_BYTES)); + } + + { + // Scenario 2: Table verified in `VerifyChecksum()`. This opens a + // `TableReader` to verify each block. It can involve duplicate reads of the + // same data so we set a lower-bound only. + ASSERT_OK(options.statistics->Reset()); + ASSERT_OK(db_->VerifyChecksum()); + ASSERT_GE(options.statistics->getTickerCount(VERIFY_CHECKSUM_READ_BYTES), + table_files_size); + } +} + +#endif // !ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_table_properties_test.cc b/src/rocksdb/db/db_table_properties_test.cc new file mode 100644 index 000000000..981a514ad --- /dev/null +++ b/src/rocksdb/db/db_table_properties_test.cc @@ -0,0 +1,625 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include + +#include "db/db_test_util.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/db.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/table_properties_collectors.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "table/table_properties_internal.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/random.h" + +#ifndef ROCKSDB_LITE + +namespace ROCKSDB_NAMESPACE { + +// A helper function that ensures the table properties returned in +// `GetPropertiesOfAllTablesTest` is correct. +// This test assumes entries size is different for each of the tables. +namespace { + +void VerifyTableProperties(DB* db, uint64_t expected_entries_size) { + TablePropertiesCollection props; + ASSERT_OK(db->GetPropertiesOfAllTables(&props)); + + ASSERT_EQ(4U, props.size()); + std::unordered_set unique_entries; + + // Indirect test + uint64_t sum = 0; + for (const auto& item : props) { + unique_entries.insert(item.second->num_entries); + sum += item.second->num_entries; + } + + ASSERT_EQ(props.size(), unique_entries.size()); + ASSERT_EQ(expected_entries_size, sum); + + VerifySstUniqueIds(props); +} +} // anonymous namespace + +class DBTablePropertiesTest : public DBTestBase, + public testing::WithParamInterface { + public: + DBTablePropertiesTest() + : DBTestBase("db_table_properties_test", /*env_do_fsync=*/false) {} + TablePropertiesCollection TestGetPropertiesOfTablesInRange( + std::vector ranges, std::size_t* num_properties = nullptr, + std::size_t* num_files = nullptr); +}; + +TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 8; + // Part of strategy to prevent pinning table files + options.max_open_files = 42; + Reopen(options); + + // Create 4 tables + for (int table = 0; table < 4; ++table) { + // Use old meta name for table properties for one file + if (table == 3) { + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableBuilder::WritePropertiesBlock:Meta", [&](void* meta) { + *reinterpret_cast(meta) = + &kPropertiesBlockOldName; + }); + SyncPoint::GetInstance()->EnableProcessing(); + } + // Build file + for (int i = 0; i < 10 + table; ++i) { + ASSERT_OK( + db_->Put(WriteOptions(), std::to_string(table * 100 + i), "val")); + } + ASSERT_OK(db_->Flush(FlushOptions())); + } + SyncPoint::GetInstance()->DisableProcessing(); + std::string original_session_id; + ASSERT_OK(db_->GetDbSessionId(original_session_id)); + + // Part of strategy to prevent pinning table files + SyncPoint::GetInstance()->SetCallBack( + "VersionEditHandler::LoadTables:skip_load_table_files", + [&](void* skip_load) { *reinterpret_cast(skip_load) = true; }); + SyncPoint::GetInstance()->EnableProcessing(); + + // 1. Read table properties directly from file + Reopen(options); + // Clear out auto-opened files + dbfull()->TEST_table_cache()->EraseUnRefEntries(); + ASSERT_EQ(dbfull()->TEST_table_cache()->GetUsage(), 0U); + VerifyTableProperties(db_, 10 + 11 + 12 + 13); + + // 2. Put two tables to table cache and + Reopen(options); + // Clear out auto-opened files + dbfull()->TEST_table_cache()->EraseUnRefEntries(); + ASSERT_EQ(dbfull()->TEST_table_cache()->GetUsage(), 0U); + // fetch key from 1st and 2nd table, which will internally place that table to + // the table cache. + for (int i = 0; i < 2; ++i) { + Get(std::to_string(i * 100 + 0)); + } + + VerifyTableProperties(db_, 10 + 11 + 12 + 13); + + // 3. Put all tables to table cache + Reopen(options); + // fetch key from all tables, which will place them in table cache. + for (int i = 0; i < 4; ++i) { + Get(std::to_string(i * 100 + 0)); + } + VerifyTableProperties(db_, 10 + 11 + 12 + 13); + + // 4. Try to read CORRUPT properties (a) directly from file, and (b) + // through reader on Get + + // It's not practical to prevent table file read on Open, so we + // corrupt after open and after purging table cache. + for (bool direct : {true, false}) { + Reopen(options); + // Clear out auto-opened files + dbfull()->TEST_table_cache()->EraseUnRefEntries(); + ASSERT_EQ(dbfull()->TEST_table_cache()->GetUsage(), 0U); + + TablePropertiesCollection props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&props)); + std::string sst_file = props.begin()->first; + + // Corrupt the file's TableProperties using session id + std::string contents; + ASSERT_OK( + ReadFileToString(env_->GetFileSystem().get(), sst_file, &contents)); + size_t pos = contents.find(original_session_id); + ASSERT_NE(pos, std::string::npos); + ASSERT_OK(test::CorruptFile(env_, sst_file, static_cast(pos), 1, + /*verify checksum fails*/ false)); + + // Try to read CORRUPT properties + if (direct) { + ASSERT_TRUE(db_->GetPropertiesOfAllTables(&props).IsCorruption()); + } else { + bool found_corruption = false; + for (int i = 0; i < 4; ++i) { + std::string result = Get(std::to_string(i * 100 + 0)); + if (result.find_first_of("Corruption: block checksum mismatch") != + std::string::npos) { + found_corruption = true; + } + } + ASSERT_TRUE(found_corruption); + } + + // UN-corrupt file for next iteration + ASSERT_OK(test::CorruptFile(env_, sst_file, static_cast(pos), 1, + /*verify checksum fails*/ false)); + } + + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTablePropertiesTest, InvalidIgnored) { + // RocksDB versions 2.5 - 2.7 generate some properties that Block considers + // invalid in some way. This approximates that. + + // Inject properties block data that Block considers invalid + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableBuilder::WritePropertiesBlock:BlockData", + [&](void* block_data) { + *reinterpret_cast(block_data) = Slice("X"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Corrupting the table properties corrupts the unique id. + // Ignore the unique id recorded in the manifest. + auto options = CurrentOptions(); + options.verify_sst_unique_id_in_manifest = false; + Reopen(options); + + // Build file + for (int i = 0; i < 10; ++i) { + ASSERT_OK(db_->Put(WriteOptions(), std::to_string(i), "val")); + } + ASSERT_OK(db_->Flush(FlushOptions())); + + SyncPoint::GetInstance()->DisableProcessing(); + + // Not crashing is good enough + TablePropertiesCollection props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&props)); +} + +TEST_F(DBTablePropertiesTest, CreateOnDeletionCollectorFactory) { + ConfigOptions options; + options.ignore_unsupported_options = false; + + std::shared_ptr factory; + std::string id = CompactOnDeletionCollectorFactory::kClassName(); + ASSERT_OK( + TablePropertiesCollectorFactory::CreateFromString(options, id, &factory)); + auto del_factory = factory->CheckedCast(); + ASSERT_NE(del_factory, nullptr); + ASSERT_EQ(0U, del_factory->GetWindowSize()); + ASSERT_EQ(0U, del_factory->GetDeletionTrigger()); + ASSERT_EQ(0.0, del_factory->GetDeletionRatio()); + ASSERT_OK(TablePropertiesCollectorFactory::CreateFromString( + options, "window_size=100; deletion_trigger=90; id=" + id, &factory)); + del_factory = factory->CheckedCast(); + ASSERT_NE(del_factory, nullptr); + ASSERT_EQ(100U, del_factory->GetWindowSize()); + ASSERT_EQ(90U, del_factory->GetDeletionTrigger()); + ASSERT_EQ(0.0, del_factory->GetDeletionRatio()); + ASSERT_OK(TablePropertiesCollectorFactory::CreateFromString( + options, + "window_size=100; deletion_trigger=90; deletion_ratio=0.5; id=" + id, + &factory)); + del_factory = factory->CheckedCast(); + ASSERT_NE(del_factory, nullptr); + ASSERT_EQ(100U, del_factory->GetWindowSize()); + ASSERT_EQ(90U, del_factory->GetDeletionTrigger()); + ASSERT_EQ(0.5, del_factory->GetDeletionRatio()); +} + +TablePropertiesCollection +DBTablePropertiesTest::TestGetPropertiesOfTablesInRange( + std::vector ranges, std::size_t* num_properties, + std::size_t* num_files) { + // Since we deref zero element in the vector it can not be empty + // otherwise we pass an address to some random memory + EXPECT_GT(ranges.size(), 0U); + // run the query + TablePropertiesCollection props; + EXPECT_OK(db_->GetPropertiesOfTablesInRange( + db_->DefaultColumnFamily(), &ranges[0], ranges.size(), &props)); + + // Make sure that we've received properties for those and for those files + // only which fall within requested ranges + std::vector vmd; + db_->GetLiveFilesMetaData(&vmd); + for (auto& md : vmd) { + std::string fn = md.db_path + md.name; + bool in_range = false; + for (auto& r : ranges) { + // smallestkey < limit && largestkey >= start + if (r.limit.compare(md.smallestkey) >= 0 && + r.start.compare(md.largestkey) <= 0) { + in_range = true; + EXPECT_GT(props.count(fn), 0); + } + } + if (!in_range) { + EXPECT_EQ(props.count(fn), 0); + } + } + + if (num_properties) { + *num_properties = props.size(); + } + + if (num_files) { + *num_files = vmd.size(); + } + return props; +} + +TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesInRange) { + // Fixed random sead + Random rnd(301); + + Options options; + options.create_if_missing = true; + options.write_buffer_size = 4096; + options.max_write_buffer_number = 2; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 2; + options.target_file_size_base = 2048; + options.max_bytes_for_level_base = 40960; + options.max_bytes_for_level_multiplier = 4; + options.hard_pending_compaction_bytes_limit = 16 * 1024; + options.num_levels = 8; + options.env = env_; + + DestroyAndReopen(options); + + // build a decent LSM + for (int i = 0; i < 10000; i++) { + ASSERT_OK(Put(test::RandomKey(&rnd, 5), rnd.RandomString(102))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + if (NumTableFilesAtLevel(0) == 0) { + ASSERT_OK(Put(test::RandomKey(&rnd, 5), rnd.RandomString(102))); + ASSERT_OK(Flush()); + } + + ASSERT_OK(db_->PauseBackgroundWork()); + + // Ensure that we have at least L0, L1 and L2 + ASSERT_GT(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + ASSERT_GT(NumTableFilesAtLevel(2), 0); + + // Query the largest range + std::size_t num_properties, num_files; + TestGetPropertiesOfTablesInRange( + {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::SMALLEST), + test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST))}, + &num_properties, &num_files); + ASSERT_EQ(num_properties, num_files); + + // Query the empty range + TestGetPropertiesOfTablesInRange( + {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST), + test::RandomKey(&rnd, 5, test::RandomKeyType::SMALLEST))}, + &num_properties, &num_files); + ASSERT_GT(num_files, 0); + ASSERT_EQ(num_properties, 0); + + // Query the middle rangee + TestGetPropertiesOfTablesInRange( + {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::MIDDLE), + test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST))}, + &num_properties, &num_files); + ASSERT_GT(num_files, 0); + ASSERT_GT(num_files, num_properties); + ASSERT_GT(num_properties, 0); + + // Query a bunch of random ranges + for (int j = 0; j < 100; j++) { + // create a bunch of ranges + std::vector random_keys; + // Random returns numbers with zero included + // when we pass empty ranges TestGetPropertiesOfTablesInRange() + // derefs random memory in the empty ranges[0] + // so want to be greater than zero and even since + // the below loop requires that random_keys.size() to be even. + auto n = 2 * (rnd.Uniform(50) + 1); + + for (uint32_t i = 0; i < n; ++i) { + random_keys.push_back(test::RandomKey(&rnd, 5)); + } + + ASSERT_GT(random_keys.size(), 0U); + ASSERT_EQ((random_keys.size() % 2), 0U); + + std::vector ranges; + auto it = random_keys.begin(); + while (it != random_keys.end()) { + ranges.push_back(Range(*it, *(it + 1))); + it += 2; + } + + TestGetPropertiesOfTablesInRange(std::move(ranges)); + } +} + +TEST_F(DBTablePropertiesTest, GetColumnFamilyNameProperty) { + std::string kExtraCfName = "pikachu"; + CreateAndReopenWithCF({kExtraCfName}, CurrentOptions()); + + // Create one table per CF, then verify it was created with the column family + // name property. + for (uint32_t cf = 0; cf < 2; ++cf) { + ASSERT_OK(Put(cf, "key", "val")); + ASSERT_OK(Flush(cf)); + + TablePropertiesCollection fname_to_props; + ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props)); + ASSERT_EQ(1U, fname_to_props.size()); + + std::string expected_cf_name; + if (cf > 0) { + expected_cf_name = kExtraCfName; + } else { + expected_cf_name = kDefaultColumnFamilyName; + } + ASSERT_EQ(expected_cf_name, + fname_to_props.begin()->second->column_family_name); + ASSERT_EQ(cf, static_cast( + fname_to_props.begin()->second->column_family_id)); + } +} + +TEST_F(DBTablePropertiesTest, GetDbIdentifiersProperty) { + CreateAndReopenWithCF({"goku"}, CurrentOptions()); + + for (uint32_t cf = 0; cf < 2; ++cf) { + ASSERT_OK(Put(cf, "key", "val")); + ASSERT_OK(Put(cf, "foo", "bar")); + ASSERT_OK(Flush(cf)); + + TablePropertiesCollection fname_to_props; + ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props)); + ASSERT_EQ(1U, fname_to_props.size()); + + std::string id, sid; + ASSERT_OK(db_->GetDbIdentity(id)); + ASSERT_OK(db_->GetDbSessionId(sid)); + ASSERT_EQ(id, fname_to_props.begin()->second->db_id); + ASSERT_EQ(sid, fname_to_props.begin()->second->db_session_id); + } +} + +class DBTableHostnamePropertyTest + : public DBTestBase, + public ::testing::WithParamInterface> { + public: + DBTableHostnamePropertyTest() + : DBTestBase("db_table_hostname_property_test", + /*env_do_fsync=*/false) {} +}; + +TEST_P(DBTableHostnamePropertyTest, DbHostLocationProperty) { + option_config_ = std::get<0>(GetParam()); + Options opts = CurrentOptions(); + std::string expected_host_id = std::get<1>(GetParam()); + ; + if (expected_host_id == kHostnameForDbHostId) { + ASSERT_OK(env_->GetHostNameString(&expected_host_id)); + } else { + opts.db_host_id = expected_host_id; + } + CreateAndReopenWithCF({"goku"}, opts); + + for (uint32_t cf = 0; cf < 2; ++cf) { + ASSERT_OK(Put(cf, "key", "val")); + ASSERT_OK(Put(cf, "foo", "bar")); + ASSERT_OK(Flush(cf)); + + TablePropertiesCollection fname_to_props; + ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props)); + ASSERT_EQ(1U, fname_to_props.size()); + + ASSERT_EQ(fname_to_props.begin()->second->db_host_id, expected_host_id); + } +} + +INSTANTIATE_TEST_CASE_P( + DBTableHostnamePropertyTest, DBTableHostnamePropertyTest, + ::testing::Values( + // OptionConfig, override db_host_location + std::make_tuple(DBTestBase::OptionConfig::kDefault, + kHostnameForDbHostId), + std::make_tuple(DBTestBase::OptionConfig::kDefault, "foobar"), + std::make_tuple(DBTestBase::OptionConfig::kDefault, ""), + std::make_tuple(DBTestBase::OptionConfig::kPlainTableFirstBytePrefix, + kHostnameForDbHostId), + std::make_tuple(DBTestBase::OptionConfig::kPlainTableFirstBytePrefix, + "foobar"), + std::make_tuple(DBTestBase::OptionConfig::kPlainTableFirstBytePrefix, + ""))); + +class DeletionTriggeredCompactionTestListener : public EventListener { + public: + void OnCompactionBegin(DB*, const CompactionJobInfo& ci) override { + ASSERT_EQ(ci.compaction_reason, + CompactionReason::kFilesMarkedForCompaction); + } + + void OnCompactionCompleted(DB*, const CompactionJobInfo& ci) override { + ASSERT_EQ(ci.compaction_reason, + CompactionReason::kFilesMarkedForCompaction); + } +}; + +TEST_P(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) { + int kNumKeys = 1000; + int kWindowSize = 100; + int kNumDelsTrigger = 90; + std::shared_ptr compact_on_del = + NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger); + + Options opts = CurrentOptions(); + opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + opts.table_properties_collector_factories.emplace_back(compact_on_del); + + if (GetParam() == "kCompactionStyleUniversal") { + opts.compaction_style = kCompactionStyleUniversal; + } + Reopen(opts); + + // add an L1 file to prevent tombstones from dropping due to obsolescence + // during flush + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + + DeletionTriggeredCompactionTestListener* listener = + new DeletionTriggeredCompactionTestListener(); + opts.listeners.emplace_back(listener); + Reopen(opts); + + for (int i = 0; i < kNumKeys; ++i) { + if (i >= kNumKeys - kWindowSize && + i < kNumKeys - kWindowSize + kNumDelsTrigger) { + ASSERT_OK(Delete(Key(i))); + } else { + ASSERT_OK(Put(Key(i), "val")); + } + } + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + + // Change the window size and deletion trigger and ensure new values take + // effect + kWindowSize = 50; + kNumDelsTrigger = 40; + static_cast(compact_on_del.get()) + ->SetWindowSize(kWindowSize); + static_cast(compact_on_del.get()) + ->SetDeletionTrigger(kNumDelsTrigger); + for (int i = 0; i < kNumKeys; ++i) { + if (i >= kNumKeys - kWindowSize && + i < kNumKeys - kWindowSize + kNumDelsTrigger) { + ASSERT_OK(Delete(Key(i))); + } else { + ASSERT_OK(Put(Key(i), "val")); + } + } + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + + // Change the window size to disable delete triggered compaction + kWindowSize = 0; + static_cast(compact_on_del.get()) + ->SetWindowSize(kWindowSize); + static_cast(compact_on_del.get()) + ->SetDeletionTrigger(kNumDelsTrigger); + for (int i = 0; i < kNumKeys; ++i) { + if (i >= kNumKeys - kWindowSize && + i < kNumKeys - kWindowSize + kNumDelsTrigger) { + ASSERT_OK(Delete(Key(i))); + } else { + ASSERT_OK(Put(Key(i), "val")); + } + } + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + ASSERT_LT(0, opts.statistics->getTickerCount(COMPACT_WRITE_BYTES_MARKED)); + ASSERT_LT(0, opts.statistics->getTickerCount(COMPACT_READ_BYTES_MARKED)); +} + +TEST_P(DBTablePropertiesTest, RatioBasedDeletionTriggeredCompactionMarking) { + constexpr int kNumKeys = 1000; + constexpr int kWindowSize = 0; + constexpr int kNumDelsTrigger = 0; + constexpr double kDeletionRatio = 0.1; + std::shared_ptr compact_on_del = + NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger, + kDeletionRatio); + + Options opts = CurrentOptions(); + opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + opts.table_properties_collector_factories.emplace_back(compact_on_del); + + Reopen(opts); + + // Add an L2 file to prevent tombstones from dropping due to obsolescence + // during flush + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + + auto* listener = new DeletionTriggeredCompactionTestListener(); + opts.listeners.emplace_back(listener); + Reopen(opts); + + // Generate one L0 with kNumKeys Put. + for (int i = 0; i < kNumKeys; ++i) { + ASSERT_OK(Put(Key(i), "not important")); + } + ASSERT_OK(Flush()); + + // Generate another L0 with kNumKeys Delete. + // This file, due to deletion ratio, will trigger compaction: 2@0 files to L1. + // The resulting L1 file has only one tombstone for user key 'Key(0)'. + // Again, due to deletion ratio, a compaction will be triggered: 1@1 + 1@2 + // files to L2. However, the resulting file is empty because the tombstone + // and value are both dropped. + for (int i = 0; i < kNumKeys; ++i) { + ASSERT_OK(Delete(Key(i))); + } + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + for (int i = 0; i < 3; ++i) { + ASSERT_EQ(0, NumTableFilesAtLevel(i)); + } +} + +INSTANTIATE_TEST_CASE_P(DBTablePropertiesTest, DBTablePropertiesTest, + ::testing::Values("kCompactionStyleLevel", + "kCompactionStyleUniversal")); + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_tailing_iter_test.cc b/src/rocksdb/db/db_tailing_iter_test.cc new file mode 100644 index 000000000..af3194ac4 --- /dev/null +++ b/src/rocksdb/db/db_tailing_iter_test.cc @@ -0,0 +1,604 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// Introduction of SyncPoint effectively disabled building and running this test +// in Release build. +// which is a pity, it is a good test +#if !defined(ROCKSDB_LITE) + +#include "db/db_test_util.h" +#include "db/forward_iterator.h" +#include "port/stack_trace.h" + +namespace ROCKSDB_NAMESPACE { + +class DBTestTailingIterator : public DBTestBase, + public ::testing::WithParamInterface { + public: + DBTestTailingIterator() + : DBTestBase("db_tailing_iterator_test", /*env_do_fsync=*/true) {} +}; + +INSTANTIATE_TEST_CASE_P(DBTestTailingIterator, DBTestTailingIterator, + ::testing::Bool()); + +TEST_P(DBTestTailingIterator, TailingIteratorSingle) { + ReadOptions read_options; + read_options.tailing = true; + if (GetParam()) { + read_options.async_io = true; + } + + std::unique_ptr iter(db_->NewIterator(read_options)); + iter->SeekToFirst(); + ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); + + // add a record and check that iter can see it + ASSERT_OK(db_->Put(WriteOptions(), "mirko", "fodor")); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "mirko"); + + iter->Next(); + ASSERT_TRUE(!iter->Valid()); +} + +TEST_P(DBTestTailingIterator, TailingIteratorKeepAdding) { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ReadOptions read_options; + read_options.tailing = true; + if (GetParam()) { + read_options.async_io = true; + } + std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iter->status()); + std::string value(1024, 'a'); + + const int num_records = 10000; + for (int i = 0; i < num_records; ++i) { + char buf[32]; + snprintf(buf, sizeof(buf), "%016d", i); + + Slice key(buf, 16); + ASSERT_OK(Put(1, key, value)); + + iter->Seek(key); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(key), 0); + } +} + +TEST_P(DBTestTailingIterator, TailingIteratorSeekToNext) { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ReadOptions read_options; + read_options.tailing = true; + if (GetParam()) { + read_options.async_io = true; + } + std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iter->status()); + std::unique_ptr itern(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(itern->status()); + std::string value(1024, 'a'); + + const int num_records = 1000; + for (int i = 1; i < num_records; ++i) { + char buf1[32]; + char buf2[32]; + snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5); + + Slice key(buf1, 20); + ASSERT_OK(Put(1, key, value)); + + if (i % 100 == 99) { + ASSERT_OK(Flush(1)); + } + + snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2); + Slice target(buf2, 20); + iter->Seek(target); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(key), 0); + if (i == 1) { + itern->SeekToFirst(); + } else { + itern->Next(); + } + ASSERT_TRUE(itern->Valid()); + ASSERT_EQ(itern->key().compare(key), 0); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + for (int i = 2 * num_records; i > 0; --i) { + char buf1[32]; + char buf2[32]; + snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5); + + Slice key(buf1, 20); + ASSERT_OK(Put(1, key, value)); + + if (i % 100 == 99) { + ASSERT_OK(Flush(1)); + } + + snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2); + Slice target(buf2, 20); + iter->Seek(target); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(key), 0); + } +} + +TEST_P(DBTestTailingIterator, TailingIteratorTrimSeekToNext) { + const uint64_t k150KB = 150 * 1024; + Options options; + options.write_buffer_size = k150KB; + options.max_write_buffer_number = 3; + options.min_write_buffer_number_to_merge = 2; + options.env = env_; + CreateAndReopenWithCF({"pikachu"}, options); + ReadOptions read_options; + read_options.tailing = true; + if (GetParam()) { + read_options.async_io = true; + } + int num_iters, deleted_iters; + + char bufe[32]; + snprintf(bufe, sizeof(bufe), "00b0%016d", 0); + Slice keyu(bufe, 20); + read_options.iterate_upper_bound = &keyu; + std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iter->status()); + std::unique_ptr itern(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(itern->status()); + std::unique_ptr iterh(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iterh->status()); + std::string value(1024, 'a'); + bool file_iters_deleted = false; + bool file_iters_renewed_null = false; + bool file_iters_renewed_copy = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ForwardIterator::SeekInternal:Return", [&](void* arg) { + ForwardIterator* fiter = reinterpret_cast(arg); + ASSERT_TRUE(!file_iters_deleted || + fiter->TEST_CheckDeletedIters(&deleted_iters, &num_iters)); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ForwardIterator::Next:Return", [&](void* arg) { + ForwardIterator* fiter = reinterpret_cast(arg); + ASSERT_TRUE(!file_iters_deleted || + fiter->TEST_CheckDeletedIters(&deleted_iters, &num_iters)); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ForwardIterator::RenewIterators:Null", + [&](void* /*arg*/) { file_iters_renewed_null = true; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ForwardIterator::RenewIterators:Copy", + [&](void* /*arg*/) { file_iters_renewed_copy = true; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + const int num_records = 1000; + for (int i = 1; i < num_records; ++i) { + char buf1[32]; + char buf2[32]; + char buf3[32]; + char buf4[32]; + snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5); + snprintf(buf3, sizeof(buf3), "00b0%016d", i * 5); + + Slice key(buf1, 20); + ASSERT_OK(Put(1, key, value)); + Slice keyn(buf3, 20); + ASSERT_OK(Put(1, keyn, value)); + + if (i % 100 == 99) { + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + if (i == 299) { + file_iters_deleted = true; + } + snprintf(buf4, sizeof(buf4), "00a0%016d", i * 5 / 2); + Slice target(buf4, 20); + iterh->Seek(target); + ASSERT_TRUE(iter->Valid()); + for (int j = (i + 1) * 5 / 2; j < i * 5; j += 5) { + iterh->Next(); + ASSERT_TRUE(iterh->Valid()); + } + if (i == 299) { + file_iters_deleted = false; + } + } + + file_iters_deleted = true; + snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2); + Slice target(buf2, 20); + iter->Seek(target); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(key), 0); + ASSERT_LE(num_iters, 1); + if (i == 1) { + itern->SeekToFirst(); + } else { + itern->Next(); + } + ASSERT_TRUE(itern->Valid()); + ASSERT_EQ(itern->key().compare(key), 0); + ASSERT_LE(num_iters, 1); + file_iters_deleted = false; + } + ASSERT_TRUE(file_iters_renewed_null); + ASSERT_TRUE(file_iters_renewed_copy); + iter = nullptr; + itern = nullptr; + iterh = nullptr; + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.block_cache_compressed = nullptr; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + read_options.read_tier = kBlockCacheTier; + std::unique_ptr iteri(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iteri->status()); + char buf5[32]; + snprintf(buf5, sizeof(buf5), "00a0%016d", (num_records / 2) * 5 - 2); + Slice target1(buf5, 20); + iteri->Seek(target1); + ASSERT_TRUE(iteri->status().IsIncomplete()); + iteri = nullptr; + + read_options.read_tier = kReadAllTier; + options.table_factory.reset(NewBlockBasedTableFactory()); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + iter.reset(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iter->status()); + for (int i = 2 * num_records; i > 0; --i) { + char buf1[32]; + char buf2[32]; + snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5); + + Slice key(buf1, 20); + ASSERT_OK(Put(1, key, value)); + + if (i % 100 == 99) { + ASSERT_OK(Flush(1)); + } + + snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2); + Slice target(buf2, 20); + iter->Seek(target); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(key), 0); + } +} + +TEST_P(DBTestTailingIterator, TailingIteratorDeletes) { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ReadOptions read_options; + read_options.tailing = true; + if (GetParam()) { + read_options.async_io = true; + } + + std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iter->status()); + + // write a single record, read it using the iterator, then delete it + ASSERT_OK(Put(1, "0test", "test")); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "0test"); + ASSERT_OK(Delete(1, "0test")); + + // write many more records + const int num_records = 10000; + std::string value(1024, 'A'); + + for (int i = 0; i < num_records; ++i) { + char buf[32]; + snprintf(buf, sizeof(buf), "1%015d", i); + + Slice key(buf, 16); + ASSERT_OK(Put(1, key, value)); + } + + // force a flush to make sure that no records are read from memtable + ASSERT_OK(Flush(1)); + + // skip "0test" + iter->Next(); + + // make sure we can read all new records using the existing iterator + int count = 0; + for (; iter->Valid(); iter->Next(), ++count) + ; + + ASSERT_EQ(count, num_records); +} + +TEST_P(DBTestTailingIterator, TailingIteratorPrefixSeek) { + ReadOptions read_options; + read_options.tailing = true; + if (GetParam()) { + read_options.async_io = true; + } + Options options = CurrentOptions(); + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(2)); + options.memtable_factory.reset(NewHashSkipListRepFactory(16)); + options.allow_concurrent_memtable_write = false; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iter->status()); + ASSERT_OK(Put(1, "0101", "test")); + + ASSERT_OK(Flush(1)); + + ASSERT_OK(Put(1, "0202", "test")); + + // Seek(0102) shouldn't find any records since 0202 has a different prefix + iter->Seek("0102"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("0202"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "0202"); + + iter->Next(); + ASSERT_TRUE(!iter->Valid()); +} + +TEST_P(DBTestTailingIterator, TailingIteratorIncomplete) { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ReadOptions read_options; + read_options.tailing = true; + if (GetParam()) { + read_options.async_io = true; + } + read_options.read_tier = kBlockCacheTier; + + std::string key("key"); + std::string value("value"); + + ASSERT_OK(db_->Put(WriteOptions(), key, value)); + + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + iter->SeekToFirst(); + // we either see the entry or it's not in cache + ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + iter->SeekToFirst(); + // should still be true after compaction + ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete()); +} + +TEST_P(DBTestTailingIterator, TailingIteratorSeekToSame) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 1000; + CreateAndReopenWithCF({"pikachu"}, options); + + ReadOptions read_options; + read_options.tailing = true; + if (GetParam()) { + read_options.async_io = true; + } + const int NROWS = 10000; + // Write rows with keys 00000, 00002, 00004 etc. + for (int i = 0; i < NROWS; ++i) { + char buf[100]; + snprintf(buf, sizeof(buf), "%05d", 2 * i); + std::string key(buf); + std::string value("value"); + ASSERT_OK(db_->Put(WriteOptions(), key, value)); + } + + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + // Seek to 00001. We expect to find 00002. + std::string start_key = "00001"; + iter->Seek(start_key); + ASSERT_TRUE(iter->Valid()); + + std::string found = iter->key().ToString(); + ASSERT_EQ("00002", found); + + // Now seek to the same key. The iterator should remain in the same + // position. + iter->Seek(found); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(found, iter->key().ToString()); +} + +// Sets iterate_upper_bound and verifies that ForwardIterator doesn't call +// Seek() on immutable iterators when target key is >= prev_key and all +// iterators, including the memtable iterator, are over the upper bound. +TEST_P(DBTestTailingIterator, TailingIteratorUpperBound) { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + + const Slice upper_bound("20", 3); + ReadOptions read_options; + read_options.tailing = true; + read_options.iterate_upper_bound = &upper_bound; + if (GetParam()) { + read_options.async_io = true; + } + ASSERT_OK(Put(1, "11", "11")); + ASSERT_OK(Put(1, "12", "12")); + ASSERT_OK(Put(1, "22", "22")); + ASSERT_OK(Flush(1)); // flush all those keys to an immutable SST file + + // Add another key to the memtable. + ASSERT_OK(Put(1, "21", "21")); + + std::unique_ptr it(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(it->status()); + it->Seek("12"); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("12", it->key().ToString()); + + it->Next(); + // Not valid since "21" is over the upper bound. + ASSERT_FALSE(it->Valid()); + ASSERT_OK(it->status()); + // This keeps track of the number of times NeedToSeekImmutable() was true. + int immutable_seeks = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ForwardIterator::SeekInternal:Immutable", + [&](void* /*arg*/) { ++immutable_seeks; }); + + // Seek to 13. This should not require any immutable seeks. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + it->Seek("13"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + ASSERT_FALSE(it->Valid()); + ASSERT_OK(it->status()); + if (GetParam()) { + ASSERT_EQ(1, immutable_seeks); + } else { + ASSERT_EQ(0, immutable_seeks); + } +} + +TEST_P(DBTestTailingIterator, TailingIteratorGap) { + // level 1: [20, 25] [35, 40] + // level 2: [10 - 15] [45 - 50] + // level 3: [20, 30, 40] + // Previously there is a bug in tailing_iterator that if there is a gap in + // lower level, the key will be skipped if it is within the range between + // the largest key of index n file and the smallest key of index n+1 file + // if both file fit in that gap. In this example, 25 < key < 35 + // https://github.com/facebook/rocksdb/issues/1372 + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + + ReadOptions read_options; + read_options.tailing = true; + if (GetParam()) { + read_options.async_io = true; + } + ASSERT_OK(Put(1, "20", "20")); + ASSERT_OK(Put(1, "30", "30")); + ASSERT_OK(Put(1, "40", "40")); + ASSERT_OK(Flush(1)); + MoveFilesToLevel(3, 1); + + ASSERT_OK(Put(1, "10", "10")); + ASSERT_OK(Put(1, "15", "15")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "45", "45")); + ASSERT_OK(Put(1, "50", "50")); + ASSERT_OK(Flush(1)); + MoveFilesToLevel(2, 1); + + ASSERT_OK(Put(1, "20", "20")); + ASSERT_OK(Put(1, "25", "25")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "35", "35")); + ASSERT_OK(Put(1, "40", "40")); + ASSERT_OK(Flush(1)); + MoveFilesToLevel(1, 1); + + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(handles_[1], &meta); + + std::unique_ptr it(db_->NewIterator(read_options, handles_[1])); + it->Seek("30"); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("30", it->key().ToString()); + + it->Next(); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("35", it->key().ToString()); + + it->Next(); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("40", it->key().ToString()); + + ASSERT_OK(it->status()); +} + +TEST_P(DBTestTailingIterator, SeekWithUpperBoundBug) { + ReadOptions read_options; + read_options.tailing = true; + if (GetParam()) { + read_options.async_io = true; + } + const Slice upper_bound("cc", 3); + read_options.iterate_upper_bound = &upper_bound; + + // 1st L0 file + ASSERT_OK(db_->Put(WriteOptions(), "aa", "SEEN")); + ASSERT_OK(Flush()); + + // 2nd L0 file + ASSERT_OK(db_->Put(WriteOptions(), "zz", "NOT-SEEN")); + ASSERT_OK(Flush()); + + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + iter->Seek("aa"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "aa"); +} + +TEST_P(DBTestTailingIterator, SeekToFirstWithUpperBoundBug) { + ReadOptions read_options; + read_options.tailing = true; + if (GetParam()) { + read_options.async_io = true; + } + const Slice upper_bound("cc", 3); + read_options.iterate_upper_bound = &upper_bound; + + // 1st L0 file + ASSERT_OK(db_->Put(WriteOptions(), "aa", "SEEN")); + ASSERT_OK(Flush()); + + // 2nd L0 file + ASSERT_OK(db_->Put(WriteOptions(), "zz", "NOT-SEEN")); + ASSERT_OK(Flush()); + + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "aa"); + + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "aa"); +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // !defined(ROCKSDB_LITE) + +int main(int argc, char** argv) { +#if !defined(ROCKSDB_LITE) + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +#else + (void)argc; + (void)argv; + return 0; +#endif +} diff --git a/src/rocksdb/db/db_test.cc b/src/rocksdb/db/db_test.cc new file mode 100644 index 000000000..9575248b4 --- /dev/null +++ b/src/rocksdb/db/db_test.cc @@ -0,0 +1,7397 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// Introduction of SyncPoint effectively disabled building and running this test +// in Release build. +// which is a pity, it is a good test +#include + +#include +#include +#include +#include +#include + +#ifndef OS_WIN +#include +#endif +#ifdef OS_SOLARIS +#include +#endif + +#include "cache/lru_cache.h" +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "db/dbformat.h" +#include "db/job_context.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "env/mock_env.h" +#include "file/filename.h" +#include "monitoring/thread_status_util.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/experimental.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/snapshot.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/thread_status.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/checkpoint.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "table/mock_table.h" +#include "table/scoped_arena_iterator.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/compression.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/rate_limiter.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +// Note that whole DBTest and its child classes disable fsync on files +// and directories for speed. +// If fsync needs to be covered in a test, put it in other places. +class DBTest : public DBTestBase { + public: + DBTest() : DBTestBase("db_test", /*env_do_fsync=*/false) {} +}; + +class DBTestWithParam + : public DBTest, + public testing::WithParamInterface> { + public: + DBTestWithParam() { + max_subcompactions_ = std::get<0>(GetParam()); + exclusive_manual_compaction_ = std::get<1>(GetParam()); + } + + // Required if inheriting from testing::WithParamInterface<> + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + uint32_t max_subcompactions_; + bool exclusive_manual_compaction_; +}; + +TEST_F(DBTest, MockEnvTest) { + std::unique_ptr env{MockEnv::Create(Env::Default())}; + Options options; + options.create_if_missing = true; + options.env = env.get(); + DB* db; + + const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")}; + const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")}; + + ASSERT_OK(DB::Open(options, "/dir/db", &db)); + for (size_t i = 0; i < 3; ++i) { + ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i])); + } + + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } + + Iterator* iterator = db->NewIterator(ReadOptions()); + iterator->SeekToFirst(); + for (size_t i = 0; i < 3; ++i) { + ASSERT_TRUE(iterator->Valid()); + ASSERT_TRUE(keys[i] == iterator->key()); + ASSERT_TRUE(vals[i] == iterator->value()); + iterator->Next(); + } + ASSERT_TRUE(!iterator->Valid()); + delete iterator; + +// TEST_FlushMemTable() is not supported in ROCKSDB_LITE +#ifndef ROCKSDB_LITE + DBImpl* dbi = static_cast_with_check(db); + ASSERT_OK(dbi->TEST_FlushMemTable()); + + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } +#endif // ROCKSDB_LITE + + delete db; +} + +// NewMemEnv returns nullptr in ROCKSDB_LITE since class InMemoryEnv isn't +// defined. +#ifndef ROCKSDB_LITE +TEST_F(DBTest, MemEnvTest) { + std::unique_ptr env{NewMemEnv(Env::Default())}; + Options options; + options.create_if_missing = true; + options.env = env.get(); + DB* db; + + const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")}; + const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")}; + + ASSERT_OK(DB::Open(options, "/dir/db", &db)); + for (size_t i = 0; i < 3; ++i) { + ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i])); + } + + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } + + Iterator* iterator = db->NewIterator(ReadOptions()); + iterator->SeekToFirst(); + for (size_t i = 0; i < 3; ++i) { + ASSERT_TRUE(iterator->Valid()); + ASSERT_TRUE(keys[i] == iterator->key()); + ASSERT_TRUE(vals[i] == iterator->value()); + iterator->Next(); + } + ASSERT_TRUE(!iterator->Valid()); + delete iterator; + + DBImpl* dbi = static_cast_with_check(db); + ASSERT_OK(dbi->TEST_FlushMemTable()); + + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } + + delete db; + + options.create_if_missing = false; + ASSERT_OK(DB::Open(options, "/dir/db", &db)); + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } + delete db; +} +#endif // ROCKSDB_LITE + +TEST_F(DBTest, WriteEmptyBatch) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "foo", "bar")); + WriteOptions wo; + wo.sync = true; + wo.disableWAL = false; + WriteBatch empty_batch; + ASSERT_OK(dbfull()->Write(wo, &empty_batch)); + + // make sure we can re-open it. + ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); + ASSERT_EQ("bar", Get(1, "foo")); +} + +TEST_F(DBTest, SkipDelay) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; + CreateAndReopenWithCF({"pikachu"}, options); + + for (bool sync : {true, false}) { + for (bool disableWAL : {true, false}) { + if (sync && disableWAL) { + // sync and disableWAL is incompatible. + continue; + } + // Use a small number to ensure a large delay that is still effective + // when we do Put + // TODO(myabandeh): this is time dependent and could potentially make + // the test flaky + auto token = dbfull()->TEST_write_controler().GetDelayToken(1); + std::atomic sleep_count(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DelayWrite:Sleep", + [&](void* /*arg*/) { sleep_count.fetch_add(1); }); + std::atomic wait_count(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DelayWrite:Wait", + [&](void* /*arg*/) { wait_count.fetch_add(1); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + WriteOptions wo; + wo.sync = sync; + wo.disableWAL = disableWAL; + wo.no_slowdown = true; + // Large enough to exceed allowance for one time interval + std::string large_value(1024, 'x'); + // Perhaps ideally this first write would fail because of delay, but + // the current implementation does not guarantee that. + dbfull()->Put(wo, "foo", large_value).PermitUncheckedError(); + // We need the 2nd write to trigger delay. This is because delay is + // estimated based on the last write size which is 0 for the first write. + ASSERT_NOK(dbfull()->Put(wo, "foo2", large_value)); + ASSERT_GE(sleep_count.load(), 0); + ASSERT_GE(wait_count.load(), 0); + token.reset(); + + token = dbfull()->TEST_write_controler().GetDelayToken(1000000); + wo.no_slowdown = false; + ASSERT_OK(dbfull()->Put(wo, "foo3", large_value)); + ASSERT_GE(sleep_count.load(), 1); + token.reset(); + } + } +} + +TEST_F(DBTest, MixedSlowdownOptions) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; + CreateAndReopenWithCF({"pikachu"}, options); + std::vector threads; + std::atomic thread_num(0); + + std::function write_slowdown_func = [&]() { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions wo; + wo.no_slowdown = false; + ASSERT_OK(dbfull()->Put(wo, key, "bar")); + }; + std::function write_no_slowdown_func = [&]() { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions wo; + wo.no_slowdown = true; + ASSERT_NOK(dbfull()->Put(wo, key, "bar")); + }; + // Use a small number to ensure a large delay that is still effective + // when we do Put + // TODO(myabandeh): this is time dependent and could potentially make + // the test flaky + auto token = dbfull()->TEST_write_controler().GetDelayToken(1); + std::atomic sleep_count(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DelayWrite:BeginWriteStallDone", [&](void* /*arg*/) { + sleep_count.fetch_add(1); + if (threads.empty()) { + for (int i = 0; i < 2; ++i) { + threads.emplace_back(write_slowdown_func); + } + for (int i = 0; i < 2; ++i) { + threads.emplace_back(write_no_slowdown_func); + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + WriteOptions wo; + wo.sync = false; + wo.disableWAL = false; + wo.no_slowdown = false; + ASSERT_OK(dbfull()->Put(wo, "foo", "bar")); + // We need the 2nd write to trigger delay. This is because delay is + // estimated based on the last write size which is 0 for the first write. + ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2")); + token.reset(); + + for (auto& t : threads) { + t.join(); + } + ASSERT_GE(sleep_count.load(), 1); + + wo.no_slowdown = true; + ASSERT_OK(dbfull()->Put(wo, "foo3", "bar")); +} + +TEST_F(DBTest, MixedSlowdownOptionsInQueue) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; + CreateAndReopenWithCF({"pikachu"}, options); + std::vector threads; + std::atomic thread_num(0); + + std::function write_no_slowdown_func = [&]() { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions wo; + wo.no_slowdown = true; + ASSERT_NOK(dbfull()->Put(wo, key, "bar")); + }; + // Use a small number to ensure a large delay that is still effective + // when we do Put + // TODO(myabandeh): this is time dependent and could potentially make + // the test flaky + auto token = dbfull()->TEST_write_controler().GetDelayToken(1); + std::atomic sleep_count(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DelayWrite:Sleep", [&](void* /*arg*/) { + sleep_count.fetch_add(1); + if (threads.empty()) { + for (int i = 0; i < 2; ++i) { + threads.emplace_back(write_no_slowdown_func); + } + // Sleep for 2s to allow the threads to insert themselves into the + // write queue + env_->SleepForMicroseconds(3000000ULL); + } + }); + std::atomic wait_count(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DelayWrite:Wait", + [&](void* /*arg*/) { wait_count.fetch_add(1); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + WriteOptions wo; + wo.sync = false; + wo.disableWAL = false; + wo.no_slowdown = false; + ASSERT_OK(dbfull()->Put(wo, "foo", "bar")); + // We need the 2nd write to trigger delay. This is because delay is + // estimated based on the last write size which is 0 for the first write. + ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2")); + token.reset(); + + for (auto& t : threads) { + t.join(); + } + ASSERT_EQ(sleep_count.load(), 1); + ASSERT_GE(wait_count.load(), 0); +} + +TEST_F(DBTest, MixedSlowdownOptionsStop) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; + CreateAndReopenWithCF({"pikachu"}, options); + std::vector threads; + std::atomic thread_num(0); + + std::function write_slowdown_func = [&]() { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions wo; + wo.no_slowdown = false; + ASSERT_OK(dbfull()->Put(wo, key, "bar")); + }; + std::function write_no_slowdown_func = [&]() { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions wo; + wo.no_slowdown = true; + ASSERT_NOK(dbfull()->Put(wo, key, "bar")); + }; + std::function wakeup_writer = [&]() { + dbfull()->mutex_.Lock(); + dbfull()->bg_cv_.SignalAll(); + dbfull()->mutex_.Unlock(); + }; + // Use a small number to ensure a large delay that is still effective + // when we do Put + // TODO(myabandeh): this is time dependent and could potentially make + // the test flaky + auto token = dbfull()->TEST_write_controler().GetStopToken(); + std::atomic wait_count(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DelayWrite:Wait", [&](void* /*arg*/) { + wait_count.fetch_add(1); + if (threads.empty()) { + for (int i = 0; i < 2; ++i) { + threads.emplace_back(write_slowdown_func); + } + for (int i = 0; i < 2; ++i) { + threads.emplace_back(write_no_slowdown_func); + } + // Sleep for 2s to allow the threads to insert themselves into the + // write queue + env_->SleepForMicroseconds(3000000ULL); + } + token.reset(); + threads.emplace_back(wakeup_writer); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + WriteOptions wo; + wo.sync = false; + wo.disableWAL = false; + wo.no_slowdown = false; + ASSERT_OK(dbfull()->Put(wo, "foo", "bar")); + // We need the 2nd write to trigger delay. This is because delay is + // estimated based on the last write size which is 0 for the first write. + ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2")); + token.reset(); + + for (auto& t : threads) { + t.join(); + } + ASSERT_GE(wait_count.load(), 1); + + wo.no_slowdown = true; + ASSERT_OK(dbfull()->Put(wo, "foo3", "bar")); +} +#ifndef ROCKSDB_LITE + +TEST_F(DBTest, LevelLimitReopen) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + + const std::string value(1024 * 1024, ' '); + int i = 0; + while (NumTableFilesAtLevel(2, 1) == 0) { + ASSERT_OK(Put(1, Key(i++), value)); + } + + options.num_levels = 1; + options.max_bytes_for_level_multiplier_additional.resize(1, 1); + Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ(s.IsInvalidArgument(), true); + ASSERT_EQ(s.ToString(), + "Invalid argument: db has more levels than options.num_levels"); + + options.num_levels = 10; + options.max_bytes_for_level_multiplier_additional.resize(10, 1); + ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); +} +#endif // ROCKSDB_LITE + +#ifndef ROCKSDB_LITE +TEST_F(DBTest, LevelReopenWithFIFO) { + const int kLevelCount = 4; + const int kKeyCount = 5; + const int kTotalSstFileCount = kLevelCount * kKeyCount; + const int kCF = 1; + + Options options = CurrentOptions(); + // Config level0_file_num_compaction_trigger to prevent L0 files being + // automatically compacted while we are constructing a LSM tree structure + // to test multi-level FIFO compaction. + options.level0_file_num_compaction_trigger = kKeyCount + 1; + CreateAndReopenWithCF({"pikachu"}, options); + + // The expected number of files per level after each file creation. + const std::string expected_files_per_level[kLevelCount][kKeyCount] = { + {"0,0,0,1", "0,0,0,2", "0,0,0,3", "0,0,0,4", "0,0,0,5"}, + {"0,0,1,5", "0,0,2,5", "0,0,3,5", "0,0,4,5", "0,0,5,5"}, + {"0,1,5,5", "0,2,5,5", "0,3,5,5", "0,4,5,5", "0,5,5,5"}, + {"1,5,5,5", "2,5,5,5", "3,5,5,5", "4,5,5,5", "5,5,5,5"}, + }; + + const std::string expected_entries[kKeyCount][kLevelCount + 1] = { + {"[ ]", "[ a3 ]", "[ a2, a3 ]", "[ a1, a2, a3 ]", "[ a0, a1, a2, a3 ]"}, + {"[ ]", "[ b3 ]", "[ b2, b3 ]", "[ b1, b2, b3 ]", "[ b0, b1, b2, b3 ]"}, + {"[ ]", "[ c3 ]", "[ c2, c3 ]", "[ c1, c2, c3 ]", "[ c0, c1, c2, c3 ]"}, + {"[ ]", "[ d3 ]", "[ d2, d3 ]", "[ d1, d2, d3 ]", "[ d0, d1, d2, d3 ]"}, + {"[ ]", "[ e3 ]", "[ e2, e3 ]", "[ e1, e2, e3 ]", "[ e0, e1, e2, e3 ]"}, + }; + + // The loop below creates the following LSM tree where each (k, v) pair + // represents a file that contains that entry. When a file is created, + // the db is reopend with FIFO compaction and verified the LSM tree + // structure is still the same. + // + // The resulting LSM tree will contain 5 different keys. Each key as + // 4 different versions, located in different level. + // + // L0: (e, e0) (d, d0) (c, c0) (b, b0) (a, a0) + // L1: (a, a1) (b, b1) (c, c1) (d, d1) (e, e1) + // L2: (a, a2) (b, b2) (c, c2) (d, d2) (e, e2) + // L3: (a, a3) (b, b3) (c, c3) (d, d3) (e, e3) + for (int l = 0; l < kLevelCount; ++l) { + int level = kLevelCount - 1 - l; + for (int p = 0; p < kKeyCount; ++p) { + std::string put_key = std::string(1, char('a' + p)); + ASSERT_OK(Put(kCF, put_key, put_key + std::to_string(level))); + ASSERT_OK(Flush(kCF)); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + for (int g = 0; g < kKeyCount; ++g) { + int entry_count = (p >= g) ? l + 1 : l; + std::string get_key = std::string(1, char('a' + g)); + CheckAllEntriesWithFifoReopen(expected_entries[g][entry_count], get_key, + kCF, {"pikachu"}, options); + } + if (level != 0) { + MoveFilesToLevel(level, kCF); + for (int g = 0; g < kKeyCount; ++g) { + int entry_count = (p >= g) ? l + 1 : l; + std::string get_key = std::string(1, char('a' + g)); + CheckAllEntriesWithFifoReopen(expected_entries[g][entry_count], + get_key, kCF, {"pikachu"}, options); + } + } + ASSERT_EQ(expected_files_per_level[l][p], FilesPerLevel(kCF)); + } + } + + // The expected number of sst files in each level after each FIFO compaction + // that deletes the oldest sst file. + const std::string expected_files_per_level_after_fifo[] = { + "5,5,5,4", "5,5,5,3", "5,5,5,2", "5,5,5,1", "5,5,5", "5,5,4", "5,5,3", + "5,5,2", "5,5,1", "5,5", "5,4", "5,3", "5,2", "5,1", + "5", "4", "3", "2", "1", "", + }; + + // The expected value entries of each key after each FIFO compaction. + // This verifies whether FIFO removes the file with the smallest key in non-L0 + // files first then the oldest files in L0. + const std::string expected_entries_after_fifo[kKeyCount][kLevelCount + 1] = { + {"[ a0, a1, a2, a3 ]", "[ a0, a1, a2 ]", "[ a0, a1 ]", "[ a0 ]", "[ ]"}, + {"[ b0, b1, b2, b3 ]", "[ b0, b1, b2 ]", "[ b0, b1 ]", "[ b0 ]", "[ ]"}, + {"[ c0, c1, c2, c3 ]", "[ c0, c1, c2 ]", "[ c0, c1 ]", "[ c0 ]", "[ ]"}, + {"[ d0, d1, d2, d3 ]", "[ d0, d1, d2 ]", "[ d0, d1 ]", "[ d0 ]", "[ ]"}, + {"[ e0, e1, e2, e3 ]", "[ e0, e1, e2 ]", "[ e0, e1 ]", "[ e0 ]", "[ ]"}, + }; + + // In the 2nd phase, we reopen the DB with FIFO compaction. In each reopen, + // we config max_table_files_size so that FIFO will remove exactly one file + // at a time upon compaction, and we will use it to verify whether the sst + // files are deleted in the correct order. + for (int i = 0; i < kTotalSstFileCount; ++i) { + uint64_t total_sst_files_size = 0; + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.total-sst-files-size", &total_sst_files_size)); + ASSERT_TRUE(total_sst_files_size > 0); + + Options fifo_options(options); + fifo_options.compaction_style = kCompactionStyleFIFO; + options.create_if_missing = false; + fifo_options.max_open_files = -1; + fifo_options.disable_auto_compactions = false; + // Config max_table_files_size to be total_sst_files_size - 1 so that + // FIFO will delete one file. + fifo_options.compaction_options_fifo.max_table_files_size = + total_sst_files_size - 1; + ASSERT_OK( + TryReopenWithColumnFamilies({"default", "pikachu"}, fifo_options)); + // For FIFO to pick a compaction + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); + ASSERT_OK(dbfull()->TEST_WaitForCompact(false)); + for (int g = 0; g < kKeyCount; ++g) { + std::string get_key = std::string(1, char('a' + g)); + int status_index = i / kKeyCount; + if ((i % kKeyCount) >= g) { + // If true, then it means the sst file containing the get_key in the + // current level has already been deleted, so we need to move the + // status_index for checking the expected value. + status_index++; + } + CheckAllEntriesWithFifoReopen( + expected_entries_after_fifo[g][status_index], get_key, kCF, + {"pikachu"}, options); + } + ASSERT_EQ(expected_files_per_level_after_fifo[i], FilesPerLevel(kCF)); + } +} +#endif // !ROCKSDB_LITE + +TEST_F(DBTest, PutSingleDeleteGet) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_OK(Put(1, "foo2", "v2")); + ASSERT_EQ("v2", Get(1, "foo2")); + ASSERT_OK(SingleDelete(1, "foo")); + ASSERT_EQ("NOT_FOUND", Get(1, "foo")); + // Skip FIFO and universal compaction because they do not apply to the test + // case. Skip MergePut because single delete does not get removed when it + // encounters a merge. + } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction | + kSkipMergePut)); +} + +TEST_F(DBTest, ReadFromPersistedTier) { + do { + Random rnd(301); + Options options = CurrentOptions(); + for (int disableWAL = 0; disableWAL <= 1; ++disableWAL) { + CreateAndReopenWithCF({"pikachu"}, options); + WriteOptions wopt; + wopt.disableWAL = (disableWAL == 1); + // 1st round: put but not flush + ASSERT_OK(db_->Put(wopt, handles_[1], "foo", "first")); + ASSERT_OK(db_->Put(wopt, handles_[1], "bar", "one")); + ASSERT_EQ("first", Get(1, "foo")); + ASSERT_EQ("one", Get(1, "bar")); + + // Read directly from persited data. + ReadOptions ropt; + ropt.read_tier = kPersistedTier; + std::string value; + if (wopt.disableWAL) { + // as data has not yet being flushed, we expect not found. + ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound()); + ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).IsNotFound()); + } else { + ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value)); + ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value)); + } + + // Multiget + std::vector multiget_cfs; + multiget_cfs.push_back(handles_[1]); + multiget_cfs.push_back(handles_[1]); + std::vector multiget_keys; + multiget_keys.push_back("foo"); + multiget_keys.push_back("bar"); + std::vector multiget_values; + auto statuses = + db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values); + if (wopt.disableWAL) { + ASSERT_TRUE(statuses[0].IsNotFound()); + ASSERT_TRUE(statuses[1].IsNotFound()); + } else { + ASSERT_OK(statuses[0]); + ASSERT_OK(statuses[1]); + } + + // 2nd round: flush and put a new value in memtable. + ASSERT_OK(Flush(1)); + ASSERT_OK(db_->Put(wopt, handles_[1], "rocksdb", "hello")); + + // once the data has been flushed, we are able to get the + // data when kPersistedTier is used. + ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).ok()); + ASSERT_EQ(value, "first"); + ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).ok()); + ASSERT_EQ(value, "one"); + if (wopt.disableWAL) { + ASSERT_TRUE( + db_->Get(ropt, handles_[1], "rocksdb", &value).IsNotFound()); + } else { + ASSERT_OK(db_->Get(ropt, handles_[1], "rocksdb", &value)); + ASSERT_EQ(value, "hello"); + } + + // Expect same result in multiget + multiget_cfs.push_back(handles_[1]); + multiget_keys.push_back("rocksdb"); + statuses = + db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values); + ASSERT_TRUE(statuses[0].ok()); + ASSERT_EQ("first", multiget_values[0]); + ASSERT_TRUE(statuses[1].ok()); + ASSERT_EQ("one", multiget_values[1]); + if (wopt.disableWAL) { + ASSERT_TRUE(statuses[2].IsNotFound()); + } else { + ASSERT_OK(statuses[2]); + } + + // 3rd round: delete and flush + ASSERT_OK(db_->Delete(wopt, handles_[1], "foo")); + Flush(1); + ASSERT_OK(db_->Delete(wopt, handles_[1], "bar")); + + ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound()); + if (wopt.disableWAL) { + // Still expect finding the value as its delete has not yet being + // flushed. + ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).ok()); + ASSERT_EQ(value, "one"); + } else { + ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).IsNotFound()); + } + ASSERT_TRUE(db_->Get(ropt, handles_[1], "rocksdb", &value).ok()); + ASSERT_EQ(value, "hello"); + + statuses = + db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values); + ASSERT_TRUE(statuses[0].IsNotFound()); + if (wopt.disableWAL) { + ASSERT_TRUE(statuses[1].ok()); + ASSERT_EQ("one", multiget_values[1]); + } else { + ASSERT_TRUE(statuses[1].IsNotFound()); + } + ASSERT_TRUE(statuses[2].ok()); + ASSERT_EQ("hello", multiget_values[2]); + if (wopt.disableWAL == 0) { + DestroyAndReopen(options); + } + } + } while (ChangeOptions()); +} + +TEST_F(DBTest, SingleDeleteFlush) { + // Test to check whether flushing preserves a single delete hidden + // behind a put. + do { + Random rnd(301); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + CreateAndReopenWithCF({"pikachu"}, options); + + // Put values on second level (so that they will not be in the same + // compaction as the other operations. + ASSERT_OK(Put(1, "foo", "first")); + ASSERT_OK(Put(1, "bar", "one")); + ASSERT_OK(Flush(1)); + MoveFilesToLevel(2, 1); + + // (Single) delete hidden by a put + ASSERT_OK(SingleDelete(1, "foo")); + ASSERT_OK(Put(1, "foo", "second")); + ASSERT_OK(Delete(1, "bar")); + ASSERT_OK(Put(1, "bar", "two")); + ASSERT_OK(Flush(1)); + + ASSERT_OK(SingleDelete(1, "foo")); + ASSERT_OK(Delete(1, "bar")); + ASSERT_OK(Flush(1)); + + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], + nullptr, nullptr)); + + ASSERT_EQ("NOT_FOUND", Get(1, "bar")); + ASSERT_EQ("NOT_FOUND", Get(1, "foo")); + // Skip FIFO and universal compaction beccaus they do not apply to the test + // case. Skip MergePut because single delete does not get removed when it + // encounters a merge. + } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction | + kSkipMergePut)); +} + +TEST_F(DBTest, SingleDeletePutFlush) { + // Single deletes that encounter the matching put in a flush should get + // removed. + do { + Random rnd(301); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "foo", Slice())); + ASSERT_OK(Put(1, "a", Slice())); + ASSERT_OK(SingleDelete(1, "a")); + ASSERT_OK(Flush(1)); + + ASSERT_EQ("[ ]", AllEntriesFor("a", 1)); + // Skip FIFO and universal compaction because they do not apply to the test + // case. Skip MergePut because single delete does not get removed when it + // encounters a merge. + } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction | + kSkipMergePut)); +} + +// Disable because not all platform can run it. +// It requires more than 9GB memory to run it, With single allocation +// of more than 3GB. +TEST_F(DBTest, DISABLED_SanitizeVeryVeryLargeValue) { + const size_t kValueSize = 4 * size_t{1024 * 1024 * 1024}; // 4GB value + std::string raw(kValueSize, 'v'); + Options options = CurrentOptions(); + options.env = env_; + options.merge_operator = MergeOperators::CreatePutOperator(); + options.write_buffer_size = 100000; // Small write buffer + options.paranoid_checks = true; + DestroyAndReopen(options); + + ASSERT_OK(Put("boo", "v1")); + ASSERT_TRUE(Put("foo", raw).IsInvalidArgument()); + ASSERT_TRUE(Merge("foo", raw).IsInvalidArgument()); + + WriteBatch wb; + ASSERT_TRUE(wb.Put("foo", raw).IsInvalidArgument()); + ASSERT_TRUE(wb.Merge("foo", raw).IsInvalidArgument()); + + Slice value_slice = raw; + Slice key_slice = "foo"; + SliceParts sp_key(&key_slice, 1); + SliceParts sp_value(&value_slice, 1); + + ASSERT_TRUE(wb.Put(sp_key, sp_value).IsInvalidArgument()); + ASSERT_TRUE(wb.Merge(sp_key, sp_value).IsInvalidArgument()); +} + +// Disable because not all platform can run it. +// It requires more than 9GB memory to run it, With single allocation +// of more than 3GB. +TEST_F(DBTest, DISABLED_VeryLargeValue) { + const size_t kValueSize = 3221225472u; // 3GB value + const size_t kKeySize = 8388608u; // 8MB key + std::string raw(kValueSize, 'v'); + std::string key1(kKeySize, 'c'); + std::string key2(kKeySize, 'd'); + + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + options.paranoid_checks = true; + DestroyAndReopen(options); + + ASSERT_OK(Put("boo", "v1")); + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put(key1, raw)); + raw[0] = 'w'; + ASSERT_OK(Put(key2, raw)); + dbfull()->TEST_WaitForFlushMemTable(); + +#ifndef ROCKSDB_LITE + ASSERT_EQ(1, NumTableFilesAtLevel(0)); +#endif // !ROCKSDB_LITE + + std::string value; + Status s = db_->Get(ReadOptions(), key1, &value); + ASSERT_OK(s); + ASSERT_EQ(kValueSize, value.size()); + ASSERT_EQ('v', value[0]); + + s = db_->Get(ReadOptions(), key2, &value); + ASSERT_OK(s); + ASSERT_EQ(kValueSize, value.size()); + ASSERT_EQ('w', value[0]); + + // Compact all files. + Flush(); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + // Check DB is not in read-only state. + ASSERT_OK(Put("boo", "v1")); + + s = db_->Get(ReadOptions(), key1, &value); + ASSERT_OK(s); + ASSERT_EQ(kValueSize, value.size()); + ASSERT_EQ('v', value[0]); + + s = db_->Get(ReadOptions(), key2, &value); + ASSERT_OK(s); + ASSERT_EQ(kValueSize, value.size()); + ASSERT_EQ('w', value[0]); +} + +TEST_F(DBTest, GetFromImmutableLayer) { + do { + Options options = CurrentOptions(); + options.env = env_; + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_EQ("v1", Get(1, "foo")); + + // Block sync calls + env_->delay_sstable_sync_.store(true, std::memory_order_release); + ASSERT_OK(Put(1, "k1", std::string(100000, 'x'))); // Fill memtable + ASSERT_OK(Put(1, "k2", std::string(100000, 'y'))); // Trigger flush + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("NOT_FOUND", Get(0, "foo")); + // Release sync calls + env_->delay_sstable_sync_.store(false, std::memory_order_release); + } while (ChangeOptions()); +} + +TEST_F(DBTest, GetLevel0Ordering) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + // Check that we process level-0 files in correct order. The code + // below generates two level-0 files where the earlier one comes + // before the later one in the level-0 file list since the earlier + // one has a smaller "smallest" key. + ASSERT_OK(Put(1, "bar", "b")); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "foo", "v2")); + ASSERT_OK(Flush(1)); + ASSERT_EQ("v2", Get(1, "foo")); + } while (ChangeOptions()); +} + +TEST_F(DBTest, WrongLevel0Config) { + Options options = CurrentOptions(); + Close(); + ASSERT_OK(DestroyDB(dbname_, options)); + options.level0_stop_writes_trigger = 1; + options.level0_slowdown_writes_trigger = 2; + options.level0_file_num_compaction_trigger = 3; + ASSERT_OK(DB::Open(options, dbname_, &db_)); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest, GetOrderedByLevels) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + Compact(1, "a", "z"); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_OK(Put(1, "foo", "v2")); + ASSERT_EQ("v2", Get(1, "foo")); + ASSERT_OK(Flush(1)); + ASSERT_EQ("v2", Get(1, "foo")); + } while (ChangeOptions()); +} + +TEST_F(DBTest, GetPicksCorrectFile) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + // Arrange to have multiple files in a non-level-0 level. + ASSERT_OK(Put(1, "a", "va")); + Compact(1, "a", "b"); + ASSERT_OK(Put(1, "x", "vx")); + Compact(1, "x", "y"); + ASSERT_OK(Put(1, "f", "vf")); + Compact(1, "f", "g"); + ASSERT_EQ("va", Get(1, "a")); + ASSERT_EQ("vf", Get(1, "f")); + ASSERT_EQ("vx", Get(1, "x")); + } while (ChangeOptions()); +} + +TEST_F(DBTest, GetEncountersEmptyLevel) { + do { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + // Arrange for the following to happen: + // * sstable A in level 0 + // * nothing in level 1 + // * sstable B in level 2 + // Then do enough Get() calls to arrange for an automatic compaction + // of sstable A. A bug would cause the compaction to be marked as + // occurring at level 1 (instead of the correct level 0). + + // Step 1: First place sstables in levels 0 and 2 + ASSERT_OK(Put(1, "a", "begin")); + ASSERT_OK(Put(1, "z", "end")); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1])); + ASSERT_OK(Put(1, "a", "begin")); + ASSERT_OK(Put(1, "z", "end")); + ASSERT_OK(Flush(1)); + ASSERT_GT(NumTableFilesAtLevel(0, 1), 0); + ASSERT_GT(NumTableFilesAtLevel(2, 1), 0); + + // Step 2: clear level 1 if necessary. + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1])); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2, 1), 1); + + // Step 3: read a bunch of times + for (int i = 0; i < 1000; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, "missing")); + } + + // Step 4: Wait for compaction to finish + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1); // XXX + } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction)); +} +#endif // ROCKSDB_LITE + +TEST_F(DBTest, FlushMultipleMemtable) { + do { + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.max_write_buffer_size_to_maintain = -1; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); + + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); + ASSERT_OK(Flush(1)); + } while (ChangeCompactOptions()); +} +#ifndef ROCKSDB_LITE +TEST_F(DBTest, FlushSchedule) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.level0_stop_writes_trigger = 1 << 10; + options.level0_slowdown_writes_trigger = 1 << 10; + options.min_write_buffer_number_to_merge = 1; + options.max_write_buffer_size_to_maintain = + static_cast(options.write_buffer_size); + options.max_write_buffer_number = 2; + options.write_buffer_size = 120 * 1024; + auto flush_listener = std::make_shared(); + flush_listener->expected_flush_reason = FlushReason::kWriteBufferFull; + options.listeners.push_back(flush_listener); + CreateAndReopenWithCF({"pikachu"}, options); + std::vector threads; + + std::atomic thread_num(0); + // each column family will have 5 thread, each thread generating 2 memtables. + // each column family should end up with 10 table files + std::function fill_memtable_func = [&]() { + int a = thread_num.fetch_add(1); + Random rnd(a); + WriteOptions wo; + // this should fill up 2 memtables + for (int k = 0; k < 5000; ++k) { + ASSERT_OK(db_->Put(wo, handles_[a & 1], rnd.RandomString(13), "")); + } + }; + + for (int i = 0; i < 10; ++i) { + threads.emplace_back(fill_memtable_func); + } + + for (auto& t : threads) { + t.join(); + } + + auto default_tables = GetNumberOfSstFilesForColumnFamily(db_, "default"); + auto pikachu_tables = GetNumberOfSstFilesForColumnFamily(db_, "pikachu"); + ASSERT_LE(default_tables, static_cast(10)); + ASSERT_GT(default_tables, static_cast(0)); + ASSERT_LE(pikachu_tables, static_cast(10)); + ASSERT_GT(pikachu_tables, static_cast(0)); +} +#endif // ROCKSDB_LITE + +namespace { +class KeepFilter : public CompactionFilter { + public: + bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + return false; + } + + const char* Name() const override { return "KeepFilter"; } +}; + +class KeepFilterFactory : public CompactionFilterFactory { + public: + explicit KeepFilterFactory(bool check_context = false) + : check_context_(check_context) {} + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + if (check_context_) { + EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction); + EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction); + } + return std::unique_ptr(new KeepFilter()); + } + + const char* Name() const override { return "KeepFilterFactory"; } + bool check_context_; + std::atomic_bool expect_full_compaction_; + std::atomic_bool expect_manual_compaction_; +}; + +class DelayFilter : public CompactionFilter { + public: + explicit DelayFilter(DBTestBase* d) : db_test(d) {} + bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + db_test->env_->MockSleepForMicroseconds(1000); + return true; + } + + const char* Name() const override { return "DelayFilter"; } + + private: + DBTestBase* db_test; +}; + +class DelayFilterFactory : public CompactionFilterFactory { + public: + explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {} + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& /*context*/) override { + return std::unique_ptr(new DelayFilter(db_test)); + } + + const char* Name() const override { return "DelayFilterFactory"; } + + private: + DBTestBase* db_test; +}; +} // anonymous namespace + +#ifndef ROCKSDB_LITE + +static std::string CompressibleString(Random* rnd, int len) { + std::string r; + test::CompressibleString(rnd, 0.8, len, &r); + return r; +} +#endif // ROCKSDB_LITE + +TEST_F(DBTest, FailMoreDbPaths) { + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_, 10000000); + options.db_paths.emplace_back(dbname_ + "_2", 1000000); + options.db_paths.emplace_back(dbname_ + "_3", 1000000); + options.db_paths.emplace_back(dbname_ + "_4", 1000000); + options.db_paths.emplace_back(dbname_ + "_5", 1000000); + ASSERT_TRUE(TryReopen(options).IsNotSupported()); +} + +void CheckColumnFamilyMeta( + const ColumnFamilyMetaData& cf_meta, const std::string& cf_name, + const std::vector>& files_by_level, + uint64_t start_time, uint64_t end_time) { + ASSERT_EQ(cf_meta.name, cf_name); + ASSERT_EQ(cf_meta.levels.size(), files_by_level.size()); + + uint64_t cf_size = 0; + size_t file_count = 0; + + for (size_t i = 0; i < cf_meta.levels.size(); ++i) { + const auto& level_meta_from_cf = cf_meta.levels[i]; + const auto& level_meta_from_files = files_by_level[i]; + + ASSERT_EQ(level_meta_from_cf.level, i); + ASSERT_EQ(level_meta_from_cf.files.size(), level_meta_from_files.size()); + + file_count += level_meta_from_cf.files.size(); + + uint64_t level_size = 0; + for (size_t j = 0; j < level_meta_from_cf.files.size(); ++j) { + const auto& file_meta_from_cf = level_meta_from_cf.files[j]; + const auto& file_meta_from_files = level_meta_from_files[j]; + + level_size += file_meta_from_cf.size; + + ASSERT_EQ(file_meta_from_cf.file_number, + file_meta_from_files.fd.GetNumber()); + ASSERT_EQ(file_meta_from_cf.file_number, + TableFileNameToNumber(file_meta_from_cf.name)); + ASSERT_EQ(file_meta_from_cf.size, file_meta_from_files.fd.file_size); + ASSERT_EQ(file_meta_from_cf.smallest_seqno, + file_meta_from_files.fd.smallest_seqno); + ASSERT_EQ(file_meta_from_cf.largest_seqno, + file_meta_from_files.fd.largest_seqno); + ASSERT_EQ(file_meta_from_cf.smallestkey, + file_meta_from_files.smallest.user_key().ToString()); + ASSERT_EQ(file_meta_from_cf.largestkey, + file_meta_from_files.largest.user_key().ToString()); + ASSERT_EQ(file_meta_from_cf.oldest_blob_file_number, + file_meta_from_files.oldest_blob_file_number); + ASSERT_EQ(file_meta_from_cf.oldest_ancester_time, + file_meta_from_files.oldest_ancester_time); + ASSERT_EQ(file_meta_from_cf.file_creation_time, + file_meta_from_files.file_creation_time); + ASSERT_GE(file_meta_from_cf.file_creation_time, start_time); + ASSERT_LE(file_meta_from_cf.file_creation_time, end_time); + ASSERT_GE(file_meta_from_cf.oldest_ancester_time, start_time); + ASSERT_LE(file_meta_from_cf.oldest_ancester_time, end_time); + // More from FileStorageInfo + ASSERT_EQ(file_meta_from_cf.file_type, kTableFile); + ASSERT_EQ(file_meta_from_cf.name, + "/" + file_meta_from_cf.relative_filename); + ASSERT_EQ(file_meta_from_cf.directory, file_meta_from_cf.db_path); + } + + ASSERT_EQ(level_meta_from_cf.size, level_size); + cf_size += level_size; + } + + ASSERT_EQ(cf_meta.file_count, file_count); + ASSERT_EQ(cf_meta.size, cf_size); +} + +void CheckLiveFilesMeta( + const std::vector& live_file_meta, + const std::vector>& files_by_level) { + size_t total_file_count = 0; + for (const auto& f : files_by_level) { + total_file_count += f.size(); + } + + ASSERT_EQ(live_file_meta.size(), total_file_count); + + int level = 0; + int i = 0; + + for (const auto& meta : live_file_meta) { + if (level != meta.level) { + level = meta.level; + i = 0; + } + + ASSERT_LT(i, files_by_level[level].size()); + + const auto& expected_meta = files_by_level[level][i]; + + ASSERT_EQ(meta.column_family_name, kDefaultColumnFamilyName); + ASSERT_EQ(meta.file_number, expected_meta.fd.GetNumber()); + ASSERT_EQ(meta.file_number, TableFileNameToNumber(meta.name)); + ASSERT_EQ(meta.size, expected_meta.fd.file_size); + ASSERT_EQ(meta.smallest_seqno, expected_meta.fd.smallest_seqno); + ASSERT_EQ(meta.largest_seqno, expected_meta.fd.largest_seqno); + ASSERT_EQ(meta.smallestkey, expected_meta.smallest.user_key().ToString()); + ASSERT_EQ(meta.largestkey, expected_meta.largest.user_key().ToString()); + ASSERT_EQ(meta.oldest_blob_file_number, + expected_meta.oldest_blob_file_number); + + // More from FileStorageInfo + ASSERT_EQ(meta.file_type, kTableFile); + ASSERT_EQ(meta.name, "/" + meta.relative_filename); + ASSERT_EQ(meta.directory, meta.db_path); + + ++i; + } +} + +#ifndef ROCKSDB_LITE +void AddBlobFile(const ColumnFamilyHandle* cfh, uint64_t blob_file_number, + uint64_t total_blob_count, uint64_t total_blob_bytes, + const std::string& checksum_method, + const std::string& checksum_value, + uint64_t garbage_blob_count = 0, + uint64_t garbage_blob_bytes = 0) { + ColumnFamilyData* cfd = + (static_cast(cfh))->cfd(); + assert(cfd); + + Version* const version = cfd->current(); + assert(version); + + VersionStorageInfo* const storage_info = version->storage_info(); + assert(storage_info); + + // Add a live blob file. + + auto shared_meta = SharedBlobFileMetaData::Create( + blob_file_number, total_blob_count, total_blob_bytes, checksum_method, + checksum_value); + + auto meta = BlobFileMetaData::Create(std::move(shared_meta), + BlobFileMetaData::LinkedSsts(), + garbage_blob_count, garbage_blob_bytes); + + storage_info->AddBlobFile(std::move(meta)); +} + +static void CheckBlobMetaData( + const BlobMetaData& bmd, uint64_t blob_file_number, + uint64_t total_blob_count, uint64_t total_blob_bytes, + const std::string& checksum_method, const std::string& checksum_value, + uint64_t garbage_blob_count = 0, uint64_t garbage_blob_bytes = 0) { + ASSERT_EQ(bmd.blob_file_number, blob_file_number); + ASSERT_EQ(bmd.blob_file_name, BlobFileName("", blob_file_number)); + ASSERT_EQ(bmd.blob_file_size, + total_blob_bytes + BlobLogHeader::kSize + BlobLogFooter::kSize); + + ASSERT_EQ(bmd.total_blob_count, total_blob_count); + ASSERT_EQ(bmd.total_blob_bytes, total_blob_bytes); + ASSERT_EQ(bmd.garbage_blob_count, garbage_blob_count); + ASSERT_EQ(bmd.garbage_blob_bytes, garbage_blob_bytes); + ASSERT_EQ(bmd.checksum_method, checksum_method); + ASSERT_EQ(bmd.checksum_value, checksum_value); +} + +TEST_F(DBTest, MetaDataTest) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.disable_auto_compactions = true; + + int64_t temp_time = 0; + options.env->GetCurrentTime(&temp_time); + uint64_t start_time = static_cast(temp_time); + + DestroyAndReopen(options); + + Random rnd(301); + int key_index = 0; + for (int i = 0; i < 100; ++i) { + // Add a single blob reference to each file + std::string blob_index; + BlobIndex::EncodeBlob(&blob_index, /* blob_file_number */ i + 1000, + /* offset */ 1234, /* size */ 5678, kNoCompression); + + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, Key(key_index), + blob_index)); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); + + ++key_index; + + // Fill up the rest of the file with random values. + GenerateNewFile(&rnd, &key_index, /* nowait */ true); + + ASSERT_OK(Flush()); + } + + std::vector> files_by_level; + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files_by_level); + + options.env->GetCurrentTime(&temp_time); + uint64_t end_time = static_cast(temp_time); + + ColumnFamilyMetaData cf_meta; + db_->GetColumnFamilyMetaData(&cf_meta); + CheckColumnFamilyMeta(cf_meta, kDefaultColumnFamilyName, files_by_level, + start_time, end_time); + std::vector live_file_meta; + db_->GetLiveFilesMetaData(&live_file_meta); + CheckLiveFilesMeta(live_file_meta, files_by_level); +} + +TEST_F(DBTest, AllMetaDataTest) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + constexpr uint64_t blob_file_number = 234; + constexpr uint64_t total_blob_count = 555; + constexpr uint64_t total_blob_bytes = 66666; + constexpr char checksum_method[] = "CRC32"; + constexpr char checksum_value[] = "\x3d\x87\xff\x57"; + + int64_t temp_time = 0; + options.env->GetCurrentTime(&temp_time).PermitUncheckedError(); + uint64_t start_time = static_cast(temp_time); + + Random rnd(301); + dbfull()->TEST_LockMutex(); + for (int cf = 0; cf < 2; cf++) { + AddBlobFile(handles_[cf], blob_file_number * (cf + 1), + total_blob_count * (cf + 1), total_blob_bytes * (cf + 1), + checksum_method, checksum_value); + } + dbfull()->TEST_UnlockMutex(); + + std::vector all_meta; + db_->GetAllColumnFamilyMetaData(&all_meta); + + std::vector> default_files_by_level; + std::vector> pikachu_files_by_level; + dbfull()->TEST_GetFilesMetaData(handles_[0], &default_files_by_level); + dbfull()->TEST_GetFilesMetaData(handles_[1], &pikachu_files_by_level); + + options.env->GetCurrentTime(&temp_time).PermitUncheckedError(); + uint64_t end_time = static_cast(temp_time); + + ASSERT_EQ(all_meta.size(), 2); + for (int cf = 0; cf < 2; cf++) { + const auto& cfmd = all_meta[cf]; + if (cf == 0) { + CheckColumnFamilyMeta(cfmd, "default", default_files_by_level, start_time, + end_time); + } else { + CheckColumnFamilyMeta(cfmd, "pikachu", pikachu_files_by_level, start_time, + end_time); + } + ASSERT_EQ(cfmd.blob_files.size(), 1U); + const auto& bmd = cfmd.blob_files[0]; + ASSERT_EQ(cfmd.blob_file_count, 1U); + ASSERT_EQ(cfmd.blob_file_size, bmd.blob_file_size); + ASSERT_EQ(NormalizePath(bmd.blob_file_path), NormalizePath(dbname_)); + CheckBlobMetaData(bmd, blob_file_number * (cf + 1), + total_blob_count * (cf + 1), total_blob_bytes * (cf + 1), + checksum_method, checksum_value); + } +} + +namespace { +void MinLevelHelper(DBTest* self, Options& options) { + Random rnd(301); + + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + std::vector values; + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + values.push_back(rnd.RandomString(10000)); + ASSERT_OK(self->Put(DBTestBase::Key(i), values[i])); + } + ASSERT_OK(self->dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ(self->NumTableFilesAtLevel(0), num + 1); + } + + // generate one more file in level-0, and should trigger level-0 compaction + std::vector values; + for (int i = 0; i < 12; i++) { + values.push_back(rnd.RandomString(10000)); + ASSERT_OK(self->Put(DBTestBase::Key(i), values[i])); + } + ASSERT_OK(self->dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ(self->NumTableFilesAtLevel(0), 0); + ASSERT_EQ(self->NumTableFilesAtLevel(1), 1); +} + +// returns false if the calling-Test should be skipped +bool MinLevelToCompress(CompressionType& type, Options& options, int wbits, + int lev, int strategy) { + fprintf(stderr, + "Test with compression options : window_bits = %d, level = %d, " + "strategy = %d}\n", + wbits, lev, strategy); + options.write_buffer_size = 100 << 10; // 100KB + options.arena_block_size = 4096; + options.num_levels = 3; + options.level0_file_num_compaction_trigger = 3; + options.create_if_missing = true; + + if (Snappy_Supported()) { + type = kSnappyCompression; + fprintf(stderr, "using snappy\n"); + } else if (Zlib_Supported()) { + type = kZlibCompression; + fprintf(stderr, "using zlib\n"); + } else if (BZip2_Supported()) { + type = kBZip2Compression; + fprintf(stderr, "using bzip2\n"); + } else if (LZ4_Supported()) { + type = kLZ4Compression; + fprintf(stderr, "using lz4\n"); + } else if (XPRESS_Supported()) { + type = kXpressCompression; + fprintf(stderr, "using xpress\n"); + } else if (ZSTD_Supported()) { + type = kZSTD; + fprintf(stderr, "using ZSTD\n"); + } else { + fprintf(stderr, "skipping test, compression disabled\n"); + return false; + } + options.compression_per_level.resize(options.num_levels); + + // do not compress L0 + for (int i = 0; i < 1; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = 1; i < options.num_levels; i++) { + options.compression_per_level[i] = type; + } + return true; +} +} // anonymous namespace + +TEST_F(DBTest, MinLevelToCompress1) { + Options options = CurrentOptions(); + CompressionType type = kSnappyCompression; + if (!MinLevelToCompress(type, options, -14, -1, 0)) { + return; + } + Reopen(options); + MinLevelHelper(this, options); + + // do not compress L0 and L1 + for (int i = 0; i < 2; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = 2; i < options.num_levels; i++) { + options.compression_per_level[i] = type; + } + DestroyAndReopen(options); + MinLevelHelper(this, options); +} + +TEST_F(DBTest, MinLevelToCompress2) { + Options options = CurrentOptions(); + CompressionType type = kSnappyCompression; + if (!MinLevelToCompress(type, options, 15, -1, 0)) { + return; + } + Reopen(options); + MinLevelHelper(this, options); + + // do not compress L0 and L1 + for (int i = 0; i < 2; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = 2; i < options.num_levels; i++) { + options.compression_per_level[i] = type; + } + DestroyAndReopen(options); + MinLevelHelper(this, options); +} + +// This test may fail because of a legit case that multiple L0 files +// are trivial moved to L1. +TEST_F(DBTest, DISABLED_RepeatedWritesToSameKey) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + CreateAndReopenWithCF({"pikachu"}, options); + + // We must have at most one file per level except for level-0, + // which may have up to kL0_StopWritesTrigger files. + const int kMaxFiles = + options.num_levels + options.level0_stop_writes_trigger; + + Random rnd(301); + std::string value = + rnd.RandomString(static_cast(2 * options.write_buffer_size)); + for (int i = 0; i < 5 * kMaxFiles; i++) { + ASSERT_OK(Put(1, "key", value)); + ASSERT_LE(TotalTableFiles(1), kMaxFiles); + } + } while (ChangeCompactOptions()); +} +#endif // ROCKSDB_LITE + +#ifndef ROCKSDB_LITE +static bool Between(uint64_t val, uint64_t low, uint64_t high) { + bool result = (val >= low) && (val <= high); + if (!result) { + fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", + (unsigned long long)(val), (unsigned long long)(low), + (unsigned long long)(high)); + } + return result; +} + +TEST_F(DBTest, ApproximateSizesMemTable) { + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; // Large write buffer + options.compression = kNoCompression; + options.create_if_missing = true; + DestroyAndReopen(options); + auto default_cf = db_->DefaultColumnFamily(); + + const int N = 128; + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(1024))); + } + + uint64_t size; + std::string start = Key(50); + std::string end = Key(60); + Range r(start, end); + SizeApproximationOptions size_approx_options; + size_approx_options.include_memtables = true; + size_approx_options.include_files = true; + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); + ASSERT_GT(size, 6000); + ASSERT_LT(size, 204800); + // Zero if not including mem table + ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size)); + ASSERT_EQ(size, 0); + + start = Key(500); + end = Key(600); + r = Range(start, end); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); + ASSERT_EQ(size, 0); + + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(1000 + i), rnd.RandomString(1024))); + } + + start = Key(500); + end = Key(600); + r = Range(start, end); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); + ASSERT_EQ(size, 0); + + start = Key(100); + end = Key(1020); + r = Range(start, end); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); + ASSERT_GT(size, 6000); + + options.max_write_buffer_number = 8; + options.min_write_buffer_number_to_merge = 5; + options.write_buffer_size = 1024 * N; // Not very large + DestroyAndReopen(options); + default_cf = db_->DefaultColumnFamily(); + + int keys[N * 3]; + for (int i = 0; i < N; i++) { + keys[i * 3] = i * 5; + keys[i * 3 + 1] = i * 5 + 1; + keys[i * 3 + 2] = i * 5 + 2; + } + // MemTable entry counting is estimated and can vary greatly depending on + // layout. Thus, using deterministic seed for test stability. + RandomShuffle(std::begin(keys), std::end(keys), rnd.Next()); + + for (int i = 0; i < N * 3; i++) { + ASSERT_OK(Put(Key(keys[i] + 1000), rnd.RandomString(1024))); + } + + start = Key(100); + end = Key(300); + r = Range(start, end); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); + ASSERT_EQ(size, 0); + + start = Key(1050); + end = Key(1080); + r = Range(start, end); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); + ASSERT_GT(size, 6000); + + start = Key(2100); + end = Key(2300); + r = Range(start, end); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); + ASSERT_EQ(size, 0); + + start = Key(1050); + end = Key(1080); + r = Range(start, end); + uint64_t size_with_mt, size_without_mt; + ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, + &size_with_mt)); + ASSERT_GT(size_with_mt, 6000); + ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size_without_mt)); + ASSERT_EQ(size_without_mt, 0); + + ASSERT_OK(Flush()); + + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i + 1000), rnd.RandomString(1024))); + } + + start = Key(1050); + end = Key(1080); + r = Range(start, end); + ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, + &size_with_mt)); + ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size_without_mt)); + ASSERT_GT(size_with_mt, size_without_mt); + ASSERT_GT(size_without_mt, 6000); + + // Check that include_memtables flag works as expected + size_approx_options.include_memtables = false; + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); + ASSERT_EQ(size, size_without_mt); + + // Check that files_size_error_margin works as expected, when the heuristic + // conditions are not met + start = Key(1); + end = Key(1000 + N - 2); + r = Range(start, end); + size_approx_options.files_size_error_margin = -1.0; // disabled + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); + uint64_t size2; + size_approx_options.files_size_error_margin = 0.5; // enabled, but not used + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2)); + ASSERT_EQ(size, size2); +} + +TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) { + // Roughly 4 keys per data block, 1000 keys per file, + // with filter substantially larger than a data block + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(16)); + table_options.block_size = 100; + Options options = CurrentOptions(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.write_buffer_size = 24 * 1024; + options.compression = kNoCompression; + options.create_if_missing = true; + options.target_file_size_base = 24 * 1024; + DestroyAndReopen(options); + const auto default_cf = db_->DefaultColumnFamily(); + + const int N = 64000; + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(24))); + } + // Flush everything to files + ASSERT_OK(Flush()); + // Compact the entire key space into the next level + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), default_cf, nullptr, nullptr)); + + // Write more keys + for (int i = N; i < (N + N / 4); i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(24))); + } + // Flush everything to files again + ASSERT_OK(Flush()); + + // Wait for compaction to finish + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + { + const std::string start = Key(0); + const std::string end = Key(2 * N); + const Range r(start, end); + + SizeApproximationOptions size_approx_options; + size_approx_options.include_memtables = false; + size_approx_options.include_files = true; + size_approx_options.files_size_error_margin = -1.0; // disabled + + // Get the precise size without any approximation heuristic + uint64_t size; + ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, + &size)); + ASSERT_NE(size, 0); + + // Get the size with an approximation heuristic + uint64_t size2; + const double error_margin = 0.2; + size_approx_options.files_size_error_margin = error_margin; + ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, + &size2)); + ASSERT_LT(size2, size * (1 + error_margin)); + ASSERT_GT(size2, size * (1 - error_margin)); + } + + { + // Ensure that metadata is not falsely attributed only to the last data in + // the file. (In some applications, filters can be large portion of data + // size.) + // Perform many queries over small range, enough to ensure crossing file + // boundary, and make sure we never see a spike for large filter. + for (int i = 0; i < 3000; i += 10) { + const std::string start = Key(i); + const std::string end = Key(i + 11); // overlap by 1 key + const Range r(start, end); + uint64_t size; + ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size)); + ASSERT_LE(size, 11 * 100); + } + } +} + +TEST_F(DBTest, GetApproximateMemTableStats) { + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; + options.compression = kNoCompression; + options.create_if_missing = true; + DestroyAndReopen(options); + + const int N = 128; + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(1024))); + } + + uint64_t count; + uint64_t size; + + std::string start = Key(50); + std::string end = Key(60); + Range r(start, end); + db_->GetApproximateMemTableStats(r, &count, &size); + ASSERT_GT(count, 0); + ASSERT_LE(count, N); + ASSERT_GT(size, 6000); + ASSERT_LT(size, 204800); + + start = Key(500); + end = Key(600); + r = Range(start, end); + db_->GetApproximateMemTableStats(r, &count, &size); + ASSERT_EQ(count, 0); + ASSERT_EQ(size, 0); + + ASSERT_OK(Flush()); + + start = Key(50); + end = Key(60); + r = Range(start, end); + db_->GetApproximateMemTableStats(r, &count, &size); + ASSERT_EQ(count, 0); + ASSERT_EQ(size, 0); + + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(1000 + i), rnd.RandomString(1024))); + } + + start = Key(100); + end = Key(1020); + r = Range(start, end); + db_->GetApproximateMemTableStats(r, &count, &size); + ASSERT_GT(count, 20); + ASSERT_GT(size, 6000); +} + +TEST_F(DBTest, ApproximateSizes) { + do { + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; // Large write buffer + options.compression = kNoCompression; + options.create_if_missing = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + uint64_t size; + ASSERT_OK(Size("", "xyz", 1, &size)); + ASSERT_TRUE(Between(size, 0, 0)); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_OK(Size("", "xyz", 1, &size)); + ASSERT_TRUE(Between(size, 0, 0)); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + const int N = 80; + static const int S1 = 100000; + static const int S2 = 105000; // Allow some expansion from metadata + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(1, Key(i), rnd.RandomString(S1))); + } + + // 0 because GetApproximateSizes() does not account for memtable space + ASSERT_OK(Size("", Key(50), 1, &size)); + ASSERT_TRUE(Between(size, 0, 0)); + + // Check sizes across recovery by reopening a few times + for (int run = 0; run < 3; run++) { + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + for (int compact_start = 0; compact_start < N; compact_start += 10) { + for (int i = 0; i < N; i += 10) { + ASSERT_OK(Size("", Key(i), 1, &size)); + ASSERT_TRUE(Between(size, S1 * i, S2 * i)); + ASSERT_OK(Size("", Key(i) + ".suffix", 1, &size)); + ASSERT_TRUE(Between(size, S1 * (i + 1), S2 * (i + 1))); + ASSERT_OK(Size(Key(i), Key(i + 10), 1, &size)); + ASSERT_TRUE(Between(size, S1 * 10, S2 * 10)); + } + ASSERT_OK(Size("", Key(50), 1, &size)); + ASSERT_TRUE(Between(size, S1 * 50, S2 * 50)); + ASSERT_OK(Size("", Key(50) + ".suffix", 1, &size)); + ASSERT_TRUE(Between(size, S1 * 50, S2 * 50)); + + std::string cstart_str = Key(compact_start); + std::string cend_str = Key(compact_start + 9); + Slice cstart = cstart_str; + Slice cend = cend_str; + ASSERT_OK(dbfull()->TEST_CompactRange(0, &cstart, &cend, handles_[1])); + } + + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_GT(NumTableFilesAtLevel(1, 1), 0); + } + // ApproximateOffsetOf() is not yet implemented in plain table format. + } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction | + kSkipPlainTable | kSkipHashIndex)); +} + +TEST_F(DBTest, ApproximateSizes_MixOfSmallAndLarge) { + do { + Options options = CurrentOptions(); + options.compression = kNoCompression; + CreateAndReopenWithCF({"pikachu"}, options); + + Random rnd(301); + std::string big1 = rnd.RandomString(100000); + ASSERT_OK(Put(1, Key(0), rnd.RandomString(10000))); + ASSERT_OK(Put(1, Key(1), rnd.RandomString(10000))); + ASSERT_OK(Put(1, Key(2), big1)); + ASSERT_OK(Put(1, Key(3), rnd.RandomString(10000))); + ASSERT_OK(Put(1, Key(4), big1)); + ASSERT_OK(Put(1, Key(5), rnd.RandomString(10000))); + ASSERT_OK(Put(1, Key(6), rnd.RandomString(300000))); + ASSERT_OK(Put(1, Key(7), rnd.RandomString(10000))); + + // Check sizes across recovery by reopening a few times + uint64_t size; + for (int run = 0; run < 3; run++) { + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + ASSERT_OK(Size("", Key(0), 1, &size)); + ASSERT_TRUE(Between(size, 0, 0)); + ASSERT_OK(Size("", Key(1), 1, &size)); + ASSERT_TRUE(Between(size, 10000, 11000)); + ASSERT_OK(Size("", Key(2), 1, &size)); + ASSERT_TRUE(Between(size, 20000, 21000)); + ASSERT_OK(Size("", Key(3), 1, &size)); + ASSERT_TRUE(Between(size, 120000, 121000)); + ASSERT_OK(Size("", Key(4), 1, &size)); + ASSERT_TRUE(Between(size, 130000, 131000)); + ASSERT_OK(Size("", Key(5), 1, &size)); + ASSERT_TRUE(Between(size, 230000, 232000)); + ASSERT_OK(Size("", Key(6), 1, &size)); + ASSERT_TRUE(Between(size, 240000, 242000)); + // Ensure some overhead is accounted for, even without including all + ASSERT_OK(Size("", Key(7), 1, &size)); + ASSERT_TRUE(Between(size, 540500, 545000)); + ASSERT_OK(Size("", Key(8), 1, &size)); + ASSERT_TRUE(Between(size, 550500, 555000)); + + ASSERT_OK(Size(Key(3), Key(5), 1, &size)); + ASSERT_TRUE(Between(size, 110100, 111000)); + + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); + } + // ApproximateOffsetOf() is not yet implemented in plain table format. + } while (ChangeOptions(kSkipPlainTable)); +} +#endif // ROCKSDB_LITE + +#ifndef ROCKSDB_LITE +TEST_F(DBTest, Snapshot) { + env_->SetMockSleep(); + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override)); + ASSERT_OK(Put(0, "foo", "0v1")); + ASSERT_OK(Put(1, "foo", "1v1")); + + const Snapshot* s1 = db_->GetSnapshot(); + ASSERT_EQ(1U, GetNumSnapshots()); + uint64_t time_snap1 = GetTimeOldestSnapshots(); + ASSERT_GT(time_snap1, 0U); + ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); + ASSERT_EQ(GetTimeOldestSnapshots(), + static_cast(s1->GetUnixTime())); + ASSERT_OK(Put(0, "foo", "0v2")); + ASSERT_OK(Put(1, "foo", "1v2")); + + env_->MockSleepForSeconds(1); + + const Snapshot* s2 = db_->GetSnapshot(); + ASSERT_EQ(2U, GetNumSnapshots()); + ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); + ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); + ASSERT_EQ(GetTimeOldestSnapshots(), + static_cast(s1->GetUnixTime())); + ASSERT_OK(Put(0, "foo", "0v3")); + ASSERT_OK(Put(1, "foo", "1v3")); + + { + ManagedSnapshot s3(db_); + ASSERT_EQ(3U, GetNumSnapshots()); + ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); + ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); + ASSERT_EQ(GetTimeOldestSnapshots(), + static_cast(s1->GetUnixTime())); + + ASSERT_OK(Put(0, "foo", "0v4")); + ASSERT_OK(Put(1, "foo", "1v4")); + ASSERT_EQ("0v1", Get(0, "foo", s1)); + ASSERT_EQ("1v1", Get(1, "foo", s1)); + ASSERT_EQ("0v2", Get(0, "foo", s2)); + ASSERT_EQ("1v2", Get(1, "foo", s2)); + ASSERT_EQ("0v3", Get(0, "foo", s3.snapshot())); + ASSERT_EQ("1v3", Get(1, "foo", s3.snapshot())); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + } + + ASSERT_EQ(2U, GetNumSnapshots()); + ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); + ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); + ASSERT_EQ(GetTimeOldestSnapshots(), + static_cast(s1->GetUnixTime())); + ASSERT_EQ("0v1", Get(0, "foo", s1)); + ASSERT_EQ("1v1", Get(1, "foo", s1)); + ASSERT_EQ("0v2", Get(0, "foo", s2)); + ASSERT_EQ("1v2", Get(1, "foo", s2)); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + + db_->ReleaseSnapshot(s1); + ASSERT_EQ("0v2", Get(0, "foo", s2)); + ASSERT_EQ("1v2", Get(1, "foo", s2)); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + ASSERT_EQ(1U, GetNumSnapshots()); + ASSERT_LT(time_snap1, GetTimeOldestSnapshots()); + ASSERT_EQ(GetSequenceOldestSnapshots(), s2->GetSequenceNumber()); + ASSERT_EQ(GetTimeOldestSnapshots(), + static_cast(s2->GetUnixTime())); + + db_->ReleaseSnapshot(s2); + ASSERT_EQ(0U, GetNumSnapshots()); + ASSERT_EQ(GetSequenceOldestSnapshots(), 0); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + } while (ChangeOptions()); +} + +TEST_F(DBTest, HiddenValuesAreRemoved) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + uint64_t size; + do { + Options options = CurrentOptions(options_override); + CreateAndReopenWithCF({"pikachu"}, options); + Random rnd(301); + FillLevels("a", "z", 1); + + std::string big = rnd.RandomString(50000); + ASSERT_OK(Put(1, "foo", big)); + ASSERT_OK(Put(1, "pastfoo", "v")); + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(Put(1, "foo", "tiny")); + ASSERT_OK(Put(1, "pastfoo2", "v2")); // Advance sequence number one more + + ASSERT_OK(Flush(1)); + ASSERT_GT(NumTableFilesAtLevel(0, 1), 0); + + ASSERT_EQ(big, Get(1, "foo", snapshot)); + ASSERT_OK(Size("", "pastfoo", 1, &size)); + ASSERT_TRUE(Between(size, 50000, 60000)); + db_->ReleaseSnapshot(snapshot); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny, " + big + " ]"); + Slice x("x"); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, &x, handles_[1])); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]"); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_GE(NumTableFilesAtLevel(1, 1), 1); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, &x, handles_[1])); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]"); + + ASSERT_OK(Size("", "pastfoo", 1, &size)); + ASSERT_TRUE(Between(size, 0, 1000)); + // ApproximateOffsetOf() is not yet implemented in plain table format, + // which is used by Size(). + } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction | + kSkipPlainTable)); +} +#endif // ROCKSDB_LITE + +TEST_F(DBTest, UnremovableSingleDelete) { + // If we compact: + // + // Put(A, v1) Snapshot SingleDelete(A) Put(A, v2) + // + // We do not want to end up with: + // + // Put(A, v1) Snapshot Put(A, v2) + // + // Because a subsequent SingleDelete(A) would delete the Put(A, v2) + // but not Put(A, v1), so Get(A) would return v1. + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + do { + Options options = CurrentOptions(options_override); + options.disable_auto_compactions = true; + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "foo", "first")); + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(SingleDelete(1, "foo")); + ASSERT_OK(Put(1, "foo", "second")); + ASSERT_OK(Flush(1)); + + ASSERT_EQ("first", Get(1, "foo", snapshot)); + ASSERT_EQ("second", Get(1, "foo")); + + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], + nullptr, nullptr)); + ASSERT_EQ("[ second, SDEL, first ]", AllEntriesFor("foo", 1)); + + ASSERT_OK(SingleDelete(1, "foo")); + + ASSERT_EQ("first", Get(1, "foo", snapshot)); + ASSERT_EQ("NOT_FOUND", Get(1, "foo")); + + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], + nullptr, nullptr)); + + ASSERT_EQ("first", Get(1, "foo", snapshot)); + ASSERT_EQ("NOT_FOUND", Get(1, "foo")); + db_->ReleaseSnapshot(snapshot); + // Skip FIFO and universal compaction because they do not apply to the test + // case. Skip MergePut because single delete does not get removed when it + // encounters a merge. + } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction | + kSkipMergePut)); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest, DeletionMarkers1) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Flush(1)); + const int last = 2; + MoveFilesToLevel(last, 1); + // foo => v1 is now in last level + ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); + + // Place a table at level last-1 to prevent merging with preceding mutation + ASSERT_OK(Put(1, "a", "begin")); + ASSERT_OK(Put(1, "z", "end")); + ASSERT_OK(Flush(1)); + MoveFilesToLevel(last - 1, 1); + ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); + ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1); + + ASSERT_OK(Delete(1, "foo")); + ASSERT_OK(Put(1, "foo", "v2")); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]"); + ASSERT_OK(Flush(1)); // Moves to level last-2 + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); + Slice z("z"); + ASSERT_OK(dbfull()->TEST_CompactRange(last - 2, nullptr, &z, handles_[1])); + // DEL eliminated, but v1 remains because we aren't compacting that level + // (DEL can be eliminated because v2 hides v1). + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); + ASSERT_OK( + dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1])); + // Merging last-1 w/ last, so we are the base level for "foo", so + // DEL is removed. (as is v1). + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]"); +} + +TEST_F(DBTest, DeletionMarkers2) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Flush(1)); + const int last = 2; + MoveFilesToLevel(last, 1); + // foo => v1 is now in last level + ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); + + // Place a table at level last-1 to prevent merging with preceding mutation + ASSERT_OK(Put(1, "a", "begin")); + ASSERT_OK(Put(1, "z", "end")); + ASSERT_OK(Flush(1)); + MoveFilesToLevel(last - 1, 1); + ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); + ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1); + + ASSERT_OK(Delete(1, "foo")); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); + ASSERT_OK(Flush(1)); // Moves to level last-2 + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); + ASSERT_OK( + dbfull()->TEST_CompactRange(last - 2, nullptr, nullptr, handles_[1])); + // DEL kept: "last" file overlaps + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); + ASSERT_OK( + dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1])); + // Merging last-1 w/ last, so we are the base level for "foo", so + // DEL is removed. (as is v1). + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); +} + +TEST_F(DBTest, OverlapInLevel0) { + do { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + + // Fill levels 1 and 2 to disable the pushing of new memtables to levels > + // 0. + ASSERT_OK(Put(1, "100", "v100")); + ASSERT_OK(Put(1, "999", "v999")); + ASSERT_OK(Flush(1)); + MoveFilesToLevel(2, 1); + ASSERT_OK(Delete(1, "100")); + ASSERT_OK(Delete(1, "999")); + ASSERT_OK(Flush(1)); + MoveFilesToLevel(1, 1); + ASSERT_EQ("0,1,1", FilesPerLevel(1)); + + // Make files spanning the following ranges in level-0: + // files[0] 200 .. 900 + // files[1] 300 .. 500 + // Note that files are sorted by smallest key. + ASSERT_OK(Put(1, "300", "v300")); + ASSERT_OK(Put(1, "500", "v500")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "200", "v200")); + ASSERT_OK(Put(1, "600", "v600")); + ASSERT_OK(Put(1, "900", "v900")); + ASSERT_OK(Flush(1)); + ASSERT_EQ("2,1,1", FilesPerLevel(1)); + + // BEGIN addition to existing test + // Take this opportunity to verify SST unique ids (including Plain table) + TablePropertiesCollection tbc; + ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[1], &tbc)); + VerifySstUniqueIds(tbc); + // END addition to existing test + + // Compact away the placeholder files we created initially + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1])); + ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr, handles_[1])); + ASSERT_EQ("2", FilesPerLevel(1)); + + // Do a memtable compaction. Before bug-fix, the compaction would + // not detect the overlap with level-0 files and would incorrectly place + // the deletion in a deeper level. + ASSERT_OK(Delete(1, "600")); + ASSERT_OK(Flush(1)); + ASSERT_EQ("3", FilesPerLevel(1)); + ASSERT_EQ("NOT_FOUND", Get(1, "600")); + } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction)); +} +#endif // ROCKSDB_LITE + +TEST_F(DBTest, ComparatorCheck) { + class NewComparator : public Comparator { + public: + const char* Name() const override { return "rocksdb.NewComparator"; } + int Compare(const Slice& a, const Slice& b) const override { + return BytewiseComparator()->Compare(a, b); + } + void FindShortestSeparator(std::string* s, const Slice& l) const override { + BytewiseComparator()->FindShortestSeparator(s, l); + } + void FindShortSuccessor(std::string* key) const override { + BytewiseComparator()->FindShortSuccessor(key); + } + }; + Options new_options, options; + NewComparator cmp; + do { + options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + new_options = CurrentOptions(); + new_options.comparator = &cmp; + // only the non-default column family has non-matching comparator + Status s = TryReopenWithColumnFamilies( + {"default", "pikachu"}, std::vector({options, new_options})); + ASSERT_TRUE(!s.ok()); + ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos) + << s.ToString(); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTest, CustomComparator) { + class NumberComparator : public Comparator { + public: + const char* Name() const override { return "test.NumberComparator"; } + int Compare(const Slice& a, const Slice& b) const override { + return ToNumber(a) - ToNumber(b); + } + void FindShortestSeparator(std::string* s, const Slice& l) const override { + ToNumber(*s); // Check format + ToNumber(l); // Check format + } + void FindShortSuccessor(std::string* key) const override { + ToNumber(*key); // Check format + } + + private: + static int ToNumber(const Slice& x) { + // Check that there are no extra characters. + EXPECT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size() - 1] == ']') + << EscapeString(x); + int val; + char ignored; + EXPECT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1) + << EscapeString(x); + return val; + } + }; + Options new_options; + NumberComparator cmp; + do { + new_options = CurrentOptions(); + new_options.create_if_missing = true; + new_options.comparator = &cmp; + new_options.write_buffer_size = 4096; // Compact more often + new_options.arena_block_size = 4096; + new_options = CurrentOptions(new_options); + DestroyAndReopen(new_options); + CreateAndReopenWithCF({"pikachu"}, new_options); + ASSERT_OK(Put(1, "[10]", "ten")); + ASSERT_OK(Put(1, "[0x14]", "twenty")); + for (int i = 0; i < 2; i++) { + ASSERT_EQ("ten", Get(1, "[10]")); + ASSERT_EQ("ten", Get(1, "[0xa]")); + ASSERT_EQ("twenty", Get(1, "[20]")); + ASSERT_EQ("twenty", Get(1, "[0x14]")); + ASSERT_EQ("NOT_FOUND", Get(1, "[15]")); + ASSERT_EQ("NOT_FOUND", Get(1, "[0xf]")); + Compact(1, "[0]", "[9999]"); + } + + for (int run = 0; run < 2; run++) { + for (int i = 0; i < 1000; i++) { + char buf[100]; + snprintf(buf, sizeof(buf), "[%d]", i * 10); + ASSERT_OK(Put(1, buf, buf)); + } + Compact(1, "[0]", "[1000000]"); + } + } while (ChangeCompactOptions()); +} + +TEST_F(DBTest, DBOpen_Options) { + Options options = CurrentOptions(); + std::string dbname = test::PerThreadDBPath("db_options_test"); + ASSERT_OK(DestroyDB(dbname, options)); + + // Does not exist, and create_if_missing == false: error + DB* db = nullptr; + options.create_if_missing = false; + Status s = DB::Open(options, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr); + ASSERT_TRUE(db == nullptr); + + // Does not exist, and create_if_missing == true: OK + options.create_if_missing = true; + s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + + delete db; + db = nullptr; + + // Does exist, and error_if_exists == true: error + options.create_if_missing = false; + options.error_if_exists = true; + s = DB::Open(options, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr); + ASSERT_TRUE(db == nullptr); + + // Does exist, and error_if_exists == false: OK + options.create_if_missing = true; + options.error_if_exists = false; + s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + + delete db; + db = nullptr; +} + +TEST_F(DBTest, DBOpen_Change_NumLevels) { + Options options = CurrentOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); + ASSERT_TRUE(db_ != nullptr); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "a", "123")); + ASSERT_OK(Put(1, "b", "234")); + ASSERT_OK(Flush(1)); + MoveFilesToLevel(3, 1); + Close(); + + options.create_if_missing = false; + options.num_levels = 2; + Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr); + ASSERT_TRUE(db_ == nullptr); +} + +TEST_F(DBTest, DestroyDBMetaDatabase) { + std::string dbname = test::PerThreadDBPath("db_meta"); + ASSERT_OK(env_->CreateDirIfMissing(dbname)); + std::string metadbname = MetaDatabaseName(dbname, 0); + ASSERT_OK(env_->CreateDirIfMissing(metadbname)); + std::string metametadbname = MetaDatabaseName(metadbname, 0); + ASSERT_OK(env_->CreateDirIfMissing(metametadbname)); + + // Destroy previous versions if they exist. Using the long way. + Options options = CurrentOptions(); + ASSERT_OK(DestroyDB(metametadbname, options)); + ASSERT_OK(DestroyDB(metadbname, options)); + ASSERT_OK(DestroyDB(dbname, options)); + + // Setup databases + DB* db = nullptr; + ASSERT_OK(DB::Open(options, dbname, &db)); + delete db; + db = nullptr; + ASSERT_OK(DB::Open(options, metadbname, &db)); + delete db; + db = nullptr; + ASSERT_OK(DB::Open(options, metametadbname, &db)); + delete db; + db = nullptr; + + // Delete databases + ASSERT_OK(DestroyDB(dbname, options)); + + // Check if deletion worked. + options.create_if_missing = false; + ASSERT_TRUE(!(DB::Open(options, dbname, &db)).ok()); + ASSERT_TRUE(!(DB::Open(options, metadbname, &db)).ok()); + ASSERT_TRUE(!(DB::Open(options, metametadbname, &db)).ok()); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest, SnapshotFiles) { + do { + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; // Large write buffer + CreateAndReopenWithCF({"pikachu"}, options); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + std::vector values; + for (int i = 0; i < 80; i++) { + values.push_back(rnd.RandomString(100000)); + ASSERT_OK(Put((i < 40), Key(i), values[i])); + } + + // assert that nothing makes it to disk yet. + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + + // get a file snapshot + uint64_t manifest_number = 0; + uint64_t manifest_size = 0; + std::vector files; + ASSERT_OK(dbfull()->DisableFileDeletions()); + ASSERT_OK(dbfull()->GetLiveFiles(files, &manifest_size)); + + // CURRENT, MANIFEST, OPTIONS, *.sst files (one for each CF) + ASSERT_EQ(files.size(), 5U); + + uint64_t number = 0; + FileType type; + + // copy these files to a new snapshot directory + std::string snapdir = dbname_ + ".snapdir/"; + if (env_->FileExists(snapdir).ok()) { + ASSERT_OK(DestroyDir(env_, snapdir)); + } + ASSERT_OK(env_->CreateDir(snapdir)); + + for (size_t i = 0; i < files.size(); i++) { + // our clients require that GetLiveFiles returns + // files with "/" as first character! + ASSERT_EQ(files[i][0], '/'); + std::string src = dbname_ + files[i]; + std::string dest = snapdir + files[i]; + + uint64_t size; + ASSERT_OK(env_->GetFileSize(src, &size)); + + // record the number and the size of the + // latest manifest file + if (ParseFileName(files[i].substr(1), &number, &type)) { + if (type == kDescriptorFile) { + ASSERT_EQ(manifest_number, 0); + manifest_number = number; + ASSERT_GE(size, manifest_size); + size = manifest_size; // copy only valid MANIFEST data + } + } + CopyFile(src, dest, size); + } + + // release file snapshot + ASSERT_OK(dbfull()->EnableFileDeletions(/*force*/ false)); + // overwrite one key, this key should not appear in the snapshot + std::vector extras; + for (unsigned int i = 0; i < 1; i++) { + extras.push_back(rnd.RandomString(100000)); + ASSERT_OK(Put(0, Key(i), extras[i])); + } + + // verify that data in the snapshot are correct + std::vector column_families; + column_families.emplace_back("default", ColumnFamilyOptions()); + column_families.emplace_back("pikachu", ColumnFamilyOptions()); + std::vector cf_handles; + DB* snapdb; + DBOptions opts; + opts.env = env_; + opts.create_if_missing = false; + Status stat = + DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb); + ASSERT_OK(stat); + + ReadOptions roptions; + std::string val; + for (unsigned int i = 0; i < 80; i++) { + ASSERT_OK(snapdb->Get(roptions, cf_handles[i < 40], Key(i), &val)); + ASSERT_EQ(values[i].compare(val), 0); + } + for (auto cfh : cf_handles) { + delete cfh; + } + delete snapdb; + + // look at the new live files after we added an 'extra' key + // and after we took the first snapshot. + uint64_t new_manifest_number = 0; + uint64_t new_manifest_size = 0; + std::vector newfiles; + ASSERT_OK(dbfull()->DisableFileDeletions()); + ASSERT_OK(dbfull()->GetLiveFiles(newfiles, &new_manifest_size)); + + // find the new manifest file. assert that this manifest file is + // the same one as in the previous snapshot. But its size should be + // larger because we added an extra key after taking the + // previous shapshot. + for (size_t i = 0; i < newfiles.size(); i++) { + std::string src = dbname_ + "/" + newfiles[i]; + // record the lognumber and the size of the + // latest manifest file + if (ParseFileName(newfiles[i].substr(1), &number, &type)) { + if (type == kDescriptorFile) { + ASSERT_EQ(new_manifest_number, 0); + uint64_t size; + new_manifest_number = number; + ASSERT_OK(env_->GetFileSize(src, &size)); + ASSERT_GE(size, new_manifest_size); + } + } + } + ASSERT_EQ(manifest_number, new_manifest_number); + ASSERT_GT(new_manifest_size, manifest_size); + + // Also test GetLiveFilesStorageInfo + std::vector new_infos; + ASSERT_OK(db_->GetLiveFilesStorageInfo(LiveFilesStorageInfoOptions(), + &new_infos)); + + // Close DB (while deletions disabled) + Close(); + + // Validate + for (auto& info : new_infos) { + std::string path = info.directory + "/" + info.relative_filename; + uint64_t size; + ASSERT_OK(env_->GetFileSize(path, &size)); + if (info.trim_to_size) { + ASSERT_LE(info.size, size); + } else if (!info.replacement_contents.empty()) { + ASSERT_EQ(info.size, info.replacement_contents.size()); + } else { + ASSERT_EQ(info.size, size); + } + if (info.file_type == kDescriptorFile) { + ASSERT_EQ(info.file_number, manifest_number); + } + } + } while (ChangeCompactOptions()); +} + +TEST_F(DBTest, ReadonlyDBGetLiveManifestSize) { + do { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + DestroyAndReopen(options); + + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + + uint64_t manifest_size = 0; + std::vector files; + ASSERT_OK(dbfull()->GetLiveFiles(files, &manifest_size)); + + for (const std::string& f : files) { + uint64_t number = 0; + FileType type; + if (ParseFileName(f.substr(1), &number, &type)) { + if (type == kDescriptorFile) { + uint64_t size_on_disk; + ASSERT_OK(env_->GetFileSize(dbname_ + "/" + f, &size_on_disk)); + ASSERT_EQ(manifest_size, size_on_disk); + break; + } + } + } + Close(); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTest, GetLiveBlobFiles) { + // Note: the following prevents an otherwise harmless data race between the + // test setup code (AddBlobFile) below and the periodic stat dumping thread. + Options options = CurrentOptions(); + options.stats_dump_period_sec = 0; + + constexpr uint64_t blob_file_number = 234; + constexpr uint64_t total_blob_count = 555; + constexpr uint64_t total_blob_bytes = 66666; + constexpr char checksum_method[] = "CRC32"; + constexpr char checksum_value[] = "\x3d\x87\xff\x57"; + constexpr uint64_t garbage_blob_count = 0; + constexpr uint64_t garbage_blob_bytes = 0; + + Reopen(options); + + AddBlobFile(db_->DefaultColumnFamily(), blob_file_number, total_blob_count, + total_blob_bytes, checksum_method, checksum_value, + garbage_blob_count, garbage_blob_bytes); + // Make sure it appears in the results returned by GetLiveFiles. + uint64_t manifest_size = 0; + std::vector files; + ASSERT_OK(dbfull()->GetLiveFiles(files, &manifest_size)); + + ASSERT_FALSE(files.empty()); + ASSERT_EQ(files[0], BlobFileName("", blob_file_number)); + + ColumnFamilyMetaData cfmd; + + db_->GetColumnFamilyMetaData(&cfmd); + ASSERT_EQ(cfmd.blob_files.size(), 1); + const BlobMetaData& bmd = cfmd.blob_files[0]; + + CheckBlobMetaData(bmd, blob_file_number, total_blob_count, total_blob_bytes, + checksum_method, checksum_value, garbage_blob_count, + garbage_blob_bytes); + ASSERT_EQ(NormalizePath(bmd.blob_file_path), NormalizePath(dbname_)); + ASSERT_EQ(cfmd.blob_file_count, 1U); + ASSERT_EQ(cfmd.blob_file_size, bmd.blob_file_size); +} +#endif + +TEST_F(DBTest, PurgeInfoLogs) { + Options options = CurrentOptions(); + options.keep_log_file_num = 5; + options.create_if_missing = true; + options.env = env_; + for (int mode = 0; mode <= 1; mode++) { + if (mode == 1) { + options.db_log_dir = dbname_ + "_logs"; + ASSERT_OK(env_->CreateDirIfMissing(options.db_log_dir)); + } else { + options.db_log_dir = ""; + } + for (int i = 0; i < 8; i++) { + Reopen(options); + } + + std::vector files; + ASSERT_OK(env_->GetChildren( + options.db_log_dir.empty() ? dbname_ : options.db_log_dir, &files)); + int info_log_count = 0; + for (std::string file : files) { + if (file.find("LOG") != std::string::npos) { + info_log_count++; + } + } + ASSERT_EQ(5, info_log_count); + + Destroy(options); + // For mode (1), test DestroyDB() to delete all the logs under DB dir. + // For mode (2), no info log file should have been put under DB dir. + // Since dbname_ has no children, there is no need to loop db_files + std::vector db_files; + ASSERT_TRUE(env_->GetChildren(dbname_, &db_files).IsNotFound()); + ASSERT_TRUE(db_files.empty()); + + if (mode == 1) { + // Cleaning up + ASSERT_OK(env_->GetChildren(options.db_log_dir, &files)); + for (std::string file : files) { + ASSERT_OK(env_->DeleteFile(options.db_log_dir + "/" + file)); + } + ASSERT_OK(env_->DeleteDir(options.db_log_dir)); + } + } +} + +#ifndef ROCKSDB_LITE +// Multi-threaded test: +namespace { + +static const int kColumnFamilies = 10; +static const int kNumThreads = 10; +static const int kTestSeconds = 10; +static const int kNumKeys = 1000; + +struct MTState { + DBTest* test; + std::atomic counter[kNumThreads]; +}; + +struct MTThread { + MTState* state; + int id; + bool multiget_batched; +}; + +static void MTThreadBody(void* arg) { + MTThread* t = reinterpret_cast(arg); + int id = t->id; + DB* db = t->state->test->db_; + int counter = 0; + std::shared_ptr clock = SystemClock::Default(); + auto end_micros = clock->NowMicros() + kTestSeconds * 1000000U; + + fprintf(stderr, "... starting thread %d\n", id); + Random rnd(1000 + id); + char valbuf[1500]; + while (clock->NowMicros() < end_micros) { + t->state->counter[id].store(counter, std::memory_order_release); + + int key = rnd.Uniform(kNumKeys); + char keybuf[20]; + snprintf(keybuf, sizeof(keybuf), "%016d", key); + + if (rnd.OneIn(2)) { + // Write values of the form . + // into each of the CFs + // We add some padding for force compactions. + int unique_id = rnd.Uniform(1000000); + + // Half of the time directly use WriteBatch. Half of the time use + // WriteBatchWithIndex. + if (rnd.OneIn(2)) { + WriteBatch batch; + for (int cf = 0; cf < kColumnFamilies; ++cf) { + snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id, + static_cast(counter), cf, unique_id); + ASSERT_OK(batch.Put(t->state->test->handles_[cf], Slice(keybuf), + Slice(valbuf))); + } + ASSERT_OK(db->Write(WriteOptions(), &batch)); + } else { + WriteBatchWithIndex batch(db->GetOptions().comparator); + for (int cf = 0; cf < kColumnFamilies; ++cf) { + snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id, + static_cast(counter), cf, unique_id); + ASSERT_OK(batch.Put(t->state->test->handles_[cf], Slice(keybuf), + Slice(valbuf))); + } + ASSERT_OK(db->Write(WriteOptions(), batch.GetWriteBatch())); + } + } else { + // Read a value and verify that it matches the pattern written above + // and that writes to all column families were atomic (unique_id is the + // same) + std::vector keys(kColumnFamilies, Slice(keybuf)); + std::vector values; + std::vector statuses; + if (!t->multiget_batched) { + statuses = db->MultiGet(ReadOptions(), t->state->test->handles_, keys, + &values); + } else { + std::vector pin_values(keys.size()); + statuses.resize(keys.size()); + const Snapshot* snapshot = db->GetSnapshot(); + ReadOptions ro; + ro.snapshot = snapshot; + for (int cf = 0; cf < kColumnFamilies; ++cf) { + db->MultiGet(ro, t->state->test->handles_[cf], 1, &keys[cf], + &pin_values[cf], &statuses[cf]); + } + db->ReleaseSnapshot(snapshot); + values.resize(keys.size()); + for (int cf = 0; cf < kColumnFamilies; ++cf) { + if (statuses[cf].ok()) { + values[cf].assign(pin_values[cf].data(), pin_values[cf].size()); + } + } + } + Status s = statuses[0]; + // all statuses have to be the same + for (size_t i = 1; i < statuses.size(); ++i) { + // they are either both ok or both not-found + ASSERT_TRUE((s.ok() && statuses[i].ok()) || + (s.IsNotFound() && statuses[i].IsNotFound())); + } + if (s.IsNotFound()) { + // Key has not yet been written + } else { + // Check that the writer thread counter is >= the counter in the value + ASSERT_OK(s); + int unique_id = -1; + for (int i = 0; i < kColumnFamilies; ++i) { + int k, w, c, cf, u; + ASSERT_EQ(5, sscanf(values[i].c_str(), "%d.%d.%d.%d.%d", &k, &w, &c, + &cf, &u)) + << values[i]; + ASSERT_EQ(k, key); + ASSERT_GE(w, 0); + ASSERT_LT(w, kNumThreads); + ASSERT_LE(c, t->state->counter[w].load(std::memory_order_acquire)); + ASSERT_EQ(cf, i); + if (i == 0) { + unique_id = u; + } else { + // this checks that updates across column families happened + // atomically -- all unique ids are the same + ASSERT_EQ(u, unique_id); + } + } + } + } + counter++; + } + fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter)); +} + +} // anonymous namespace + +class MultiThreadedDBTest + : public DBTest, + public ::testing::WithParamInterface> { + public: + void SetUp() override { + std::tie(option_config_, multiget_batched_) = GetParam(); + } + + static std::vector GenerateOptionConfigs() { + std::vector optionConfigs; + for (int optionConfig = kDefault; optionConfig < kEnd; ++optionConfig) { + optionConfigs.push_back(optionConfig); + } + return optionConfigs; + } + + bool multiget_batched_; +}; + +TEST_P(MultiThreadedDBTest, MultiThreaded) { + if (option_config_ == kPipelinedWrite) return; + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + Options options = CurrentOptions(options_override); + std::vector cfs; + for (int i = 1; i < kColumnFamilies; ++i) { + cfs.push_back(std::to_string(i)); + } + Reopen(options); + CreateAndReopenWithCF(cfs, options); + // Initialize state + MTState mt; + mt.test = this; + for (int id = 0; id < kNumThreads; id++) { + mt.counter[id].store(0, std::memory_order_release); + } + + // Start threads + MTThread thread[kNumThreads]; + for (int id = 0; id < kNumThreads; id++) { + thread[id].state = &mt; + thread[id].id = id; + thread[id].multiget_batched = multiget_batched_; + env_->StartThread(MTThreadBody, &thread[id]); + } + + env_->WaitForJoin(); +} + +INSTANTIATE_TEST_CASE_P( + MultiThreaded, MultiThreadedDBTest, + ::testing::Combine( + ::testing::ValuesIn(MultiThreadedDBTest::GenerateOptionConfigs()), + ::testing::Bool())); +#endif // ROCKSDB_LITE + +// Group commit test: +#if !defined(OS_WIN) +// Disable this test temporarily on Travis and appveyor as it fails +// intermittently. Github issue: #4151 +namespace { + +static const int kGCNumThreads = 4; +static const int kGCNumKeys = 1000; + +struct GCThread { + DB* db; + int id; + std::atomic done; +}; + +static void GCThreadBody(void* arg) { + GCThread* t = reinterpret_cast(arg); + int id = t->id; + DB* db = t->db; + WriteOptions wo; + + for (int i = 0; i < kGCNumKeys; ++i) { + std::string kv(std::to_string(i + id * kGCNumKeys)); + ASSERT_OK(db->Put(wo, kv, kv)); + } + t->done = true; +} + +} // anonymous namespace + +TEST_F(DBTest, GroupCommitTest) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + Reopen(options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"WriteThread::JoinBatchGroup:BeganWaiting", + "DBImpl::WriteImpl:BeforeLeaderEnters"}, + {"WriteThread::AwaitState:BlockingWaiting", + "WriteThread::EnterAsBatchGroupLeader:End"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Start threads + GCThread thread[kGCNumThreads]; + for (int id = 0; id < kGCNumThreads; id++) { + thread[id].id = id; + thread[id].db = db_; + thread[id].done = false; + env_->StartThread(GCThreadBody, &thread[id]); + } + env_->WaitForJoin(); + + ASSERT_GT(TestGetTickerCount(options, WRITE_DONE_BY_OTHER), 0); + + std::vector expected_db; + for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) { + expected_db.push_back(std::to_string(i)); + } + std::sort(expected_db.begin(), expected_db.end()); + + Iterator* itr = db_->NewIterator(ReadOptions()); + itr->SeekToFirst(); + for (auto x : expected_db) { + ASSERT_TRUE(itr->Valid()); + ASSERT_EQ(itr->key().ToString(), x); + ASSERT_EQ(itr->value().ToString(), x); + itr->Next(); + } + ASSERT_TRUE(!itr->Valid()); + delete itr; + + HistogramData hist_data; + options.statistics->histogramData(DB_WRITE, &hist_data); + ASSERT_GT(hist_data.average, 0.0); + } while (ChangeOptions(kSkipNoSeekToLast)); +} +#endif // OS_WIN + +namespace { +using KVMap = std::map; +} + +class ModelDB : public DB { + public: + class ModelSnapshot : public Snapshot { + public: + KVMap map_; + + SequenceNumber GetSequenceNumber() const override { + // no need to call this + assert(false); + return 0; + } + + int64_t GetUnixTime() const override { + // no need to call this + assert(false); + return 0; + } + + uint64_t GetTimestamp() const override { + // no need to call this + assert(false); + return 0; + } + }; + + explicit ModelDB(const Options& options) : options_(options) {} + using DB::Put; + Status Put(const WriteOptions& o, ColumnFamilyHandle* cf, const Slice& k, + const Slice& v) override { + WriteBatch batch; + Status s = batch.Put(cf, k, v); + if (!s.ok()) { + return s; + } + return Write(o, &batch); + } + Status Put(const WriteOptions& /*o*/, ColumnFamilyHandle* /*cf*/, + const Slice& /*k*/, const Slice& /*ts*/, + const Slice& /*v*/) override { + return Status::NotSupported(); + } + + using DB::PutEntity; + Status PutEntity(const WriteOptions& /* options */, + ColumnFamilyHandle* /* column_family */, + const Slice& /* key */, + const WideColumns& /* columns */) override { + return Status::NotSupported(); + } + + using DB::Close; + Status Close() override { return Status::OK(); } + using DB::Delete; + Status Delete(const WriteOptions& o, ColumnFamilyHandle* cf, + const Slice& key) override { + WriteBatch batch; + Status s = batch.Delete(cf, key); + if (!s.ok()) { + return s; + } + return Write(o, &batch); + } + Status Delete(const WriteOptions& /*o*/, ColumnFamilyHandle* /*cf*/, + const Slice& /*key*/, const Slice& /*ts*/) override { + return Status::NotSupported(); + } + using DB::SingleDelete; + Status SingleDelete(const WriteOptions& o, ColumnFamilyHandle* cf, + const Slice& key) override { + WriteBatch batch; + Status s = batch.SingleDelete(cf, key); + if (!s.ok()) { + return s; + } + return Write(o, &batch); + } + Status SingleDelete(const WriteOptions& /*o*/, ColumnFamilyHandle* /*cf*/, + const Slice& /*key*/, const Slice& /*ts*/) override { + return Status::NotSupported(); + } + using DB::Merge; + Status Merge(const WriteOptions& o, ColumnFamilyHandle* cf, const Slice& k, + const Slice& v) override { + WriteBatch batch; + Status s = batch.Merge(cf, k, v); + if (!s.ok()) { + return s; + } + return Write(o, &batch); + } + Status Merge(const WriteOptions& /*o*/, ColumnFamilyHandle* /*cf*/, + const Slice& /*k*/, const Slice& /*ts*/, + const Slice& /*value*/) override { + return Status::NotSupported(); + } + using DB::Get; + Status Get(const ReadOptions& /*options*/, ColumnFamilyHandle* /*cf*/, + const Slice& key, PinnableSlice* /*value*/) override { + return Status::NotSupported(key); + } + + using DB::GetMergeOperands; + virtual Status GetMergeOperands( + const ReadOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, + const Slice& key, PinnableSlice* /*slice*/, + GetMergeOperandsOptions* /*merge_operands_options*/, + int* /*number_of_operands*/) override { + return Status::NotSupported(key); + } + + using DB::MultiGet; + std::vector MultiGet( + const ReadOptions& /*options*/, + const std::vector& /*column_family*/, + const std::vector& keys, + std::vector* /*values*/) override { + std::vector s(keys.size(), + Status::NotSupported("Not implemented.")); + return s; + } + +#ifndef ROCKSDB_LITE + using DB::IngestExternalFile; + Status IngestExternalFile( + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*external_files*/, + const IngestExternalFileOptions& /*options*/) override { + return Status::NotSupported("Not implemented."); + } + + using DB::IngestExternalFiles; + Status IngestExternalFiles( + const std::vector& /*args*/) override { + return Status::NotSupported("Not implemented"); + } + + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& /*options*/, + const std::string& /*column_family_name*/, + const ImportColumnFamilyOptions& /*import_options*/, + const ExportImportFilesMetaData& /*metadata*/, + ColumnFamilyHandle** /*handle*/) override { + return Status::NotSupported("Not implemented."); + } + + using DB::VerifyChecksum; + Status VerifyChecksum(const ReadOptions&) override { + return Status::NotSupported("Not implemented."); + } + + using DB::GetPropertiesOfAllTables; + Status GetPropertiesOfAllTables( + ColumnFamilyHandle* /*column_family*/, + TablePropertiesCollection* /*props*/) override { + return Status(); + } + + Status GetPropertiesOfTablesInRange( + ColumnFamilyHandle* /*column_family*/, const Range* /*range*/, + std::size_t /*n*/, TablePropertiesCollection* /*props*/) override { + return Status(); + } +#endif // ROCKSDB_LITE + + using DB::KeyMayExist; + bool KeyMayExist(const ReadOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, + std::string* /*value*/, + bool* value_found = nullptr) override { + if (value_found != nullptr) { + *value_found = false; + } + return true; // Not Supported directly + } + using DB::NewIterator; + Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* /*column_family*/) override { + if (options.snapshot == nullptr) { + KVMap* saved = new KVMap; + *saved = map_; + return new ModelIter(saved, true); + } else { + const KVMap* snapshot_state = + &(reinterpret_cast(options.snapshot)->map_); + return new ModelIter(snapshot_state, false); + } + } + Status NewIterators(const ReadOptions& /*options*/, + const std::vector& /*column_family*/, + std::vector* /*iterators*/) override { + return Status::NotSupported("Not supported yet"); + } + const Snapshot* GetSnapshot() override { + ModelSnapshot* snapshot = new ModelSnapshot; + snapshot->map_ = map_; + return snapshot; + } + + void ReleaseSnapshot(const Snapshot* snapshot) override { + delete reinterpret_cast(snapshot); + } + + Status Write(const WriteOptions& /*options*/, WriteBatch* batch) override { + class Handler : public WriteBatch::Handler { + public: + KVMap* map_; + void Put(const Slice& key, const Slice& value) override { + (*map_)[key.ToString()] = value.ToString(); + } + void Merge(const Slice& /*key*/, const Slice& /*value*/) override { + // ignore merge for now + // (*map_)[key.ToString()] = value.ToString(); + } + void Delete(const Slice& key) override { map_->erase(key.ToString()); } + }; + Handler handler; + handler.map_ = &map_; + return batch->Iterate(&handler); + } + + using DB::GetProperty; + bool GetProperty(ColumnFamilyHandle* /*column_family*/, + const Slice& /*property*/, std::string* /*value*/) override { + return false; + } + using DB::GetIntProperty; + bool GetIntProperty(ColumnFamilyHandle* /*column_family*/, + const Slice& /*property*/, uint64_t* /*value*/) override { + return false; + } + using DB::GetMapProperty; + bool GetMapProperty(ColumnFamilyHandle* /*column_family*/, + const Slice& /*property*/, + std::map* /*value*/) override { + return false; + } + using DB::GetAggregatedIntProperty; + bool GetAggregatedIntProperty(const Slice& /*property*/, + uint64_t* /*value*/) override { + return false; + } + using DB::GetApproximateSizes; + Status GetApproximateSizes(const SizeApproximationOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Range* /*range*/, int n, + uint64_t* sizes) override { + for (int i = 0; i < n; i++) { + sizes[i] = 0; + } + return Status::OK(); + } + using DB::GetApproximateMemTableStats; + void GetApproximateMemTableStats(ColumnFamilyHandle* /*column_family*/, + const Range& /*range*/, + uint64_t* const count, + uint64_t* const size) override { + *count = 0; + *size = 0; + } + using DB::CompactRange; + Status CompactRange(const CompactRangeOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice* /*start*/, const Slice* /*end*/) override { + return Status::NotSupported("Not supported operation."); + } + + Status SetDBOptions( + const std::unordered_map& /*new_options*/) + override { + return Status::NotSupported("Not supported operation."); + } + + using DB::CompactFiles; + Status CompactFiles( + const CompactionOptions& /*compact_options*/, + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*input_file_names*/, + const int /*output_level*/, const int /*output_path_id*/ = -1, + std::vector* const /*output_file_names*/ = nullptr, + CompactionJobInfo* /*compaction_job_info*/ = nullptr) override { + return Status::NotSupported("Not supported operation."); + } + + Status PauseBackgroundWork() override { + return Status::NotSupported("Not supported operation."); + } + + Status ContinueBackgroundWork() override { + return Status::NotSupported("Not supported operation."); + } + + Status EnableAutoCompaction( + const std::vector& /*column_family_handles*/) + override { + return Status::NotSupported("Not supported operation."); + } + + void EnableManualCompaction() override { return; } + + void DisableManualCompaction() override { return; } + + using DB::NumberLevels; + int NumberLevels(ColumnFamilyHandle* /*column_family*/) override { return 1; } + + using DB::MaxMemCompactionLevel; + int MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) override { + return 1; + } + + using DB::Level0StopWriteTrigger; + int Level0StopWriteTrigger(ColumnFamilyHandle* /*column_family*/) override { + return -1; + } + + const std::string& GetName() const override { return name_; } + + Env* GetEnv() const override { return nullptr; } + + using DB::GetOptions; + Options GetOptions(ColumnFamilyHandle* /*column_family*/) const override { + return options_; + } + + using DB::GetDBOptions; + DBOptions GetDBOptions() const override { return options_; } + + using DB::Flush; + Status Flush(const ROCKSDB_NAMESPACE::FlushOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/) override { + Status ret; + return ret; + } + Status Flush( + const ROCKSDB_NAMESPACE::FlushOptions& /*options*/, + const std::vector& /*column_families*/) override { + return Status::OK(); + } + + Status SyncWAL() override { return Status::OK(); } + + Status DisableFileDeletions() override { return Status::OK(); } + + Status EnableFileDeletions(bool /*force*/) override { return Status::OK(); } +#ifndef ROCKSDB_LITE + + Status GetLiveFiles(std::vector&, uint64_t* /*size*/, + bool /*flush_memtable*/ = true) override { + return Status::OK(); + } + + Status GetLiveFilesChecksumInfo( + FileChecksumList* /*checksum_list*/) override { + return Status::OK(); + } + + Status GetLiveFilesStorageInfo( + const LiveFilesStorageInfoOptions& /*opts*/, + std::vector* /*files*/) override { + return Status::OK(); + } + + Status GetSortedWalFiles(VectorLogPtr& /*files*/) override { + return Status::OK(); + } + + Status GetCurrentWalFile( + std::unique_ptr* /*current_log_file*/) override { + return Status::OK(); + } + + virtual Status GetCreationTimeOfOldestFile( + uint64_t* /*creation_time*/) override { + return Status::NotSupported(); + } + + Status DeleteFile(std::string /*name*/) override { return Status::OK(); } + + Status GetUpdatesSince( + ROCKSDB_NAMESPACE::SequenceNumber, + std::unique_ptr*, + const TransactionLogIterator::ReadOptions& /*read_options*/ = + TransactionLogIterator::ReadOptions()) override { + return Status::NotSupported("Not supported in Model DB"); + } + + void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/, + ColumnFamilyMetaData* /*metadata*/) override {} +#endif // ROCKSDB_LITE + + Status GetDbIdentity(std::string& /*identity*/) const override { + return Status::OK(); + } + + Status GetDbSessionId(std::string& /*session_id*/) const override { + return Status::OK(); + } + + SequenceNumber GetLatestSequenceNumber() const override { return 0; } + + Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* /*cf*/, + std::string /*ts_low*/) override { + return Status::OK(); + } + + Status GetFullHistoryTsLow(ColumnFamilyHandle* /*cf*/, + std::string* /*ts_low*/) override { + return Status::OK(); + } + + ColumnFamilyHandle* DefaultColumnFamily() const override { return nullptr; } + + private: + class ModelIter : public Iterator { + public: + ModelIter(const KVMap* map, bool owned) + : map_(map), owned_(owned), iter_(map_->end()) {} + ~ModelIter() override { + if (owned_) delete map_; + } + bool Valid() const override { return iter_ != map_->end(); } + void SeekToFirst() override { iter_ = map_->begin(); } + void SeekToLast() override { + if (map_->empty()) { + iter_ = map_->end(); + } else { + iter_ = map_->find(map_->rbegin()->first); + } + } + void Seek(const Slice& k) override { + iter_ = map_->lower_bound(k.ToString()); + } + void SeekForPrev(const Slice& k) override { + iter_ = map_->upper_bound(k.ToString()); + Prev(); + } + void Next() override { ++iter_; } + void Prev() override { + if (iter_ == map_->begin()) { + iter_ = map_->end(); + return; + } + --iter_; + } + + Slice key() const override { return iter_->first; } + Slice value() const override { return iter_->second; } + Status status() const override { return Status::OK(); } + + private: + const KVMap* const map_; + const bool owned_; // Do we own map_ + KVMap::const_iterator iter_; + }; + const Options options_; + KVMap map_; + std::string name_ = ""; +}; + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +static std::string RandomKey(Random* rnd, int minimum = 0) { + int len; + do { + len = (rnd->OneIn(3) + ? 1 // Short sometimes to encourage collisions + : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10))); + } while (len < minimum); + return test::RandomKey(rnd, len); +} + +static bool CompareIterators(int step, DB* model, DB* db, + const Snapshot* model_snap, + const Snapshot* db_snap) { + ReadOptions options; + options.snapshot = model_snap; + Iterator* miter = model->NewIterator(options); + options.snapshot = db_snap; + Iterator* dbiter = db->NewIterator(options); + bool ok = true; + int count = 0; + for (miter->SeekToFirst(), dbiter->SeekToFirst(); + ok && miter->Valid() && dbiter->Valid(); miter->Next(), dbiter->Next()) { + count++; + if (miter->key().compare(dbiter->key()) != 0) { + fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", step, + EscapeString(miter->key()).c_str(), + EscapeString(dbiter->key()).c_str()); + ok = false; + break; + } + + if (miter->value().compare(dbiter->value()) != 0) { + fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n", + step, EscapeString(miter->key()).c_str(), + EscapeString(miter->value()).c_str(), + EscapeString(dbiter->value()).c_str()); + ok = false; + } + } + + if (ok) { + if (miter->Valid() != dbiter->Valid()) { + fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n", + step, miter->Valid(), dbiter->Valid()); + ok = false; + } + } + delete miter; + delete dbiter; + return ok; +} + +class DBTestRandomized : public DBTest, + public ::testing::WithParamInterface { + public: + void SetUp() override { option_config_ = GetParam(); } + + static std::vector GenerateOptionConfigs() { + std::vector option_configs; + // skip cuckoo hash as it does not support snapshot. + for (int option_config = kDefault; option_config < kEnd; ++option_config) { + if (!ShouldSkipOptions(option_config, + kSkipDeletesFilterFirst | kSkipNoSeekToLast)) { + option_configs.push_back(option_config); + } + } + option_configs.push_back(kBlockBasedTableWithIndexRestartInterval); + return option_configs; + } +}; + +INSTANTIATE_TEST_CASE_P( + DBTestRandomized, DBTestRandomized, + ::testing::ValuesIn(DBTestRandomized::GenerateOptionConfigs())); + +TEST_P(DBTestRandomized, Randomized) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + Options options = CurrentOptions(options_override); + DestroyAndReopen(options); + + Random rnd(test::RandomSeed() + GetParam()); + ModelDB model(options); + const int N = 10000; + const Snapshot* model_snap = nullptr; + const Snapshot* db_snap = nullptr; + std::string k, v; + for (int step = 0; step < N; step++) { + // TODO(sanjay): Test Get() works + int p = rnd.Uniform(100); + int minimum = 0; + if (option_config_ == kHashSkipList || option_config_ == kHashLinkList || + option_config_ == kPlainTableFirstBytePrefix || + option_config_ == kBlockBasedTableWithWholeKeyHashIndex || + option_config_ == kBlockBasedTableWithPrefixHashIndex) { + minimum = 1; + } + if (p < 45) { // Put + k = RandomKey(&rnd, minimum); + v = rnd.RandomString(rnd.OneIn(20) ? 100 + rnd.Uniform(100) + : rnd.Uniform(8)); + ASSERT_OK(model.Put(WriteOptions(), k, v)); + ASSERT_OK(db_->Put(WriteOptions(), k, v)); + } else if (p < 90) { // Delete + k = RandomKey(&rnd, minimum); + ASSERT_OK(model.Delete(WriteOptions(), k)); + ASSERT_OK(db_->Delete(WriteOptions(), k)); + } else { // Multi-element batch + WriteBatch b; + const int num = rnd.Uniform(8); + for (int i = 0; i < num; i++) { + if (i == 0 || !rnd.OneIn(10)) { + k = RandomKey(&rnd, minimum); + } else { + // Periodically re-use the same key from the previous iter, so + // we have multiple entries in the write batch for the same key + } + if (rnd.OneIn(2)) { + v = rnd.RandomString(rnd.Uniform(10)); + ASSERT_OK(b.Put(k, v)); + } else { + ASSERT_OK(b.Delete(k)); + } + } + ASSERT_OK(model.Write(WriteOptions(), &b)); + ASSERT_OK(db_->Write(WriteOptions(), &b)); + } + + if ((step % 100) == 0) { + // For DB instances that use the hash index + block-based table, the + // iterator will be invalid right when seeking a non-existent key, right + // than return a key that is close to it. + if (option_config_ != kBlockBasedTableWithWholeKeyHashIndex && + option_config_ != kBlockBasedTableWithPrefixHashIndex) { + ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr)); + ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap)); + } + + // Save a snapshot from each DB this time that we'll use next + // time we compare things, to make sure the current state is + // preserved with the snapshot + if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); + if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); + + Reopen(options); + ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr)); + + model_snap = model.GetSnapshot(); + db_snap = db_->GetSnapshot(); + } + } + if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); + if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_F(DBTest, BlockBasedTablePrefixIndexTest) { + // create a DB with block prefix index + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + table_options.index_type = BlockBasedTableOptions::kHashSearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + + Reopen(options); + ASSERT_OK(Put("k1", "v1")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("k2", "v2")); + + // Reopen with different prefix extractor, make sure everything still works. + // RocksDB should just fall back to the binary index. + options.prefix_extractor.reset(NewFixedPrefixTransform(2)); + + Reopen(options); + ASSERT_EQ("v1", Get("k1")); + ASSERT_EQ("v2", Get("k2")); + +#ifndef ROCKSDB_LITE + // Back to original + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:1"}})); + ASSERT_EQ("v1", Get("k1")); + ASSERT_EQ("v2", Get("k2")); +#endif // !ROCKSDB_LITE + + // Same if there's a problem initally loading prefix transform + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::Open::ForceNullTablePrefixExtractor", + [&](void* arg) { *static_cast(arg) = true; }); + SyncPoint::GetInstance()->EnableProcessing(); + Reopen(options); + ASSERT_EQ("v1", Get("k1")); + ASSERT_EQ("v2", Get("k2")); + +#ifndef ROCKSDB_LITE + // Change again + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}})); + ASSERT_EQ("v1", Get("k1")); + ASSERT_EQ("v2", Get("k2")); +#endif // !ROCKSDB_LITE + SyncPoint::GetInstance()->DisableProcessing(); + + // Reopen with no prefix extractor, make sure everything still works. + // RocksDB should just fall back to the binary index. + table_options.index_type = BlockBasedTableOptions::kBinarySearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(); + + Reopen(options); + ASSERT_EQ("v1", Get("k1")); + ASSERT_EQ("v2", Get("k2")); +} + +TEST_F(DBTest, BlockBasedTablePrefixHashIndexTest) { + // create a DB with block prefix index + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + table_options.index_type = BlockBasedTableOptions::kHashSearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewCappedPrefixTransform(2)); + + Reopen(options); + ASSERT_OK(Put("kk1", "v1")); + ASSERT_OK(Put("kk2", "v2")); + ASSERT_OK(Put("kk", "v3")); + ASSERT_OK(Put("k", "v4")); + Flush(); + + ASSERT_EQ("v1", Get("kk1")); + ASSERT_EQ("v2", Get("kk2")); + + ASSERT_EQ("v3", Get("kk")); + ASSERT_EQ("v4", Get("k")); +} + +TEST_F(DBTest, BlockBasedTablePrefixIndexTotalOrderSeek) { + // create a DB with block prefix index + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + options.max_open_files = 10; + table_options.index_type = BlockBasedTableOptions::kHashSearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + + // RocksDB sanitize max open files to at least 20. Modify it back. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = static_cast(arg); + *max_open_files = 11; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Reopen(options); + ASSERT_OK(Put("k1", "v1")); + ASSERT_OK(Flush()); + + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 1; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + // Force evict tables + dbfull()->TEST_table_cache()->SetCapacity(0); + // Make table cache to keep one entry. + dbfull()->TEST_table_cache()->SetCapacity(1); + + ReadOptions read_options; + read_options.total_order_seek = true; + { + std::unique_ptr iter(db_->NewIterator(read_options)); + iter->Seek("k1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("k1", iter->key().ToString()); + } + + // After total order seek, prefix index should still be used. + read_options.total_order_seek = false; + { + std::unique_ptr iter(db_->NewIterator(read_options)); + iter->Seek("k1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("k1", iter->key().ToString()); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest, ChecksumTest) { + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + + table_options.checksum = kCRC32c; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + ASSERT_OK(Flush()); // table with crc checksum + + table_options.checksum = kxxHash; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + ASSERT_OK(Put("e", "f")); + ASSERT_OK(Put("g", "h")); + ASSERT_OK(Flush()); // table with xxhash checksum + + table_options.checksum = kCRC32c; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + ASSERT_EQ("b", Get("a")); + ASSERT_EQ("d", Get("c")); + ASSERT_EQ("f", Get("e")); + ASSERT_EQ("h", Get("g")); + + table_options.checksum = kCRC32c; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + ASSERT_EQ("b", Get("a")); + ASSERT_EQ("d", Get("c")); + ASSERT_EQ("f", Get("e")); + ASSERT_EQ("h", Get("g")); +} + +#ifndef ROCKSDB_LITE +TEST_P(DBTestWithParam, FIFOCompactionTest) { + for (int iter = 0; iter < 2; ++iter) { + // first iteration -- auto compaction + // second iteration -- manual compaction + Options options; + options.compaction_style = kCompactionStyleFIFO; + options.write_buffer_size = 100 << 10; // 100KB + options.arena_block_size = 4096; + options.compaction_options_fifo.max_table_files_size = 500 << 10; // 500KB + options.compression = kNoCompression; + options.create_if_missing = true; + options.max_subcompactions = max_subcompactions_; + if (iter == 1) { + options.disable_auto_compactions = true; + } + options = CurrentOptions(options); + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 110; ++j) { + ASSERT_OK(Put(std::to_string(i * 100 + j), rnd.RandomString(980))); + } + // flush should happen here + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + if (iter == 0) { + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } else { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + } + // only 5 files should survive + ASSERT_EQ(NumTableFilesAtLevel(0), 5); + for (int i = 0; i < 50; ++i) { + // these keys should be deleted in previous compaction + ASSERT_EQ("NOT_FOUND", Get(std::to_string(i))); + } + } +} + +TEST_F(DBTest, FIFOCompactionTestWithCompaction) { + Options options; + options.compaction_style = kCompactionStyleFIFO; + options.write_buffer_size = 20 << 10; // 20K + options.arena_block_size = 4096; + options.compaction_options_fifo.max_table_files_size = 1500 << 10; // 1MB + options.compaction_options_fifo.allow_compaction = true; + options.level0_file_num_compaction_trigger = 6; + options.compression = kNoCompression; + options.create_if_missing = true; + options = CurrentOptions(options); + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 60; i++) { + // Generate and flush a file about 20KB. + for (int j = 0; j < 20; j++) { + ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + // It should be compacted to 10 files. + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + for (int i = 0; i < 60; i++) { + // Generate and flush a file about 20KB. + for (int j = 0; j < 20; j++) { + ASSERT_OK(Put(std::to_string(i * 20 + j + 2000), rnd.RandomString(980))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + + // It should be compacted to no more than 20 files. + ASSERT_GT(NumTableFilesAtLevel(0), 10); + ASSERT_LT(NumTableFilesAtLevel(0), 18); + // Size limit is still guaranteed. + ASSERT_LE(SizeAtLevel(0), + options.compaction_options_fifo.max_table_files_size); +} + +TEST_F(DBTest, FIFOCompactionStyleWithCompactionAndDelete) { + Options options; + options.compaction_style = kCompactionStyleFIFO; + options.write_buffer_size = 20 << 10; // 20K + options.arena_block_size = 4096; + options.compaction_options_fifo.max_table_files_size = 1500 << 10; // 1MB + options.compaction_options_fifo.allow_compaction = true; + options.level0_file_num_compaction_trigger = 3; + options.compression = kNoCompression; + options.create_if_missing = true; + options = CurrentOptions(options); + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 3; i++) { + // Each file contains a different key which will be dropped later. + ASSERT_OK(Put("a" + std::to_string(i), rnd.RandomString(500))); + ASSERT_OK(Put("key" + std::to_string(i), "")); + ASSERT_OK(Put("z" + std::to_string(i), rnd.RandomString(500))); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + for (int i = 0; i < 3; i++) { + ASSERT_EQ("", Get("key" + std::to_string(i))); + } + for (int i = 0; i < 3; i++) { + // Each file contains a different key which will be dropped later. + ASSERT_OK(Put("a" + std::to_string(i), rnd.RandomString(500))); + ASSERT_OK(Delete("key" + std::to_string(i))); + ASSERT_OK(Put("z" + std::to_string(i), rnd.RandomString(500))); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_EQ(NumTableFilesAtLevel(0), 2); + for (int i = 0; i < 3; i++) { + ASSERT_EQ("NOT_FOUND", Get("key" + std::to_string(i))); + } +} + +// Check that FIFO-with-TTL is not supported with max_open_files != -1. +// Github issue #8014 +TEST_F(DBTest, FIFOCompactionWithTTLAndMaxOpenFilesTest) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleFIFO; + options.create_if_missing = true; + options.ttl = 600; // seconds + + // TTL is not supported with max_open_files != -1. + options.max_open_files = 0; + ASSERT_TRUE(TryReopen(options).IsNotSupported()); + + options.max_open_files = 100; + ASSERT_TRUE(TryReopen(options).IsNotSupported()); + + // TTL is supported with unlimited max_open_files + options.max_open_files = -1; + ASSERT_OK(TryReopen(options)); +} + +// Check that FIFO-with-TTL is supported only with BlockBasedTableFactory. +TEST_F(DBTest, FIFOCompactionWithTTLAndVariousTableFormatsTest) { + Options options; + options.compaction_style = kCompactionStyleFIFO; + options.create_if_missing = true; + options.ttl = 600; // seconds + + options = CurrentOptions(options); + options.table_factory.reset(NewBlockBasedTableFactory()); + ASSERT_OK(TryReopen(options)); + + Destroy(options); + options.table_factory.reset(NewPlainTableFactory()); + ASSERT_TRUE(TryReopen(options).IsNotSupported()); + + Destroy(options); + options.table_factory.reset(NewAdaptiveTableFactory()); + ASSERT_TRUE(TryReopen(options).IsNotSupported()); +} + +TEST_F(DBTest, FIFOCompactionWithTTLTest) { + Options options; + options.compaction_style = kCompactionStyleFIFO; + options.write_buffer_size = 10 << 10; // 10KB + options.arena_block_size = 4096; + options.compression = kNoCompression; + options.create_if_missing = true; + env_->SetMockSleep(); + options.env = env_; + + // Test to make sure that all files with expired ttl are deleted on next + // manual compaction. + { + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + options.compaction_options_fifo.max_table_files_size = 150 << 10; // 150KB + options.compaction_options_fifo.allow_compaction = false; + options.ttl = 1 * 60 * 60; // 1 hour + options = CurrentOptions(options); + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 10; i++) { + // Generate and flush a file about 10KB. + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + // Sleep for 2 hours -- which is much greater than TTL. + env_->MockSleepForSeconds(2 * 60 * 60); + + // Since no flushes and compactions have run, the db should still be in + // the same state even after considerable time has passed. + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + } + + // Test to make sure that all files with expired ttl are deleted on next + // automatic compaction. + { + options.compaction_options_fifo.max_table_files_size = 150 << 10; // 150KB + options.compaction_options_fifo.allow_compaction = false; + options.ttl = 1 * 60 * 60; // 1 hour + options = CurrentOptions(options); + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 10; i++) { + // Generate and flush a file about 10KB. + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + // Sleep for 2 hours -- which is much greater than TTL. + env_->MockSleepForSeconds(2 * 60 * 60); + // Just to make sure that we are in the same state even after sleeping. + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + // Create 1 more file to trigger TTL compaction. The old files are dropped. + for (int i = 0; i < 1; i++) { + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980))); + } + ASSERT_OK(Flush()); + } + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Only the new 10 files remain. + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + ASSERT_LE(SizeAtLevel(0), + options.compaction_options_fifo.max_table_files_size); + } + + // Test that shows the fall back to size-based FIFO compaction if TTL-based + // deletion doesn't move the total size to be less than max_table_files_size. + { + options.write_buffer_size = 10 << 10; // 10KB + options.compaction_options_fifo.max_table_files_size = 150 << 10; // 150KB + options.compaction_options_fifo.allow_compaction = false; + options.ttl = 1 * 60 * 60; // 1 hour + options = CurrentOptions(options); + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 3; i++) { + // Generate and flush a file about 10KB. + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_EQ(NumTableFilesAtLevel(0), 3); + + // Sleep for 2 hours -- which is much greater than TTL. + env_->MockSleepForSeconds(2 * 60 * 60); + // Just to make sure that we are in the same state even after sleeping. + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 3); + + for (int i = 0; i < 5; i++) { + for (int j = 0; j < 140; j++) { + ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + // Size limit is still guaranteed. + ASSERT_LE(SizeAtLevel(0), + options.compaction_options_fifo.max_table_files_size); + } + + // Test with TTL + Intra-L0 compactions. + { + options.compaction_options_fifo.max_table_files_size = 150 << 10; // 150KB + options.compaction_options_fifo.allow_compaction = true; + options.ttl = 1 * 60 * 60; // 1 hour + options.level0_file_num_compaction_trigger = 6; + options = CurrentOptions(options); + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 10; i++) { + // Generate and flush a file about 10KB. + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + // With Intra-L0 compaction, out of 10 files, 6 files will be compacted to 1 + // (due to level0_file_num_compaction_trigger = 6). + // So total files = 1 + remaining 4 = 5. + ASSERT_EQ(NumTableFilesAtLevel(0), 5); + + // Sleep for 2 hours -- which is much greater than TTL. + env_->MockSleepForSeconds(2 * 60 * 60); + // Just to make sure that we are in the same state even after sleeping. + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 5); + + // Create 10 more files. The old 5 files are dropped as their ttl expired. + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_EQ(NumTableFilesAtLevel(0), 5); + ASSERT_LE(SizeAtLevel(0), + options.compaction_options_fifo.max_table_files_size); + } + + // Test with large TTL + Intra-L0 compactions. + // Files dropped based on size, as ttl doesn't kick in. + { + options.write_buffer_size = 20 << 10; // 20K + options.compaction_options_fifo.max_table_files_size = 1500 << 10; // 1.5MB + options.compaction_options_fifo.allow_compaction = true; + options.ttl = 1 * 60 * 60; // 1 hour + options.level0_file_num_compaction_trigger = 6; + options = CurrentOptions(options); + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 60; i++) { + // Generate and flush a file about 20KB. + for (int j = 0; j < 20; j++) { + ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + // It should be compacted to 10 files. + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + for (int i = 0; i < 60; i++) { + // Generate and flush a file about 20KB. + for (int j = 0; j < 20; j++) { + ASSERT_OK( + Put(std::to_string(i * 20 + j + 2000), rnd.RandomString(980))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + + // It should be compacted to no more than 20 files. + ASSERT_GT(NumTableFilesAtLevel(0), 10); + ASSERT_LT(NumTableFilesAtLevel(0), 18); + // Size limit is still guaranteed. + ASSERT_LE(SizeAtLevel(0), + options.compaction_options_fifo.max_table_files_size); + } +} +#endif // ROCKSDB_LITE + +#ifndef ROCKSDB_LITE +/* + * This test is not reliable enough as it heavily depends on disk behavior. + * Disable as it is flaky. + */ +TEST_F(DBTest, DISABLED_RateLimitingTest) { + Options options = CurrentOptions(); + options.write_buffer_size = 1 << 20; // 1MB + options.level0_file_num_compaction_trigger = 2; + options.target_file_size_base = 1 << 20; // 1MB + options.max_bytes_for_level_base = 4 << 20; // 4MB + options.max_bytes_for_level_multiplier = 4; + options.compression = kNoCompression; + options.create_if_missing = true; + options.env = env_; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.IncreaseParallelism(4); + DestroyAndReopen(options); + + WriteOptions wo; + wo.disableWAL = true; + + // # no rate limiting + Random rnd(301); + uint64_t start = env_->NowMicros(); + // Write ~96M data + for (int64_t i = 0; i < (96 << 10); ++i) { + ASSERT_OK(Put(rnd.RandomString(32), rnd.RandomString((1 << 10) + 1), wo)); + } + uint64_t elapsed = env_->NowMicros() - start; + double raw_rate = env_->bytes_written_ * 1000000.0 / elapsed; + uint64_t rate_limiter_drains = + TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS); + ASSERT_EQ(0, rate_limiter_drains); + Close(); + + // # rate limiting with 0.7 x threshold + options.rate_limiter.reset( + NewGenericRateLimiter(static_cast(0.7 * raw_rate))); + env_->bytes_written_ = 0; + DestroyAndReopen(options); + + start = env_->NowMicros(); + // Write ~96M data + for (int64_t i = 0; i < (96 << 10); ++i) { + ASSERT_OK(Put(rnd.RandomString(32), rnd.RandomString((1 << 10) + 1), wo)); + } + rate_limiter_drains = + TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS) - + rate_limiter_drains; + elapsed = env_->NowMicros() - start; + Close(); + ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_); + // Most intervals should've been drained (interval time is 100ms, elapsed is + // micros) + ASSERT_GT(rate_limiter_drains, 0); + ASSERT_LE(rate_limiter_drains, elapsed / 100000 + 1); + double ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate; + fprintf(stderr, "write rate ratio = %.2lf, expected 0.7\n", ratio); + ASSERT_TRUE(ratio < 0.8); + + // # rate limiting with half of the raw_rate + options.rate_limiter.reset( + NewGenericRateLimiter(static_cast(raw_rate / 2))); + env_->bytes_written_ = 0; + DestroyAndReopen(options); + + start = env_->NowMicros(); + // Write ~96M data + for (int64_t i = 0; i < (96 << 10); ++i) { + ASSERT_OK(Put(rnd.RandomString(32), rnd.RandomString((1 << 10) + 1), wo)); + } + elapsed = env_->NowMicros() - start; + rate_limiter_drains = + TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS) - + rate_limiter_drains; + Close(); + ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_); + // Most intervals should've been drained (interval time is 100ms, elapsed is + // micros) + ASSERT_GT(rate_limiter_drains, elapsed / 100000 / 2); + ASSERT_LE(rate_limiter_drains, elapsed / 100000 + 1); + ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate; + fprintf(stderr, "write rate ratio = %.2lf, expected 0.5\n", ratio); + ASSERT_LT(ratio, 0.6); +} + +// This is a mocked customed rate limiter without implementing optional APIs +// (e.g, RateLimiter::GetTotalPendingRequests()) +class MockedRateLimiterWithNoOptionalAPIImpl : public RateLimiter { + public: + MockedRateLimiterWithNoOptionalAPIImpl() {} + + ~MockedRateLimiterWithNoOptionalAPIImpl() override {} + + void SetBytesPerSecond(int64_t bytes_per_second) override { + (void)bytes_per_second; + } + + using RateLimiter::Request; + void Request(const int64_t bytes, const Env::IOPriority pri, + Statistics* stats) override { + (void)bytes; + (void)pri; + (void)stats; + } + + int64_t GetSingleBurstBytes() const override { return 200; } + + int64_t GetTotalBytesThrough( + const Env::IOPriority pri = Env::IO_TOTAL) const override { + (void)pri; + return 0; + } + + int64_t GetTotalRequests( + const Env::IOPriority pri = Env::IO_TOTAL) const override { + (void)pri; + return 0; + } + + int64_t GetBytesPerSecond() const override { return 0; } +}; + +// To test that customed rate limiter not implementing optional APIs (e.g, +// RateLimiter::GetTotalPendingRequests()) works fine with RocksDB basic +// operations (e.g, Put, Get, Flush) +TEST_F(DBTest, CustomedRateLimiterWithNoOptionalAPIImplTest) { + Options options = CurrentOptions(); + options.rate_limiter.reset(new MockedRateLimiterWithNoOptionalAPIImpl()); + DestroyAndReopen(options); + ASSERT_OK(Put("abc", "def")); + ASSERT_EQ(Get("abc"), "def"); + ASSERT_OK(Flush()); + ASSERT_EQ(Get("abc"), "def"); +} + +TEST_F(DBTest, TableOptionsSanitizeTest) { + Options options = CurrentOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); + ASSERT_EQ(db_->GetOptions().allow_mmap_reads, false); + + options.table_factory.reset(NewPlainTableFactory()); + options.prefix_extractor.reset(NewNoopTransform()); + Destroy(options); + ASSERT_TRUE(!TryReopen(options).IsNotSupported()); + + // Test for check of prefix_extractor when hash index is used for + // block-based table + BlockBasedTableOptions to; + to.index_type = BlockBasedTableOptions::kHashSearch; + options = CurrentOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(to)); + ASSERT_TRUE(TryReopen(options).IsInvalidArgument()); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + ASSERT_OK(TryReopen(options)); +} + +TEST_F(DBTest, ConcurrentMemtableNotSupported) { + Options options = CurrentOptions(); + options.allow_concurrent_memtable_write = true; + options.soft_pending_compaction_bytes_limit = 0; + options.hard_pending_compaction_bytes_limit = 100; + options.create_if_missing = true; + + DestroyDB(dbname_, options); + options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true, 4)); + ASSERT_NOK(TryReopen(options)); + + options.memtable_factory.reset(new SkipListFactory); + ASSERT_OK(TryReopen(options)); + + ColumnFamilyOptions cf_options(options); + cf_options.memtable_factory.reset( + NewHashLinkListRepFactory(4, 0, 3, true, 4)); + ColumnFamilyHandle* handle; + ASSERT_NOK(db_->CreateColumnFamily(cf_options, "name", &handle)); +} + +#endif // ROCKSDB_LITE + +TEST_F(DBTest, SanitizeNumThreads) { + for (int attempt = 0; attempt < 2; attempt++) { + const size_t kTotalTasks = 8; + test::SleepingBackgroundTask sleeping_tasks[kTotalTasks]; + + Options options = CurrentOptions(); + if (attempt == 0) { + options.max_background_compactions = 3; + options.max_background_flushes = 2; + } + options.create_if_missing = true; + DestroyAndReopen(options); + + for (size_t i = 0; i < kTotalTasks; i++) { + // Insert 5 tasks to low priority queue and 5 tasks to high priority queue + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_tasks[i], + (i < 4) ? Env::Priority::LOW : Env::Priority::HIGH); + } + + // Wait until 10s for they are scheduled. + for (int i = 0; i < 10000; i++) { + if (options.env->GetThreadPoolQueueLen(Env::Priority::LOW) <= 1 && + options.env->GetThreadPoolQueueLen(Env::Priority::HIGH) <= 2) { + break; + } + env_->SleepForMicroseconds(1000); + } + + // pool size 3, total task 4. Queue size should be 1. + ASSERT_EQ(1U, options.env->GetThreadPoolQueueLen(Env::Priority::LOW)); + // pool size 2, total task 4. Queue size should be 2. + ASSERT_EQ(2U, options.env->GetThreadPoolQueueLen(Env::Priority::HIGH)); + + for (size_t i = 0; i < kTotalTasks; i++) { + sleeping_tasks[i].WakeUp(); + sleeping_tasks[i].WaitUntilDone(); + } + + ASSERT_OK(Put("abc", "def")); + ASSERT_EQ("def", Get("abc")); + ASSERT_OK(Flush()); + ASSERT_EQ("def", Get("abc")); + } +} + +TEST_F(DBTest, WriteSingleThreadEntry) { + std::vector threads; + dbfull()->TEST_LockMutex(); + auto w = dbfull()->TEST_BeginWrite(); + threads.emplace_back([&] { ASSERT_OK(Put("a", "b")); }); + env_->SleepForMicroseconds(10000); + threads.emplace_back([&] { ASSERT_OK(Flush()); }); + env_->SleepForMicroseconds(10000); + dbfull()->TEST_UnlockMutex(); + dbfull()->TEST_LockMutex(); + dbfull()->TEST_EndWrite(w); + dbfull()->TEST_UnlockMutex(); + + for (auto& t : threads) { + t.join(); + } +} + +TEST_F(DBTest, ConcurrentFlushWAL) { + const size_t cnt = 100; + Options options; + options.env = env_; + WriteOptions wopt; + ReadOptions ropt; + for (bool two_write_queues : {false, true}) { + for (bool manual_wal_flush : {false, true}) { + options.two_write_queues = two_write_queues; + options.manual_wal_flush = manual_wal_flush; + options.create_if_missing = true; + DestroyAndReopen(options); + std::vector threads; + threads.emplace_back([&] { + for (size_t i = 0; i < cnt; i++) { + auto istr = std::to_string(i); + ASSERT_OK(db_->Put(wopt, db_->DefaultColumnFamily(), "a" + istr, + "b" + istr)); + } + }); + if (two_write_queues) { + threads.emplace_back([&] { + for (size_t i = cnt; i < 2 * cnt; i++) { + auto istr = std::to_string(i); + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, + wopt.protection_bytes_per_key, + 0 /* default_cf_ts_sz */); + ASSERT_OK(batch.Put("a" + istr, "b" + istr)); + ASSERT_OK( + dbfull()->WriteImpl(wopt, &batch, nullptr, nullptr, 0, true)); + } + }); + } + threads.emplace_back([&] { + for (size_t i = 0; i < cnt * 100; i++) { // FlushWAL is faster than Put + ASSERT_OK(db_->FlushWAL(false)); + } + }); + for (auto& t : threads) { + t.join(); + } + options.create_if_missing = false; + // Recover from the wal and make sure that it is not corrupted + Reopen(options); + for (size_t i = 0; i < cnt; i++) { + PinnableSlice pval; + auto istr = std::to_string(i); + ASSERT_OK( + db_->Get(ropt, db_->DefaultColumnFamily(), "a" + istr, &pval)); + ASSERT_TRUE(pval == ("b" + istr)); + } + } + } +} + +// This test failure will be caught with a probability +TEST_F(DBTest, ManualFlushWalAndWriteRace) { + Options options; + options.env = env_; + options.manual_wal_flush = true; + options.create_if_missing = true; + + DestroyAndReopen(options); + + WriteOptions wopts; + wopts.sync = true; + + port::Thread writeThread([&]() { + for (int i = 0; i < 100; i++) { + auto istr = std::to_string(i); + ASSERT_OK(dbfull()->Put(wopts, "key_" + istr, "value_" + istr)); + } + }); + port::Thread flushThread([&]() { + for (int i = 0; i < 100; i++) { + ASSERT_OK(dbfull()->FlushWAL(false)); + } + }); + + writeThread.join(); + flushThread.join(); + ASSERT_OK(dbfull()->Put(wopts, "foo1", "value1")); + ASSERT_OK(dbfull()->Put(wopts, "foo2", "value2")); + Reopen(options); + ASSERT_EQ("value1", Get("foo1")); + ASSERT_EQ("value2", Get("foo2")); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest, DynamicMemtableOptions) { + const uint64_t k64KB = 1 << 16; + const uint64_t k128KB = 1 << 17; + const uint64_t k5KB = 5 * 1024; + Options options; + options.env = env_; + options.create_if_missing = true; + options.compression = kNoCompression; + options.max_background_compactions = 1; + options.write_buffer_size = k64KB; + options.arena_block_size = 16 * 1024; + options.max_write_buffer_number = 2; + // Don't trigger compact/slowdown/stop + options.level0_file_num_compaction_trigger = 1024; + options.level0_slowdown_writes_trigger = 1024; + options.level0_stop_writes_trigger = 1024; + DestroyAndReopen(options); + + auto gen_l0_kb = [this](int size) { + const int kNumPutsBeforeWaitForFlush = 64; + Random rnd(301); + for (int i = 0; i < size; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(1024))); + + // The following condition prevents a race condition between flush jobs + // acquiring work and this thread filling up multiple memtables. Without + // this, the flush might produce less files than expected because + // multiple memtables are flushed into a single L0 file. This race + // condition affects assertion (A). + if (i % kNumPutsBeforeWaitForFlush == kNumPutsBeforeWaitForFlush - 1) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + }; + + // Test write_buffer_size + gen_l0_kb(64); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + ASSERT_LT(SizeAtLevel(0), k64KB + k5KB); + ASSERT_GT(SizeAtLevel(0), k64KB - k5KB * 2); + + // Clean up L0 + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + + // Increase buffer size + ASSERT_OK(dbfull()->SetOptions({ + {"write_buffer_size", "131072"}, + })); + + // The existing memtable inflated 64KB->128KB when we invoked SetOptions(). + // Write 192KB, we should have a 128KB L0 file and a memtable with 64KB data. + gen_l0_kb(192); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); // (A) + ASSERT_LT(SizeAtLevel(0), k128KB + 2 * k5KB); + ASSERT_GT(SizeAtLevel(0), k128KB - 4 * k5KB); + + // Decrease buffer size below current usage + ASSERT_OK(dbfull()->SetOptions({ + {"write_buffer_size", "65536"}, + })); + // The existing memtable became eligible for flush when we reduced its + // capacity to 64KB. Two keys need to be added to trigger flush: first causes + // memtable to be marked full, second schedules the flush. Then we should have + // a 128KB L0 file, a 64KB L0 file, and a memtable with just one key. + gen_l0_kb(2); + ASSERT_EQ(NumTableFilesAtLevel(0), 2); + ASSERT_LT(SizeAtLevel(0), k128KB + k64KB + 2 * k5KB); + ASSERT_GT(SizeAtLevel(0), k128KB + k64KB - 4 * k5KB); + + // Test max_write_buffer_number + // Block compaction thread, which will also block the flushes because + // max_background_flushes == 0, so flushes are getting executed by the + // compaction thread + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + // Start from scratch and disable compaction/flush. Flush can only happen + // during compaction but trigger is pretty high + options.disable_auto_compactions = true; + DestroyAndReopen(options); + env_->SetBackgroundThreads(0, Env::HIGH); + + // Put until writes are stopped, bounded by 256 puts. We should see stop at + // ~128KB + int count = 0; + Random rnd(301); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DelayWrite:Wait", + [&](void* /*arg*/) { sleeping_task_low.WakeUp(); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + while (!sleeping_task_low.WokenUp() && count < 256) { + ASSERT_OK(Put(Key(count), rnd.RandomString(1024), WriteOptions())); + count++; + } + ASSERT_GT(static_cast(count), 128 * 0.8); + ASSERT_LT(static_cast(count), 128 * 1.2); + + sleeping_task_low.WaitUntilDone(); + + // Increase + ASSERT_OK(dbfull()->SetOptions({ + {"max_write_buffer_number", "8"}, + })); + // Clean up memtable and L0 + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + sleeping_task_low.Reset(); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + count = 0; + while (!sleeping_task_low.WokenUp() && count < 1024) { + ASSERT_OK(Put(Key(count), rnd.RandomString(1024), WriteOptions())); + count++; + } +// Windows fails this test. Will tune in the future and figure out +// approp number +#ifndef OS_WIN + ASSERT_GT(static_cast(count), 512 * 0.8); + ASSERT_LT(static_cast(count), 512 * 1.2); +#endif + sleeping_task_low.WaitUntilDone(); + + // Decrease + ASSERT_OK(dbfull()->SetOptions({ + {"max_write_buffer_number", "4"}, + })); + // Clean up memtable and L0 + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + sleeping_task_low.Reset(); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + count = 0; + while (!sleeping_task_low.WokenUp() && count < 1024) { + ASSERT_OK(Put(Key(count), rnd.RandomString(1024), WriteOptions())); + count++; + } +// Windows fails this test. Will tune in the future and figure out +// approp number +#ifndef OS_WIN + ASSERT_GT(static_cast(count), 256 * 0.8); + ASSERT_LT(static_cast(count), 266 * 1.2); +#endif + sleeping_task_low.WaitUntilDone(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} +#endif // ROCKSDB_LITE + +#ifdef ROCKSDB_USING_THREAD_STATUS +namespace { +void VerifyOperationCount(Env* env, ThreadStatus::OperationType op_type, + int expected_count) { + int op_count = 0; + std::vector thread_list; + ASSERT_OK(env->GetThreadList(&thread_list)); + for (auto thread : thread_list) { + if (thread.operation_type == op_type) { + op_count++; + } + } + ASSERT_EQ(op_count, expected_count); +} +} // anonymous namespace + +TEST_F(DBTest, GetThreadStatus) { + Options options; + options.env = env_; + options.enable_thread_tracking = true; + TryReopen(options); + + std::vector thread_list; + Status s = env_->GetThreadList(&thread_list); + + for (int i = 0; i < 2; ++i) { + // repeat the test with differet number of high / low priority threads + const int kTestCount = 3; + const unsigned int kHighPriCounts[kTestCount] = {3, 2, 5}; + const unsigned int kLowPriCounts[kTestCount] = {10, 15, 3}; + const unsigned int kBottomPriCounts[kTestCount] = {2, 1, 4}; + for (int test = 0; test < kTestCount; ++test) { + // Change the number of threads in high / low priority pool. + env_->SetBackgroundThreads(kHighPriCounts[test], Env::HIGH); + env_->SetBackgroundThreads(kLowPriCounts[test], Env::LOW); + env_->SetBackgroundThreads(kBottomPriCounts[test], Env::BOTTOM); + // Wait to ensure the all threads has been registered + unsigned int thread_type_counts[ThreadStatus::NUM_THREAD_TYPES]; + // TODO(ajkr): it'd be better if SetBackgroundThreads returned only after + // all threads have been registered. + // Try up to 60 seconds. + for (int num_try = 0; num_try < 60000; num_try++) { + env_->SleepForMicroseconds(1000); + thread_list.clear(); + s = env_->GetThreadList(&thread_list); + ASSERT_OK(s); + memset(thread_type_counts, 0, sizeof(thread_type_counts)); + for (auto thread : thread_list) { + ASSERT_LT(thread.thread_type, ThreadStatus::NUM_THREAD_TYPES); + thread_type_counts[thread.thread_type]++; + } + if (thread_type_counts[ThreadStatus::HIGH_PRIORITY] == + kHighPriCounts[test] && + thread_type_counts[ThreadStatus::LOW_PRIORITY] == + kLowPriCounts[test] && + thread_type_counts[ThreadStatus::BOTTOM_PRIORITY] == + kBottomPriCounts[test]) { + break; + } + } + // Verify the number of high-priority threads + ASSERT_EQ(thread_type_counts[ThreadStatus::HIGH_PRIORITY], + kHighPriCounts[test]); + // Verify the number of low-priority threads + ASSERT_EQ(thread_type_counts[ThreadStatus::LOW_PRIORITY], + kLowPriCounts[test]); + // Verify the number of bottom-priority threads + ASSERT_EQ(thread_type_counts[ThreadStatus::BOTTOM_PRIORITY], + kBottomPriCounts[test]); + } + if (i == 0) { + // repeat the test with multiple column families + CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options); + env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_, + true); + } + } + ASSERT_OK(db_->DropColumnFamily(handles_[2])); + delete handles_[2]; + handles_.erase(handles_.begin() + 2); + env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_, + true); + Close(); + env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_, + true); +} + +TEST_F(DBTest, DisableThreadStatus) { + Options options; + options.env = env_; + options.enable_thread_tracking = false; + TryReopen(options); + CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options); + // Verify non of the column family info exists + env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_, + false); +} + +TEST_F(DBTest, ThreadStatusFlush) { + Options options; + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + options.enable_thread_tracking = true; + options = CurrentOptions(options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"FlushJob::FlushJob()", "DBTest::ThreadStatusFlush:1"}, + {"DBTest::ThreadStatusFlush:2", "FlushJob::WriteLevel0Table"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + CreateAndReopenWithCF({"pikachu"}, options); + VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0); + + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_EQ("v1", Get(1, "foo")); + VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0); + + uint64_t num_running_flushes = 0; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningFlushes, + &num_running_flushes)); + ASSERT_EQ(num_running_flushes, 0); + + ASSERT_OK(Put(1, "k1", std::string(100000, 'x'))); // Fill memtable + ASSERT_OK(Put(1, "k2", std::string(100000, 'y'))); // Trigger flush + + // The first sync point is to make sure there's one flush job + // running when we perform VerifyOperationCount(). + TEST_SYNC_POINT("DBTest::ThreadStatusFlush:1"); + VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 1); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningFlushes, + &num_running_flushes)); + ASSERT_EQ(num_running_flushes, 1); + // This second sync point is to ensure the flush job will not + // be completed until we already perform VerifyOperationCount(). + TEST_SYNC_POINT("DBTest::ThreadStatusFlush:2"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBTestWithParam, ThreadStatusSingleCompaction) { + const int kTestKeySize = 16; + const int kTestValueSize = 984; + const int kEntrySize = kTestKeySize + kTestValueSize; + const int kEntriesPerBuffer = 100; + Options options; + options.create_if_missing = true; + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; + options.compaction_style = kCompactionStyleLevel; + options.target_file_size_base = options.write_buffer_size; + options.max_bytes_for_level_base = options.target_file_size_base * 2; + options.max_bytes_for_level_multiplier = 2; + options.compression = kNoCompression; + options = CurrentOptions(options); + options.env = env_; + options.enable_thread_tracking = true; + const int kNumL0Files = 4; + options.level0_file_num_compaction_trigger = kNumL0Files; + options.max_subcompactions = max_subcompactions_; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"DBTest::ThreadStatusSingleCompaction:0", "DBImpl::BGWorkCompaction"}, + {"CompactionJob::Run():Start", "DBTest::ThreadStatusSingleCompaction:1"}, + {"DBTest::ThreadStatusSingleCompaction:2", "CompactionJob::Run():End"}, + }); + for (int tests = 0; tests < 2; ++tests) { + DestroyAndReopen(options); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + // The Put Phase. + for (int file = 0; file < kNumL0Files; ++file) { + for (int key = 0; key < kEntriesPerBuffer; ++key) { + ASSERT_OK(Put(std::to_string(key + file * kEntriesPerBuffer), + rnd.RandomString(kTestValueSize))); + } + ASSERT_OK(Flush()); + } + // This makes sure a compaction won't be scheduled until + // we have done with the above Put Phase. + uint64_t num_running_compactions = 0; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningCompactions, + &num_running_compactions)); + ASSERT_EQ(num_running_compactions, 0); + TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:0"); + ASSERT_GE(NumTableFilesAtLevel(0), + options.level0_file_num_compaction_trigger); + + // This makes sure at least one compaction is running. + TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:1"); + + if (options.enable_thread_tracking) { + // expecting one single L0 to L1 compaction + VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 1); + } else { + // If thread tracking is not enabled, compaction count should be 0. + VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 0); + } + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningCompactions, + &num_running_compactions)); + ASSERT_EQ(num_running_compactions, 1); + // TODO(yhchiang): adding assert to verify each compaction stage. + TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:2"); + + // repeat the test with disabling thread tracking. + options.enable_thread_tracking = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_P(DBTestWithParam, PreShutdownManualCompaction) { + Options options = CurrentOptions(); + options.max_subcompactions = max_subcompactions_; + CreateAndReopenWithCF({"pikachu"}, options); + + // iter - 0 with 7 levels + // iter - 1 with 3 levels + for (int iter = 0; iter < 2; ++iter) { + MakeTables(3, "p", "q", 1); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); + + // Compaction range falls before files + Compact(1, "", "c"); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); + + // Compaction range falls after files + Compact(1, "r", "z"); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); + + // Compaction range overlaps files + Compact(1, "p", "q"); + ASSERT_EQ("0,0,1", FilesPerLevel(1)); + + // Populate a different range + MakeTables(3, "c", "e", 1); + ASSERT_EQ("1,1,2", FilesPerLevel(1)); + + // Compact just the new range + Compact(1, "b", "f"); + ASSERT_EQ("0,0,2", FilesPerLevel(1)); + + // Compact all + MakeTables(1, "a", "z", 1); + ASSERT_EQ("1,0,2", FilesPerLevel(1)); + CancelAllBackgroundWork(db_); + ASSERT_TRUE( + db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr) + .IsShutdownInProgress()); + ASSERT_EQ("1,0,2", FilesPerLevel(1)); + + if (iter == 0) { + options = CurrentOptions(); + options.num_levels = 3; + options.create_if_missing = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + } + } +} + +TEST_F(DBTest, PreShutdownFlush) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put(1, "key", "value")); + CancelAllBackgroundWork(db_); + Status s = + db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr); + ASSERT_TRUE(s.IsShutdownInProgress()); +} + +TEST_P(DBTestWithParam, PreShutdownMultipleCompaction) { + const int kTestKeySize = 16; + const int kTestValueSize = 984; + const int kEntrySize = kTestKeySize + kTestValueSize; + const int kEntriesPerBuffer = 40; + const int kNumL0Files = 4; + + const int kHighPriCount = 3; + const int kLowPriCount = 5; + env_->SetBackgroundThreads(kHighPriCount, Env::HIGH); + env_->SetBackgroundThreads(kLowPriCount, Env::LOW); + + Options options; + options.create_if_missing = true; + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; + options.compaction_style = kCompactionStyleLevel; + options.target_file_size_base = options.write_buffer_size; + options.max_bytes_for_level_base = + options.target_file_size_base * kNumL0Files; + options.compression = kNoCompression; + options = CurrentOptions(options); + options.env = env_; + options.enable_thread_tracking = true; + options.level0_file_num_compaction_trigger = kNumL0Files; + options.max_bytes_for_level_multiplier = 2; + options.max_background_compactions = kLowPriCount; + options.level0_stop_writes_trigger = 1 << 10; + options.level0_slowdown_writes_trigger = 1 << 10; + options.max_subcompactions = max_subcompactions_; + + TryReopen(options); + Random rnd(301); + + std::vector thread_list; + // Delay both flush and compaction + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"FlushJob::FlushJob()", "CompactionJob::Run():Start"}, + {"CompactionJob::Run():Start", + "DBTest::PreShutdownMultipleCompaction:Preshutdown"}, + {"CompactionJob::Run():Start", + "DBTest::PreShutdownMultipleCompaction:VerifyCompaction"}, + {"DBTest::PreShutdownMultipleCompaction:Preshutdown", + "CompactionJob::Run():End"}, + {"CompactionJob::Run():End", + "DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Make rocksdb busy + int key = 0; + // check how many threads are doing compaction using GetThreadList + int operation_count[ThreadStatus::NUM_OP_TYPES] = {0}; + for (int file = 0; file < 16 * kNumL0Files; ++file) { + for (int k = 0; k < kEntriesPerBuffer; ++k) { + ASSERT_OK(Put(std::to_string(key++), rnd.RandomString(kTestValueSize))); + } + + ASSERT_OK(env_->GetThreadList(&thread_list)); + for (auto thread : thread_list) { + operation_count[thread.operation_type]++; + } + + // Speed up the test + if (operation_count[ThreadStatus::OP_FLUSH] > 1 && + operation_count[ThreadStatus::OP_COMPACTION] > + 0.6 * options.max_background_compactions) { + break; + } + if (file == 15 * kNumL0Files) { + TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown"); + } + } + + TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown"); + ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1); + CancelAllBackgroundWork(db_); + TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown"); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Record the number of compactions at a time. + for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) { + operation_count[i] = 0; + } + ASSERT_OK(env_->GetThreadList(&thread_list)); + for (auto thread : thread_list) { + operation_count[thread.operation_type]++; + } + ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0); +} + +TEST_P(DBTestWithParam, PreShutdownCompactionMiddle) { + const int kTestKeySize = 16; + const int kTestValueSize = 984; + const int kEntrySize = kTestKeySize + kTestValueSize; + const int kEntriesPerBuffer = 40; + const int kNumL0Files = 4; + + const int kHighPriCount = 3; + const int kLowPriCount = 5; + env_->SetBackgroundThreads(kHighPriCount, Env::HIGH); + env_->SetBackgroundThreads(kLowPriCount, Env::LOW); + + Options options; + options.create_if_missing = true; + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; + options.compaction_style = kCompactionStyleLevel; + options.target_file_size_base = options.write_buffer_size; + options.max_bytes_for_level_base = + options.target_file_size_base * kNumL0Files; + options.compression = kNoCompression; + options = CurrentOptions(options); + options.env = env_; + options.enable_thread_tracking = true; + options.level0_file_num_compaction_trigger = kNumL0Files; + options.max_bytes_for_level_multiplier = 2; + options.max_background_compactions = kLowPriCount; + options.level0_stop_writes_trigger = 1 << 10; + options.level0_slowdown_writes_trigger = 1 << 10; + options.max_subcompactions = max_subcompactions_; + + TryReopen(options); + Random rnd(301); + + std::vector thread_list; + // Delay both flush and compaction + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBTest::PreShutdownCompactionMiddle:Preshutdown", + "CompactionJob::Run():Inprogress"}, + {"CompactionJob::Run():Start", + "DBTest::PreShutdownCompactionMiddle:VerifyCompaction"}, + {"CompactionJob::Run():Inprogress", "CompactionJob::Run():End"}, + {"CompactionJob::Run():End", + "DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Make rocksdb busy + int key = 0; + // check how many threads are doing compaction using GetThreadList + int operation_count[ThreadStatus::NUM_OP_TYPES] = {0}; + for (int file = 0; file < 16 * kNumL0Files; ++file) { + for (int k = 0; k < kEntriesPerBuffer; ++k) { + ASSERT_OK(Put(std::to_string(key++), rnd.RandomString(kTestValueSize))); + } + + ASSERT_OK(env_->GetThreadList(&thread_list)); + for (auto thread : thread_list) { + operation_count[thread.operation_type]++; + } + + // Speed up the test + if (operation_count[ThreadStatus::OP_FLUSH] > 1 && + operation_count[ThreadStatus::OP_COMPACTION] > + 0.6 * options.max_background_compactions) { + break; + } + if (file == 15 * kNumL0Files) { + TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyCompaction"); + } + } + + ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1); + CancelAllBackgroundWork(db_); + TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:Preshutdown"); + TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown"); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Record the number of compactions at a time. + for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) { + operation_count[i] = 0; + } + ASSERT_OK(env_->GetThreadList(&thread_list)); + for (auto thread : thread_list) { + operation_count[thread.operation_type]++; + } + ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0); +} + +#endif // ROCKSDB_USING_THREAD_STATUS + +#ifndef ROCKSDB_LITE +TEST_F(DBTest, FlushOnDestroy) { + WriteOptions wo; + wo.disableWAL = true; + ASSERT_OK(Put("foo", "v1", wo)); + CancelAllBackgroundWork(db_); +} + +TEST_F(DBTest, DynamicLevelCompressionPerLevel) { + if (!Snappy_Supported()) { + return; + } + const int kNKeys = 120; + int keys[kNKeys]; + for (int i = 0; i < kNKeys; i++) { + keys[i] = i; + } + RandomShuffle(std::begin(keys), std::end(keys)); + + Random rnd(301); + Options options; + options.env = env_; + options.create_if_missing = true; + options.db_write_buffer_size = 20480; + options.write_buffer_size = 20480; + options.max_write_buffer_number = 2; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 2; + options.target_file_size_base = 20480; + options.level_compaction_dynamic_level_bytes = true; + options.max_bytes_for_level_base = 102400; + options.max_bytes_for_level_multiplier = 4; + options.max_background_compactions = 1; + options.num_levels = 5; + + options.compression_per_level.resize(3); + options.compression_per_level[0] = kNoCompression; + options.compression_per_level[1] = kNoCompression; + options.compression_per_level[2] = kSnappyCompression; + + OnFileDeletionListener* listener = new OnFileDeletionListener(); + options.listeners.emplace_back(listener); + + DestroyAndReopen(options); + + // Insert more than 80K. L4 should be base level. Neither L0 nor L4 should + // be compressed, so total data size should be more than 80K. + for (int i = 0; i < 20; i++) { + ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 0); + ASSERT_EQ(NumTableFilesAtLevel(3), 0); + // Assuming each files' metadata is at least 50 bytes/ + ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(4), 20U * 4000U + 50U * 4); + + // Insert 400KB. Some data will be compressed + for (int i = 21; i < 120; i++) { + ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 0); + + ASSERT_LT(SizeAtLevel(0) + SizeAtLevel(3) + SizeAtLevel(4), + 120U * 4000U + 50U * 24); + // Make sure data in files in L3 is not compacted by removing all files + // in L4 and calculate number of rows + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "true"}, + })); + ColumnFamilyMetaData cf_meta; + db_->GetColumnFamilyMetaData(&cf_meta); + for (auto file : cf_meta.levels[4].files) { + listener->SetExpectedFileName(dbname_ + file.name); + ASSERT_OK(dbfull()->DeleteFile(file.name)); + } + listener->VerifyMatchedCount(cf_meta.levels[4].files.size()); + + int num_keys = 0; + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + num_keys++; + } + ASSERT_OK(iter->status()); + ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(3), num_keys * 4000U + num_keys * 10U); +} + +TEST_F(DBTest, DynamicLevelCompressionPerLevel2) { + if (!Snappy_Supported() || !LZ4_Supported() || !Zlib_Supported()) { + return; + } + const int kNKeys = 500; + int keys[kNKeys]; + for (int i = 0; i < kNKeys; i++) { + keys[i] = i; + } + RandomShuffle(std::begin(keys), std::end(keys)); + + Random rnd(301); + Options options; + options.create_if_missing = true; + options.db_write_buffer_size = 6000000; + options.write_buffer_size = 600000; + options.max_write_buffer_number = 2; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 2; + options.soft_pending_compaction_bytes_limit = 1024 * 1024; + options.target_file_size_base = 20; + options.env = env_; + options.level_compaction_dynamic_level_bytes = true; + options.max_bytes_for_level_base = 200; + options.max_bytes_for_level_multiplier = 8; + options.max_background_compactions = 1; + options.num_levels = 5; + std::shared_ptr mtf(new mock::MockTableFactory); + options.table_factory = mtf; + + options.compression_per_level.resize(3); + options.compression_per_level[0] = kNoCompression; + options.compression_per_level[1] = kLZ4Compression; + options.compression_per_level[2] = kZlibCompression; + + DestroyAndReopen(options); + // When base level is L4, L4 is LZ4. + std::atomic num_zlib(0); + std::atomic num_lz4(0); + std::atomic num_no(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + if (compaction->output_level() == 4) { + ASSERT_TRUE(compaction->output_compression() == kLZ4Compression); + num_lz4.fetch_add(1); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) { + auto* compression = reinterpret_cast(arg); + ASSERT_TRUE(*compression == kNoCompression); + num_no.fetch_add(1); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 0; i < 100; i++) { + std::string value = rnd.RandomString(200); + ASSERT_OK(Put(Key(keys[i]), value)); + if (i % 25 == 24) { + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + } + + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 0); + ASSERT_EQ(NumTableFilesAtLevel(3), 0); + ASSERT_GT(NumTableFilesAtLevel(4), 0); + ASSERT_GT(num_no.load(), 2); + ASSERT_GT(num_lz4.load(), 0); + int prev_num_files_l4 = NumTableFilesAtLevel(4); + + // After base level turn L4->L3, L3 becomes LZ4 and L4 becomes Zlib + num_lz4.store(0); + num_no.store(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + if (compaction->output_level() == 4 && compaction->start_level() == 3) { + ASSERT_TRUE(compaction->output_compression() == kZlibCompression); + num_zlib.fetch_add(1); + } else { + ASSERT_TRUE(compaction->output_compression() == kLZ4Compression); + num_lz4.fetch_add(1); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) { + auto* compression = reinterpret_cast(arg); + ASSERT_TRUE(*compression == kNoCompression); + num_no.fetch_add(1); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 101; i < 500; i++) { + std::string value = rnd.RandomString(200); + ASSERT_OK(Put(Key(keys[i]), value)); + if (i % 100 == 99) { + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 0); + ASSERT_GT(NumTableFilesAtLevel(3), 0); + ASSERT_GT(NumTableFilesAtLevel(4), prev_num_files_l4); + ASSERT_GT(num_no.load(), 2); + ASSERT_GT(num_lz4.load(), 0); + ASSERT_GT(num_zlib.load(), 0); +} + +TEST_F(DBTest, DynamicCompactionOptions) { + // minimum write buffer size is enforced at 64KB + const uint64_t k32KB = 1 << 15; + const uint64_t k64KB = 1 << 16; + const uint64_t k128KB = 1 << 17; + const uint64_t k1MB = 1 << 20; + const uint64_t k4KB = 1 << 12; + Options options; + options.env = env_; + options.create_if_missing = true; + options.compression = kNoCompression; + options.soft_pending_compaction_bytes_limit = 1024 * 1024; + options.write_buffer_size = k64KB; + options.arena_block_size = 4 * k4KB; + options.max_write_buffer_number = 2; + // Compaction related options + options.level0_file_num_compaction_trigger = 3; + options.level0_slowdown_writes_trigger = 4; + options.level0_stop_writes_trigger = 8; + options.target_file_size_base = k64KB; + options.max_compaction_bytes = options.target_file_size_base * 10; + options.target_file_size_multiplier = 1; + options.max_bytes_for_level_base = k128KB; + options.max_bytes_for_level_multiplier = 4; + + // Block flush thread and disable compaction thread + env_->SetBackgroundThreads(1, Env::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); + DestroyAndReopen(options); + + auto gen_l0_kb = [this](int start, int size, int stride) { + Random rnd(301); + for (int i = 0; i < size; i++) { + ASSERT_OK(Put(Key(start + stride * i), rnd.RandomString(1024))); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + }; + + // Write 3 files that have the same key range. + // Since level0_file_num_compaction_trigger is 3, compaction should be + // triggered. The compaction should result in one L1 file + gen_l0_kb(0, 64, 1); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + gen_l0_kb(0, 64, 1); + ASSERT_EQ(NumTableFilesAtLevel(0), 2); + gen_l0_kb(0, 64, 1); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("0,1", FilesPerLevel()); + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(1U, metadata.size()); + ASSERT_LE(metadata[0].size, k64KB + k4KB); + ASSERT_GE(metadata[0].size, k64KB - k4KB); + + // Test compaction trigger and target_file_size_base + // Reduce compaction trigger to 2, and reduce L1 file size to 32KB. + // Writing to 64KB L0 files should trigger a compaction. Since these + // 2 L0 files have the same key range, compaction merge them and should + // result in 2 32KB L1 files. + ASSERT_OK( + dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"}, + {"target_file_size_base", std::to_string(k32KB)}})); + + gen_l0_kb(0, 64, 1); + ASSERT_EQ("1,1", FilesPerLevel()); + gen_l0_kb(0, 64, 1); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("0,2", FilesPerLevel()); + metadata.clear(); + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(2U, metadata.size()); + ASSERT_LE(metadata[0].size, k32KB + k4KB); + ASSERT_GE(metadata[0].size, k32KB - k4KB); + ASSERT_LE(metadata[1].size, k32KB + k4KB); + ASSERT_GE(metadata[1].size, k32KB - k4KB); + + // Test max_bytes_for_level_base + // Increase level base size to 256KB and write enough data that will + // fill L1 and L2. L1 size should be around 256KB while L2 size should be + // around 256KB x 4. + ASSERT_OK(dbfull()->SetOptions( + {{"max_bytes_for_level_base", std::to_string(k1MB)}})); + + // writing 96 x 64KB => 6 * 1024KB + // (L1 + L2) = (1 + 4) * 1024KB + for (int i = 0; i < 96; ++i) { + gen_l0_kb(i, 64, 96); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_GT(SizeAtLevel(1), k1MB / 2); + ASSERT_LT(SizeAtLevel(1), k1MB + k1MB / 2); + + // Within (0.5, 1.5) of 4MB. + ASSERT_GT(SizeAtLevel(2), 2 * k1MB); + ASSERT_LT(SizeAtLevel(2), 6 * k1MB); + + // Test max_bytes_for_level_multiplier and + // max_bytes_for_level_base. Now, reduce both mulitplier and level base, + // After filling enough data that can fit in L1 - L3, we should see L1 size + // reduces to 128KB from 256KB which was asserted previously. Same for L2. + ASSERT_OK(dbfull()->SetOptions( + {{"max_bytes_for_level_multiplier", "2"}, + {"max_bytes_for_level_base", std::to_string(k128KB)}})); + + // writing 20 x 64KB = 10 x 128KB + // (L1 + L2 + L3) = (1 + 2 + 4) * 128KB + for (int i = 0; i < 20; ++i) { + gen_l0_kb(i, 64, 32); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + uint64_t total_size = SizeAtLevel(1) + SizeAtLevel(2) + SizeAtLevel(3); + ASSERT_TRUE(total_size < k128KB * 7 * 1.5); + + // Test level0_stop_writes_trigger. + // Clean up memtable and L0. Block compaction threads. If continue to write + // and flush memtables. We should see put stop after 8 memtable flushes + // since level0_stop_writes_trigger = 8 + ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true)); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + // Block compaction + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + sleeping_task_low.WaitUntilSleeping(); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + int count = 0; + Random rnd(301); + WriteOptions wo; + while (count < 64) { + ASSERT_OK(Put(Key(count), rnd.RandomString(1024), wo)); + ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true)); + count++; + if (dbfull()->TEST_write_controler().IsStopped()) { + sleeping_task_low.WakeUp(); + break; + } + } + // Stop trigger = 8 + ASSERT_EQ(count, 8); + // Unblock + sleeping_task_low.WaitUntilDone(); + + // Now reduce level0_stop_writes_trigger to 6. Clear up memtables and L0. + // Block compaction thread again. Perform the put and memtable flushes + // until we see the stop after 6 memtable flushes. + ASSERT_OK(dbfull()->SetOptions({{"level0_stop_writes_trigger", "6"}})); + ASSERT_OK(dbfull()->TEST_FlushMemTable(true)); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + + // Block compaction again + sleeping_task_low.Reset(); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + sleeping_task_low.WaitUntilSleeping(); + count = 0; + while (count < 64) { + ASSERT_OK(Put(Key(count), rnd.RandomString(1024), wo)); + ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true)); + count++; + if (dbfull()->TEST_write_controler().IsStopped()) { + sleeping_task_low.WakeUp(); + break; + } + } + ASSERT_EQ(count, 6); + // Unblock + sleeping_task_low.WaitUntilDone(); + + // Test disable_auto_compactions + // Compaction thread is unblocked but auto compaction is disabled. Write + // 4 L0 files and compaction should be triggered. If auto compaction is + // disabled, then TEST_WaitForCompact will be waiting for nothing. Number of + // L0 files do not change after the call. + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "true"}})); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + + for (int i = 0; i < 4; ++i) { + ASSERT_OK(Put(Key(i), rnd.RandomString(1024))); + // Wait for compaction so that put won't stop + ASSERT_OK(dbfull()->TEST_FlushMemTable(true)); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 4); + + // Enable auto compaction and perform the same test, # of L0 files should be + // reduced after compaction. + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + + for (int i = 0; i < 4; ++i) { + ASSERT_OK(Put(Key(i), rnd.RandomString(1024))); + // Wait for compaction so that put won't stop + ASSERT_OK(dbfull()->TEST_FlushMemTable(true)); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_LT(NumTableFilesAtLevel(0), 4); +} + +// Test dynamic FIFO compaction options. +// This test covers just option parsing and makes sure that the options are +// correctly assigned. Also look at DBOptionsTest.SetFIFOCompactionOptions +// test which makes sure that the FIFO compaction funcionality is working +// as expected on dynamically changing the options. +// Even more FIFOCompactionTests are at DBTest.FIFOCompaction* . +TEST_F(DBTest, DynamicFIFOCompactionOptions) { + Options options; + options.ttl = 0; + options.create_if_missing = true; + options.env = env_; + DestroyAndReopen(options); + + // Initial defaults + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 1024 * 1024 * 1024); + ASSERT_EQ(dbfull()->GetOptions().ttl, 0); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + false); + + ASSERT_OK(dbfull()->SetOptions( + {{"compaction_options_fifo", "{max_table_files_size=23;}"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 23); + ASSERT_EQ(dbfull()->GetOptions().ttl, 0); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + false); + + ASSERT_OK(dbfull()->SetOptions({{"ttl", "97"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 23); + ASSERT_EQ(dbfull()->GetOptions().ttl, 97); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + false); + + ASSERT_OK(dbfull()->SetOptions({{"ttl", "203"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 23); + ASSERT_EQ(dbfull()->GetOptions().ttl, 203); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + false); + + ASSERT_OK(dbfull()->SetOptions( + {{"compaction_options_fifo", "{allow_compaction=true;}"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 23); + ASSERT_EQ(dbfull()->GetOptions().ttl, 203); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + true); + + ASSERT_OK(dbfull()->SetOptions( + {{"compaction_options_fifo", "{max_table_files_size=31;}"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 31); + ASSERT_EQ(dbfull()->GetOptions().ttl, 203); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + true); + + ASSERT_OK(dbfull()->SetOptions( + {{"compaction_options_fifo", + "{max_table_files_size=51;allow_compaction=true;}"}})); + ASSERT_OK(dbfull()->SetOptions({{"ttl", "49"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 51); + ASSERT_EQ(dbfull()->GetOptions().ttl, 49); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + true); +} + +TEST_F(DBTest, DynamicUniversalCompactionOptions) { + Options options; + options.create_if_missing = true; + options.env = env_; + DestroyAndReopen(options); + + // Initial defaults + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 1U); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width, + 2u); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width, + UINT_MAX); + ASSERT_EQ(dbfull() + ->GetOptions() + .compaction_options_universal.max_size_amplification_percent, + 200u); + ASSERT_EQ(dbfull() + ->GetOptions() + .compaction_options_universal.compression_size_percent, + -1); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style, + kCompactionStopStyleTotalSize); + ASSERT_EQ( + dbfull()->GetOptions().compaction_options_universal.allow_trivial_move, + false); + + ASSERT_OK(dbfull()->SetOptions( + {{"compaction_options_universal", "{size_ratio=7;}"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7u); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width, + 2u); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width, + UINT_MAX); + ASSERT_EQ(dbfull() + ->GetOptions() + .compaction_options_universal.max_size_amplification_percent, + 200u); + ASSERT_EQ(dbfull() + ->GetOptions() + .compaction_options_universal.compression_size_percent, + -1); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style, + kCompactionStopStyleTotalSize); + ASSERT_EQ( + dbfull()->GetOptions().compaction_options_universal.allow_trivial_move, + false); + + ASSERT_OK(dbfull()->SetOptions( + {{"compaction_options_universal", "{min_merge_width=11;}"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7u); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width, + 11u); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width, + UINT_MAX); + ASSERT_EQ(dbfull() + ->GetOptions() + .compaction_options_universal.max_size_amplification_percent, + 200u); + ASSERT_EQ(dbfull() + ->GetOptions() + .compaction_options_universal.compression_size_percent, + -1); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style, + kCompactionStopStyleTotalSize); + ASSERT_EQ( + dbfull()->GetOptions().compaction_options_universal.allow_trivial_move, + false); +} +#endif // ROCKSDB_LITE + +TEST_F(DBTest, FileCreationRandomFailure) { + Options options; + options.env = env_; + options.create_if_missing = true; + options.write_buffer_size = 100000; // Small write buffer + options.target_file_size_base = 200000; + options.max_bytes_for_level_base = 1000000; + options.max_bytes_for_level_multiplier = 2; + + DestroyAndReopen(options); + Random rnd(301); + + constexpr int kCDTKeysPerBuffer = 4; + constexpr int kTestSize = kCDTKeysPerBuffer * 4096; + constexpr int kTotalIteration = 20; + // the second half of the test involves in random failure + // of file creation. + constexpr int kRandomFailureTest = kTotalIteration / 2; + + std::vector values; + for (int i = 0; i < kTestSize; ++i) { + values.push_back("NOT_FOUND"); + } + for (int j = 0; j < kTotalIteration; ++j) { + if (j == kRandomFailureTest) { + env_->non_writeable_rate_.store(90); + } + for (int k = 0; k < kTestSize; ++k) { + // here we expect some of the Put fails. + std::string value = rnd.RandomString(100); + Status s = Put(Key(k), Slice(value)); + if (s.ok()) { + // update the latest successful put + values[k] = value; + } + // But everything before we simulate the failure-test should succeed. + if (j < kRandomFailureTest) { + ASSERT_OK(s); + } + } + } + + // If rocksdb does not do the correct job, internal assert will fail here. + ASSERT_TRUE(dbfull()->TEST_WaitForFlushMemTable().IsIOError()); + ASSERT_TRUE(dbfull()->TEST_WaitForCompact().IsIOError()); + + // verify we have the latest successful update + for (int k = 0; k < kTestSize; ++k) { + auto v = Get(Key(k)); + ASSERT_EQ(v, values[k]); + } + + // reopen and reverify we have the latest successful update + env_->non_writeable_rate_.store(0); + Reopen(options); + for (int k = 0; k < kTestSize; ++k) { + auto v = Get(Key(k)); + ASSERT_EQ(v, values[k]); + } +} + +#ifndef ROCKSDB_LITE + +TEST_F(DBTest, DynamicMiscOptions) { + // Test max_sequential_skip_in_iterations + Options options; + options.env = env_; + options.create_if_missing = true; + options.max_sequential_skip_in_iterations = 16; + options.compression = kNoCompression; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + + auto assert_reseek_count = [this, &options](int key_start, int num_reseek) { + int key0 = key_start; + int key1 = key_start + 1; + int key2 = key_start + 2; + Random rnd(301); + ASSERT_OK(Put(Key(key0), rnd.RandomString(8))); + for (int i = 0; i < 10; ++i) { + ASSERT_OK(Put(Key(key1), rnd.RandomString(8))); + } + ASSERT_OK(Put(Key(key2), rnd.RandomString(8))); + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + iter->Seek(Key(key1)); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Key(key1)), 0); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Key(key2)), 0); + ASSERT_EQ(num_reseek, + TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION)); + }; + // No reseek + assert_reseek_count(100, 0); + + ASSERT_OK(dbfull()->SetOptions({{"max_sequential_skip_in_iterations", "4"}})); + // Clear memtable and make new option effective + ASSERT_OK(dbfull()->TEST_FlushMemTable(true)); + // Trigger reseek + assert_reseek_count(200, 1); + + ASSERT_OK( + dbfull()->SetOptions({{"max_sequential_skip_in_iterations", "16"}})); + // Clear memtable and make new option effective + ASSERT_OK(dbfull()->TEST_FlushMemTable(true)); + // No reseek + assert_reseek_count(300, 1); + + MutableCFOptions mutable_cf_options; + CreateAndReopenWithCF({"pikachu"}, options); + // Test soft_pending_compaction_bytes_limit, + // hard_pending_compaction_bytes_limit + ASSERT_OK(dbfull()->SetOptions( + handles_[1], {{"soft_pending_compaction_bytes_limit", "200"}, + {"hard_pending_compaction_bytes_limit", "300"}})); + ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1], + &mutable_cf_options)); + ASSERT_EQ(200, mutable_cf_options.soft_pending_compaction_bytes_limit); + ASSERT_EQ(300, mutable_cf_options.hard_pending_compaction_bytes_limit); + // Test report_bg_io_stats + ASSERT_OK( + dbfull()->SetOptions(handles_[1], {{"report_bg_io_stats", "true"}})); + // sanity check + ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1], + &mutable_cf_options)); + ASSERT_TRUE(mutable_cf_options.report_bg_io_stats); + // Test compression + // sanity check + ASSERT_OK(dbfull()->SetOptions({{"compression", "kNoCompression"}})); + ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0], + &mutable_cf_options)); + ASSERT_EQ(CompressionType::kNoCompression, mutable_cf_options.compression); + + if (Snappy_Supported()) { + ASSERT_OK(dbfull()->SetOptions({{"compression", "kSnappyCompression"}})); + ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0], + &mutable_cf_options)); + ASSERT_EQ(CompressionType::kSnappyCompression, + mutable_cf_options.compression); + } + + // Test paranoid_file_checks already done in db_block_cache_test + ASSERT_OK( + dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "true"}})); + ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1], + &mutable_cf_options)); + ASSERT_TRUE(mutable_cf_options.report_bg_io_stats); + ASSERT_TRUE(mutable_cf_options.check_flush_compaction_key_order); + + ASSERT_OK(dbfull()->SetOptions( + handles_[1], {{"check_flush_compaction_key_order", "false"}})); + ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1], + &mutable_cf_options)); + ASSERT_FALSE(mutable_cf_options.check_flush_compaction_key_order); +} +#endif // ROCKSDB_LITE + +TEST_F(DBTest, L0L1L2AndUpHitCounter) { + const int kNumLevels = 3; + const int kNumKeysPerLevel = 10000; + const int kNumKeysPerDb = kNumLevels * kNumKeysPerLevel; + + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + Reopen(options); + + // After the below loop there will be one file on each of L0, L1, and L2. + int key = 0; + for (int output_level = kNumLevels - 1; output_level >= 0; --output_level) { + for (int i = 0; i < kNumKeysPerLevel; ++i) { + ASSERT_OK(Put(Key(key), "val")); + key++; + } + ASSERT_OK(Flush()); + for (int input_level = 0; input_level < output_level; ++input_level) { + // `TEST_CompactRange(input_level, ...)` compacts from `input_level` to + // `input_level + 1`. + ASSERT_OK(dbfull()->TEST_CompactRange(input_level, nullptr, nullptr)); + } + } + assert(key == kNumKeysPerDb); + + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0)); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1)); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP)); + + for (int i = 0; i < kNumKeysPerDb; i++) { + ASSERT_EQ(Get(Key(i)), "val"); + } + + ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L0)); + ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L1)); + ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L2_AND_UP)); + + ASSERT_EQ(kNumKeysPerDb, TestGetTickerCount(options, GET_HIT_L0) + + TestGetTickerCount(options, GET_HIT_L1) + + TestGetTickerCount(options, GET_HIT_L2_AND_UP)); +} + +TEST_F(DBTest, EncodeDecompressedBlockSizeTest) { + // iter 0 -- zlib + // iter 1 -- bzip2 + // iter 2 -- lz4 + // iter 3 -- lz4HC + // iter 4 -- xpress + CompressionType compressions[] = {kZlibCompression, kBZip2Compression, + kLZ4Compression, kLZ4HCCompression, + kXpressCompression}; + for (auto comp : compressions) { + if (!CompressionTypeSupported(comp)) { + continue; + } + // first_table_version 1 -- generate with table_version == 1, read with + // table_version == 2 + // first_table_version 2 -- generate with table_version == 2, read with + // table_version == 1 + for (int first_table_version = 1; first_table_version <= 2; + ++first_table_version) { + BlockBasedTableOptions table_options; + table_options.format_version = first_table_version; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + Options options = CurrentOptions(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.create_if_missing = true; + options.compression = comp; + DestroyAndReopen(options); + + int kNumKeysWritten = 1000; + + Random rnd(301); + for (int i = 0; i < kNumKeysWritten; ++i) { + // compressible string + ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a'))); + } + + table_options.format_version = first_table_version == 1 ? 2 : 1; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + for (int i = 0; i < kNumKeysWritten; ++i) { + auto r = Get(Key(i)); + ASSERT_EQ(r.substr(128), std::string(128, 'a')); + } + } + } +} + +TEST_F(DBTest, CloseSpeedup) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 4; + options.max_bytes_for_level_base = 400 * 1024; + options.max_write_buffer_number = 16; + + // Block background threads + env_->SetBackgroundThreads(1, Env::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + test::SleepingBackgroundTask sleeping_task_high; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_high, Env::Priority::HIGH); + + std::vector filenames; + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); + // In Windows, LOCK file cannot be deleted because it is locked by db_test + // After closing db_test, the LOCK file is unlocked and can be deleted + // Delete archival files. + bool deleteDir = true; + for (size_t i = 0; i < filenames.size(); ++i) { + Status s = env_->DeleteFile(dbname_ + "/" + filenames[i]); + if (!s.ok()) { + deleteDir = false; + } + } + if (deleteDir) { + ASSERT_OK(env_->DeleteDir(dbname_)); + } + DestroyAndReopen(options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + env_->SetBackgroundThreads(1, Env::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); + Random rnd(301); + int key_idx = 0; + + // First three 110KB files are not going to level 2 + // After that, (100K, 200K) + for (int num = 0; num < 5; num++) { + GenerateNewFile(&rnd, &key_idx, true); + } + + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + Close(); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // Unblock background threads + sleeping_task_high.WakeUp(); + sleeping_task_high.WaitUntilDone(); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + + Destroy(options); +} + +class DelayedMergeOperator : public MergeOperator { + private: + DBTest* db_test_; + + public: + explicit DelayedMergeOperator(DBTest* d) : db_test_(d) {} + + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override { + db_test_->env_->MockSleepForMicroseconds(1000 * + merge_in.operand_list.size()); + merge_out->new_value = ""; + return true; + } + + const char* Name() const override { return "DelayedMergeOperator"; } +}; + +TEST_F(DBTest, MergeTestTime) { + std::string one, two, three; + PutFixed64(&one, 1); + PutFixed64(&two, 2); + PutFixed64(&three, 3); + + // Enable time profiling + SetPerfLevel(kEnableTime); + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.merge_operator.reset(new DelayedMergeOperator(this)); + SetTimeElapseOnlySleepOnReopen(&options); + DestroyAndReopen(options); + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + ASSERT_EQ(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0); + ASSERT_OK(db_->Put(WriteOptions(), "foo", one)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->Merge(WriteOptions(), "foo", two)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->Merge(WriteOptions(), "foo", three)); + ASSERT_OK(Flush()); + + ReadOptions opt; + opt.verify_checksums = true; + opt.snapshot = nullptr; + std::string result; + ASSERT_OK(db_->Get(opt, "foo", &result)); + + ASSERT_EQ(2000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME)); + + ReadOptions read_options; + std::unique_ptr iter(db_->NewIterator(read_options)); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + ++count; + } + + ASSERT_EQ(1, count); + ASSERT_EQ(4000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME)); +#ifdef ROCKSDB_USING_THREAD_STATUS + ASSERT_GT(TestGetTickerCount(options, FLUSH_WRITE_BYTES), 0); +#endif // ROCKSDB_USING_THREAD_STATUS +} + +#ifndef ROCKSDB_LITE +TEST_P(DBTestWithParam, MergeCompactionTimeTest) { + SetPerfLevel(kEnableTime); + Options options = CurrentOptions(); + options.compaction_filter_factory = std::make_shared(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.merge_operator.reset(new DelayedMergeOperator(this)); + options.disable_auto_compactions = true; + options.max_subcompactions = max_subcompactions_; + SetTimeElapseOnlySleepOnReopen(&options); + DestroyAndReopen(options); + + constexpr unsigned n = 1000; + for (unsigned i = 0; i < n; i++) { + ASSERT_OK(db_->Merge(WriteOptions(), "foo", "TEST")); + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + ASSERT_EQ(uint64_t{n} * 1000000U, + TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME)); +} + +TEST_P(DBTestWithParam, FilterCompactionTimeTest) { + Options options = CurrentOptions(); + options.compaction_filter_factory = + std::make_shared(this); + options.disable_auto_compactions = true; + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.statistics->set_stats_level(kExceptTimeForMutex); + options.max_subcompactions = max_subcompactions_; + SetTimeElapseOnlySleepOnReopen(&options); + DestroyAndReopen(options); + + unsigned n = 0; + // put some data + for (int table = 0; table < 4; ++table) { + for (int i = 0; i < 10 + table; ++i) { + ASSERT_OK(Put(std::to_string(table * 100 + i), "val")); + ++n; + } + ASSERT_OK(Flush()); + } + + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ(0U, CountLiveFiles()); + + Reopen(options); + + Iterator* itr = db_->NewIterator(ReadOptions()); + itr->SeekToFirst(); + ASSERT_OK(itr->status()); + ASSERT_EQ(uint64_t{n} * 1000000U, + TestGetTickerCount(options, FILTER_OPERATION_TOTAL_TIME)); + delete itr; +} +#endif // ROCKSDB_LITE + +TEST_F(DBTest, TestLogCleanup) { + Options options = CurrentOptions(); + options.write_buffer_size = 64 * 1024; // very small + // only two memtables allowed ==> only two log files + options.max_write_buffer_number = 2; + Reopen(options); + + for (int i = 0; i < 100000; ++i) { + ASSERT_OK(Put(Key(i), "val")); + // only 2 memtables will be alive, so logs_to_free needs to always be below + // 2 + ASSERT_LT(dbfull()->TEST_LogsToFreeSize(), static_cast(3)); + } +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest, EmptyCompactedDB) { + Options options = CurrentOptions(); + options.max_open_files = -1; + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + Status s = Put("new", "value"); + ASSERT_TRUE(s.IsNotSupported()); + Close(); +} +#endif // ROCKSDB_LITE + +#ifndef ROCKSDB_LITE +TEST_F(DBTest, SuggestCompactRangeTest) { + class CompactionFilterFactoryGetContext : public CompactionFilterFactory { + public: + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + saved_context = context; + std::unique_ptr empty_filter; + return empty_filter; + } + const char* Name() const override { + return "CompactionFilterFactoryGetContext"; + } + static bool IsManual(CompactionFilterFactory* compaction_filter_factory) { + return reinterpret_cast( + compaction_filter_factory) + ->saved_context.is_manual_compaction; + } + CompactionFilter::Context saved_context; + }; + + Options options = CurrentOptions(); + options.memtable_factory.reset(test::NewSpecialSkipListFactory( + DBTestBase::kNumKeysByGenerateNewRandomFile)); + options.compaction_style = kCompactionStyleLevel; + options.compaction_filter_factory.reset( + new CompactionFilterFactoryGetContext()); + options.write_buffer_size = 200 << 10; + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 4; + options.num_levels = 4; + options.compression = kNoCompression; + options.max_bytes_for_level_base = 450 << 10; + options.target_file_size_base = 98 << 10; + options.max_compaction_bytes = static_cast(1) << 60; // inf + + Reopen(options); + + Random rnd(301); + + for (int num = 0; num < 10; num++) { + GenerateNewRandomFile(&rnd); + } + + ASSERT_TRUE(!CompactionFilterFactoryGetContext::IsManual( + options.compaction_filter_factory.get())); + + // make sure either L0 or L1 has file + while (NumTableFilesAtLevel(0) == 0 && NumTableFilesAtLevel(1) == 0) { + GenerateNewRandomFile(&rnd); + } + + // compact it three times + for (int i = 0; i < 3; ++i) { + ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + + // All files are compacted + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); + + GenerateNewRandomFile(&rnd); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + // nonoverlapping with the file on level 0 + Slice start("a"), end("b"); + ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // should not compact the level 0 file + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + start = Slice("j"); + end = Slice("m"); + ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // SuggestCompactRange() is not going to be reported as manual compaction + ASSERT_TRUE(!CompactionFilterFactoryGetContext::IsManual( + options.compaction_filter_factory.get())); + + // now it should compact the level 0 file + // as it's a trivial move to L1, it triggers another one to compact to L2 + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); +} + +TEST_F(DBTest, SuggestCompactRangeUniversal) { + Options options = CurrentOptions(); + options.memtable_factory.reset(test::NewSpecialSkipListFactory( + DBTestBase::kNumKeysByGenerateNewRandomFile)); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 200 << 10; + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 4; + options.num_levels = 4; + options.compression = kNoCompression; + options.max_bytes_for_level_base = 450 << 10; + options.target_file_size_base = 98 << 10; + options.max_compaction_bytes = static_cast(1) << 60; // inf + + Reopen(options); + + Random rnd(301); + + for (int num = 0; num < 10; num++) { + GenerateNewRandomFile(&rnd); + } + + ASSERT_EQ("1,2,3,4", FilesPerLevel()); + for (int i = 0; i < 3; i++) { + ASSERT_OK( + db_->SuggestCompactRange(db_->DefaultColumnFamily(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + + // All files are compacted + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); + ASSERT_EQ(0, NumTableFilesAtLevel(2)); + + GenerateNewRandomFile(&rnd); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + // nonoverlapping with the file on level 0 + Slice start("a"), end("b"); + ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // should not compact the level 0 file + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + start = Slice("j"); + end = Slice("m"); + ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // now it should compact the level 0 file to the last level + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); +} + +TEST_F(DBTest, PromoteL0) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.write_buffer_size = 10 * 1024 * 1024; + DestroyAndReopen(options); + + // non overlapping ranges + std::vector> ranges = { + {81, 160}, {0, 80}, {161, 240}, {241, 320}}; + + int32_t value_size = 10 * 1024; // 10 KB + + Random rnd(301); + std::map values; + for (const auto& range : ranges) { + for (int32_t j = range.first; j < range.second; j++) { + values[j] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(j), values[j])); + } + ASSERT_OK(Flush()); + } + + int32_t level0_files = NumTableFilesAtLevel(0, 0); + ASSERT_EQ(level0_files, ranges.size()); + ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); // No files in L1 + + // Promote L0 level to L2. + ASSERT_OK(experimental::PromoteL0(db_, db_->DefaultColumnFamily(), 2)); + // We expect that all the files were trivially moved from L0 to L2 + ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); + ASSERT_EQ(NumTableFilesAtLevel(2, 0), level0_files); + + for (const auto& kv : values) { + ASSERT_EQ(Get(Key(kv.first)), kv.second); + } +} + +TEST_F(DBTest, PromoteL0Failure) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.write_buffer_size = 10 * 1024 * 1024; + DestroyAndReopen(options); + + // Produce two L0 files with overlapping ranges. + ASSERT_OK(Put(Key(0), "")); + ASSERT_OK(Put(Key(3), "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "")); + ASSERT_OK(Flush()); + + Status status; + // Fails because L0 has overlapping files. + status = experimental::PromoteL0(db_, db_->DefaultColumnFamily()); + ASSERT_TRUE(status.IsInvalidArgument()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + // Now there is a file in L1. + ASSERT_GE(NumTableFilesAtLevel(1, 0), 1); + + ASSERT_OK(Put(Key(5), "")); + ASSERT_OK(Flush()); + // Fails because L1 is non-empty. + status = experimental::PromoteL0(db_, db_->DefaultColumnFamily()); + ASSERT_TRUE(status.IsInvalidArgument()); +} + +// Github issue #596 +TEST_F(DBTest, CompactRangeWithEmptyBottomLevel) { + const int kNumLevels = 2; + const int kNumL0Files = 2; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = kNumLevels; + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < kNumL0Files; ++i) { + ASSERT_OK(Put(Key(0), rnd.RandomString(1024))); + ASSERT_OK(Flush()); + } + ASSERT_EQ(NumTableFilesAtLevel(0), kNumL0Files); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), kNumL0Files); +} +#endif // ROCKSDB_LITE + +TEST_F(DBTest, AutomaticConflictsWithManualCompaction) { + const int kNumL0Files = 50; + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 4; + // never slowdown / stop + options.level0_slowdown_writes_trigger = 999999; + options.level0_stop_writes_trigger = 999999; + options.max_background_compactions = 10; + DestroyAndReopen(options); + + // schedule automatic compactions after the manual one starts, but before it + // finishes to ensure conflict. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCompaction:Start", + "DBTest::AutomaticConflictsWithManualCompaction:PrePuts"}, + {"DBTest::AutomaticConflictsWithManualCompaction:PostPuts", + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}}); + std::atomic callback_count(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::MaybeScheduleFlushOrCompaction:Conflict", + [&](void* /*arg*/) { callback_count.fetch_add(1); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int i = 0; i < 2; ++i) { + // put two keys to ensure no trivial move + for (int j = 0; j < 2; ++j) { + ASSERT_OK(Put(Key(j), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + } + port::Thread manual_compaction_thread([this]() { + CompactRangeOptions croptions; + croptions.exclusive_manual_compaction = true; + ASSERT_OK(db_->CompactRange(croptions, nullptr, nullptr)); + }); + + TEST_SYNC_POINT("DBTest::AutomaticConflictsWithManualCompaction:PrePuts"); + for (int i = 0; i < kNumL0Files; ++i) { + // put two keys to ensure no trivial move + for (int j = 0; j < 2; ++j) { + ASSERT_OK(Put(Key(j), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + } + TEST_SYNC_POINT("DBTest::AutomaticConflictsWithManualCompaction:PostPuts"); + + ASSERT_GE(callback_count.load(), 1); + for (int i = 0; i < 2; ++i) { + ASSERT_NE("NOT_FOUND", Get(Key(i))); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + manual_compaction_thread.join(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest, CompactFilesShouldTriggerAutoCompaction) { + Options options = CurrentOptions(); + options.max_background_compactions = 1; + options.level0_file_num_compaction_trigger = 4; + options.level0_slowdown_writes_trigger = 36; + options.level0_stop_writes_trigger = 36; + DestroyAndReopen(options); + + // generate files for manual compaction + Random rnd(301); + for (int i = 0; i < 2; ++i) { + // put two keys to ensure no trivial move + for (int j = 0; j < 2; ++j) { + ASSERT_OK(Put(Key(j), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + } + + ROCKSDB_NAMESPACE::ColumnFamilyMetaData cf_meta_data; + db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data); + + std::vector input_files; + input_files.push_back(cf_meta_data.levels[0].files[0].name); + + SyncPoint::GetInstance()->LoadDependency({ + {"CompactFilesImpl:0", + "DBTest::CompactFilesShouldTriggerAutoCompaction:Begin"}, + {"DBTest::CompactFilesShouldTriggerAutoCompaction:End", + "CompactFilesImpl:1"}, + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + port::Thread manual_compaction_thread([&]() { + auto s = db_->CompactFiles(CompactionOptions(), db_->DefaultColumnFamily(), + input_files, 0); + ASSERT_OK(s); + }); + + TEST_SYNC_POINT("DBTest::CompactFilesShouldTriggerAutoCompaction:Begin"); + // generate enough files to trigger compaction + for (int i = 0; i < 20; ++i) { + for (int j = 0; j < 2; ++j) { + ASSERT_OK(Put(Key(j), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + } + db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data); + ASSERT_GT(cf_meta_data.levels[0].files.size(), + options.level0_file_num_compaction_trigger); + TEST_SYNC_POINT("DBTest::CompactFilesShouldTriggerAutoCompaction:End"); + + manual_compaction_thread.join(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data); + ASSERT_LE(cf_meta_data.levels[0].files.size(), + options.level0_file_num_compaction_trigger); +} +#endif // ROCKSDB_LITE + +// Github issue #595 +// Large write batch with column families +TEST_F(DBTest, LargeBatchWithColumnFamilies) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + CreateAndReopenWithCF({"pikachu"}, options); + int64_t j = 0; + for (int i = 0; i < 5; i++) { + for (int pass = 1; pass <= 3; pass++) { + WriteBatch batch; + size_t write_size = 1024 * 1024 * (5 + i); + fprintf(stderr, "prepare: %" ROCKSDB_PRIszt " MB, pass:%d\n", + (write_size / 1024 / 1024), pass); + for (;;) { + std::string data(3000, j++ % 127 + 20); + data += std::to_string(j); + ASSERT_OK(batch.Put(handles_[0], Slice(data), Slice(data))); + if (batch.GetDataSize() > write_size) { + break; + } + } + fprintf(stderr, "write: %" ROCKSDB_PRIszt " MB\n", + (batch.GetDataSize() / 1024 / 1024)); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); + fprintf(stderr, "done\n"); + } + } + // make sure we can re-open it. + ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); +} + +// Make sure that Flushes can proceed in parallel with CompactRange() +TEST_F(DBTest, FlushesInParallelWithCompactRange) { + // iter == 0 -- leveled + // iter == 1 -- leveled, but throw in a flush between two levels compacting + // iter == 2 -- universal + for (int iter = 0; iter < 3; ++iter) { + Options options = CurrentOptions(); + if (iter < 2) { + options.compaction_style = kCompactionStyleLevel; + } else { + options.compaction_style = kCompactionStyleUniversal; + } + options.write_buffer_size = 110 << 10; + options.level0_file_num_compaction_trigger = 4; + options.num_levels = 4; + options.compression = kNoCompression; + options.max_bytes_for_level_base = 450 << 10; + options.target_file_size_base = 98 << 10; + options.max_write_buffer_number = 2; + + DestroyAndReopen(options); + + Random rnd(301); + for (int num = 0; num < 14; num++) { + GenerateNewRandomFile(&rnd); + } + + if (iter == 1) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::RunManualCompaction()::1", + "DBTest::FlushesInParallelWithCompactRange:1"}, + {"DBTest::FlushesInParallelWithCompactRange:2", + "DBImpl::RunManualCompaction()::2"}}); + } else { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"CompactionJob::Run():Start", + "DBTest::FlushesInParallelWithCompactRange:1"}, + {"DBTest::FlushesInParallelWithCompactRange:2", + "CompactionJob::Run():End"}}); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + std::vector threads; + threads.emplace_back([&]() { Compact("a", "z"); }); + + TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:1"); + + // this has to start a flush. if flushes are blocked, this will try to + // create + // 3 memtables, and that will fail because max_write_buffer_number is 2 + for (int num = 0; num < 3; num++) { + GenerateNewRandomFile(&rnd, /* nowait */ true); + } + + TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:2"); + + for (auto& t : threads) { + t.join(); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_F(DBTest, DelayedWriteRate) { + const int kEntriesPerMemTable = 100; + const int kTotalFlushes = 12; + + Options options = CurrentOptions(); + env_->SetBackgroundThreads(1, Env::LOW); + options.env = env_; + options.write_buffer_size = 100000000; + options.max_write_buffer_number = 256; + options.max_background_compactions = 1; + options.level0_file_num_compaction_trigger = 3; + options.level0_slowdown_writes_trigger = 3; + options.level0_stop_writes_trigger = 999999; + options.delayed_write_rate = 20000000; // Start with 200MB/s + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kEntriesPerMemTable)); + + SetTimeElapseOnlySleepOnReopen(&options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Block compactions + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + for (int i = 0; i < 3; i++) { + ASSERT_OK(Put(Key(i), std::string(10000, 'x'))); + ASSERT_OK(Flush()); + } + + // These writes will be slowed down to 1KB/s + uint64_t estimated_sleep_time = 0; + Random rnd(301); + ASSERT_OK(Put("", "")); + uint64_t cur_rate = options.delayed_write_rate; + for (int i = 0; i < kTotalFlushes; i++) { + uint64_t size_memtable = 0; + for (int j = 0; j < kEntriesPerMemTable; j++) { + auto rand_num = rnd.Uniform(20); + // Spread the size range to more. + size_t entry_size = rand_num * rand_num * rand_num; + WriteOptions wo; + ASSERT_OK(Put(Key(i), std::string(entry_size, 'x'), wo)); + size_memtable += entry_size + 18; + // Occasionally sleep a while + if (rnd.Uniform(20) == 6) { + env_->SleepForMicroseconds(2666); + } + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + estimated_sleep_time += size_memtable * 1000000u / cur_rate; + // Slow down twice. One for memtable switch and one for flush finishes. + cur_rate = static_cast(static_cast(cur_rate) * + kIncSlowdownRatio * kIncSlowdownRatio); + } + // Estimate the total sleep time fall into the rough range. + ASSERT_GT(env_->NowMicros(), estimated_sleep_time / 2); + ASSERT_LT(env_->NowMicros(), estimated_sleep_time * 2); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); +} + +TEST_F(DBTest, HardLimit) { + Options options = CurrentOptions(); + options.env = env_; + env_->SetBackgroundThreads(1, Env::LOW); + options.max_write_buffer_number = 256; + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 * 1024; + options.level0_file_num_compaction_trigger = 4; + options.level0_slowdown_writes_trigger = 999999; + options.level0_stop_writes_trigger = 999999; + options.hard_pending_compaction_bytes_limit = 800 << 10; + options.max_bytes_for_level_base = 10000000000u; + options.max_background_compactions = 1; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + CreateAndReopenWithCF({"pikachu"}, options); + + std::atomic callback_count(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DelayWrite:Wait", [&](void* /*arg*/) { + callback_count.fetch_add(1); + sleeping_task_low.WakeUp(); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + int key_idx = 0; + for (int num = 0; num < 5; num++) { + GenerateNewFile(&rnd, &key_idx, true); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + + ASSERT_EQ(0, callback_count.load()); + + for (int num = 0; num < 5; num++) { + GenerateNewFile(&rnd, &key_idx, true); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_GE(callback_count.load(), 1); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + sleeping_task_low.WaitUntilDone(); +} + +#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) +class WriteStallListener : public EventListener { + public: + WriteStallListener() : condition_(WriteStallCondition::kNormal) {} + void OnStallConditionsChanged(const WriteStallInfo& info) override { + MutexLock l(&mutex_); + condition_ = info.condition.cur; + } + bool CheckCondition(WriteStallCondition expected) { + MutexLock l(&mutex_); + return expected == condition_; + } + + private: + port::Mutex mutex_; + WriteStallCondition condition_; +}; + +TEST_F(DBTest, SoftLimit) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + options.max_write_buffer_number = 256; + options.level0_file_num_compaction_trigger = 1; + options.level0_slowdown_writes_trigger = 3; + options.level0_stop_writes_trigger = 999999; + options.delayed_write_rate = 20000; // About 200KB/s limited rate + options.soft_pending_compaction_bytes_limit = 160000; + options.target_file_size_base = 99999999; // All into one file + options.max_bytes_for_level_base = 50000; + options.max_bytes_for_level_multiplier = 10; + options.max_background_compactions = 1; + options.compression = kNoCompression; + WriteStallListener* listener = new WriteStallListener(); + options.listeners.emplace_back(listener); + + // FlushMemtable with opt.wait=true does not wait for + // `OnStallConditionsChanged` being called. The event listener is triggered + // on `JobContext::Clean`, which happens after flush result is installed. + // We use sync point to create a custom WaitForFlush that waits for + // context cleanup. + port::Mutex flush_mutex; + port::CondVar flush_cv(&flush_mutex); + bool flush_finished = false; + auto InstallFlushCallback = [&]() { + { + MutexLock l(&flush_mutex); + flush_finished = false; + } + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCallFlush:ContextCleanedUp", [&](void*) { + { + MutexLock l(&flush_mutex); + flush_finished = true; + } + flush_cv.SignalAll(); + }); + }; + auto WaitForFlush = [&]() { + { + MutexLock l(&flush_mutex); + while (!flush_finished) { + flush_cv.Wait(); + } + } + SyncPoint::GetInstance()->ClearCallBack( + "DBImpl::BackgroundCallFlush:ContextCleanedUp"); + }; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Reopen(options); + + // Generating 360KB in Level 3 + for (int i = 0; i < 72; i++) { + ASSERT_OK(Put(Key(i), std::string(5000, 'x'))); + if (i % 10 == 0) { + ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true)); + } + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + MoveFilesToLevel(3); + + // Generating 360KB in Level 2 + for (int i = 0; i < 72; i++) { + ASSERT_OK(Put(Key(i), std::string(5000, 'x'))); + if (i % 10 == 0) { + ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true)); + } + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + MoveFilesToLevel(2); + + ASSERT_OK(Put(Key(0), "")); + + test::SleepingBackgroundTask sleeping_task_low; + // Block compactions + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + sleeping_task_low.WaitUntilSleeping(); + + // Create 3 L0 files, making score of L0 to be 3. + for (int i = 0; i < 3; i++) { + ASSERT_OK(Put(Key(i), std::string(5000, 'x'))); + ASSERT_OK(Put(Key(100 - i), std::string(5000, 'x'))); + // Flush the file. File size is around 30KB. + InstallFlushCallback(); + ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true)); + WaitForFlush(); + } + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed)); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + sleeping_task_low.Reset(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Now there is one L1 file but doesn't trigger soft_rate_limit + // + // TODO: soft_rate_limit is depreciated. If this test + // relies on soft_rate_limit, then we need to change the test. + // + // The L1 file size is around 30KB. + ASSERT_EQ(NumTableFilesAtLevel(1), 1); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal)); + + // Only allow one compactin going through. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void* /*arg*/) { + // Schedule a sleeping task. + sleeping_task_low.Reset(); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_low, Env::Priority::LOW); + }); + + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + sleeping_task_low.WaitUntilSleeping(); + // Create 3 L0 files, making score of L0 to be 3 + for (int i = 0; i < 3; i++) { + ASSERT_OK(Put(Key(10 + i), std::string(5000, 'x'))); + ASSERT_OK(Put(Key(90 - i), std::string(5000, 'x'))); + // Flush the file. File size is around 30KB. + InstallFlushCallback(); + ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true)); + WaitForFlush(); + } + + // Wake up sleep task to enable compaction to run and waits + // for it to go to sleep state again to make sure one compaction + // goes through. + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilSleeping(); + + // Now there is one L1 file (around 60KB) which exceeds 50KB base by 10KB + // Given level multiplier 10, estimated pending compaction is around 100KB + // doesn't trigger soft_pending_compaction_bytes_limit + ASSERT_EQ(NumTableFilesAtLevel(1), 1); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal)); + + // Create 3 L0 files, making score of L0 to be 3, higher than L0. + for (int i = 0; i < 3; i++) { + ASSERT_OK(Put(Key(20 + i), std::string(5000, 'x'))); + ASSERT_OK(Put(Key(80 - i), std::string(5000, 'x'))); + // Flush the file. File size is around 30KB. + InstallFlushCallback(); + ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true)); + WaitForFlush(); + } + // Wake up sleep task to enable compaction to run and waits + // for it to go to sleep state again to make sure one compaction + // goes through. + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilSleeping(); + + // Now there is one L1 file (around 90KB) which exceeds 50KB base by 40KB + // L2 size is 360KB, so the estimated level fanout 4, estimated pending + // compaction is around 200KB + // triggerring soft_pending_compaction_bytes_limit + ASSERT_EQ(NumTableFilesAtLevel(1), 1); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed)); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilSleeping(); + + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal)); + + // shrink level base so L2 will hit soft limit easier. + ASSERT_OK(dbfull()->SetOptions({ + {"max_bytes_for_level_base", "5000"}, + })); + + ASSERT_OK(Put("", "")); + ASSERT_OK(Flush()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed)); + + sleeping_task_low.WaitUntilSleeping(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); +} + +TEST_F(DBTest, LastWriteBufferDelay) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; + options.max_write_buffer_number = 4; + options.delayed_write_rate = 20000; + options.compression = kNoCompression; + options.disable_auto_compactions = true; + int kNumKeysPerMemtable = 3; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerMemtable)); + + Reopen(options); + test::SleepingBackgroundTask sleeping_task; + // Block flushes + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::HIGH); + sleeping_task.WaitUntilSleeping(); + + // Create 3 L0 files, making score of L0 to be 3. + for (int i = 0; i < 3; i++) { + // Fill one mem table + for (int j = 0; j < kNumKeysPerMemtable; j++) { + ASSERT_OK(Put(Key(j), "")); + } + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + } + // Inserting a new entry would create a new mem table, triggering slow down. + ASSERT_OK(Put(Key(0), "")); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); +} +#endif // !defined(ROCKSDB_LITE) && + // !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) + +TEST_F(DBTest, FailWhenCompressionNotSupportedTest) { + CompressionType compressions[] = {kZlibCompression, kBZip2Compression, + kLZ4Compression, kLZ4HCCompression, + kXpressCompression}; + for (auto comp : compressions) { + if (!CompressionTypeSupported(comp)) { + // not supported, we should fail the Open() + Options options = CurrentOptions(); + options.compression = comp; + ASSERT_TRUE(!TryReopen(options).ok()); + // Try if CreateColumnFamily also fails + options.compression = kNoCompression; + ASSERT_OK(TryReopen(options)); + ColumnFamilyOptions cf_options(options); + cf_options.compression = comp; + ColumnFamilyHandle* handle; + ASSERT_TRUE(!db_->CreateColumnFamily(cf_options, "name", &handle).ok()); + } + } +} + +TEST_F(DBTest, CreateColumnFamilyShouldFailOnIncompatibleOptions) { + Options options = CurrentOptions(); + options.max_open_files = 100; + Reopen(options); + + ColumnFamilyOptions cf_options(options); + // ttl is now supported when max_open_files is -1. + cf_options.ttl = 3600; + ColumnFamilyHandle* handle; + ASSERT_OK(db_->CreateColumnFamily(cf_options, "pikachu", &handle)); + delete handle; +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest, RowCache) { + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.row_cache = NewLRUCache(8192); + DestroyAndReopen(options); + + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0); + ASSERT_EQ(Get("foo"), "bar"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); + ASSERT_EQ(Get("foo"), "bar"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); +} + +TEST_F(DBTest, PinnableSliceAndRowCache) { + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.row_cache = NewLRUCache(8192); + DestroyAndReopen(options); + + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + ASSERT_EQ(Get("foo"), "bar"); + ASSERT_EQ( + reinterpret_cast(options.row_cache.get())->TEST_GetLRUSize(), + 1); + + { + PinnableSlice pin_slice; + ASSERT_EQ(Get("foo", &pin_slice), Status::OK()); + ASSERT_EQ(pin_slice.ToString(), "bar"); + // Entry is already in cache, lookup will remove the element from lru + ASSERT_EQ( + reinterpret_cast(options.row_cache.get())->TEST_GetLRUSize(), + 0); + } + // After PinnableSlice destruction element is added back in LRU + ASSERT_EQ( + reinterpret_cast(options.row_cache.get())->TEST_GetLRUSize(), + 1); +} + +TEST_F(DBTest, ReusePinnableSlice) { + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.row_cache = NewLRUCache(8192); + DestroyAndReopen(options); + + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + ASSERT_EQ(Get("foo"), "bar"); + ASSERT_EQ( + reinterpret_cast(options.row_cache.get())->TEST_GetLRUSize(), + 1); + + { + PinnableSlice pin_slice; + ASSERT_EQ(Get("foo", &pin_slice), Status::OK()); + ASSERT_EQ(Get("foo", &pin_slice), Status::OK()); + ASSERT_EQ(pin_slice.ToString(), "bar"); + + // Entry is already in cache, lookup will remove the element from lru + ASSERT_EQ( + reinterpret_cast(options.row_cache.get())->TEST_GetLRUSize(), + 0); + } + // After PinnableSlice destruction element is added back in LRU + ASSERT_EQ( + reinterpret_cast(options.row_cache.get())->TEST_GetLRUSize(), + 1); + + { + std::vector multiget_keys; + multiget_keys.push_back("foo"); + std::vector multiget_values(1); + std::vector statuses({Status::NotFound()}); + ReadOptions ropt; + dbfull()->MultiGet(ropt, dbfull()->DefaultColumnFamily(), + multiget_keys.size(), multiget_keys.data(), + multiget_values.data(), statuses.data()); + ASSERT_EQ(Status::OK(), statuses[0]); + dbfull()->MultiGet(ropt, dbfull()->DefaultColumnFamily(), + multiget_keys.size(), multiget_keys.data(), + multiget_values.data(), statuses.data()); + ASSERT_EQ(Status::OK(), statuses[0]); + + // Entry is already in cache, lookup will remove the element from lru + ASSERT_EQ( + reinterpret_cast(options.row_cache.get())->TEST_GetLRUSize(), + 0); + } + // After PinnableSlice destruction element is added back in LRU + ASSERT_EQ( + reinterpret_cast(options.row_cache.get())->TEST_GetLRUSize(), + 1); + + { + std::vector multiget_cfs; + multiget_cfs.push_back(dbfull()->DefaultColumnFamily()); + std::vector multiget_keys; + multiget_keys.push_back("foo"); + std::vector multiget_values(1); + std::vector statuses({Status::NotFound()}); + ReadOptions ropt; + dbfull()->MultiGet(ropt, multiget_keys.size(), multiget_cfs.data(), + multiget_keys.data(), multiget_values.data(), + statuses.data()); + ASSERT_EQ(Status::OK(), statuses[0]); + dbfull()->MultiGet(ropt, multiget_keys.size(), multiget_cfs.data(), + multiget_keys.data(), multiget_values.data(), + statuses.data()); + ASSERT_EQ(Status::OK(), statuses[0]); + + // Entry is already in cache, lookup will remove the element from lru + ASSERT_EQ( + reinterpret_cast(options.row_cache.get())->TEST_GetLRUSize(), + 0); + } + // After PinnableSlice destruction element is added back in LRU + ASSERT_EQ( + reinterpret_cast(options.row_cache.get())->TEST_GetLRUSize(), + 1); +} + +#endif // ROCKSDB_LITE + +TEST_F(DBTest, DeletingOldWalAfterDrop) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"Test:AllowFlushes", "DBImpl::BGWorkFlush"}, + {"DBImpl::BGWorkFlush:done", "Test:WaitForFlush"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + Options options = CurrentOptions(); + options.max_total_wal_size = 8192; + options.compression = kNoCompression; + options.write_buffer_size = 1 << 20; + options.level0_file_num_compaction_trigger = (1 << 30); + options.level0_slowdown_writes_trigger = (1 << 30); + options.level0_stop_writes_trigger = (1 << 30); + options.disable_auto_compactions = true; + DestroyAndReopen(options); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + CreateColumnFamilies({"cf1", "cf2"}, options); + ASSERT_OK(Put(0, "key1", DummyString(8192))); + ASSERT_OK(Put(0, "key2", DummyString(8192))); + // the oldest wal should now be getting_flushed + ASSERT_OK(db_->DropColumnFamily(handles_[0])); + // all flushes should now do nothing because their CF is dropped + TEST_SYNC_POINT("Test:AllowFlushes"); + TEST_SYNC_POINT("Test:WaitForFlush"); + uint64_t lognum1 = dbfull()->TEST_LogfileNumber(); + ASSERT_OK(Put(1, "key3", DummyString(8192))); + ASSERT_OK(Put(1, "key4", DummyString(8192))); + // new wal should have been created + uint64_t lognum2 = dbfull()->TEST_LogfileNumber(); + EXPECT_GT(lognum2, lognum1); +} + +TEST_F(DBTest, UnsupportedManualSync) { + DestroyAndReopen(CurrentOptions()); + env_->is_wal_sync_thread_safe_.store(false); + Status s = db_->SyncWAL(); + ASSERT_TRUE(s.IsNotSupported()); +} + +INSTANTIATE_TEST_CASE_P(DBTestWithParam, DBTestWithParam, + ::testing::Combine(::testing::Values(1, 4), + ::testing::Bool())); + +TEST_F(DBTest, PauseBackgroundWorkTest) { + Options options = CurrentOptions(); + options.write_buffer_size = 100000; // Small write buffer + Reopen(options); + + std::vector threads; + std::atomic done(false); + ASSERT_OK(db_->PauseBackgroundWork()); + threads.emplace_back([&]() { + Random rnd(301); + for (int i = 0; i < 10000; ++i) { + ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10))); + } + done.store(true); + }); + env_->SleepForMicroseconds(200000); + // make sure the thread is not done + ASSERT_FALSE(done.load()); + ASSERT_OK(db_->ContinueBackgroundWork()); + for (auto& t : threads) { + t.join(); + } + // now it's done + ASSERT_TRUE(done.load()); +} + +// Keep spawning short-living threads that create an iterator and quit. +// Meanwhile in another thread keep flushing memtables. +// This used to cause a deadlock. +TEST_F(DBTest, ThreadLocalPtrDeadlock) { + std::atomic flushes_done{0}; + std::atomic threads_destroyed{0}; + auto done = [&] { return flushes_done.load() > 10; }; + + port::Thread flushing_thread([&] { + for (int i = 0; !done(); ++i) { + ASSERT_OK(db_->Put(WriteOptions(), Slice("hi"), + Slice(std::to_string(i).c_str()))); + ASSERT_OK(db_->Flush(FlushOptions())); + int cnt = ++flushes_done; + fprintf(stderr, "Flushed %d times\n", cnt); + } + }); + + std::vector thread_spawning_threads(10); + for (auto& t : thread_spawning_threads) { + t = port::Thread([&] { + while (!done()) { + { + port::Thread tmp_thread([&] { + auto it = db_->NewIterator(ReadOptions()); + ASSERT_OK(it->status()); + delete it; + }); + tmp_thread.join(); + } + ++threads_destroyed; + } + }); + } + + for (auto& t : thread_spawning_threads) { + t.join(); + } + flushing_thread.join(); + fprintf(stderr, "Done. Flushed %d times, destroyed %d threads\n", + flushes_done.load(), threads_destroyed.load()); +} + +TEST_F(DBTest, LargeBlockSizeTest) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put(0, "foo", "bar")); + BlockBasedTableOptions table_options; + table_options.block_size = 8LL * 1024 * 1024 * 1024LL; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ASSERT_NOK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); +} + +#ifndef ROCKSDB_LITE + +TEST_F(DBTest, CreationTimeOfOldestFile) { + const int kNumKeysPerFile = 32; + const int kNumLevelFiles = 2; + const int kValueSize = 100; + + Options options = CurrentOptions(); + options.max_open_files = -1; + env_->SetMockSleep(); + options.env = env_; + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + DestroyAndReopen(options); + + bool set_file_creation_time_to_zero = true; + int idx = 0; + + int64_t time_1 = 0; + env_->GetCurrentTime(&time_1); + const uint64_t uint_time_1 = static_cast(time_1); + + // Add 50 hours + env_->MockSleepForSeconds(50 * 60 * 60); + + int64_t time_2 = 0; + env_->GetCurrentTime(&time_2); + const uint64_t uint_time_2 = static_cast(time_2); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PropertyBlockBuilder::AddTableProperty:Start", [&](void* arg) { + TableProperties* props = reinterpret_cast(arg); + if (set_file_creation_time_to_zero) { + if (idx == 0) { + props->file_creation_time = 0; + idx++; + } else if (idx == 1) { + props->file_creation_time = uint_time_1; + idx = 0; + } + } else { + if (idx == 0) { + props->file_creation_time = uint_time_1; + idx++; + } else if (idx == 1) { + props->file_creation_time = uint_time_2; + } + } + }); + // Set file creation time in manifest all to 0. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "FileMetaData::FileMetaData", [&](void* arg) { + FileMetaData* meta = static_cast(arg); + meta->file_creation_time = 0; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + } + + // At this point there should be 2 files, one with file_creation_time = 0 and + // the other non-zero. GetCreationTimeOfOldestFile API should return 0. + uint64_t creation_time; + Status s1 = dbfull()->GetCreationTimeOfOldestFile(&creation_time); + ASSERT_EQ(0, creation_time); + ASSERT_EQ(s1, Status::OK()); + + // Testing with non-zero file creation time. + set_file_creation_time_to_zero = false; + options = CurrentOptions(); + options.max_open_files = -1; + options.env = env_; + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + DestroyAndReopen(options); + + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + } + + // At this point there should be 2 files with non-zero file creation time. + // GetCreationTimeOfOldestFile API should return non-zero value. + uint64_t ctime; + Status s2 = dbfull()->GetCreationTimeOfOldestFile(&ctime); + ASSERT_EQ(uint_time_1, ctime); + ASSERT_EQ(s2, Status::OK()); + + // Testing with max_open_files != -1 + options = CurrentOptions(); + options.max_open_files = 10; + DestroyAndReopen(options); + Status s3 = dbfull()->GetCreationTimeOfOldestFile(&ctime); + ASSERT_EQ(s3, Status::NotSupported()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest, MemoryUsageWithMaxWriteBufferSizeToMaintain) { + Options options = CurrentOptions(); + options.max_write_buffer_size_to_maintain = 10000; + options.write_buffer_size = 160000; + Reopen(options); + Random rnd(301); + bool memory_limit_exceeded = false; + + ColumnFamilyData* cfd = + static_cast(db_->DefaultColumnFamily())->cfd(); + + for (int i = 0; i < 1000; i++) { + std::string value = rnd.RandomString(1000); + ASSERT_OK(Put("keykey_" + std::to_string(i), value)); + + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + const uint64_t cur_active_mem = cfd->mem()->ApproximateMemoryUsage(); + const uint64_t size_all_mem_table = + cur_active_mem + cfd->imm()->ApproximateMemoryUsage(); + + // Errors out if memory usage keeps on increasing beyond the limit. + // Once memory limit exceeds, memory_limit_exceeded is set and if + // size_all_mem_table doesn't drop out in the next write then it errors out + // (not expected behaviour). If memory usage drops then + // memory_limit_exceeded is set to false. + if ((size_all_mem_table > cur_active_mem) && + (cur_active_mem >= + static_cast(options.max_write_buffer_size_to_maintain)) && + (size_all_mem_table > + static_cast(options.max_write_buffer_size_to_maintain) + + options.write_buffer_size)) { + ASSERT_FALSE(memory_limit_exceeded); + memory_limit_exceeded = true; + } else { + memory_limit_exceeded = false; + } + } +} + +TEST_F(DBTest, ShuttingDownNotBlockStalledWrites) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + Reopen(options); + Random rnd(403); + + for (int i = 0; i < 20; i++) { + ASSERT_OK(Put("key_" + std::to_string(i), rnd.RandomString(10))); + ASSERT_OK(Flush()); + } + ASSERT_EQ(GetSstFileCount(dbname_), 20); + + // We need !disable_auto_compactions for writes to stall but also want to + // delay compaction so stalled writes unblocked due to kShutdownInProgress. BG + // compaction will first wait for the sync point + // DBTest::ShuttingDownNotBlockStalledWrites. Then it waits extra 2 sec to + // allow CancelAllBackgroundWork() to set shutting_down_. + SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", + [&](void* /* arg */) { env_->SleepForMicroseconds(2 * 1000 * 1000); }); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::DelayWrite:Wait", "DBTest::ShuttingDownNotBlockStalledWrites"}, + {"DBTest::ShuttingDownNotBlockStalledWrites", + "BackgroundCallCompaction:0"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + options.level0_stop_writes_trigger = 20; + options.disable_auto_compactions = false; + Reopen(options); + + std::thread thd([&]() { + Status s = Put("key_" + std::to_string(101), "101"); + ASSERT_EQ(s.code(), Status::kShutdownInProgress); + }); + + TEST_SYNC_POINT("DBTest::ShuttingDownNotBlockStalledWrites"); + CancelAllBackgroundWork(db_, true); + + thd.join(); +} +#endif + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_test2.cc b/src/rocksdb/db/db_test2.cc new file mode 100644 index 000000000..8adde3680 --- /dev/null +++ b/src/rocksdb/db/db_test2.cc @@ -0,0 +1,7652 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include + +#include "db/db_test_util.h" +#include "db/read_callback.h" +#include "options/options_helper.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/experimental.h" +#include "rocksdb/iostats_context.h" +#include "rocksdb/persistent_cache.h" +#include "rocksdb/trace_record.h" +#include "rocksdb/trace_record_result.h" +#include "rocksdb/utilities/replayer.h" +#include "rocksdb/wal_filter.h" +#include "test_util/testutil.h" +#include "util/random.h" +#include "utilities/fault_injection_env.h" + +namespace ROCKSDB_NAMESPACE { + +class DBTest2 : public DBTestBase { + public: + DBTest2() : DBTestBase("db_test2", /*env_do_fsync=*/true) {} +}; + +#ifndef ROCKSDB_LITE +TEST_F(DBTest2, OpenForReadOnly) { + DB* db_ptr = nullptr; + std::string dbname = test::PerThreadDBPath("db_readonly"); + Options options = CurrentOptions(); + options.create_if_missing = true; + // OpenForReadOnly should fail but will create in the file system + ASSERT_NOK(DB::OpenForReadOnly(options, dbname, &db_ptr)); + // Since is created, we should be able to delete the dir + // We first get the list files under + // There should not be any subdirectories -- this is not checked here + std::vector files; + ASSERT_OK(env_->GetChildren(dbname, &files)); + for (auto& f : files) { + ASSERT_OK(env_->DeleteFile(dbname + "/" + f)); + } + // should be empty now and we should be able to delete it + ASSERT_OK(env_->DeleteDir(dbname)); + options.create_if_missing = false; + // OpenForReadOnly should fail since was successfully deleted + ASSERT_NOK(DB::OpenForReadOnly(options, dbname, &db_ptr)); + // With create_if_missing false, there should not be a dir in the file system + ASSERT_NOK(env_->FileExists(dbname)); +} + +TEST_F(DBTest2, OpenForReadOnlyWithColumnFamilies) { + DB* db_ptr = nullptr; + std::string dbname = test::PerThreadDBPath("db_readonly"); + Options options = CurrentOptions(); + options.create_if_missing = true; + + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + column_families.push_back(ColumnFamilyDescriptor("goku", cf_options)); + std::vector handles; + // OpenForReadOnly should fail but will create in the file system + ASSERT_NOK( + DB::OpenForReadOnly(options, dbname, column_families, &handles, &db_ptr)); + // Since is created, we should be able to delete the dir + // We first get the list files under + // There should not be any subdirectories -- this is not checked here + std::vector files; + ASSERT_OK(env_->GetChildren(dbname, &files)); + for (auto& f : files) { + ASSERT_OK(env_->DeleteFile(dbname + "/" + f)); + } + // should be empty now and we should be able to delete it + ASSERT_OK(env_->DeleteDir(dbname)); + options.create_if_missing = false; + // OpenForReadOnly should fail since was successfully deleted + ASSERT_NOK( + DB::OpenForReadOnly(options, dbname, column_families, &handles, &db_ptr)); + // With create_if_missing false, there should not be a dir in the file system + ASSERT_NOK(env_->FileExists(dbname)); +} + +class TestReadOnlyWithCompressedCache + : public DBTestBase, + public testing::WithParamInterface> { + public: + TestReadOnlyWithCompressedCache() + : DBTestBase("test_readonly_with_compressed_cache", + /*env_do_fsync=*/true) { + max_open_files_ = std::get<0>(GetParam()); + use_mmap_ = std::get<1>(GetParam()); + } + int max_open_files_; + bool use_mmap_; +}; + +TEST_P(TestReadOnlyWithCompressedCache, ReadOnlyWithCompressedCache) { + if (use_mmap_ && !IsMemoryMappedAccessSupported()) { + ROCKSDB_GTEST_SKIP("Test requires MMAP support"); + return; + } + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo2", "barbarbarbarbarbarbarbar")); + ASSERT_OK(Flush()); + + DB* db_ptr = nullptr; + Options options = CurrentOptions(); + options.allow_mmap_reads = use_mmap_; + options.max_open_files = max_open_files_; + options.compression = kSnappyCompression; + BlockBasedTableOptions table_options; + table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024); + table_options.no_block_cache = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.statistics = CreateDBStatistics(); + + ASSERT_OK(DB::OpenForReadOnly(options, dbname_, &db_ptr)); + + std::string v; + ASSERT_OK(db_ptr->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("bar", v); + ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT)); + ASSERT_OK(db_ptr->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("bar", v); + if (Snappy_Supported()) { + if (use_mmap_) { + ASSERT_EQ(0, + options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT)); + } else { + ASSERT_EQ(1, + options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT)); + } + } + + delete db_ptr; +} + +INSTANTIATE_TEST_CASE_P(TestReadOnlyWithCompressedCache, + TestReadOnlyWithCompressedCache, + ::testing::Combine(::testing::Values(-1, 100), + ::testing::Bool())); + +class PartitionedIndexTestListener : public EventListener { + public: + void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { + ASSERT_GT(info.table_properties.index_partitions, 1); + ASSERT_EQ(info.table_properties.index_key_is_user_key, 0); + } +}; + +TEST_F(DBTest2, PartitionedIndexUserToInternalKey) { + const int kValueSize = 10500; + const int kNumEntriesPerFile = 1000; + const int kNumFiles = 3; + const int kNumDistinctKeys = 30; + + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; + PartitionedIndexTestListener* listener = new PartitionedIndexTestListener(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.listeners.emplace_back(listener); + std::vector snapshots; + Reopen(options); + Random rnd(301); + + for (int i = 0; i < kNumFiles; i++) { + for (int j = 0; j < kNumEntriesPerFile; j++) { + int key_id = (i * kNumEntriesPerFile + j) % kNumDistinctKeys; + std::string value = rnd.RandomString(kValueSize); + ASSERT_OK(Put("keykey_" + std::to_string(key_id), value)); + snapshots.push_back(db_->GetSnapshot()); + } + ASSERT_OK(Flush()); + } + + for (auto s : snapshots) { + db_->ReleaseSnapshot(s); + } +} + +#endif // ROCKSDB_LITE + +class PrefixFullBloomWithReverseComparator + : public DBTestBase, + public ::testing::WithParamInterface { + public: + PrefixFullBloomWithReverseComparator() + : DBTestBase("prefix_bloom_reverse", /*env_do_fsync=*/true) {} + void SetUp() override { if_cache_filter_ = GetParam(); } + bool if_cache_filter_; +}; + +TEST_P(PrefixFullBloomWithReverseComparator, + PrefixFullBloomWithReverseComparator) { + Options options = last_options_; + options.comparator = ReverseBytewiseComparator(); + options.prefix_extractor.reset(NewCappedPrefixTransform(3)); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions bbto; + if (if_cache_filter_) { + bbto.no_block_cache = false; + bbto.cache_index_and_filter_blocks = true; + bbto.block_cache = NewLRUCache(1); + } + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + ASSERT_OK(dbfull()->Put(WriteOptions(), "bar123", "foo")); + ASSERT_OK(dbfull()->Put(WriteOptions(), "bar234", "foo2")); + ASSERT_OK(dbfull()->Put(WriteOptions(), "foo123", "foo3")); + + ASSERT_OK(dbfull()->Flush(FlushOptions())); + + if (bbto.block_cache) { + bbto.block_cache->EraseUnRefEntries(); + } + + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + iter->Seek("bar345"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar234", iter->key().ToString()); + ASSERT_EQ("foo2", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar123", iter->key().ToString()); + ASSERT_EQ("foo", iter->value().ToString()); + + iter->Seek("foo234"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo123", iter->key().ToString()); + ASSERT_EQ("foo3", iter->value().ToString()); + + iter->Seek("bar"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); +} + +INSTANTIATE_TEST_CASE_P(PrefixFullBloomWithReverseComparator, + PrefixFullBloomWithReverseComparator, testing::Bool()); + +TEST_F(DBTest2, IteratorPropertyVersionNumber) { + ASSERT_OK(Put("", "")); + Iterator* iter1 = db_->NewIterator(ReadOptions()); + ASSERT_OK(iter1->status()); + std::string prop_value; + ASSERT_OK( + iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value)); + uint64_t version_number1 = + static_cast(std::atoi(prop_value.c_str())); + + ASSERT_OK(Put("", "")); + ASSERT_OK(Flush()); + + Iterator* iter2 = db_->NewIterator(ReadOptions()); + ASSERT_OK(iter2->status()); + ASSERT_OK( + iter2->GetProperty("rocksdb.iterator.super-version-number", &prop_value)); + uint64_t version_number2 = + static_cast(std::atoi(prop_value.c_str())); + + ASSERT_GT(version_number2, version_number1); + + ASSERT_OK(Put("", "")); + + Iterator* iter3 = db_->NewIterator(ReadOptions()); + ASSERT_OK(iter3->status()); + ASSERT_OK( + iter3->GetProperty("rocksdb.iterator.super-version-number", &prop_value)); + uint64_t version_number3 = + static_cast(std::atoi(prop_value.c_str())); + + ASSERT_EQ(version_number2, version_number3); + + iter1->SeekToFirst(); + ASSERT_OK( + iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value)); + uint64_t version_number1_new = + static_cast(std::atoi(prop_value.c_str())); + ASSERT_EQ(version_number1, version_number1_new); + + delete iter1; + delete iter2; + delete iter3; +} + +TEST_F(DBTest2, CacheIndexAndFilterWithDBRestart) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "a", "begin")); + ASSERT_OK(Put(1, "z", "end")); + ASSERT_OK(Flush(1)); + TryReopenWithColumnFamilies({"default", "pikachu"}, options); + + std::string value; + value = Get(1, "a"); +} + +TEST_F(DBTest2, MaxSuccessiveMergesChangeWithDBRecovery) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.max_successive_merges = 3; + options.merge_operator = MergeOperators::CreatePutOperator(); + options.disable_auto_compactions = true; + DestroyAndReopen(options); + ASSERT_OK(Put("poi", "Finch")); + ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Reese")); + ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Shaw")); + ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Root")); + options.max_successive_merges = 2; + Reopen(options); +} + +#ifndef ROCKSDB_LITE +class DBTestSharedWriteBufferAcrossCFs + : public DBTestBase, + public testing::WithParamInterface> { + public: + DBTestSharedWriteBufferAcrossCFs() + : DBTestBase("db_test_shared_write_buffer", /*env_do_fsync=*/true) {} + void SetUp() override { + use_old_interface_ = std::get<0>(GetParam()); + cost_cache_ = std::get<1>(GetParam()); + } + bool use_old_interface_; + bool cost_cache_; +}; + +TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) { + Options options = CurrentOptions(); + options.arena_block_size = 4096; + auto flush_listener = std::make_shared(); + options.listeners.push_back(flush_listener); + // Don't trip the listener at shutdown. + options.avoid_flush_during_shutdown = true; + + // Avoid undeterministic value by malloc_usable_size(); + // Force arena block size to 1 + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "Arena::Arena:0", [&](void* arg) { + size_t* block_size = static_cast(arg); + *block_size = 1; + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "Arena::AllocateNewBlock:0", [&](void* arg) { + std::pair* pair = + static_cast*>(arg); + *std::get<0>(*pair) = *std::get<1>(*pair); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // The total soft write buffer size is about 105000 + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + + if (use_old_interface_) { + options.db_write_buffer_size = 120000; // this is the real limit + } else if (!cost_cache_) { + options.write_buffer_manager.reset(new WriteBufferManager(114285)); + } else { + options.write_buffer_manager.reset(new WriteBufferManager(114285, cache)); + } + options.write_buffer_size = 500000; // this is never hit + CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options); + + WriteOptions wo; + wo.disableWAL = true; + + std::function wait_flush = [&]() { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0])); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2])); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3])); + // Ensure background work is fully finished including listener callbacks + // before accessing listener state. + ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork()); + }; + + // Create some data and flush "default" and "nikitich" so that they + // are newer CFs created. + flush_listener->expected_flush_reason = FlushReason::kManualFlush; + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + Flush(3); + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); + Flush(0); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(1)); + + flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager; + ASSERT_OK(Put(3, Key(1), DummyString(30000), wo)); + if (cost_cache_) { + ASSERT_GE(cache->GetUsage(), 256 * 1024); + ASSERT_LE(cache->GetUsage(), 2 * 256 * 1024); + } + wait_flush(); + ASSERT_OK(Put(0, Key(1), DummyString(60000), wo)); + if (cost_cache_) { + ASSERT_GE(cache->GetUsage(), 256 * 1024); + ASSERT_LE(cache->GetUsage(), 2 * 256 * 1024); + } + wait_flush(); + ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); + // No flush should trigger + wait_flush(); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(1)); + } + + // Trigger a flush. Flushing "nikitich". + ASSERT_OK(Put(3, Key(2), DummyString(30000), wo)); + wait_flush(); + ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); + wait_flush(); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(2)); + } + + // Without hitting the threshold, no flush should trigger. + ASSERT_OK(Put(2, Key(1), DummyString(30000), wo)); + wait_flush(); + ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); + wait_flush(); + ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); + wait_flush(); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(2)); + } + + // Hit the write buffer limit again. "default" + // will have been flushed. + ASSERT_OK(Put(2, Key(2), DummyString(10000), wo)); + wait_flush(); + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + wait_flush(); + ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); + wait_flush(); + ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); + wait_flush(); + ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); + wait_flush(); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(2)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(2)); + } + + // Trigger another flush. This time "dobrynia". "pikachu" should not + // be flushed, althrough it was never flushed. + ASSERT_OK(Put(1, Key(1), DummyString(1), wo)); + wait_flush(); + ASSERT_OK(Put(2, Key(1), DummyString(80000), wo)); + wait_flush(); + ASSERT_OK(Put(1, Key(1), DummyString(1), wo)); + wait_flush(); + ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); + wait_flush(); + + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(2)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(2)); + } + if (cost_cache_) { + ASSERT_GE(cache->GetUsage(), 256 * 1024); + Close(); + options.write_buffer_manager.reset(); + last_options_.write_buffer_manager.reset(); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +INSTANTIATE_TEST_CASE_P(DBTestSharedWriteBufferAcrossCFs, + DBTestSharedWriteBufferAcrossCFs, + ::testing::Values(std::make_tuple(true, false), + std::make_tuple(false, false), + std::make_tuple(false, true))); + +TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) { + std::string dbname2 = test::PerThreadDBPath("db_shared_wb_db2"); + Options options = CurrentOptions(); + options.arena_block_size = 4096; + auto flush_listener = std::make_shared(); + options.listeners.push_back(flush_listener); + // Don't trip the listener at shutdown. + options.avoid_flush_during_shutdown = true; + // Avoid undeterministic value by malloc_usable_size(); + // Force arena block size to 1 + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "Arena::Arena:0", [&](void* arg) { + size_t* block_size = static_cast(arg); + *block_size = 1; + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "Arena::AllocateNewBlock:0", [&](void* arg) { + std::pair* pair = + static_cast*>(arg); + *std::get<0>(*pair) = *std::get<1>(*pair); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + options.write_buffer_size = 500000; // this is never hit + // Use a write buffer total size so that the soft limit is about + // 105000. + options.write_buffer_manager.reset(new WriteBufferManager(120000)); + CreateAndReopenWithCF({"cf1", "cf2"}, options); + + ASSERT_OK(DestroyDB(dbname2, options)); + DB* db2 = nullptr; + ASSERT_OK(DB::Open(options, dbname2, &db2)); + + WriteOptions wo; + wo.disableWAL = true; + + std::function wait_flush = [&]() { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0])); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2])); + ASSERT_OK(static_cast(db2)->TEST_WaitForFlushMemTable()); + // Ensure background work is fully finished including listener callbacks + // before accessing listener state. + ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork()); + ASSERT_OK( + static_cast_with_check(db2)->TEST_WaitForBackgroundWork()); + }; + + // Trigger a flush on cf2 + flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager; + ASSERT_OK(Put(2, Key(1), DummyString(70000), wo)); + wait_flush(); + ASSERT_OK(Put(0, Key(1), DummyString(20000), wo)); + wait_flush(); + + // Insert to DB2 + ASSERT_OK(db2->Put(wo, Key(2), DummyString(20000))); + wait_flush(); + + ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); + wait_flush(); + ASSERT_OK(static_cast(db2)->TEST_WaitForFlushMemTable()); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default") + + GetNumberOfSstFilesForColumnFamily(db_, "cf1") + + GetNumberOfSstFilesForColumnFamily(db_, "cf2"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"), + static_cast(0)); + } + + // Triggering to flush another CF in DB1 + ASSERT_OK(db2->Put(wo, Key(2), DummyString(70000))); + wait_flush(); + ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); + wait_flush(); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"), + static_cast(0)); + } + + // Triggering flush in DB2. + ASSERT_OK(db2->Put(wo, Key(3), DummyString(40000))); + wait_flush(); + ASSERT_OK(db2->Put(wo, Key(1), DummyString(1))); + wait_flush(); + ASSERT_OK(static_cast(db2)->TEST_WaitForFlushMemTable()); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"), + static_cast(1)); + } + + delete db2; + ASSERT_OK(DestroyDB(dbname2, options)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, TestWriteBufferNoLimitWithCache) { + Options options = CurrentOptions(); + options.arena_block_size = 4096; + std::shared_ptr cache = NewLRUCache(LRUCacheOptions( + 10000000 /* capacity */, 1 /* num_shard_bits */, + false /* strict_capacity_limit */, 0.0 /* high_pri_pool_ratio */, + nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata)); + + options.write_buffer_size = 50000; // this is never hit + // Use a write buffer total size so that the soft limit is about + // 105000. + options.write_buffer_manager.reset(new WriteBufferManager(0, cache)); + Reopen(options); + + ASSERT_OK(Put("foo", "bar")); + // One dummy entry is 256KB. + ASSERT_GT(cache->GetUsage(), 128000); +} + +namespace { +void ValidateKeyExistence(DB* db, const std::vector& keys_must_exist, + const std::vector& keys_must_not_exist) { + // Ensure that expected keys exist + std::vector values; + if (keys_must_exist.size() > 0) { + std::vector status_list = + db->MultiGet(ReadOptions(), keys_must_exist, &values); + for (size_t i = 0; i < keys_must_exist.size(); i++) { + ASSERT_OK(status_list[i]); + } + } + + // Ensure that given keys don't exist + if (keys_must_not_exist.size() > 0) { + std::vector status_list = + db->MultiGet(ReadOptions(), keys_must_not_exist, &values); + for (size_t i = 0; i < keys_must_not_exist.size(); i++) { + ASSERT_TRUE(status_list[i].IsNotFound()); + } + } +} + +} // anonymous namespace + +TEST_F(DBTest2, WalFilterTest) { + class TestWalFilter : public WalFilter { + private: + // Processing option that is requested to be applied at the given index + WalFilter::WalProcessingOption wal_processing_option_; + // Index at which to apply wal_processing_option_ + // At other indexes default wal_processing_option::kContinueProcessing is + // returned. + size_t apply_option_at_record_index_; + // Current record index, incremented with each record encountered. + size_t current_record_index_; + + public: + TestWalFilter(WalFilter::WalProcessingOption wal_processing_option, + size_t apply_option_for_record_index) + : wal_processing_option_(wal_processing_option), + apply_option_at_record_index_(apply_option_for_record_index), + current_record_index_(0) {} + + WalProcessingOption LogRecord(const WriteBatch& /*batch*/, + WriteBatch* /*new_batch*/, + bool* /*batch_changed*/) const override { + WalFilter::WalProcessingOption option_to_return; + + if (current_record_index_ == apply_option_at_record_index_) { + option_to_return = wal_processing_option_; + } else { + option_to_return = WalProcessingOption::kContinueProcessing; + } + + // Filter is passed as a const object for RocksDB to not modify the + // object, however we modify it for our own purpose here and hence + // cast the constness away. + (const_cast(this)->current_record_index_)++; + + return option_to_return; + } + + const char* Name() const override { return "TestWalFilter"; } + }; + + // Create 3 batches with two keys each + std::vector> batch_keys(3); + + batch_keys[0].push_back("key1"); + batch_keys[0].push_back("key2"); + batch_keys[1].push_back("key3"); + batch_keys[1].push_back("key4"); + batch_keys[2].push_back("key5"); + batch_keys[2].push_back("key6"); + + // Test with all WAL processing options + for (int option = 0; + option < static_cast( + WalFilter::WalProcessingOption::kWalProcessingOptionMax); + option++) { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Write given keys in given batches + for (size_t i = 0; i < batch_keys.size(); i++) { + WriteBatch batch; + for (size_t j = 0; j < batch_keys[i].size(); j++) { + ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024))); + } + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); + } + + WalFilter::WalProcessingOption wal_processing_option = + static_cast(option); + + // Create a test filter that would apply wal_processing_option at the first + // record + size_t apply_option_for_record_index = 1; + TestWalFilter test_wal_filter(wal_processing_option, + apply_option_for_record_index); + + // Reopen database with option to use WAL filter + options = OptionsForLogIterTest(); + options.wal_filter = &test_wal_filter; + Status status = + TryReopenWithColumnFamilies({"default", "pikachu"}, options); + if (wal_processing_option == + WalFilter::WalProcessingOption::kCorruptedRecord) { + ASSERT_NOK(status); + // In case of corruption we can turn off paranoid_checks to reopen + // databse + options.paranoid_checks = false; + ReopenWithColumnFamilies({"default", "pikachu"}, options); + } else { + ASSERT_OK(status); + } + + // Compute which keys we expect to be found + // and which we expect not to be found after recovery. + std::vector keys_must_exist; + std::vector keys_must_not_exist; + switch (wal_processing_option) { + case WalFilter::WalProcessingOption::kCorruptedRecord: + case WalFilter::WalProcessingOption::kContinueProcessing: { + fprintf(stderr, "Testing with complete WAL processing\n"); + // we expect all records to be processed + for (size_t i = 0; i < batch_keys.size(); i++) { + for (size_t j = 0; j < batch_keys[i].size(); j++) { + keys_must_exist.push_back(Slice(batch_keys[i][j])); + } + } + break; + } + case WalFilter::WalProcessingOption::kIgnoreCurrentRecord: { + fprintf(stderr, + "Testing with ignoring record %" ROCKSDB_PRIszt " only\n", + apply_option_for_record_index); + // We expect the record with apply_option_for_record_index to be not + // found. + for (size_t i = 0; i < batch_keys.size(); i++) { + for (size_t j = 0; j < batch_keys[i].size(); j++) { + if (i == apply_option_for_record_index) { + keys_must_not_exist.push_back(Slice(batch_keys[i][j])); + } else { + keys_must_exist.push_back(Slice(batch_keys[i][j])); + } + } + } + break; + } + case WalFilter::WalProcessingOption::kStopReplay: { + fprintf(stderr, + "Testing with stopping replay from record %" ROCKSDB_PRIszt + "\n", + apply_option_for_record_index); + // We expect records beyond apply_option_for_record_index to be not + // found. + for (size_t i = 0; i < batch_keys.size(); i++) { + for (size_t j = 0; j < batch_keys[i].size(); j++) { + if (i >= apply_option_for_record_index) { + keys_must_not_exist.push_back(Slice(batch_keys[i][j])); + } else { + keys_must_exist.push_back(Slice(batch_keys[i][j])); + } + } + } + break; + } + default: + FAIL(); // unhandled case + } + + bool checked_after_reopen = false; + + while (true) { + // Ensure that expected keys exists + // and not expected keys don't exist after recovery + ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist); + + if (checked_after_reopen) { + break; + } + + // reopen database again to make sure previous log(s) are not used + //(even if they were skipped) + // reopn database with option to use WAL filter + options = OptionsForLogIterTest(); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + checked_after_reopen = true; + } + } +} + +TEST_F(DBTest2, WalFilterTestWithChangeBatch) { + class ChangeBatchHandler : public WriteBatch::Handler { + private: + // Batch to insert keys in + WriteBatch* new_write_batch_; + // Number of keys to add in the new batch + size_t num_keys_to_add_in_new_batch_; + // Number of keys added to new batch + size_t num_keys_added_; + + public: + ChangeBatchHandler(WriteBatch* new_write_batch, + size_t num_keys_to_add_in_new_batch) + : new_write_batch_(new_write_batch), + num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch), + num_keys_added_(0) {} + void Put(const Slice& key, const Slice& value) override { + if (num_keys_added_ < num_keys_to_add_in_new_batch_) { + ASSERT_OK(new_write_batch_->Put(key, value)); + ++num_keys_added_; + } + } + }; + + class TestWalFilterWithChangeBatch : public WalFilter { + private: + // Index at which to start changing records + size_t change_records_from_index_; + // Number of keys to add in the new batch + size_t num_keys_to_add_in_new_batch_; + // Current record index, incremented with each record encountered. + size_t current_record_index_; + + public: + TestWalFilterWithChangeBatch(size_t change_records_from_index, + size_t num_keys_to_add_in_new_batch) + : change_records_from_index_(change_records_from_index), + num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch), + current_record_index_(0) {} + + WalProcessingOption LogRecord(const WriteBatch& batch, + WriteBatch* new_batch, + bool* batch_changed) const override { + if (current_record_index_ >= change_records_from_index_) { + ChangeBatchHandler handler(new_batch, num_keys_to_add_in_new_batch_); + Status s = batch.Iterate(&handler); + if (s.ok()) { + *batch_changed = true; + } else { + assert(false); + } + } + + // Filter is passed as a const object for RocksDB to not modify the + // object, however we modify it for our own purpose here and hence + // cast the constness away. + (const_cast(this) + ->current_record_index_)++; + + return WalProcessingOption::kContinueProcessing; + } + + const char* Name() const override { return "TestWalFilterWithChangeBatch"; } + }; + + std::vector> batch_keys(3); + + batch_keys[0].push_back("key1"); + batch_keys[0].push_back("key2"); + batch_keys[1].push_back("key3"); + batch_keys[1].push_back("key4"); + batch_keys[2].push_back("key5"); + batch_keys[2].push_back("key6"); + + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Write given keys in given batches + for (size_t i = 0; i < batch_keys.size(); i++) { + WriteBatch batch; + for (size_t j = 0; j < batch_keys[i].size(); j++) { + ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024))); + } + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); + } + + // Create a test filter that would apply wal_processing_option at the first + // record + size_t change_records_from_index = 1; + size_t num_keys_to_add_in_new_batch = 1; + TestWalFilterWithChangeBatch test_wal_filter_with_change_batch( + change_records_from_index, num_keys_to_add_in_new_batch); + + // Reopen database with option to use WAL filter + options = OptionsForLogIterTest(); + options.wal_filter = &test_wal_filter_with_change_batch; + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + // Ensure that all keys exist before change_records_from_index_ + // And after that index only single key exists + // as our filter adds only single key for each batch + std::vector keys_must_exist; + std::vector keys_must_not_exist; + + for (size_t i = 0; i < batch_keys.size(); i++) { + for (size_t j = 0; j < batch_keys[i].size(); j++) { + if (i >= change_records_from_index && j >= num_keys_to_add_in_new_batch) { + keys_must_not_exist.push_back(Slice(batch_keys[i][j])); + } else { + keys_must_exist.push_back(Slice(batch_keys[i][j])); + } + } + } + + bool checked_after_reopen = false; + + while (true) { + // Ensure that expected keys exists + // and not expected keys don't exist after recovery + ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist); + + if (checked_after_reopen) { + break; + } + + // reopen database again to make sure previous log(s) are not used + //(even if they were skipped) + // reopn database with option to use WAL filter + options = OptionsForLogIterTest(); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + checked_after_reopen = true; + } +} + +TEST_F(DBTest2, WalFilterTestWithChangeBatchExtraKeys) { + class TestWalFilterWithChangeBatchAddExtraKeys : public WalFilter { + public: + WalProcessingOption LogRecord(const WriteBatch& batch, + WriteBatch* new_batch, + bool* batch_changed) const override { + *new_batch = batch; + Status s = new_batch->Put("key_extra", "value_extra"); + if (s.ok()) { + *batch_changed = true; + } else { + assert(false); + } + return WalProcessingOption::kContinueProcessing; + } + + const char* Name() const override { + return "WalFilterTestWithChangeBatchExtraKeys"; + } + }; + + std::vector> batch_keys(3); + + batch_keys[0].push_back("key1"); + batch_keys[0].push_back("key2"); + batch_keys[1].push_back("key3"); + batch_keys[1].push_back("key4"); + batch_keys[2].push_back("key5"); + batch_keys[2].push_back("key6"); + + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Write given keys in given batches + for (size_t i = 0; i < batch_keys.size(); i++) { + WriteBatch batch; + for (size_t j = 0; j < batch_keys[i].size(); j++) { + ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024))); + } + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); + } + + // Create a test filter that would add extra keys + TestWalFilterWithChangeBatchAddExtraKeys test_wal_filter_extra_keys; + + // Reopen database with option to use WAL filter + options = OptionsForLogIterTest(); + options.wal_filter = &test_wal_filter_extra_keys; + Status status = TryReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_TRUE(status.IsNotSupported()); + + // Reopen without filter, now reopen should succeed - previous + // attempt to open must not have altered the db. + options = OptionsForLogIterTest(); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + std::vector keys_must_exist; + std::vector keys_must_not_exist; // empty vector + + for (size_t i = 0; i < batch_keys.size(); i++) { + for (size_t j = 0; j < batch_keys[i].size(); j++) { + keys_must_exist.push_back(Slice(batch_keys[i][j])); + } + } + + ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist); +} + +TEST_F(DBTest2, WalFilterTestWithColumnFamilies) { + class TestWalFilterWithColumnFamilies : public WalFilter { + private: + // column_family_id -> log_number map (provided to WALFilter) + std::map cf_log_number_map_; + // column_family_name -> column_family_id map (provided to WALFilter) + std::map cf_name_id_map_; + // column_family_name -> keys_found_in_wal map + // We store keys that are applicable to the column_family + // during recovery (i.e. aren't already flushed to SST file(s)) + // for verification against the keys we expect. + std::map> cf_wal_keys_; + + public: + void ColumnFamilyLogNumberMap( + const std::map& cf_lognumber_map, + const std::map& cf_name_id_map) override { + cf_log_number_map_ = cf_lognumber_map; + cf_name_id_map_ = cf_name_id_map; + } + + WalProcessingOption LogRecordFound(unsigned long long log_number, + const std::string& /*log_file_name*/, + const WriteBatch& batch, + WriteBatch* /*new_batch*/, + bool* /*batch_changed*/) override { + class LogRecordBatchHandler : public WriteBatch::Handler { + private: + const std::map& cf_log_number_map_; + std::map>& cf_wal_keys_; + unsigned long long log_number_; + + public: + LogRecordBatchHandler( + unsigned long long current_log_number, + const std::map& cf_log_number_map, + std::map>& cf_wal_keys) + : cf_log_number_map_(cf_log_number_map), + cf_wal_keys_(cf_wal_keys), + log_number_(current_log_number) {} + + Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& /*value*/) override { + auto it = cf_log_number_map_.find(column_family_id); + assert(it != cf_log_number_map_.end()); + unsigned long long log_number_for_cf = it->second; + // If the current record is applicable for column_family_id + // (i.e. isn't flushed to SST file(s) for column_family_id) + // add it to the cf_wal_keys_ map for verification. + if (log_number_ >= log_number_for_cf) { + cf_wal_keys_[column_family_id].push_back( + std::string(key.data(), key.size())); + } + return Status::OK(); + } + } handler(log_number, cf_log_number_map_, cf_wal_keys_); + + Status s = batch.Iterate(&handler); + if (!s.ok()) { + // TODO(AR) is this ok? + return WalProcessingOption::kCorruptedRecord; + } + + return WalProcessingOption::kContinueProcessing; + } + + const char* Name() const override { + return "WalFilterTestWithColumnFamilies"; + } + + const std::map>& GetColumnFamilyKeys() { + return cf_wal_keys_; + } + + const std::map& GetColumnFamilyNameIdMap() { + return cf_name_id_map_; + } + }; + + std::vector> batch_keys_pre_flush(3); + + batch_keys_pre_flush[0].push_back("key1"); + batch_keys_pre_flush[0].push_back("key2"); + batch_keys_pre_flush[1].push_back("key3"); + batch_keys_pre_flush[1].push_back("key4"); + batch_keys_pre_flush[2].push_back("key5"); + batch_keys_pre_flush[2].push_back("key6"); + + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Write given keys in given batches + for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) { + WriteBatch batch; + for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) { + ASSERT_OK(batch.Put(handles_[0], batch_keys_pre_flush[i][j], + DummyString(1024))); + ASSERT_OK(batch.Put(handles_[1], batch_keys_pre_flush[i][j], + DummyString(1024))); + } + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); + } + + // Flush default column-family + ASSERT_OK(db_->Flush(FlushOptions(), handles_[0])); + + // Do some more writes + std::vector> batch_keys_post_flush(3); + + batch_keys_post_flush[0].push_back("key7"); + batch_keys_post_flush[0].push_back("key8"); + batch_keys_post_flush[1].push_back("key9"); + batch_keys_post_flush[1].push_back("key10"); + batch_keys_post_flush[2].push_back("key11"); + batch_keys_post_flush[2].push_back("key12"); + + // Write given keys in given batches + for (size_t i = 0; i < batch_keys_post_flush.size(); i++) { + WriteBatch batch; + for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) { + ASSERT_OK(batch.Put(handles_[0], batch_keys_post_flush[i][j], + DummyString(1024))); + ASSERT_OK(batch.Put(handles_[1], batch_keys_post_flush[i][j], + DummyString(1024))); + } + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); + } + + // On Recovery we should only find the second batch applicable to default CF + // But both batches applicable to pikachu CF + + // Create a test filter that would add extra keys + TestWalFilterWithColumnFamilies test_wal_filter_column_families; + + // Reopen database with option to use WAL filter + options = OptionsForLogIterTest(); + options.wal_filter = &test_wal_filter_column_families; + Status status = TryReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_TRUE(status.ok()); + + // verify that handles_[0] only has post_flush keys + // while handles_[1] has pre and post flush keys + auto cf_wal_keys = test_wal_filter_column_families.GetColumnFamilyKeys(); + auto name_id_map = test_wal_filter_column_families.GetColumnFamilyNameIdMap(); + size_t index = 0; + auto keys_cf = cf_wal_keys[name_id_map[kDefaultColumnFamilyName]]; + // default column-family, only post_flush keys are expected + for (size_t i = 0; i < batch_keys_post_flush.size(); i++) { + for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) { + Slice key_from_the_log(keys_cf[index++]); + Slice batch_key(batch_keys_post_flush[i][j]); + ASSERT_EQ(key_from_the_log.compare(batch_key), 0); + } + } + ASSERT_EQ(index, keys_cf.size()); + + index = 0; + keys_cf = cf_wal_keys[name_id_map["pikachu"]]; + // pikachu column-family, all keys are expected + for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) { + for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) { + Slice key_from_the_log(keys_cf[index++]); + Slice batch_key(batch_keys_pre_flush[i][j]); + ASSERT_EQ(key_from_the_log.compare(batch_key), 0); + } + } + + for (size_t i = 0; i < batch_keys_post_flush.size(); i++) { + for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) { + Slice key_from_the_log(keys_cf[index++]); + Slice batch_key(batch_keys_post_flush[i][j]); + ASSERT_EQ(key_from_the_log.compare(batch_key), 0); + } + } + ASSERT_EQ(index, keys_cf.size()); +} + +TEST_F(DBTest2, PresetCompressionDict) { + // Verifies that compression ratio improves when dictionary is enabled, and + // improves even further when the dictionary is trained by ZSTD. + const size_t kBlockSizeBytes = 4 << 10; + const size_t kL0FileBytes = 128 << 10; + const size_t kApproxPerBlockOverheadBytes = 50; + const int kNumL0Files = 5; + + Options options; + // Make sure to use any custom env that the test is configured with. + options.env = CurrentOptions().env; + options.allow_concurrent_memtable_write = false; + options.arena_block_size = kBlockSizeBytes; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.level0_file_num_compaction_trigger = kNumL0Files; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kL0FileBytes / kBlockSizeBytes)); + options.num_levels = 2; + options.target_file_size_base = kL0FileBytes; + options.target_file_size_multiplier = 2; + options.write_buffer_size = kL0FileBytes; + BlockBasedTableOptions table_options; + table_options.block_size = kBlockSizeBytes; + std::vector compression_types; + if (Zlib_Supported()) { + compression_types.push_back(kZlibCompression); + } +#if LZ4_VERSION_NUMBER >= 10400 // r124+ + compression_types.push_back(kLZ4Compression); + compression_types.push_back(kLZ4HCCompression); +#endif // LZ4_VERSION_NUMBER >= 10400 + if (ZSTD_Supported()) { + compression_types.push_back(kZSTD); + } + + enum DictionaryTypes : int { + kWithoutDict, + kWithDict, + kWithZSTDfinalizeDict, + kWithZSTDTrainedDict, + kDictEnd, + }; + + for (auto compression_type : compression_types) { + options.compression = compression_type; + size_t bytes_without_dict = 0; + size_t bytes_with_dict = 0; + size_t bytes_with_zstd_finalize_dict = 0; + size_t bytes_with_zstd_trained_dict = 0; + for (int i = kWithoutDict; i < kDictEnd; i++) { + // First iteration: compress without preset dictionary + // Second iteration: compress with preset dictionary + // Third iteration (zstd only): compress with zstd-trained dictionary + // + // To make sure the compression dictionary has the intended effect, we + // verify the compressed size is smaller in successive iterations. Also in + // the non-first iterations, verify the data we get out is the same data + // we put in. + switch (i) { + case kWithoutDict: + options.compression_opts.max_dict_bytes = 0; + options.compression_opts.zstd_max_train_bytes = 0; + break; + case kWithDict: + options.compression_opts.max_dict_bytes = kBlockSizeBytes; + options.compression_opts.zstd_max_train_bytes = 0; + break; + case kWithZSTDfinalizeDict: + if (compression_type != kZSTD || + !ZSTD_FinalizeDictionarySupported()) { + continue; + } + options.compression_opts.max_dict_bytes = kBlockSizeBytes; + options.compression_opts.zstd_max_train_bytes = kL0FileBytes; + options.compression_opts.use_zstd_dict_trainer = false; + break; + case kWithZSTDTrainedDict: + if (compression_type != kZSTD || !ZSTD_TrainDictionarySupported()) { + continue; + } + options.compression_opts.max_dict_bytes = kBlockSizeBytes; + options.compression_opts.zstd_max_train_bytes = kL0FileBytes; + options.compression_opts.use_zstd_dict_trainer = true; + break; + default: + assert(false); + } + + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + Random rnd(301); + std::string seq_datas[10]; + for (int j = 0; j < 10; ++j) { + seq_datas[j] = + rnd.RandomString(kBlockSizeBytes - kApproxPerBlockOverheadBytes); + } + + ASSERT_EQ(0, NumTableFilesAtLevel(0, 1)); + for (int j = 0; j < kNumL0Files; ++j) { + for (size_t k = 0; k < kL0FileBytes / kBlockSizeBytes + 1; ++k) { + auto key_num = j * (kL0FileBytes / kBlockSizeBytes) + k; + ASSERT_OK(Put(1, Key(static_cast(key_num)), + seq_datas[(key_num / 10) % 10])); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + ASSERT_EQ(j + 1, NumTableFilesAtLevel(0, 1)); + } + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], + true /* disallow_trivial_move */)); + ASSERT_EQ(0, NumTableFilesAtLevel(0, 1)); + ASSERT_GT(NumTableFilesAtLevel(1, 1), 0); + + // Get the live sst files size + size_t total_sst_bytes = TotalSize(1); + if (i == kWithoutDict) { + bytes_without_dict = total_sst_bytes; + } else if (i == kWithDict) { + bytes_with_dict = total_sst_bytes; + } else if (i == kWithZSTDfinalizeDict) { + bytes_with_zstd_finalize_dict = total_sst_bytes; + } else if (i == kWithZSTDTrainedDict) { + bytes_with_zstd_trained_dict = total_sst_bytes; + } + + for (size_t j = 0; j < kNumL0Files * (kL0FileBytes / kBlockSizeBytes); + j++) { + ASSERT_EQ(seq_datas[(j / 10) % 10], Get(1, Key(static_cast(j)))); + } + if (i == kWithDict) { + ASSERT_GT(bytes_without_dict, bytes_with_dict); + } else if (i == kWithZSTDTrainedDict) { + // In zstd compression, it is sometimes possible that using a finalized + // dictionary does not get as good a compression ratio as raw content + // dictionary. But using a dictionary should always get better + // compression ratio than not using one. + ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_finalize_dict || + bytes_without_dict > bytes_with_zstd_finalize_dict); + } else if (i == kWithZSTDTrainedDict) { + // In zstd compression, it is sometimes possible that using a trained + // dictionary does not get as good a compression ratio as without + // training. + // But using a dictionary (with or without training) should always get + // better compression ratio than not using one. + ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_trained_dict || + bytes_without_dict > bytes_with_zstd_trained_dict); + } + + DestroyAndReopen(options); + } + } +} + +TEST_F(DBTest2, PresetCompressionDictLocality) { + if (!ZSTD_Supported()) { + return; + } + // Verifies that compression dictionary is generated from local data. The + // verification simply checks all output SSTs have different compression + // dictionaries. We do not verify effectiveness as that'd likely be flaky in + // the future. + const int kNumEntriesPerFile = 1 << 10; // 1KB + const int kNumBytesPerEntry = 1 << 10; // 1KB + const int kNumFiles = 4; + Options options = CurrentOptions(); + options.compression = kZSTD; + options.compression_opts.max_dict_bytes = 1 << 14; // 16KB + options.compression_opts.zstd_max_train_bytes = 1 << 18; // 256KB + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry; + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + + Random rnd(301); + for (int i = 0; i < kNumFiles; ++i) { + for (int j = 0; j < kNumEntriesPerFile; ++j) { + ASSERT_OK(Put(Key(i * kNumEntriesPerFile + j), + rnd.RandomString(kNumBytesPerEntry))); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + ASSERT_EQ(NumTableFilesAtLevel(1), i + 1); + } + + // Store all the dictionaries generated during a full compaction. + std::vector compression_dicts; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict", + [&](void* arg) { + compression_dicts.emplace_back(static_cast(arg)->ToString()); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + CompactRangeOptions compact_range_opts; + compact_range_opts.bottommost_level_compaction = + BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr)); + + // Dictionary compression should not be so good as to compress four totally + // random files into one. If it does then there's probably something wrong + // with the test. + ASSERT_GT(NumTableFilesAtLevel(1), 1); + + // Furthermore, there should be one compression dictionary generated per file. + // And they should all be different from each other. + ASSERT_EQ(NumTableFilesAtLevel(1), + static_cast(compression_dicts.size())); + for (size_t i = 1; i < compression_dicts.size(); ++i) { + std::string& a = compression_dicts[i - 1]; + std::string& b = compression_dicts[i]; + size_t alen = a.size(); + size_t blen = b.size(); + ASSERT_TRUE(alen != blen || memcmp(a.data(), b.data(), alen) != 0); + } +} + +class PresetCompressionDictTest + : public DBTestBase, + public testing::WithParamInterface> { + public: + PresetCompressionDictTest() + : DBTestBase("db_test2", false /* env_do_fsync */), + compression_type_(std::get<0>(GetParam())), + bottommost_(std::get<1>(GetParam())) {} + + protected: + const CompressionType compression_type_; + const bool bottommost_; +}; + +INSTANTIATE_TEST_CASE_P( + DBTest2, PresetCompressionDictTest, + ::testing::Combine(::testing::ValuesIn(GetSupportedDictCompressions()), + ::testing::Bool())); + +TEST_P(PresetCompressionDictTest, Flush) { + // Verifies that dictionary is generated and written during flush only when + // `ColumnFamilyOptions::compression` enables dictionary. Also verifies the + // size of the dictionary is within expectations according to the limit on + // buffering set by `CompressionOptions::max_dict_buffer_bytes`. + const size_t kValueLen = 256; + const size_t kKeysPerFile = 1 << 10; + const size_t kDictLen = 16 << 10; + const size_t kBlockLen = 4 << 10; + + Options options = CurrentOptions(); + if (bottommost_) { + options.bottommost_compression = compression_type_; + options.bottommost_compression_opts.enabled = true; + options.bottommost_compression_opts.max_dict_bytes = kDictLen; + options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen; + } else { + options.compression = compression_type_; + options.compression_opts.max_dict_bytes = kDictLen; + options.compression_opts.max_dict_buffer_bytes = kBlockLen; + } + options.memtable_factory.reset(test::NewSpecialSkipListFactory(kKeysPerFile)); + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions bbto; + bbto.block_size = kBlockLen; + bbto.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + Random rnd(301); + for (size_t i = 0; i <= kKeysPerFile; ++i) { + ASSERT_OK(Put(Key(static_cast(i)), rnd.RandomString(kValueLen))); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a + // compression dictionary exists since dictionaries would be preloaded when + // the flush finishes. + if (bottommost_) { + // Flush is never considered bottommost. This should change in the future + // since flushed files may have nothing underneath them, like the one in + // this test case. + ASSERT_EQ( + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), + 0); + } else { + ASSERT_GT( + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), + 0); + // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on + // number of bytes needs to be adjusted in case the cached block is in + // ZSTD's digested dictionary format. + if (compression_type_ != kZSTD && + compression_type_ != kZSTDNotFinalCompression) { + // Although we limited buffering to `kBlockLen`, there may be up to two + // blocks of data included in the dictionary since we only check limit + // after each block is built. + ASSERT_LE(TestGetTickerCount(options, + BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), + 2 * kBlockLen); + } + } +} + +TEST_P(PresetCompressionDictTest, CompactNonBottommost) { + // Verifies that dictionary is generated and written during compaction to + // non-bottommost level only when `ColumnFamilyOptions::compression` enables + // dictionary. Also verifies the size of the dictionary is within expectations + // according to the limit on buffering set by + // `CompressionOptions::max_dict_buffer_bytes`. + const size_t kValueLen = 256; + const size_t kKeysPerFile = 1 << 10; + const size_t kDictLen = 16 << 10; + const size_t kBlockLen = 4 << 10; + + Options options = CurrentOptions(); + if (bottommost_) { + options.bottommost_compression = compression_type_; + options.bottommost_compression_opts.enabled = true; + options.bottommost_compression_opts.max_dict_bytes = kDictLen; + options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen; + } else { + options.compression = compression_type_; + options.compression_opts.max_dict_bytes = kDictLen; + options.compression_opts.max_dict_buffer_bytes = kBlockLen; + } + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions bbto; + bbto.block_size = kBlockLen; + bbto.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + Random rnd(301); + for (size_t j = 0; j <= kKeysPerFile; ++j) { + ASSERT_OK(Put(Key(static_cast(j)), rnd.RandomString(kValueLen))); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + + for (int i = 0; i < 2; ++i) { + for (size_t j = 0; j <= kKeysPerFile; ++j) { + ASSERT_OK(Put(Key(static_cast(j)), rnd.RandomString(kValueLen))); + } + ASSERT_OK(Flush()); + } +#ifndef ROCKSDB_LITE + ASSERT_EQ("2,0,1", FilesPerLevel(0)); +#endif // ROCKSDB_LITE + + uint64_t prev_compression_dict_bytes_inserted = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT); + // This L0->L1 compaction merges the two L0 files into L1. The produced L1 + // file is not bottommost due to the existing L2 file covering the same key- + // range. + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,1,1", FilesPerLevel(0)); +#endif // ROCKSDB_LITE + // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a + // compression dictionary exists since dictionaries would be preloaded when + // the compaction finishes. + if (bottommost_) { + ASSERT_EQ( + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), + prev_compression_dict_bytes_inserted); + } else { + ASSERT_GT( + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), + prev_compression_dict_bytes_inserted); + // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on + // number of bytes needs to be adjusted in case the cached block is in + // ZSTD's digested dictionary format. + if (compression_type_ != kZSTD && + compression_type_ != kZSTDNotFinalCompression) { + // Although we limited buffering to `kBlockLen`, there may be up to two + // blocks of data included in the dictionary since we only check limit + // after each block is built. + ASSERT_LE(TestGetTickerCount(options, + BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), + prev_compression_dict_bytes_inserted + 2 * kBlockLen); + } + } +} + +TEST_P(PresetCompressionDictTest, CompactBottommost) { + // Verifies that dictionary is generated and written during compaction to + // non-bottommost level only when either `ColumnFamilyOptions::compression` or + // `ColumnFamilyOptions::bottommost_compression` enables dictionary. Also + // verifies the size of the dictionary is within expectations according to the + // limit on buffering set by `CompressionOptions::max_dict_buffer_bytes`. + const size_t kValueLen = 256; + const size_t kKeysPerFile = 1 << 10; + const size_t kDictLen = 16 << 10; + const size_t kBlockLen = 4 << 10; + + Options options = CurrentOptions(); + if (bottommost_) { + options.bottommost_compression = compression_type_; + options.bottommost_compression_opts.enabled = true; + options.bottommost_compression_opts.max_dict_bytes = kDictLen; + options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen; + } else { + options.compression = compression_type_; + options.compression_opts.max_dict_bytes = kDictLen; + options.compression_opts.max_dict_buffer_bytes = kBlockLen; + } + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions bbto; + bbto.block_size = kBlockLen; + bbto.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + Random rnd(301); + for (int i = 0; i < 2; ++i) { + for (size_t j = 0; j <= kKeysPerFile; ++j) { + ASSERT_OK(Put(Key(static_cast(j)), rnd.RandomString(kValueLen))); + } + ASSERT_OK(Flush()); + } +#ifndef ROCKSDB_LITE + ASSERT_EQ("2", FilesPerLevel(0)); +#endif // ROCKSDB_LITE + + uint64_t prev_compression_dict_bytes_inserted = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT); + CompactRangeOptions cro; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,1", FilesPerLevel(0)); +#endif // ROCKSDB_LITE + ASSERT_GT( + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), + prev_compression_dict_bytes_inserted); + // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on + // number of bytes needs to be adjusted in case the cached block is in ZSTD's + // digested dictionary format. + if (compression_type_ != kZSTD && + compression_type_ != kZSTDNotFinalCompression) { + // Although we limited buffering to `kBlockLen`, there may be up to two + // blocks of data included in the dictionary since we only check limit after + // each block is built. + ASSERT_LE( + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), + prev_compression_dict_bytes_inserted + 2 * kBlockLen); + } +} + +class CompactionCompressionListener : public EventListener { + public: + explicit CompactionCompressionListener(Options* db_options) + : db_options_(db_options) {} + + void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override { + // Figure out last level with files + int bottommost_level = 0; + for (int level = 0; level < db->NumberLevels(); level++) { + std::string files_at_level; + ASSERT_TRUE( + db->GetProperty("rocksdb.num-files-at-level" + std::to_string(level), + &files_at_level)); + if (files_at_level != "0") { + bottommost_level = level; + } + } + + if (db_options_->bottommost_compression != kDisableCompressionOption && + ci.output_level == bottommost_level) { + ASSERT_EQ(ci.compression, db_options_->bottommost_compression); + } else if (db_options_->compression_per_level.size() != 0) { + ASSERT_EQ(ci.compression, + db_options_->compression_per_level[ci.output_level]); + } else { + ASSERT_EQ(ci.compression, db_options_->compression); + } + max_level_checked = std::max(max_level_checked, ci.output_level); + } + + int max_level_checked = 0; + const Options* db_options_; +}; + +enum CompressionFailureType { + kTestCompressionFail, + kTestDecompressionFail, + kTestDecompressionCorruption +}; + +class CompressionFailuresTest + : public DBTest2, + public testing::WithParamInterface> { + public: + CompressionFailuresTest() { + std::tie(compression_failure_type_, compression_type_, + compression_max_dict_bytes_, compression_parallel_threads_) = + GetParam(); + } + + CompressionFailureType compression_failure_type_ = kTestCompressionFail; + CompressionType compression_type_ = kNoCompression; + uint32_t compression_max_dict_bytes_ = 0; + uint32_t compression_parallel_threads_ = 0; +}; + +INSTANTIATE_TEST_CASE_P( + DBTest2, CompressionFailuresTest, + ::testing::Combine(::testing::Values(kTestCompressionFail, + kTestDecompressionFail, + kTestDecompressionCorruption), + ::testing::ValuesIn(GetSupportedCompressions()), + ::testing::Values(0, 10), ::testing::Values(1, 4))); + +TEST_P(CompressionFailuresTest, CompressionFailures) { + if (compression_type_ == kNoCompression) { + return; + } + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.max_bytes_for_level_base = 1024; + options.max_bytes_for_level_multiplier = 2; + options.num_levels = 7; + options.max_background_compactions = 1; + options.target_file_size_base = 512; + + BlockBasedTableOptions table_options; + table_options.block_size = 512; + table_options.verify_compression = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + options.compression = compression_type_; + options.compression_opts.parallel_threads = compression_parallel_threads_; + options.compression_opts.max_dict_bytes = compression_max_dict_bytes_; + options.bottommost_compression_opts.parallel_threads = + compression_parallel_threads_; + options.bottommost_compression_opts.max_dict_bytes = + compression_max_dict_bytes_; + + if (compression_failure_type_ == kTestCompressionFail) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompressData:TamperWithReturnValue", [](void* arg) { + bool* ret = static_cast(arg); + *ret = false; + }); + } else if (compression_failure_type_ == kTestDecompressionFail) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "UncompressBlockData:TamperWithReturnValue", [](void* arg) { + Status* ret = static_cast(arg); + ASSERT_OK(*ret); + *ret = Status::Corruption("kTestDecompressionFail"); + }); + } else if (compression_failure_type_ == kTestDecompressionCorruption) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "UncompressBlockData:" + "TamperWithDecompressionOutput", + [](void* arg) { + BlockContents* contents = static_cast(arg); + // Ensure uncompressed data != original data + const size_t len = contents->data.size() + 1; + std::unique_ptr fake_data(new char[len]()); + *contents = BlockContents(std::move(fake_data), len); + }); + } + + std::map key_value_written; + + const int kKeySize = 5; + const int kValUnitSize = 16; + const int kValSize = 256; + Random rnd(405); + + Status s = Status::OK(); + + DestroyAndReopen(options); + // Write 10 random files + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 5; j++) { + std::string key = rnd.RandomString(kKeySize); + // Ensure good compression ratio + std::string valueUnit = rnd.RandomString(kValUnitSize); + std::string value; + for (int k = 0; k < kValSize; k += kValUnitSize) { + value += valueUnit; + } + s = Put(key, value); + if (compression_failure_type_ == kTestCompressionFail) { + key_value_written[key] = value; + ASSERT_OK(s); + } + } + s = Flush(); + if (compression_failure_type_ == kTestCompressionFail) { + ASSERT_OK(s); + } + s = dbfull()->TEST_WaitForCompact(); + if (compression_failure_type_ == kTestCompressionFail) { + ASSERT_OK(s); + } + if (i == 4) { + // Make compression fail at the mid of table building + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + } + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + if (compression_failure_type_ == kTestCompressionFail) { + // Should be kNoCompression, check content consistency + std::unique_ptr db_iter(db_->NewIterator(ReadOptions())); + for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { + std::string key = db_iter->key().ToString(); + std::string value = db_iter->value().ToString(); + ASSERT_NE(key_value_written.find(key), key_value_written.end()); + ASSERT_EQ(key_value_written[key], value); + key_value_written.erase(key); + } + ASSERT_EQ(0, key_value_written.size()); + } else if (compression_failure_type_ == kTestDecompressionFail) { + ASSERT_EQ(std::string(s.getState()), + "Could not decompress: kTestDecompressionFail"); + } else if (compression_failure_type_ == kTestDecompressionCorruption) { + ASSERT_EQ(std::string(s.getState()), + "Decompressed block did not match pre-compression block"); + } +} + +TEST_F(DBTest2, CompressionOptions) { + if (!Zlib_Supported() || !Snappy_Supported()) { + return; + } + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.max_bytes_for_level_base = 100; + options.max_bytes_for_level_multiplier = 2; + options.num_levels = 7; + options.max_background_compactions = 1; + + CompactionCompressionListener* listener = + new CompactionCompressionListener(&options); + options.listeners.emplace_back(listener); + + const int kKeySize = 5; + const int kValSize = 20; + Random rnd(301); + + std::vector compression_parallel_threads = {1, 4}; + + std::map key_value_written; + + for (int iter = 0; iter <= 2; iter++) { + listener->max_level_checked = 0; + + if (iter == 0) { + // Use different compression algorithms for different levels but + // always use Zlib for bottommost level + options.compression_per_level = {kNoCompression, kNoCompression, + kNoCompression, kSnappyCompression, + kSnappyCompression, kSnappyCompression, + kZlibCompression}; + options.compression = kNoCompression; + options.bottommost_compression = kZlibCompression; + } else if (iter == 1) { + // Use Snappy except for bottommost level use ZLib + options.compression_per_level = {}; + options.compression = kSnappyCompression; + options.bottommost_compression = kZlibCompression; + } else if (iter == 2) { + // Use Snappy everywhere + options.compression_per_level = {}; + options.compression = kSnappyCompression; + options.bottommost_compression = kDisableCompressionOption; + } + + for (auto num_threads : compression_parallel_threads) { + options.compression_opts.parallel_threads = num_threads; + options.bottommost_compression_opts.parallel_threads = num_threads; + + DestroyAndReopen(options); + // Write 10 random files + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 5; j++) { + std::string key = rnd.RandomString(kKeySize); + std::string value = rnd.RandomString(kValSize); + key_value_written[key] = value; + ASSERT_OK(Put(key, value)); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + + // Make sure that we wrote enough to check all 7 levels + ASSERT_EQ(listener->max_level_checked, 6); + + // Make sure database content is the same as key_value_written + std::unique_ptr db_iter(db_->NewIterator(ReadOptions())); + for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { + std::string key = db_iter->key().ToString(); + std::string value = db_iter->value().ToString(); + ASSERT_NE(key_value_written.find(key), key_value_written.end()); + ASSERT_EQ(key_value_written[key], value); + key_value_written.erase(key); + } + ASSERT_OK(db_iter->status()); + ASSERT_EQ(0, key_value_written.size()); + } + } +} + +class CompactionStallTestListener : public EventListener { + public: + CompactionStallTestListener() + : compacting_files_cnt_(0), compacted_files_cnt_(0) {} + + void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override { + ASSERT_EQ(ci.cf_name, "default"); + ASSERT_EQ(ci.base_input_level, 0); + ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum); + compacting_files_cnt_ += ci.input_files.size(); + } + + void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override { + ASSERT_EQ(ci.cf_name, "default"); + ASSERT_EQ(ci.base_input_level, 0); + ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum); + compacted_files_cnt_ += ci.input_files.size(); + } + + std::atomic compacting_files_cnt_; + std::atomic compacted_files_cnt_; +}; + +TEST_F(DBTest2, CompactionStall) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:0"}, + {"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:1"}, + {"DBTest2::CompactionStall:2", + "DBImpl::NotifyOnCompactionBegin::UnlockMutex"}, + {"DBTest2::CompactionStall:3", + "DBImpl::NotifyOnCompactionCompleted::UnlockMutex"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 4; + options.max_background_compactions = 40; + CompactionStallTestListener* listener = new CompactionStallTestListener(); + options.listeners.emplace_back(listener); + DestroyAndReopen(options); + // make sure all background compaction jobs can be scheduled + auto stop_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + Random rnd(301); + + // 4 Files in L0 + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10))); + } + ASSERT_OK(Flush()); + } + + // Wait for compaction to be triggered + TEST_SYNC_POINT("DBTest2::CompactionStall:0"); + + // Clear "DBImpl::BGWorkCompaction" SYNC_POINT since we want to hold it again + // at DBTest2::CompactionStall::1 + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); + + // Another 6 L0 files to trigger compaction again + for (int i = 0; i < 6; i++) { + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10))); + } + ASSERT_OK(Flush()); + } + + // Wait for another compaction to be triggered + TEST_SYNC_POINT("DBTest2::CompactionStall:1"); + + // Hold NotifyOnCompactionBegin in the unlock mutex section + TEST_SYNC_POINT("DBTest2::CompactionStall:2"); + + // Hold NotifyOnCompactionCompleted in the unlock mutex section + TEST_SYNC_POINT("DBTest2::CompactionStall:3"); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_LT(NumTableFilesAtLevel(0), + options.level0_file_num_compaction_trigger); + ASSERT_GT(listener->compacted_files_cnt_.load(), + 10 - options.level0_file_num_compaction_trigger); + ASSERT_EQ(listener->compacting_files_cnt_.load(), + listener->compacted_files_cnt_.load()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +#endif // ROCKSDB_LITE + +TEST_F(DBTest2, FirstSnapshotTest) { + Options options; + options.write_buffer_size = 100000; // Small write buffer + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // This snapshot will have sequence number 0 what is expected behaviour. + const Snapshot* s1 = db_->GetSnapshot(); + + ASSERT_OK(Put(1, "k1", std::string(100000, 'x'))); // Fill memtable + ASSERT_OK(Put(1, "k2", std::string(100000, 'y'))); // Trigger flush + + db_->ReleaseSnapshot(s1); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest2, DuplicateSnapshot) { + Options options; + options = CurrentOptions(options); + std::vector snapshots; + DBImpl* dbi = static_cast_with_check(db_); + SequenceNumber oldest_ww_snap, first_ww_snap; + + ASSERT_OK(Put("k", "v")); // inc seq + snapshots.push_back(db_->GetSnapshot()); + snapshots.push_back(db_->GetSnapshot()); + ASSERT_OK(Put("k", "v")); // inc seq + snapshots.push_back(db_->GetSnapshot()); + snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary()); + first_ww_snap = snapshots.back()->GetSequenceNumber(); + ASSERT_OK(Put("k", "v")); // inc seq + snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary()); + snapshots.push_back(db_->GetSnapshot()); + ASSERT_OK(Put("k", "v")); // inc seq + snapshots.push_back(db_->GetSnapshot()); + + { + InstrumentedMutexLock l(dbi->mutex()); + auto seqs = dbi->snapshots().GetAll(&oldest_ww_snap); + ASSERT_EQ(seqs.size(), 4); // duplicates are not counted + ASSERT_EQ(oldest_ww_snap, first_ww_snap); + } + + for (auto s : snapshots) { + db_->ReleaseSnapshot(s); + } +} +#endif // ROCKSDB_LITE + +class PinL0IndexAndFilterBlocksTest + : public DBTestBase, + public testing::WithParamInterface> { + public: + PinL0IndexAndFilterBlocksTest() + : DBTestBase("db_pin_l0_index_bloom_test", /*env_do_fsync=*/true) {} + void SetUp() override { + infinite_max_files_ = std::get<0>(GetParam()); + disallow_preload_ = std::get<1>(GetParam()); + } + + void CreateTwoLevels(Options* options, bool close_afterwards) { + if (infinite_max_files_) { + options->max_open_files = -1; + } + options->create_if_missing = true; + options->statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.pin_l0_filter_and_index_blocks_in_cache = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + options->table_factory.reset(NewBlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, *options); + + ASSERT_OK(Put(1, "a", "begin")); + ASSERT_OK(Put(1, "z", "end")); + ASSERT_OK(Flush(1)); + // move this table to L1 + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); + + // reset block cache + table_options.block_cache = NewLRUCache(64 * 1024); + options->table_factory.reset(NewBlockBasedTableFactory(table_options)); + TryReopenWithColumnFamilies({"default", "pikachu"}, *options); + // create new table at L0 + ASSERT_OK(Put(1, "a2", "begin2")); + ASSERT_OK(Put(1, "z2", "end2")); + ASSERT_OK(Flush(1)); + + if (close_afterwards) { + Close(); // This ensures that there is no ref to block cache entries + } + table_options.block_cache->EraseUnRefEntries(); + } + + bool infinite_max_files_; + bool disallow_preload_; +}; + +TEST_P(PinL0IndexAndFilterBlocksTest, + IndexAndFilterBlocksOfNewTableAddedToCacheWithPinning) { + Options options = CurrentOptions(); + if (infinite_max_files_) { + options.max_open_files = -1; + } + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.pin_l0_filter_and_index_blocks_in_cache = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "key", "val")); + // Create a new table. + ASSERT_OK(Flush(1)); + + // index/filter blocks added to block cache right after table creation. + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + + // only index/filter were added + ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS)); + + std::string value; + // Miss and hit count should remain the same, they're all pinned. + ASSERT_TRUE(db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + + // Miss and hit count should remain the same, they're all pinned. + value = Get(1, "key"); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); +} + +TEST_P(PinL0IndexAndFilterBlocksTest, + MultiLevelIndexAndFilterBlocksCachedWithPinning) { + Options options = CurrentOptions(); + PinL0IndexAndFilterBlocksTest::CreateTwoLevels(&options, false); + // get base cache values + uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS); + uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT); + + std::string value; + // this should be read from L0 + // so cache values don't change + value = Get(1, "a2"); + ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + + // this should be read from L1 + // the file is opened, prefetching results in a cache filter miss + // the block is loaded and added to the cache, + // then the get results in a cache hit for L1 + // When we have inifinite max_files, there is still cache miss because we have + // reset the block cache + value = Get(1, "a"); + ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); +} + +TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) { + Options options = CurrentOptions(); + // This ensures that db does not ref anything in the block cache, so + // EraseUnRefEntries could clear them up. + bool close_afterwards = true; + PinL0IndexAndFilterBlocksTest::CreateTwoLevels(&options, close_afterwards); + + // Get base cache values + uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS); + uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT); + + if (disallow_preload_) { + // Now we have two files. We narrow the max open files to allow 3 entries + // so that preloading SST files won't happen. + options.max_open_files = 13; + // RocksDB sanitize max open files to at least 20. Modify it back. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = static_cast(arg); + *max_open_files = 13; + }); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Reopen database. If max_open_files is set as -1, table readers will be + // preloaded. This will trigger a BlockBasedTable::Open() and prefetch + // L0 index and filter. Level 1's prefetching is disabled in DB::Open() + TryReopenWithColumnFamilies({"default", "pikachu"}, options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + if (!disallow_preload_) { + // After reopen, cache miss are increased by one because we read (and only + // read) filter and index on L0 + ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + } else { + // If max_open_files is not -1, we do not preload table readers, so there is + // no change. + ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + } + std::string value; + // this should be read from L0 + value = Get(1, "a2"); + // If max_open_files is -1, we have pinned index and filter in Rep, so there + // will not be changes in index and filter misses or hits. If max_open_files + // is not -1, Get() will open a TableReader and prefetch index and filter. + ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + + // this should be read from L1 + value = Get(1, "a"); + if (!disallow_preload_) { + // In inifinite max files case, there's a cache miss in executing Get() + // because index and filter are not prefetched before. + ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + } else { + // In this case, cache miss will be increased by one in + // BlockBasedTable::Open() because this is not in DB::Open() code path so we + // will prefetch L1's index and filter. Cache hit will also be increased by + // one because Get() will read index and filter from the block cache + // prefetched in previous Open() call. + ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + } + + // Force a full compaction to one single file. There will be a block + // cache read for both of index and filter. If prefetch doesn't explicitly + // happen, it will happen when verifying the file. + Compact(1, "a", "zzzzz"); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + if (!disallow_preload_) { + ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + } else { + ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + } + + // Bloom and index hit will happen when a Get() happens. + value = Get(1, "a"); + if (!disallow_preload_) { + ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + } else { + ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih + 5, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + } +} + +INSTANTIATE_TEST_CASE_P(PinL0IndexAndFilterBlocksTest, + PinL0IndexAndFilterBlocksTest, + ::testing::Values(std::make_tuple(true, false), + std::make_tuple(false, false), + std::make_tuple(false, true))); + +#ifndef ROCKSDB_LITE +TEST_F(DBTest2, MaxCompactionBytesTest) { + Options options = CurrentOptions(); + options.memtable_factory.reset(test::NewSpecialSkipListFactory( + DBTestBase::kNumKeysByGenerateNewRandomFile)); + options.compaction_style = kCompactionStyleLevel; + options.write_buffer_size = 200 << 10; + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 4; + options.num_levels = 4; + options.compression = kNoCompression; + options.max_bytes_for_level_base = 450 << 10; + options.target_file_size_base = 100 << 10; + // Infinite for full compaction. + options.max_compaction_bytes = options.target_file_size_base * 100; + + Reopen(options); + + Random rnd(301); + + for (int num = 0; num < 8; num++) { + GenerateNewRandomFile(&rnd); + } + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,8", FilesPerLevel(0)); + + // When compact from Ln -> Ln+1, cut a file if the file overlaps with + // more than three files in Ln+1. + options.max_compaction_bytes = options.target_file_size_base * 3; + Reopen(options); + + GenerateNewRandomFile(&rnd); + // Add three more small files that overlap with the previous file + for (int i = 0; i < 3; i++) { + ASSERT_OK(Put("a", "z")); + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Output files to L1 are cut to 4 pieces, according to + // options.max_compaction_bytes (300K) + // There are 8 files on L2 (grandparents level), each one is 100K. The first + // file overlaps with a, b which max_compaction_bytes is less than 300K, the + // second one overlaps with d, e, which is also less than 300K. Including any + // extra grandparent file will make the future compaction larger than 300K. + // L1: [ 1 ] [ 2 ] [ 3 ] [ 4 ] + // L2: [a] [b] [c] [d] [e] [f] [g] [h] + ASSERT_EQ("0,4,8", FilesPerLevel(0)); +} + +static void UniqueIdCallback(void* arg) { + int* result = reinterpret_cast(arg); + if (*result == -1) { + *result = 0; + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback); +} + +class MockPersistentCache : public PersistentCache { + public: + explicit MockPersistentCache(const bool is_compressed, const size_t max_size) + : is_compressed_(is_compressed), max_size_(max_size) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback); + } + + ~MockPersistentCache() override {} + + PersistentCache::StatsType Stats() override { + return PersistentCache::StatsType(); + } + + uint64_t NewId() override { + return last_id_.fetch_add(1, std::memory_order_relaxed); + } + + Status Insert(const Slice& page_key, const char* data, + const size_t size) override { + MutexLock _(&lock_); + + if (size_ > max_size_) { + size_ -= data_.begin()->second.size(); + data_.erase(data_.begin()); + } + + data_.insert(std::make_pair(page_key.ToString(), std::string(data, size))); + size_ += size; + return Status::OK(); + } + + Status Lookup(const Slice& page_key, std::unique_ptr* data, + size_t* size) override { + MutexLock _(&lock_); + auto it = data_.find(page_key.ToString()); + if (it == data_.end()) { + return Status::NotFound(); + } + + assert(page_key.ToString() == it->first); + data->reset(new char[it->second.size()]); + memcpy(data->get(), it->second.c_str(), it->second.size()); + *size = it->second.size(); + return Status::OK(); + } + + bool IsCompressed() override { return is_compressed_; } + + std::string GetPrintableOptions() const override { + return "MockPersistentCache"; + } + + port::Mutex lock_; + std::map data_; + const bool is_compressed_ = true; + size_t size_ = 0; + const size_t max_size_ = 10 * 1024; // 10KiB + std::atomic last_id_{1}; +}; + +#ifdef OS_LINUX +// Make sure that in CPU time perf context counters, Env::NowCPUNanos() +// is used, rather than Env::CPUNanos(); +TEST_F(DBTest2, TestPerfContextGetCpuTime) { + // force resizing table cache so table handle is not preloaded so that + // we can measure find_table_nanos during Get(). + dbfull()->TEST_table_cache()->SetCapacity(0); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + env_->now_cpu_count_.store(0); + env_->SetMockSleep(); + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + // CPU timing is not enabled with kEnableTimeExceptForMutex + SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex); + ASSERT_EQ("bar", Get("foo")); + ASSERT_EQ(0, get_perf_context()->get_cpu_nanos); + ASSERT_EQ(0, env_->now_cpu_count_.load()); + + constexpr uint64_t kDummyAddonSeconds = uint64_t{1000000}; + constexpr uint64_t kDummyAddonNanos = 1000000000U * kDummyAddonSeconds; + + // Add time to NowNanos() reading. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "TableCache::FindTable:0", + [&](void* /*arg*/) { env_->MockSleepForSeconds(kDummyAddonSeconds); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); + ASSERT_EQ("bar", Get("foo")); + ASSERT_GT(env_->now_cpu_count_.load(), 2); + ASSERT_LT(get_perf_context()->get_cpu_nanos, kDummyAddonNanos); + ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonNanos); + + SetPerfLevel(PerfLevel::kDisable); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, TestPerfContextIterCpuTime) { + DestroyAndReopen(CurrentOptions()); + // force resizing table cache so table handle is not preloaded so that + // we can measure find_table_nanos during iteration + dbfull()->TEST_table_cache()->SetCapacity(0); + + const size_t kNumEntries = 10; + for (size_t i = 0; i < kNumEntries; ++i) { + ASSERT_OK(Put("k" + std::to_string(i), "v" + std::to_string(i))); + } + ASSERT_OK(Flush()); + for (size_t i = 0; i < kNumEntries; ++i) { + ASSERT_EQ("v" + std::to_string(i), Get("k" + std::to_string(i))); + } + std::string last_key = "k" + std::to_string(kNumEntries - 1); + std::string last_value = "v" + std::to_string(kNumEntries - 1); + env_->now_cpu_count_.store(0); + env_->SetMockSleep(); + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + // CPU timing is not enabled with kEnableTimeExceptForMutex + SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex); + Iterator* iter = db_->NewIterator(ReadOptions()); + iter->Seek("k0"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("v0", iter->value().ToString()); + iter->SeekForPrev(last_key); + ASSERT_TRUE(iter->Valid()); + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(last_value, iter->value().ToString()); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("v0", iter->value().ToString()); + ASSERT_EQ(0, get_perf_context()->iter_seek_cpu_nanos); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("v1", iter->value().ToString()); + ASSERT_EQ(0, get_perf_context()->iter_next_cpu_nanos); + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("v0", iter->value().ToString()); + ASSERT_EQ(0, get_perf_context()->iter_prev_cpu_nanos); + ASSERT_EQ(0, env_->now_cpu_count_.load()); + delete iter; + + constexpr uint64_t kDummyAddonSeconds = uint64_t{1000000}; + constexpr uint64_t kDummyAddonNanos = 1000000000U * kDummyAddonSeconds; + + // Add time to NowNanos() reading. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "TableCache::FindTable:0", + [&](void* /*arg*/) { env_->MockSleepForSeconds(kDummyAddonSeconds); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); + iter = db_->NewIterator(ReadOptions()); + iter->Seek("k0"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("v0", iter->value().ToString()); + iter->SeekForPrev(last_key); + ASSERT_TRUE(iter->Valid()); + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(last_value, iter->value().ToString()); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("v0", iter->value().ToString()); + ASSERT_GT(get_perf_context()->iter_seek_cpu_nanos, 0); + ASSERT_LT(get_perf_context()->iter_seek_cpu_nanos, kDummyAddonNanos); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("v1", iter->value().ToString()); + ASSERT_GT(get_perf_context()->iter_next_cpu_nanos, 0); + ASSERT_LT(get_perf_context()->iter_next_cpu_nanos, kDummyAddonNanos); + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("v0", iter->value().ToString()); + ASSERT_GT(get_perf_context()->iter_prev_cpu_nanos, 0); + ASSERT_LT(get_perf_context()->iter_prev_cpu_nanos, kDummyAddonNanos); + ASSERT_GE(env_->now_cpu_count_.load(), 12); + ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonNanos); + + SetPerfLevel(PerfLevel::kDisable); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + delete iter; +} +#endif // OS_LINUX + +#if !defined OS_SOLARIS +TEST_F(DBTest2, PersistentCache) { + int num_iter = 80; + + Options options; + options.write_buffer_size = 64 * 1024; // small write buffer + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options = CurrentOptions(options); + + auto bsizes = {/*no block cache*/ 0, /*1M*/ 1 * 1024 * 1024}; + auto types = {/*compressed*/ 1, /*uncompressed*/ 0}; + for (auto bsize : bsizes) { + for (auto type : types) { + BlockBasedTableOptions table_options; + table_options.persistent_cache.reset( + new MockPersistentCache(type, 10 * 1024)); + table_options.no_block_cache = true; + table_options.block_cache = bsize ? NewLRUCache(bsize) : nullptr; + table_options.block_cache_compressed = nullptr; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + // default column family doesn't have block cache + Options no_block_cache_opts; + no_block_cache_opts.statistics = options.statistics; + no_block_cache_opts = CurrentOptions(no_block_cache_opts); + BlockBasedTableOptions table_options_no_bc; + table_options_no_bc.no_block_cache = true; + no_block_cache_opts.table_factory.reset( + NewBlockBasedTableFactory(table_options_no_bc)); + ReopenWithColumnFamilies( + {"default", "pikachu"}, + std::vector({no_block_cache_opts, options})); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + std::vector values; + std::string str; + for (int i = 0; i < num_iter; i++) { + if (i % 4 == 0) { // high compression ratio + str = rnd.RandomString(1000); + } + values.push_back(str); + ASSERT_OK(Put(1, Key(i), values[i])); + } + + // flush all data from memtable so that reads are from block cache + ASSERT_OK(Flush(1)); + + for (int i = 0; i < num_iter; i++) { + ASSERT_EQ(Get(1, Key(i)), values[i]); + } + + auto hit = options.statistics->getTickerCount(PERSISTENT_CACHE_HIT); + auto miss = options.statistics->getTickerCount(PERSISTENT_CACHE_MISS); + + ASSERT_GT(hit, 0); + ASSERT_GT(miss, 0); + } + } +} +#endif // !defined OS_SOLARIS + +namespace { +void CountSyncPoint() { + TEST_SYNC_POINT_CALLBACK("DBTest2::MarkedPoint", nullptr /* arg */); +} +} // anonymous namespace + +TEST_F(DBTest2, SyncPointMarker) { + std::atomic sync_point_called(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBTest2::MarkedPoint", + [&](void* /*arg*/) { sync_point_called.fetch_add(1); }); + + // The first dependency enforces Marker can be loaded before MarkedPoint. + // The second checks that thread 1's MarkedPoint should be disabled here. + // Execution order: + // | Thread 1 | Thread 2 | + // | | Marker | + // | MarkedPoint | | + // | Thread1First | | + // | | MarkedPoint | + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependencyAndMarkers( + {{"DBTest2::SyncPointMarker:Thread1First", "DBTest2::MarkedPoint"}}, + {{"DBTest2::SyncPointMarker:Marker", "DBTest2::MarkedPoint"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + std::function func1 = [&]() { + CountSyncPoint(); + TEST_SYNC_POINT("DBTest2::SyncPointMarker:Thread1First"); + }; + + std::function func2 = [&]() { + TEST_SYNC_POINT("DBTest2::SyncPointMarker:Marker"); + CountSyncPoint(); + }; + + auto thread1 = port::Thread(func1); + auto thread2 = port::Thread(func2); + thread1.join(); + thread2.join(); + + // Callback is only executed once + ASSERT_EQ(sync_point_called.load(), 1); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} +#endif + +size_t GetEncodedEntrySize(size_t key_size, size_t value_size) { + std::string buffer; + + PutVarint32(&buffer, static_cast(0)); + PutVarint32(&buffer, static_cast(key_size)); + PutVarint32(&buffer, static_cast(value_size)); + + return buffer.size() + key_size + value_size; +} + +TEST_F(DBTest2, ReadAmpBitmap) { + Options options = CurrentOptions(); + BlockBasedTableOptions bbto; + uint32_t bytes_per_bit[2] = {1, 16}; + for (size_t k = 0; k < 2; k++) { + // Disable delta encoding to make it easier to calculate read amplification + bbto.use_delta_encoding = false; + // Huge block cache to make it easier to calculate read amplification + bbto.block_cache = NewLRUCache(1024 * 1024 * 1024); + bbto.read_amp_bytes_per_bit = bytes_per_bit[k]; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + + const size_t kNumEntries = 10000; + + Random rnd(301); + for (size_t i = 0; i < kNumEntries; i++) { + ASSERT_OK(Put(Key(static_cast(i)), rnd.RandomString(100))); + } + ASSERT_OK(Flush()); + + Close(); + Reopen(options); + + // Read keys/values randomly and verify that reported read amp error + // is less than 2% + uint64_t total_useful_bytes = 0; + std::set read_keys; + std::string value; + for (size_t i = 0; i < kNumEntries * 5; i++) { + int key_idx = rnd.Next() % kNumEntries; + std::string key = Key(key_idx); + ASSERT_OK(db_->Get(ReadOptions(), key, &value)); + + if (read_keys.find(key_idx) == read_keys.end()) { + auto internal_key = InternalKey(key, 0, ValueType::kTypeValue); + total_useful_bytes += + GetEncodedEntrySize(internal_key.size(), value.size()); + read_keys.insert(key_idx); + } + + double expected_read_amp = + static_cast(total_useful_bytes) / + options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES); + + double read_amp = + static_cast(options.statistics->getTickerCount( + READ_AMP_ESTIMATE_USEFUL_BYTES)) / + options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES); + + double error_pct = fabs(expected_read_amp - read_amp) * 100; + // Error between reported read amp and real read amp should be less than + // 2% + EXPECT_LE(error_pct, 2); + } + + // Make sure we read every thing in the DB (which is smaller than our cache) + Iterator* iter = db_->NewIterator(ReadOptions()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_EQ(iter->value().ToString(), Get(iter->key().ToString())); + } + ASSERT_OK(iter->status()); + delete iter; + + // Read amp is on average 100% since we read all what we loaded in memory + if (k == 0) { + ASSERT_EQ( + options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES), + options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES)); + } else { + ASSERT_NEAR( + options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES) * + 1.0f / + options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES), + 1, .01); + } + } +} + +#ifndef OS_SOLARIS // GetUniqueIdFromFile is not implemented +TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) { + { + const int kIdBufLen = 100; + char id_buf[kIdBufLen]; + Status s = Status::NotSupported(); +#ifndef OS_WIN + // You can't open a directory on windows using random access file + std::unique_ptr file; + s = env_->NewRandomAccessFile(dbname_, &file, EnvOptions()); + if (s.ok()) { + if (file->GetUniqueId(id_buf, kIdBufLen) == 0) { + // fs holding db directory doesn't support getting a unique file id, + // this means that running this test will fail because lru_cache will + // load the blocks again regardless of them being already in the cache + return; + } + } +#endif + if (!s.ok()) { + std::unique_ptr dir; + ASSERT_OK(env_->NewDirectory(dbname_, &dir)); + if (dir->GetUniqueId(id_buf, kIdBufLen) == 0) { + // fs holding db directory doesn't support getting a unique file id, + // this means that running this test will fail because lru_cache will + // load the blocks again regardless of them being already in the cache + return; + } + } + } + uint32_t bytes_per_bit[2] = {1, 16}; + for (size_t k = 0; k < 2; k++) { + std::shared_ptr lru_cache = NewLRUCache(1024 * 1024 * 1024); + std::shared_ptr stats = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + Options options = CurrentOptions(); + BlockBasedTableOptions bbto; + // Disable delta encoding to make it easier to calculate read amplification + bbto.use_delta_encoding = false; + // Huge block cache to make it easier to calculate read amplification + bbto.block_cache = lru_cache; + bbto.read_amp_bytes_per_bit = bytes_per_bit[k]; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.statistics = stats; + DestroyAndReopen(options); + + const int kNumEntries = 10000; + + Random rnd(301); + for (int i = 0; i < kNumEntries; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(100))); + } + ASSERT_OK(Flush()); + + Close(); + Reopen(options); + + std::set read_keys; + std::string value; + // Iter1: Read half the DB, Read even keys + // Key(0), Key(2), Key(4), Key(6), Key(8), ... + for (int i = 0; i < kNumEntries; i += 2) { + std::string key = Key(i); + ASSERT_OK(db_->Get(ReadOptions(), key, &value)); + + if (read_keys.find(i) == read_keys.end()) { + auto internal_key = InternalKey(key, 0, ValueType::kTypeValue); + read_keys.insert(i); + } + } + + size_t total_useful_bytes_iter1 = + options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES); + size_t total_loaded_bytes_iter1 = + options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES); + + Close(); + std::shared_ptr new_statistics = + ROCKSDB_NAMESPACE::CreateDBStatistics(); + // Destroy old statistics obj that the blocks in lru_cache are pointing to + options.statistics.reset(); + // Use the statistics object that we just created + options.statistics = new_statistics; + Reopen(options); + + // Iter2: Read half the DB, Read odd keys + // Key(1), Key(3), Key(5), Key(7), Key(9), ... + for (int i = 1; i < kNumEntries; i += 2) { + std::string key = Key(i); + ASSERT_OK(db_->Get(ReadOptions(), key, &value)); + + if (read_keys.find(i) == read_keys.end()) { + auto internal_key = InternalKey(key, 0, ValueType::kTypeValue); + read_keys.insert(i); + } + } + + size_t total_useful_bytes_iter2 = + options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES); + size_t total_loaded_bytes_iter2 = + options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES); + + // Read amp is on average 100% since we read all what we loaded in memory + if (k == 0) { + ASSERT_EQ(total_useful_bytes_iter1 + total_useful_bytes_iter2, + total_loaded_bytes_iter1 + total_loaded_bytes_iter2); + } else { + ASSERT_NEAR((total_useful_bytes_iter1 + total_useful_bytes_iter2) * 1.0f / + (total_loaded_bytes_iter1 + total_loaded_bytes_iter2), + 1, .01); + } + } +} +#endif // !OS_SOLARIS + +#ifndef ROCKSDB_LITE +TEST_F(DBTest2, AutomaticCompactionOverlapManualCompaction) { + Options options = CurrentOptions(); + options.num_levels = 3; + options.IncreaseParallelism(20); + DestroyAndReopen(options); + + ASSERT_OK(Put(Key(0), "a")); + ASSERT_OK(Put(Key(5), "a")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(10), "a")); + ASSERT_OK(Put(Key(15), "a")); + ASSERT_OK(Flush()); + + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + auto get_stat = [](std::string level_str, LevelStatType type, + std::map props) { + auto prop_str = + "compaction." + level_str + "." + + InternalStats::compaction_level_stats.at(type).property_name.c_str(); + auto prop_item = props.find(prop_str); + return prop_item == props.end() ? 0 : std::stod(prop_item->second); + }; + + // Trivial move 2 files to L2 + ASSERT_EQ("0,0,2", FilesPerLevel()); + // Also test that the stats GetMapProperty API reporting the same result + { + std::map prop; + ASSERT_TRUE(dbfull()->GetMapProperty("rocksdb.cfstats", &prop)); + ASSERT_EQ(0, get_stat("L0", LevelStatType::NUM_FILES, prop)); + ASSERT_EQ(0, get_stat("L1", LevelStatType::NUM_FILES, prop)); + ASSERT_EQ(2, get_stat("L2", LevelStatType::NUM_FILES, prop)); + ASSERT_EQ(2, get_stat("Sum", LevelStatType::NUM_FILES, prop)); + } + + // While the compaction is running, we will create 2 new files that + // can fit in L2, these 2 files will be moved to L2 and overlap with + // the running compaction and break the LSM consistency. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():Start", [&](void* /*arg*/) { + ASSERT_OK( + dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"}, + {"max_bytes_for_level_base", "1"}})); + ASSERT_OK(Put(Key(6), "a")); + ASSERT_OK(Put(Key(7), "a")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(8), "a")); + ASSERT_OK(Put(Key(9), "a")); + ASSERT_OK(Flush()); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Run a manual compaction that will compact the 2 files in L2 + // into 1 file in L2 + cro.exclusive_manual_compaction = false; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + // Test that the stats GetMapProperty API reporting 1 file in L2 + { + std::map prop; + ASSERT_TRUE(dbfull()->GetMapProperty("rocksdb.cfstats", &prop)); + ASSERT_EQ(1, get_stat("L2", LevelStatType::NUM_FILES, prop)); + } +} + +TEST_F(DBTest2, ManualCompactionOverlapManualCompaction) { + Options options = CurrentOptions(); + options.num_levels = 2; + options.IncreaseParallelism(20); + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + ASSERT_OK(Put(Key(0), "a")); + ASSERT_OK(Put(Key(5), "a")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(10), "a")); + ASSERT_OK(Put(Key(15), "a")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Trivial move 2 files to L1 + ASSERT_EQ("0,2", FilesPerLevel()); + + std::function bg_manual_compact = [&]() { + std::string k1 = Key(6); + std::string k2 = Key(9); + Slice k1s(k1); + Slice k2s(k2); + CompactRangeOptions cro; + cro.exclusive_manual_compaction = false; + ASSERT_OK(db_->CompactRange(cro, &k1s, &k2s)); + }; + ROCKSDB_NAMESPACE::port::Thread bg_thread; + + // While the compaction is running, we will create 2 new files that + // can fit in L1, these 2 files will be moved to L1 and overlap with + // the running compaction and break the LSM consistency. + std::atomic flag(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():Start", [&](void* /*arg*/) { + if (flag.exchange(true)) { + // We want to make sure to call this callback only once + return; + } + ASSERT_OK(Put(Key(6), "a")); + ASSERT_OK(Put(Key(7), "a")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(8), "a")); + ASSERT_OK(Put(Key(9), "a")); + ASSERT_OK(Flush()); + + // Start a non-exclusive manual compaction in a bg thread + bg_thread = port::Thread(bg_manual_compact); + // This manual compaction conflict with the other manual compaction + // so it should wait until the first compaction finish + env_->SleepForMicroseconds(1000000); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Run a manual compaction that will compact the 2 files in L1 + // into 1 file in L1 + CompactRangeOptions cro; + cro.exclusive_manual_compaction = false; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + bg_thread.join(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, PausingManualCompaction1) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = 7; + + DestroyAndReopen(options); + Random rnd(301); + // Generate a file containing 10 keys. + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(50))); + } + ASSERT_OK(Flush()); + + // Generate another file containing same keys + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(50))); + } + ASSERT_OK(Flush()); + + int manual_compactions_paused = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():PausingManualCompaction:1", [&](void* arg) { + auto canceled = static_cast*>(arg); + // CompactRange triggers manual compaction and cancel the compaction + // by set *canceled as true + if (canceled != nullptr) { + canceled->store(true, std::memory_order_release); + } + manual_compactions_paused += 1; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "TestCompactFiles:PausingManualCompaction:3", [&](void* arg) { + auto paused = static_cast*>(arg); + // CompactFiles() relies on manual_compactions_paused to + // determine if thie compaction should be paused or not + ASSERT_EQ(0, paused->load(std::memory_order_acquire)); + paused->fetch_add(1, std::memory_order_release); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + std::vector files_before_compact, files_after_compact; + // Remember file name before compaction is triggered + std::vector files_meta; + dbfull()->GetLiveFilesMetaData(&files_meta); + for (auto file : files_meta) { + files_before_compact.push_back(file.name); + } + + // OK, now trigger a manual compaction + ASSERT_TRUE(dbfull() + ->CompactRange(CompactRangeOptions(), nullptr, nullptr) + .IsManualCompactionPaused()); + + // Wait for compactions to get scheduled and stopped + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + // Get file names after compaction is stopped + files_meta.clear(); + dbfull()->GetLiveFilesMetaData(&files_meta); + for (auto file : files_meta) { + files_after_compact.push_back(file.name); + } + + // Like nothing happened + ASSERT_EQ(files_before_compact, files_after_compact); + ASSERT_EQ(manual_compactions_paused, 1); + + manual_compactions_paused = 0; + // Now make sure CompactFiles also not run + ASSERT_TRUE(dbfull() + ->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), + files_before_compact, 0) + .IsManualCompactionPaused()); + // Wait for manual compaction to get scheduled and finish + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + files_meta.clear(); + files_after_compact.clear(); + dbfull()->GetLiveFilesMetaData(&files_meta); + for (auto file : files_meta) { + files_after_compact.push_back(file.name); + } + + ASSERT_EQ(files_before_compact, files_after_compact); + // CompactFiles returns at entry point + ASSERT_EQ(manual_compactions_paused, 0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +// PausingManualCompaction does not affect auto compaction +TEST_F(DBTest2, PausingManualCompaction2) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.disable_auto_compactions = false; + + DestroyAndReopen(options); + dbfull()->DisableManualCompaction(); + + Random rnd(301); + for (int i = 0; i < 2; i++) { + // Generate a file containing 100 keys. + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(j), rnd.RandomString(50))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + std::vector files_meta; + dbfull()->GetLiveFilesMetaData(&files_meta); + ASSERT_EQ(files_meta.size(), 1); +} + +TEST_F(DBTest2, PausingManualCompaction3) { + CompactRangeOptions compact_options; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = 7; + + Random rnd(301); + auto generate_files = [&]() { + for (int i = 0; i < options.num_levels; i++) { + for (int j = 0; j < options.num_levels - i + 1; j++) { + for (int k = 0; k < 1000; k++) { + ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50))); + } + ASSERT_OK(Flush()); + } + + for (int l = 1; l < options.num_levels - i; l++) { + MoveFilesToLevel(l); + } + } + }; + + DestroyAndReopen(options); + generate_files(); +#ifndef ROCKSDB_LITE + ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); +#endif // !ROCKSDB_LITE + int run_manual_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():PausingManualCompaction:1", + [&](void* /*arg*/) { run_manual_compactions++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + dbfull()->DisableManualCompaction(); + ASSERT_TRUE(dbfull() + ->CompactRange(compact_options, nullptr, nullptr) + .IsManualCompactionPaused()); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + // As manual compaction disabled, not even reach sync point + ASSERT_EQ(run_manual_compactions, 0); +#ifndef ROCKSDB_LITE + ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "CompactionJob::Run():PausingManualCompaction:1"); + dbfull()->EnableManualCompaction(); + ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, PausingManualCompaction4) { + CompactRangeOptions compact_options; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = 7; + + Random rnd(301); + auto generate_files = [&]() { + for (int i = 0; i < options.num_levels; i++) { + for (int j = 0; j < options.num_levels - i + 1; j++) { + for (int k = 0; k < 1000; k++) { + ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50))); + } + ASSERT_OK(Flush()); + } + + for (int l = 1; l < options.num_levels - i; l++) { + MoveFilesToLevel(l); + } + } + }; + + DestroyAndReopen(options); + generate_files(); +#ifndef ROCKSDB_LITE + ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); +#endif // !ROCKSDB_LITE + int run_manual_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():PausingManualCompaction:2", [&](void* arg) { + auto canceled = static_cast*>(arg); + // CompactRange triggers manual compaction and cancel the compaction + // by set *canceled as true + if (canceled != nullptr) { + canceled->store(true, std::memory_order_release); + } + run_manual_compactions++; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "TestCompactFiles:PausingManualCompaction:3", [&](void* arg) { + auto paused = static_cast*>(arg); + // CompactFiles() relies on manual_compactions_paused to + // determine if thie compaction should be paused or not + ASSERT_EQ(0, paused->load(std::memory_order_acquire)); + paused->fetch_add(1, std::memory_order_release); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_TRUE(dbfull() + ->CompactRange(compact_options, nullptr, nullptr) + .IsManualCompactionPaused()); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_EQ(run_manual_compactions, 1); +#ifndef ROCKSDB_LITE + ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "CompactionJob::Run():PausingManualCompaction:2"); + ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, CancelManualCompaction1) { + CompactRangeOptions compact_options; + auto canceledPtr = + std::unique_ptr>(new std::atomic{true}); + compact_options.canceled = canceledPtr.get(); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = 7; + + Random rnd(301); + auto generate_files = [&]() { + for (int i = 0; i < options.num_levels; i++) { + for (int j = 0; j < options.num_levels - i + 1; j++) { + for (int k = 0; k < 1000; k++) { + ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50))); + } + ASSERT_OK(Flush()); + } + + for (int l = 1; l < options.num_levels - i; l++) { + MoveFilesToLevel(l); + } + } + }; + + DestroyAndReopen(options); + generate_files(); +#ifndef ROCKSDB_LITE + ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + int run_manual_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():PausingManualCompaction:1", + [&](void* /*arg*/) { run_manual_compactions++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Setup a callback to disable compactions after a couple of levels are + // compacted + int compactions_run = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::RunManualCompaction()::1", + [&](void* /*arg*/) { ++compactions_run; }); + + ASSERT_TRUE(dbfull() + ->CompactRange(compact_options, nullptr, nullptr) + .IsManualCompactionPaused()); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + // Since compactions are disabled, we shouldn't start compacting. + // E.g. we should call the compaction function exactly one time. + ASSERT_EQ(compactions_run, 0); + ASSERT_EQ(run_manual_compactions, 0); +#ifndef ROCKSDB_LITE + ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + compactions_run = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "DBImpl::RunManualCompaction()::1"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) { + ++compactions_run; + // After 3 compactions disable + if (compactions_run == 3) { + compact_options.canceled->store(true, std::memory_order_release); + } + }); + + compact_options.canceled->store(false, std::memory_order_release); + ASSERT_TRUE(dbfull() + ->CompactRange(compact_options, nullptr, nullptr) + .IsManualCompactionPaused()); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + ASSERT_EQ(compactions_run, 3); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "DBImpl::RunManualCompaction()::1"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "CompactionJob::Run():PausingManualCompaction:1"); + + // Compactions should work again if we re-enable them.. + compact_options.canceled->store(false, std::memory_order_relaxed); + ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, CancelManualCompaction2) { + CompactRangeOptions compact_options; + auto canceledPtr = + std::unique_ptr>(new std::atomic{true}); + compact_options.canceled = canceledPtr.get(); + compact_options.max_subcompactions = 1; + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = 7; + + Random rnd(301); + auto generate_files = [&]() { + for (int i = 0; i < options.num_levels; i++) { + for (int j = 0; j < options.num_levels - i + 1; j++) { + for (int k = 0; k < 1000; k++) { + ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50))); + } + ASSERT_OK(Flush()); + } + + for (int l = 1; l < options.num_levels - i; l++) { + MoveFilesToLevel(l); + } + } + }; + + DestroyAndReopen(options); + generate_files(); +#ifndef ROCKSDB_LITE + ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + int compactions_run = 0; + std::atomic kv_compactions{0}; + int compactions_stopped_at = 0; + int kv_compactions_stopped_at = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) { + ++compactions_run; + // After 3 compactions disable + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator:ProcessKV", [&](void* /*arg*/) { + int kv_compactions_run = + kv_compactions.fetch_add(1, std::memory_order_release); + if (kv_compactions_run == 5) { + compact_options.canceled->store(true, std::memory_order_release); + kv_compactions_stopped_at = kv_compactions_run; + compactions_stopped_at = compactions_run; + } + }); + + compact_options.canceled->store(false, std::memory_order_release); + ASSERT_TRUE(dbfull() + ->CompactRange(compact_options, nullptr, nullptr) + .IsManualCompactionPaused()); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + // NOTE: as we set compact_options.max_subcompacitons = 1, and store true to + // the canceled variable from the single compacting thread (via callback), + // this value is deterministically kv_compactions_stopped_at + 1. + ASSERT_EQ(kv_compactions, kv_compactions_stopped_at + 1); + ASSERT_EQ(compactions_run, compactions_stopped_at); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "CompactionIterator::ProcessKV"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "DBImpl::RunManualCompaction()::1"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "CompactionJob::Run():PausingManualCompaction:1"); + + // Compactions should work again if we re-enable them.. + compact_options.canceled->store(false, std::memory_order_relaxed); + ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +class CancelCompactionListener : public EventListener { + public: + CancelCompactionListener() + : num_compaction_started_(0), num_compaction_ended_(0) {} + + void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override { + ASSERT_EQ(ci.cf_name, "default"); + ASSERT_EQ(ci.base_input_level, 0); + num_compaction_started_++; + } + + void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override { + ASSERT_EQ(ci.cf_name, "default"); + ASSERT_EQ(ci.base_input_level, 0); + ASSERT_EQ(ci.status.code(), code_); + ASSERT_EQ(ci.status.subcode(), subcode_); + num_compaction_ended_++; + } + + std::atomic num_compaction_started_; + std::atomic num_compaction_ended_; + Status::Code code_; + Status::SubCode subcode_; +}; + +TEST_F(DBTest2, CancelManualCompactionWithListener) { + CompactRangeOptions compact_options; + auto canceledPtr = + std::unique_ptr>(new std::atomic{true}); + compact_options.canceled = canceledPtr.get(); + compact_options.max_subcompactions = 1; + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + CancelCompactionListener* listener = new CancelCompactionListener(); + options.listeners.emplace_back(listener); + + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(Key(i + j * 10), rnd.RandomString(50))); + } + ASSERT_OK(Flush()); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator:ProcessKV", [&](void* /*arg*/) { + compact_options.canceled->store(true, std::memory_order_release); + }); + + int running_compaction = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::FinishCompactionOutputFile1", + [&](void* /*arg*/) { running_compaction++; }); + + // Case I: 1 Notify begin compaction, 2 Set *canceled as true to disable + // manual compaction in the callback function, 3 Compaction not run, + // 4 Notify compaction end. + listener->code_ = Status::kIncomplete; + listener->subcode_ = Status::SubCode::kManualCompactionPaused; + + compact_options.canceled->store(false, std::memory_order_release); + ASSERT_TRUE(dbfull() + ->CompactRange(compact_options, nullptr, nullptr) + .IsManualCompactionPaused()); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + ASSERT_GT(listener->num_compaction_started_, 0); + ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_); + ASSERT_EQ(running_compaction, 0); + + listener->num_compaction_started_ = 0; + listener->num_compaction_ended_ = 0; + + // Case II: 1 Set *canceled as true in the callback function to disable manual + // compaction, 2 Notify begin compaction (return without notifying), 3 Notify + // compaction end (return without notifying). + ASSERT_TRUE(dbfull() + ->CompactRange(compact_options, nullptr, nullptr) + .IsManualCompactionPaused()); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + ASSERT_EQ(listener->num_compaction_started_, 0); + ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_); + ASSERT_EQ(running_compaction, 0); + + // Case III: 1 Notify begin compaction, 2 Compaction in between + // 3. Set *canceled as true in the callback function to disable manual + // compaction, 4 Notify compaction end. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "CompactionIterator:ProcessKV"); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run:BeforeVerify", [&](void* /*arg*/) { + compact_options.canceled->store(true, std::memory_order_release); + }); + + listener->code_ = Status::kOk; + listener->subcode_ = Status::SubCode::kNone; + + compact_options.canceled->store(false, std::memory_order_release); + ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + ASSERT_GT(listener->num_compaction_started_, 0); + ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_); + + // Compaction job will succeed. + ASSERT_GT(running_compaction, 0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, CompactionOnBottomPriorityWithListener) { + int num_levels = 3; + const int kNumFilesTrigger = 4; + + Options options = CurrentOptions(); + env_->SetBackgroundThreads(0, Env::Priority::HIGH); + env_->SetBackgroundThreads(0, Env::Priority::LOW); + env_->SetBackgroundThreads(1, Env::Priority::BOTTOM); + options.env = env_; + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = num_levels; + options.write_buffer_size = 100 << 10; // 100KB + options.target_file_size_base = 32 << 10; // 32KB + options.level0_file_num_compaction_trigger = kNumFilesTrigger; + // Trigger compaction if size amplification exceeds 110% + options.compaction_options_universal.max_size_amplification_percent = 110; + + CancelCompactionListener* listener = new CancelCompactionListener(); + options.listeners.emplace_back(listener); + + DestroyAndReopen(options); + + int num_bottom_thread_compaction_scheduled = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:ForwardToBottomPriPool", + [&](void* /*arg*/) { num_bottom_thread_compaction_scheduled++; }); + + int num_compaction_jobs = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():End", + [&](void* /*arg*/) { num_compaction_jobs++; }); + + listener->code_ = Status::kOk; + listener->subcode_ = Status::SubCode::kNone; + + Random rnd(301); + for (int i = 0; i < 1; ++i) { + for (int num = 0; num < kNumFilesTrigger; num++) { + int key_idx = 0; + GenerateNewFile(&rnd, &key_idx, true /* no_wait */); + // use no_wait above because that one waits for flush and compaction. We + // don't want to wait for compaction because the full compaction is + // intentionally blocked while more files are flushed. + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_GT(num_bottom_thread_compaction_scheduled, 0); + ASSERT_EQ(num_compaction_jobs, 1); + ASSERT_GT(listener->num_compaction_started_, 0); + ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, OptimizeForPointLookup) { + Options options = CurrentOptions(); + Close(); + options.OptimizeForPointLookup(2); + ASSERT_OK(DB::Open(options, dbname_, &db_)); + + ASSERT_OK(Put("foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(Flush()); + ASSERT_EQ("v1", Get("foo")); +} + +TEST_F(DBTest2, OptimizeForSmallDB) { + Options options = CurrentOptions(); + Close(); + options.OptimizeForSmallDb(); + + // Find the cache object + ASSERT_TRUE(options.table_factory->IsInstanceOf( + TableFactory::kBlockBasedTableName())); + auto table_options = + options.table_factory->GetOptions(); + + ASSERT_TRUE(table_options != nullptr); + std::shared_ptr cache = table_options->block_cache; + + ASSERT_EQ(0, cache->GetUsage()); + ASSERT_OK(DB::Open(options, dbname_, &db_)); + ASSERT_OK(Put("foo", "v1")); + + // memtable size is costed to the block cache + ASSERT_NE(0, cache->GetUsage()); + + ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(Flush()); + + size_t prev_size = cache->GetUsage(); + // Remember block cache size, so that we can find that + // it is filled after Get(). + // Use pinnable slice so that it can ping the block so that + // when we check the size it is not evicted. + PinnableSlice value; + ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), "foo", &value)); + ASSERT_GT(cache->GetUsage(), prev_size); + value.Reset(); +} + +#endif // ROCKSDB_LITE + +TEST_F(DBTest2, IterRaceFlush1) { + ASSERT_OK(Put("foo", "v1")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::NewIterator:1", "DBTest2::IterRaceFlush:1"}, + {"DBTest2::IterRaceFlush:2", "DBImpl::NewIterator:2"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread t1([&] { + TEST_SYNC_POINT("DBTest2::IterRaceFlush:1"); + ASSERT_OK(Put("foo", "v2")); + ASSERT_OK(Flush()); + TEST_SYNC_POINT("DBTest2::IterRaceFlush:2"); + }); + + // iterator is created after the first Put(), and its snapshot sequence is + // assigned after second Put(), so it must see v2. + { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + it->Seek("foo"); + ASSERT_TRUE(it->Valid()); + ASSERT_OK(it->status()); + ASSERT_EQ("foo", it->key().ToString()); + ASSERT_EQ("v2", it->value().ToString()); + } + + t1.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, IterRaceFlush2) { + ASSERT_OK(Put("foo", "v1")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::NewIterator:3", "DBTest2::IterRaceFlush2:1"}, + {"DBTest2::IterRaceFlush2:2", "DBImpl::NewIterator:4"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread t1([&] { + TEST_SYNC_POINT("DBTest2::IterRaceFlush2:1"); + ASSERT_OK(Put("foo", "v2")); + ASSERT_OK(Flush()); + TEST_SYNC_POINT("DBTest2::IterRaceFlush2:2"); + }); + + // iterator is created after the first Put(), and its snapshot sequence is + // assigned before second Put(), thus it must see v1. + { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + it->Seek("foo"); + ASSERT_TRUE(it->Valid()); + ASSERT_OK(it->status()); + ASSERT_EQ("foo", it->key().ToString()); + ASSERT_EQ("v1", it->value().ToString()); + } + + t1.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, IterRefreshRaceFlush) { + ASSERT_OK(Put("foo", "v1")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"ArenaWrappedDBIter::Refresh:1", "DBTest2::IterRefreshRaceFlush:1"}, + {"DBTest2::IterRefreshRaceFlush:2", "ArenaWrappedDBIter::Refresh:2"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread t1([&] { + TEST_SYNC_POINT("DBTest2::IterRefreshRaceFlush:1"); + ASSERT_OK(Put("foo", "v2")); + ASSERT_OK(Flush()); + TEST_SYNC_POINT("DBTest2::IterRefreshRaceFlush:2"); + }); + + // iterator is refreshed after the first Put(), and its sequence number is + // assigned after second Put(), thus it must see v2. + { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + ASSERT_OK(it->status()); + ASSERT_OK(it->Refresh()); + it->Seek("foo"); + ASSERT_TRUE(it->Valid()); + ASSERT_OK(it->status()); + ASSERT_EQ("foo", it->key().ToString()); + ASSERT_EQ("v2", it->value().ToString()); + } + + t1.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, GetRaceFlush1) { + ASSERT_OK(Put("foo", "v1")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::GetImpl:1", "DBTest2::GetRaceFlush:1"}, + {"DBTest2::GetRaceFlush:2", "DBImpl::GetImpl:2"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread t1([&] { + TEST_SYNC_POINT("DBTest2::GetRaceFlush:1"); + ASSERT_OK(Put("foo", "v2")); + ASSERT_OK(Flush()); + TEST_SYNC_POINT("DBTest2::GetRaceFlush:2"); + }); + + // Get() is issued after the first Put(), so it should see either + // "v1" or "v2". + ASSERT_NE("NOT_FOUND", Get("foo")); + t1.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, GetRaceFlush2) { + ASSERT_OK(Put("foo", "v1")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::GetImpl:3", "DBTest2::GetRaceFlush:1"}, + {"DBTest2::GetRaceFlush:2", "DBImpl::GetImpl:4"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + port::Thread t1([&] { + TEST_SYNC_POINT("DBTest2::GetRaceFlush:1"); + ASSERT_OK(Put("foo", "v2")); + ASSERT_OK(Flush()); + TEST_SYNC_POINT("DBTest2::GetRaceFlush:2"); + }); + + // Get() is issued after the first Put(), so it should see either + // "v1" or "v2". + ASSERT_NE("NOT_FOUND", Get("foo")); + t1.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, DirectIO) { + if (!IsDirectIOSupported()) { + return; + } + Options options = CurrentOptions(); + options.use_direct_reads = options.use_direct_io_for_flush_and_compaction = + true; + options.allow_mmap_reads = options.allow_mmap_writes = false; + DestroyAndReopen(options); + + ASSERT_OK(Put(Key(0), "a")); + ASSERT_OK(Put(Key(5), "a")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(10), "a")); + ASSERT_OK(Put(Key(15), "a")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + Reopen(options); +} + +TEST_F(DBTest2, MemtableOnlyIterator) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "foo", "first")); + ASSERT_OK(Put(1, "bar", "second")); + + ReadOptions ropt; + ropt.read_tier = kMemtableTier; + std::string value; + Iterator* it = nullptr; + + // Before flushing + // point lookups + ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value)); + ASSERT_EQ("first", value); + ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value)); + ASSERT_EQ("second", value); + + // Memtable-only iterator (read_tier=kMemtableTier); data not flushed yet. + it = db_->NewIterator(ropt, handles_[1]); + int count = 0; + for (it->SeekToFirst(); it->Valid(); it->Next()) { + ASSERT_TRUE(it->Valid()); + count++; + } + ASSERT_TRUE(!it->Valid()); + ASSERT_EQ(2, count); + delete it; + + Flush(1); + + // After flushing + // point lookups + ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value)); + ASSERT_EQ("first", value); + ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value)); + ASSERT_EQ("second", value); + // nothing should be returned using memtable-only iterator after flushing. + it = db_->NewIterator(ropt, handles_[1]); + ASSERT_OK(it->status()); + count = 0; + for (it->SeekToFirst(); it->Valid(); it->Next()) { + ASSERT_TRUE(it->Valid()); + count++; + } + ASSERT_TRUE(!it->Valid()); + ASSERT_EQ(0, count); + ASSERT_OK(it->status()); + delete it; + + // Add a key to memtable + ASSERT_OK(Put(1, "foobar", "third")); + it = db_->NewIterator(ropt, handles_[1]); + ASSERT_OK(it->status()); + count = 0; + for (it->SeekToFirst(); it->Valid(); it->Next()) { + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("foobar", it->key().ToString()); + ASSERT_EQ("third", it->value().ToString()); + count++; + } + ASSERT_TRUE(!it->Valid()); + ASSERT_EQ(1, count); + ASSERT_OK(it->status()); + delete it; +} + +TEST_F(DBTest2, LowPriWrite) { + Options options = CurrentOptions(); + // Compaction pressure should trigger since 6 files + options.level0_file_num_compaction_trigger = 4; + options.level0_slowdown_writes_trigger = 12; + options.level0_stop_writes_trigger = 30; + options.delayed_write_rate = 8 * 1024 * 1024; + Reopen(options); + + std::atomic rate_limit_count(0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "GenericRateLimiter::Request:1", [&](void* arg) { + rate_limit_count.fetch_add(1); + int64_t* rate_bytes_per_sec = static_cast(arg); + ASSERT_EQ(1024 * 1024, *rate_bytes_per_sec); + }); + // Block compaction + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"DBTest.LowPriWrite:0", "DBImpl::BGWorkCompaction"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + WriteOptions wo; + for (int i = 0; i < 6; i++) { + wo.low_pri = false; + ASSERT_OK(Put("", "", wo)); + wo.low_pri = true; + ASSERT_OK(Put("", "", wo)); + ASSERT_OK(Flush()); + } + ASSERT_EQ(0, rate_limit_count.load()); + wo.low_pri = true; + ASSERT_OK(Put("", "", wo)); + ASSERT_EQ(1, rate_limit_count.load()); + wo.low_pri = false; + ASSERT_OK(Put("", "", wo)); + ASSERT_EQ(1, rate_limit_count.load()); + + TEST_SYNC_POINT("DBTest.LowPriWrite:0"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + wo.low_pri = true; + ASSERT_OK(Put("", "", wo)); + ASSERT_EQ(1, rate_limit_count.load()); + wo.low_pri = false; + ASSERT_OK(Put("", "", wo)); + ASSERT_EQ(1, rate_limit_count.load()); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest2, RateLimitedCompactionReads) { + // compaction input has 512KB data + const int kNumKeysPerFile = 128; + const int kBytesPerKey = 1024; + const int kNumL0Files = 4; + + for (int compaction_readahead_size : {0, 32 << 10}) { + for (auto use_direct_io : {false, true}) { + if (use_direct_io && !IsDirectIOSupported()) { + continue; + } + Options options = CurrentOptions(); + options.compaction_readahead_size = compaction_readahead_size; + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = kNumL0Files; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + // takes roughly one second, split into 100 x 10ms intervals. Each + // interval permits 5.12KB, which is smaller than the block size, so this + // test exercises the code for chunking reads. + options.rate_limiter.reset(NewGenericRateLimiter( + static_cast(kNumL0Files * kNumKeysPerFile * + kBytesPerKey) /* rate_bytes_per_sec */, + 10 * 1000 /* refill_period_us */, 10 /* fairness */, + RateLimiter::Mode::kReadsOnly)); + options.use_direct_reads = + options.use_direct_io_for_flush_and_compaction = use_direct_io; + BlockBasedTableOptions bbto; + bbto.block_size = 16384; + bbto.no_block_cache = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + for (int i = 0; i < kNumL0Files; ++i) { + for (int j = 0; j <= kNumKeysPerFile; ++j) { + ASSERT_OK(Put(Key(j), DummyString(kBytesPerKey))); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + if (i + 1 < kNumL0Files) { + ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); + } + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + + // should be slightly above 512KB due to non-data blocks read. Arbitrarily + // chose 1MB as the upper bound on the total bytes read. + size_t rate_limited_bytes = static_cast( + options.rate_limiter->GetTotalBytesThrough(Env::IO_TOTAL)); + // The charges can exist for `IO_LOW` and `IO_USER` priorities. + size_t rate_limited_bytes_by_pri = + options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW) + + options.rate_limiter->GetTotalBytesThrough(Env::IO_USER); + ASSERT_EQ(rate_limited_bytes, + static_cast(rate_limited_bytes_by_pri)); + // Include the explicit prefetch of the footer in direct I/O case. + size_t direct_io_extra = use_direct_io ? 512 * 1024 : 0; + ASSERT_GE( + rate_limited_bytes, + static_cast(kNumKeysPerFile * kBytesPerKey * kNumL0Files)); + ASSERT_LT( + rate_limited_bytes, + static_cast(2 * kNumKeysPerFile * kBytesPerKey * kNumL0Files + + direct_io_extra)); + + Iterator* iter = db_->NewIterator(ReadOptions()); + ASSERT_OK(iter->status()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_EQ(iter->value().ToString(), DummyString(kBytesPerKey)); + } + delete iter; + // bytes read for user iterator shouldn't count against the rate limit. + rate_limited_bytes_by_pri = + options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW) + + options.rate_limiter->GetTotalBytesThrough(Env::IO_USER); + ASSERT_EQ(rate_limited_bytes, + static_cast(rate_limited_bytes_by_pri)); + } + } +} +#endif // ROCKSDB_LITE + +// Make sure DB can be reopen with reduced number of levels, given no file +// is on levels higher than the new num_levels. +TEST_F(DBTest2, ReduceLevel) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + options.num_levels = 7; + Reopen(options); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + MoveFilesToLevel(6); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); +#endif // !ROCKSDB_LITE + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 1; + ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,1", FilesPerLevel()); +#endif // !ROCKSDB_LITE + options.num_levels = 3; + Reopen(options); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,1", FilesPerLevel()); +#endif // !ROCKSDB_LITE +} + +// Test that ReadCallback is actually used in both memtbale and sst tables +TEST_F(DBTest2, ReadCallbackTest) { + Options options; + options.disable_auto_compactions = true; + options.num_levels = 7; + options.env = env_; + Reopen(options); + std::vector snapshots; + // Try to create a db with multiple layers and a memtable + const std::string key = "foo"; + const std::string value = "bar"; + // This test assumes that the seq start with 1 and increased by 1 after each + // write batch of size 1. If that behavior changes, the test needs to be + // updated as well. + // TODO(myabandeh): update this test to use the seq number that is returned by + // the DB instead of assuming what seq the DB used. + int i = 1; + for (; i < 10; i++) { + ASSERT_OK(Put(key, value + std::to_string(i))); + // Take a snapshot to avoid the value being removed during compaction + auto snapshot = dbfull()->GetSnapshot(); + snapshots.push_back(snapshot); + } + ASSERT_OK(Flush()); + for (; i < 20; i++) { + ASSERT_OK(Put(key, value + std::to_string(i))); + // Take a snapshot to avoid the value being removed during compaction + auto snapshot = dbfull()->GetSnapshot(); + snapshots.push_back(snapshot); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(6); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); +#endif // !ROCKSDB_LITE + for (; i < 30; i++) { + ASSERT_OK(Put(key, value + std::to_string(i))); + auto snapshot = dbfull()->GetSnapshot(); + snapshots.push_back(snapshot); + } + ASSERT_OK(Flush()); +#ifndef ROCKSDB_LITE + ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel()); +#endif // !ROCKSDB_LITE + // And also add some values to the memtable + for (; i < 40; i++) { + ASSERT_OK(Put(key, value + std::to_string(i))); + auto snapshot = dbfull()->GetSnapshot(); + snapshots.push_back(snapshot); + } + + class TestReadCallback : public ReadCallback { + public: + explicit TestReadCallback(SequenceNumber snapshot) + : ReadCallback(snapshot), snapshot_(snapshot) {} + bool IsVisibleFullCheck(SequenceNumber seq) override { + return seq <= snapshot_; + } + + private: + SequenceNumber snapshot_; + }; + + for (int seq = 1; seq < i; seq++) { + PinnableSlice pinnable_val; + ReadOptions roptions; + TestReadCallback callback(seq); + bool dont_care = true; + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = dbfull()->DefaultColumnFamily(); + get_impl_options.value = &pinnable_val; + get_impl_options.value_found = &dont_care; + get_impl_options.callback = &callback; + Status s = dbfull()->GetImpl(roptions, key, get_impl_options); + ASSERT_TRUE(s.ok()); + // Assuming that after each Put the DB increased seq by one, the value and + // seq number must be equal since we also inc value by 1 after each Put. + ASSERT_EQ(value + std::to_string(seq), pinnable_val.ToString()); + } + + for (auto snapshot : snapshots) { + dbfull()->ReleaseSnapshot(snapshot); + } +} + +#ifndef ROCKSDB_LITE + +TEST_F(DBTest2, LiveFilesOmitObsoleteFiles) { + // Regression test for race condition where an obsolete file is returned to + // user as a "live file" but then deleted, all while file deletions are + // disabled. + // + // It happened like this: + // + // 1. [flush thread] Log file "x.log" found by FindObsoleteFiles + // 2. [user thread] DisableFileDeletions, GetSortedWalFiles are called and the + // latter returned "x.log" + // 3. [flush thread] PurgeObsoleteFiles deleted "x.log" + // 4. [user thread] Reading "x.log" failed + // + // Unfortunately the only regression test I can come up with involves sleep. + // We cannot set SyncPoints to repro since, once the fix is applied, the + // SyncPoints would cause a deadlock as the repro's sequence of events is now + // prohibited. + // + // Instead, if we sleep for a second between Find and Purge, and ensure the + // read attempt happens after purge, then the sequence of events will almost + // certainly happen on the old code. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::BackgroundCallFlush:FilesFound", + "DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered"}, + {"DBImpl::PurgeObsoleteFiles:End", + "DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::PurgeObsoleteFiles:Begin", + [&](void* /*arg*/) { env_->SleepForMicroseconds(1000000); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("key", "val")); + FlushOptions flush_opts; + flush_opts.wait = false; + db_->Flush(flush_opts); + TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered"); + + ASSERT_OK(db_->DisableFileDeletions()); + VectorLogPtr log_files; + ASSERT_OK(db_->GetSortedWalFiles(log_files)); + TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured"); + for (const auto& log_file : log_files) { + ASSERT_OK(env_->FileExists(LogFileName(dbname_, log_file->LogNumber()))); + } + + ASSERT_OK(db_->EnableFileDeletions()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, TestNumPread) { + Options options = CurrentOptions(); + bool prefetch_supported = + test::IsPrefetchSupported(env_->GetFileSystem(), dbname_); + // disable block cache + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + env_->count_random_reads_ = true; + env_->random_file_open_counter_.store(0); + ASSERT_OK(Put("bar", "foo")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + if (prefetch_supported) { + // After flush, we'll open the file and read footer, meta block, + // property block and index block. + ASSERT_EQ(4, env_->random_read_counter_.Read()); + } else { + // With prefetch not supported, we will do a single read into a buffer + ASSERT_EQ(1, env_->random_read_counter_.Read()); + } + ASSERT_EQ(1, env_->random_file_open_counter_.load()); + + // One pread per a normal data block read + env_->random_file_open_counter_.store(0); + env_->random_read_counter_.Reset(); + ASSERT_EQ("bar", Get("foo")); + ASSERT_EQ(1, env_->random_read_counter_.Read()); + // All files are already opened. + ASSERT_EQ(0, env_->random_file_open_counter_.load()); + + env_->random_file_open_counter_.store(0); + env_->random_read_counter_.Reset(); + ASSERT_OK(Put("bar2", "foo2")); + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_OK(Flush()); + if (prefetch_supported) { + // After flush, we'll open the file and read footer, meta block, + // property block and index block. + ASSERT_EQ(4, env_->random_read_counter_.Read()); + } else { + // With prefetch not supported, we will do a single read into a buffer + ASSERT_EQ(1, env_->random_read_counter_.Read()); + } + ASSERT_EQ(1, env_->random_file_open_counter_.load()); + + env_->random_file_open_counter_.store(0); + env_->random_read_counter_.Reset(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + if (prefetch_supported) { + // Compaction needs two input blocks, which requires 2 preads, and + // generate a new SST file which needs 4 preads (footer, meta block, + // property block and index block). In total 6. + ASSERT_EQ(6, env_->random_read_counter_.Read()); + } else { + // With prefetch off, compaction needs two input blocks, + // followed by a single buffered read. In total 3. + ASSERT_EQ(3, env_->random_read_counter_.Read()); + } + // All compaction input files should have already been opened. + ASSERT_EQ(1, env_->random_file_open_counter_.load()); + + // One pread per a normal data block read + env_->random_file_open_counter_.store(0); + env_->random_read_counter_.Reset(); + ASSERT_EQ("foo2", Get("bar2")); + ASSERT_EQ(1, env_->random_read_counter_.Read()); + // SST files are already opened. + ASSERT_EQ(0, env_->random_file_open_counter_.load()); +} + +class TraceExecutionResultHandler : public TraceRecordResult::Handler { + public: + TraceExecutionResultHandler() {} + ~TraceExecutionResultHandler() override {} + + virtual Status Handle(const StatusOnlyTraceExecutionResult& result) override { + if (result.GetStartTimestamp() > result.GetEndTimestamp()) { + return Status::InvalidArgument("Invalid timestamps."); + } + result.GetStatus().PermitUncheckedError(); + switch (result.GetTraceType()) { + case kTraceWrite: { + total_latency_ += result.GetLatency(); + cnt_++; + writes_++; + break; + } + default: + return Status::Corruption("Type mismatch."); + } + return Status::OK(); + } + + virtual Status Handle( + const SingleValueTraceExecutionResult& result) override { + if (result.GetStartTimestamp() > result.GetEndTimestamp()) { + return Status::InvalidArgument("Invalid timestamps."); + } + result.GetStatus().PermitUncheckedError(); + switch (result.GetTraceType()) { + case kTraceGet: { + total_latency_ += result.GetLatency(); + cnt_++; + gets_++; + break; + } + default: + return Status::Corruption("Type mismatch."); + } + return Status::OK(); + } + + virtual Status Handle( + const MultiValuesTraceExecutionResult& result) override { + if (result.GetStartTimestamp() > result.GetEndTimestamp()) { + return Status::InvalidArgument("Invalid timestamps."); + } + for (const Status& s : result.GetMultiStatus()) { + s.PermitUncheckedError(); + } + switch (result.GetTraceType()) { + case kTraceMultiGet: { + total_latency_ += result.GetLatency(); + cnt_++; + multigets_++; + break; + } + default: + return Status::Corruption("Type mismatch."); + } + return Status::OK(); + } + + virtual Status Handle(const IteratorTraceExecutionResult& result) override { + if (result.GetStartTimestamp() > result.GetEndTimestamp()) { + return Status::InvalidArgument("Invalid timestamps."); + } + result.GetStatus().PermitUncheckedError(); + switch (result.GetTraceType()) { + case kTraceIteratorSeek: + case kTraceIteratorSeekForPrev: { + total_latency_ += result.GetLatency(); + cnt_++; + seeks_++; + break; + } + default: + return Status::Corruption("Type mismatch."); + } + return Status::OK(); + } + + void Reset() { + total_latency_ = 0; + cnt_ = 0; + writes_ = 0; + gets_ = 0; + seeks_ = 0; + multigets_ = 0; + } + + double GetAvgLatency() const { + return cnt_ == 0 ? 0.0 : 1.0 * total_latency_ / cnt_; + } + + int GetNumWrites() const { return writes_; } + + int GetNumGets() const { return gets_; } + + int GetNumIterSeeks() const { return seeks_; } + + int GetNumMultiGets() const { return multigets_; } + + private: + std::atomic total_latency_{0}; + std::atomic cnt_{0}; + std::atomic writes_{0}; + std::atomic gets_{0}; + std::atomic seeks_{0}; + std::atomic multigets_{0}; +}; + +TEST_F(DBTest2, TraceAndReplay) { + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreatePutOperator(); + ReadOptions ro; + WriteOptions wo; + TraceOptions trace_opts; + EnvOptions env_opts; + CreateAndReopenWithCF({"pikachu"}, options); + Random rnd(301); + Iterator* single_iter = nullptr; + + ASSERT_TRUE(db_->EndTrace().IsIOError()); + + std::string trace_filename = dbname_ + "/rocksdb.trace"; + std::unique_ptr trace_writer; + ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer)); + ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer))); + + // 5 Writes + ASSERT_OK(Put(0, "a", "1")); + ASSERT_OK(Merge(0, "b", "2")); + ASSERT_OK(Delete(0, "c")); + ASSERT_OK(SingleDelete(0, "d")); + ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f")); + + // 6th Write + WriteBatch batch; + ASSERT_OK(batch.Put("f", "11")); + ASSERT_OK(batch.Merge("g", "12")); + ASSERT_OK(batch.Delete("h")); + ASSERT_OK(batch.SingleDelete("i")); + ASSERT_OK(batch.DeleteRange("j", "k")); + ASSERT_OK(db_->Write(wo, &batch)); + + // 2 Seek(ForPrev)s + single_iter = db_->NewIterator(ro); + single_iter->Seek("f"); // Seek 1 + single_iter->SeekForPrev("g"); + ASSERT_OK(single_iter->status()); + delete single_iter; + + // 2 Gets + ASSERT_EQ("1", Get(0, "a")); + ASSERT_EQ("12", Get(0, "g")); + + // 7th and 8th Write, 3rd Get + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "rocksdb", "rocks")); + ASSERT_EQ("NOT_FOUND", Get(1, "leveldb")); + + // Total Write x 8, Get x 3, Seek x 2. + ASSERT_OK(db_->EndTrace()); + // These should not get into the trace file as it is after EndTrace. + ASSERT_OK(Put("hello", "world")); + ASSERT_OK(Merge("foo", "bar")); + + // Open another db, replay, and verify the data + std::string value; + std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay"); + ASSERT_OK(DestroyDB(dbname2, options)); + + // Using a different name than db2, to pacify infer's use-after-lifetime + // warnings (http://fbinfer.com). + DB* db2_init = nullptr; + options.create_if_missing = true; + ASSERT_OK(DB::Open(options, dbname2, &db2_init)); + ColumnFamilyHandle* cf; + ASSERT_OK( + db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf)); + delete cf; + delete db2_init; + + DB* db2 = nullptr; + std::vector column_families; + ColumnFamilyOptions cf_options; + cf_options.merge_operator = MergeOperators::CreatePutOperator(); + column_families.push_back(ColumnFamilyDescriptor("default", cf_options)); + column_families.push_back( + ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); + std::vector handles; + DBOptions db_opts; + db_opts.env = env_; + ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2)); + + env_->SleepForMicroseconds(100); + // Verify that the keys don't already exist + ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound()); + + std::unique_ptr trace_reader; + ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader)); + std::unique_ptr replayer; + ASSERT_OK( + db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer)); + + TraceExecutionResultHandler res_handler; + std::function &&)> res_cb = + [&res_handler](Status exec_s, std::unique_ptr&& res) { + ASSERT_TRUE(exec_s.ok() || exec_s.IsNotSupported()); + if (res != nullptr) { + ASSERT_OK(res->Accept(&res_handler)); + res.reset(); + } + }; + + // Unprepared replay should fail with Status::Incomplete() + ASSERT_TRUE(replayer->Replay(ReplayOptions(), nullptr).IsIncomplete()); + ASSERT_OK(replayer->Prepare()); + // Ok to repeatedly Prepare(). + ASSERT_OK(replayer->Prepare()); + // Replay using 1 thread, 1x speed. + ASSERT_OK(replayer->Replay(ReplayOptions(1, 1.0), res_cb)); + ASSERT_GT(res_handler.GetAvgLatency(), 0.0); + ASSERT_EQ(res_handler.GetNumWrites(), 8); + ASSERT_EQ(res_handler.GetNumGets(), 3); + ASSERT_EQ(res_handler.GetNumIterSeeks(), 2); + ASSERT_EQ(res_handler.GetNumMultiGets(), 0); + res_handler.Reset(); + + ASSERT_OK(db2->Get(ro, handles[0], "a", &value)); + ASSERT_EQ("1", value); + ASSERT_OK(db2->Get(ro, handles[0], "g", &value)); + ASSERT_EQ("12", value); + ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound()); + + ASSERT_OK(db2->Get(ro, handles[1], "foo", &value)); + ASSERT_EQ("bar", value); + ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value)); + ASSERT_EQ("rocks", value); + + // Re-replay should fail with Status::Incomplete() if Prepare() was not + // called. Currently we don't distinguish between unprepared and trace end. + ASSERT_TRUE(replayer->Replay(ReplayOptions(), nullptr).IsIncomplete()); + + // Re-replay using 2 threads, 2x speed. + ASSERT_OK(replayer->Prepare()); + ASSERT_OK(replayer->Replay(ReplayOptions(2, 2.0), res_cb)); + ASSERT_GT(res_handler.GetAvgLatency(), 0.0); + ASSERT_EQ(res_handler.GetNumWrites(), 8); + ASSERT_EQ(res_handler.GetNumGets(), 3); + ASSERT_EQ(res_handler.GetNumIterSeeks(), 2); + ASSERT_EQ(res_handler.GetNumMultiGets(), 0); + res_handler.Reset(); + + // Re-replay using 2 threads, 1/2 speed. + ASSERT_OK(replayer->Prepare()); + ASSERT_OK(replayer->Replay(ReplayOptions(2, 0.5), res_cb)); + ASSERT_GT(res_handler.GetAvgLatency(), 0.0); + ASSERT_EQ(res_handler.GetNumWrites(), 8); + ASSERT_EQ(res_handler.GetNumGets(), 3); + ASSERT_EQ(res_handler.GetNumIterSeeks(), 2); + ASSERT_EQ(res_handler.GetNumMultiGets(), 0); + res_handler.Reset(); + + replayer.reset(); + + for (auto handle : handles) { + delete handle; + } + delete db2; + ASSERT_OK(DestroyDB(dbname2, options)); +} + +TEST_F(DBTest2, TraceAndManualReplay) { + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreatePutOperator(); + ReadOptions ro; + WriteOptions wo; + TraceOptions trace_opts; + EnvOptions env_opts; + CreateAndReopenWithCF({"pikachu"}, options); + Random rnd(301); + Iterator* single_iter = nullptr; + + ASSERT_TRUE(db_->EndTrace().IsIOError()); + + std::string trace_filename = dbname_ + "/rocksdb.trace"; + std::unique_ptr trace_writer; + ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer)); + ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer))); + + ASSERT_OK(Put(0, "a", "1")); + ASSERT_OK(Merge(0, "b", "2")); + ASSERT_OK(Delete(0, "c")); + ASSERT_OK(SingleDelete(0, "d")); + ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f")); + + WriteBatch batch; + ASSERT_OK(batch.Put("f", "11")); + ASSERT_OK(batch.Merge("g", "12")); + ASSERT_OK(batch.Delete("h")); + ASSERT_OK(batch.SingleDelete("i")); + ASSERT_OK(batch.DeleteRange("j", "k")); + ASSERT_OK(db_->Write(wo, &batch)); + + single_iter = db_->NewIterator(ro); + single_iter->Seek("f"); + single_iter->SeekForPrev("g"); + ASSERT_OK(single_iter->status()); + delete single_iter; + + // Write some sequenced keys for testing lower/upper bounds of iterator. + batch.Clear(); + ASSERT_OK(batch.Put("iter-0", "iter-0")); + ASSERT_OK(batch.Put("iter-1", "iter-1")); + ASSERT_OK(batch.Put("iter-2", "iter-2")); + ASSERT_OK(batch.Put("iter-3", "iter-3")); + ASSERT_OK(batch.Put("iter-4", "iter-4")); + ASSERT_OK(db_->Write(wo, &batch)); + + ReadOptions bounded_ro = ro; + Slice lower_bound("iter-1"); + Slice upper_bound("iter-3"); + bounded_ro.iterate_lower_bound = &lower_bound; + bounded_ro.iterate_upper_bound = &upper_bound; + single_iter = db_->NewIterator(bounded_ro); + single_iter->Seek("iter-0"); + ASSERT_EQ(single_iter->key().ToString(), "iter-1"); + single_iter->Seek("iter-2"); + ASSERT_EQ(single_iter->key().ToString(), "iter-2"); + single_iter->Seek("iter-4"); + ASSERT_FALSE(single_iter->Valid()); + single_iter->SeekForPrev("iter-0"); + ASSERT_FALSE(single_iter->Valid()); + single_iter->SeekForPrev("iter-2"); + ASSERT_EQ(single_iter->key().ToString(), "iter-2"); + single_iter->SeekForPrev("iter-4"); + ASSERT_EQ(single_iter->key().ToString(), "iter-2"); + ASSERT_OK(single_iter->status()); + delete single_iter; + + ASSERT_EQ("1", Get(0, "a")); + ASSERT_EQ("12", Get(0, "g")); + + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "rocksdb", "rocks")); + ASSERT_EQ("NOT_FOUND", Get(1, "leveldb")); + + // Same as TraceAndReplay, Write x 8, Get x 3, Seek x 2. + // Plus 1 WriteBatch for iterator with lower/upper bounds, and 6 + // Seek(ForPrev)s. + // Total Write x 9, Get x 3, Seek x 8 + ASSERT_OK(db_->EndTrace()); + // These should not get into the trace file as it is after EndTrace. + ASSERT_OK(Put("hello", "world")); + ASSERT_OK(Merge("foo", "bar")); + + // Open another db, replay, and verify the data + std::string value; + std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay"); + ASSERT_OK(DestroyDB(dbname2, options)); + + // Using a different name than db2, to pacify infer's use-after-lifetime + // warnings (http://fbinfer.com). + DB* db2_init = nullptr; + options.create_if_missing = true; + ASSERT_OK(DB::Open(options, dbname2, &db2_init)); + ColumnFamilyHandle* cf; + ASSERT_OK( + db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf)); + delete cf; + delete db2_init; + + DB* db2 = nullptr; + std::vector column_families; + ColumnFamilyOptions cf_options; + cf_options.merge_operator = MergeOperators::CreatePutOperator(); + column_families.push_back(ColumnFamilyDescriptor("default", cf_options)); + column_families.push_back( + ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); + std::vector handles; + DBOptions db_opts; + db_opts.env = env_; + ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2)); + + env_->SleepForMicroseconds(100); + // Verify that the keys don't already exist + ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound()); + + std::unique_ptr trace_reader; + ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader)); + std::unique_ptr replayer; + ASSERT_OK( + db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer)); + + TraceExecutionResultHandler res_handler; + + // Manual replay for 2 times. The 2nd checks if the replay can restart. + std::unique_ptr record; + std::unique_ptr result; + for (int i = 0; i < 2; i++) { + // Next should fail if unprepared. + ASSERT_TRUE(replayer->Next(nullptr).IsIncomplete()); + ASSERT_OK(replayer->Prepare()); + Status s = Status::OK(); + // Looping until trace end. + while (s.ok()) { + s = replayer->Next(&record); + // Skip unsupported operations. + if (s.IsNotSupported()) { + continue; + } + if (s.ok()) { + ASSERT_OK(replayer->Execute(record, &result)); + if (result != nullptr) { + ASSERT_OK(result->Accept(&res_handler)); + if (record->GetTraceType() == kTraceIteratorSeek || + record->GetTraceType() == kTraceIteratorSeekForPrev) { + IteratorSeekQueryTraceRecord* iter_rec = + dynamic_cast(record.get()); + IteratorTraceExecutionResult* iter_res = + dynamic_cast(result.get()); + // Check if lower/upper bounds are correctly saved and decoded. + std::string lower_str = iter_rec->GetLowerBound().ToString(); + std::string upper_str = iter_rec->GetUpperBound().ToString(); + std::string iter_key = iter_res->GetKey().ToString(); + std::string iter_value = iter_res->GetValue().ToString(); + if (!lower_str.empty() && !upper_str.empty()) { + ASSERT_EQ(lower_str, "iter-1"); + ASSERT_EQ(upper_str, "iter-3"); + if (iter_res->GetValid()) { + // If iterator is valid, then lower_bound <= key < upper_bound. + ASSERT_GE(iter_key, lower_str); + ASSERT_LT(iter_key, upper_str); + } else { + // If iterator is invalid, then + // key < lower_bound or key >= upper_bound. + ASSERT_TRUE(iter_key < lower_str || iter_key >= upper_str); + } + } + // If iterator is invalid, the key and value should be empty. + if (!iter_res->GetValid()) { + ASSERT_TRUE(iter_key.empty()); + ASSERT_TRUE(iter_value.empty()); + } + } + result.reset(); + } + } + } + // Status::Incomplete() will be returned when manually reading the trace + // end, or Prepare() was not called. + ASSERT_TRUE(s.IsIncomplete()); + ASSERT_TRUE(replayer->Next(nullptr).IsIncomplete()); + ASSERT_GT(res_handler.GetAvgLatency(), 0.0); + ASSERT_EQ(res_handler.GetNumWrites(), 9); + ASSERT_EQ(res_handler.GetNumGets(), 3); + ASSERT_EQ(res_handler.GetNumIterSeeks(), 8); + ASSERT_EQ(res_handler.GetNumMultiGets(), 0); + res_handler.Reset(); + } + + ASSERT_OK(db2->Get(ro, handles[0], "a", &value)); + ASSERT_EQ("1", value); + ASSERT_OK(db2->Get(ro, handles[0], "g", &value)); + ASSERT_EQ("12", value); + ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound()); + + ASSERT_OK(db2->Get(ro, handles[1], "foo", &value)); + ASSERT_EQ("bar", value); + ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value)); + ASSERT_EQ("rocks", value); + + // Test execution of artificially created TraceRecords. + uint64_t fake_ts = 1U; + // Write + batch.Clear(); + ASSERT_OK(batch.Put("trace-record-write1", "write1")); + ASSERT_OK(batch.Put("trace-record-write2", "write2")); + record.reset(new WriteQueryTraceRecord(batch.Data(), fake_ts++)); + ASSERT_OK(replayer->Execute(record, &result)); + ASSERT_TRUE(result != nullptr); + ASSERT_OK(result->Accept(&res_handler)); // Write x 1 + ASSERT_OK(db2->Get(ro, handles[0], "trace-record-write1", &value)); + ASSERT_EQ("write1", value); + ASSERT_OK(db2->Get(ro, handles[0], "trace-record-write2", &value)); + ASSERT_EQ("write2", value); + ASSERT_GT(res_handler.GetAvgLatency(), 0.0); + ASSERT_EQ(res_handler.GetNumWrites(), 1); + ASSERT_EQ(res_handler.GetNumGets(), 0); + ASSERT_EQ(res_handler.GetNumIterSeeks(), 0); + ASSERT_EQ(res_handler.GetNumMultiGets(), 0); + res_handler.Reset(); + + // Get related + // Get an existing key. + record.reset(new GetQueryTraceRecord(handles[0]->GetID(), + "trace-record-write1", fake_ts++)); + ASSERT_OK(replayer->Execute(record, &result)); + ASSERT_TRUE(result != nullptr); + ASSERT_OK(result->Accept(&res_handler)); // Get x 1 + // Get an non-existing key, should still return Status::OK(). + record.reset(new GetQueryTraceRecord(handles[0]->GetID(), "trace-record-get", + fake_ts++)); + ASSERT_OK(replayer->Execute(record, &result)); + ASSERT_TRUE(result != nullptr); + ASSERT_OK(result->Accept(&res_handler)); // Get x 2 + // Get from an invalid (non-existing) cf_id. + uint32_t invalid_cf_id = handles[1]->GetID() + 1; + record.reset(new GetQueryTraceRecord(invalid_cf_id, "whatever", fake_ts++)); + ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption()); + ASSERT_TRUE(result == nullptr); + ASSERT_GT(res_handler.GetAvgLatency(), 0.0); + ASSERT_EQ(res_handler.GetNumWrites(), 0); + ASSERT_EQ(res_handler.GetNumGets(), 2); + ASSERT_EQ(res_handler.GetNumIterSeeks(), 0); + ASSERT_EQ(res_handler.GetNumMultiGets(), 0); + res_handler.Reset(); + + // Iteration related + for (IteratorSeekQueryTraceRecord::SeekType seekType : + {IteratorSeekQueryTraceRecord::kSeek, + IteratorSeekQueryTraceRecord::kSeekForPrev}) { + // Seek to an existing key. + record.reset(new IteratorSeekQueryTraceRecord( + seekType, handles[0]->GetID(), "trace-record-write1", fake_ts++)); + ASSERT_OK(replayer->Execute(record, &result)); + ASSERT_TRUE(result != nullptr); + ASSERT_OK(result->Accept(&res_handler)); // Seek x 1 in one iteration + // Seek to an non-existing key, should still return Status::OK(). + record.reset(new IteratorSeekQueryTraceRecord( + seekType, handles[0]->GetID(), "trace-record-get", fake_ts++)); + ASSERT_OK(replayer->Execute(record, &result)); + ASSERT_TRUE(result != nullptr); + ASSERT_OK(result->Accept(&res_handler)); // Seek x 2 in one iteration + // Seek from an invalid cf_id. + record.reset(new IteratorSeekQueryTraceRecord(seekType, invalid_cf_id, + "whatever", fake_ts++)); + ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption()); + ASSERT_TRUE(result == nullptr); + } + ASSERT_GT(res_handler.GetAvgLatency(), 0.0); + ASSERT_EQ(res_handler.GetNumWrites(), 0); + ASSERT_EQ(res_handler.GetNumGets(), 0); + ASSERT_EQ(res_handler.GetNumIterSeeks(), 4); // Seek x 2 in two iterations + ASSERT_EQ(res_handler.GetNumMultiGets(), 0); + res_handler.Reset(); + + // MultiGet related + // Get existing keys. + record.reset(new MultiGetQueryTraceRecord( + std::vector({handles[0]->GetID(), handles[1]->GetID()}), + std::vector({"a", "foo"}), fake_ts++)); + ASSERT_OK(replayer->Execute(record, &result)); + ASSERT_TRUE(result != nullptr); + ASSERT_OK(result->Accept(&res_handler)); // MultiGet x 1 + // Get all non-existing keys, should still return Status::OK(). + record.reset(new MultiGetQueryTraceRecord( + std::vector({handles[0]->GetID(), handles[1]->GetID()}), + std::vector({"no1", "no2"}), fake_ts++)); + ASSERT_OK(replayer->Execute(record, &result)); + ASSERT_TRUE(result != nullptr); + ASSERT_OK(result->Accept(&res_handler)); // MultiGet x 2 + // Get mixed of existing and non-existing keys, should still return + // Status::OK(). + record.reset(new MultiGetQueryTraceRecord( + std::vector({handles[0]->GetID(), handles[1]->GetID()}), + std::vector({"a", "no2"}), fake_ts++)); + ASSERT_OK(replayer->Execute(record, &result)); + ASSERT_TRUE(result != nullptr); + MultiValuesTraceExecutionResult* mvr = + dynamic_cast(result.get()); + ASSERT_TRUE(mvr != nullptr); + ASSERT_OK(mvr->GetMultiStatus()[0]); + ASSERT_TRUE(mvr->GetMultiStatus()[1].IsNotFound()); + ASSERT_EQ(mvr->GetValues()[0], "1"); + ASSERT_EQ(mvr->GetValues()[1], ""); + ASSERT_OK(result->Accept(&res_handler)); // MultiGet x 3 + // Get from an invalid (non-existing) cf_id. + record.reset(new MultiGetQueryTraceRecord( + std::vector( + {handles[0]->GetID(), handles[1]->GetID(), invalid_cf_id}), + std::vector({"a", "foo", "whatever"}), fake_ts++)); + ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption()); + ASSERT_TRUE(result == nullptr); + // Empty MultiGet + record.reset(new MultiGetQueryTraceRecord( + std::vector(), std::vector(), fake_ts++)); + ASSERT_TRUE(replayer->Execute(record, &result).IsInvalidArgument()); + ASSERT_TRUE(result == nullptr); + // MultiGet size mismatch + record.reset(new MultiGetQueryTraceRecord( + std::vector({handles[0]->GetID(), handles[1]->GetID()}), + std::vector({"a"}), fake_ts++)); + ASSERT_TRUE(replayer->Execute(record, &result).IsInvalidArgument()); + ASSERT_TRUE(result == nullptr); + ASSERT_GT(res_handler.GetAvgLatency(), 0.0); + ASSERT_EQ(res_handler.GetNumWrites(), 0); + ASSERT_EQ(res_handler.GetNumGets(), 0); + ASSERT_EQ(res_handler.GetNumIterSeeks(), 0); + ASSERT_EQ(res_handler.GetNumMultiGets(), 3); + res_handler.Reset(); + + replayer.reset(); + + for (auto handle : handles) { + delete handle; + } + delete db2; + ASSERT_OK(DestroyDB(dbname2, options)); +} + +TEST_F(DBTest2, TraceWithLimit) { + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreatePutOperator(); + ReadOptions ro; + WriteOptions wo; + TraceOptions trace_opts; + EnvOptions env_opts; + CreateAndReopenWithCF({"pikachu"}, options); + Random rnd(301); + + // test the max trace file size options + trace_opts.max_trace_file_size = 5; + std::string trace_filename = dbname_ + "/rocksdb.trace1"; + std::unique_ptr trace_writer; + ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer)); + ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer))); + ASSERT_OK(Put(0, "a", "1")); + ASSERT_OK(Put(0, "b", "1")); + ASSERT_OK(Put(0, "c", "1")); + ASSERT_OK(db_->EndTrace()); + + std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay2"); + std::string value; + ASSERT_OK(DestroyDB(dbname2, options)); + + // Using a different name than db2, to pacify infer's use-after-lifetime + // warnings (http://fbinfer.com). + DB* db2_init = nullptr; + options.create_if_missing = true; + ASSERT_OK(DB::Open(options, dbname2, &db2_init)); + ColumnFamilyHandle* cf; + ASSERT_OK( + db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf)); + delete cf; + delete db2_init; + + DB* db2 = nullptr; + std::vector column_families; + ColumnFamilyOptions cf_options; + cf_options.merge_operator = MergeOperators::CreatePutOperator(); + column_families.push_back(ColumnFamilyDescriptor("default", cf_options)); + column_families.push_back( + ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); + std::vector handles; + DBOptions db_opts; + db_opts.env = env_; + ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2)); + + env_->SleepForMicroseconds(100); + // Verify that the keys don't already exist + ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound()); + + std::unique_ptr trace_reader; + ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader)); + std::unique_ptr replayer; + ASSERT_OK( + db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer)); + ASSERT_OK(replayer->Prepare()); + ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr)); + replayer.reset(); + + ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound()); + + for (auto handle : handles) { + delete handle; + } + delete db2; + ASSERT_OK(DestroyDB(dbname2, options)); +} + +TEST_F(DBTest2, TraceWithSampling) { + Options options = CurrentOptions(); + ReadOptions ro; + WriteOptions wo; + TraceOptions trace_opts; + EnvOptions env_opts; + CreateAndReopenWithCF({"pikachu"}, options); + Random rnd(301); + + // test the trace file sampling options + trace_opts.sampling_frequency = 2; + std::string trace_filename = dbname_ + "/rocksdb.trace_sampling"; + std::unique_ptr trace_writer; + ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer)); + ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer))); + ASSERT_OK(Put(0, "a", "1")); + ASSERT_OK(Put(0, "b", "2")); + ASSERT_OK(Put(0, "c", "3")); + ASSERT_OK(Put(0, "d", "4")); + ASSERT_OK(Put(0, "e", "5")); + ASSERT_OK(db_->EndTrace()); + + std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay_sampling"); + std::string value; + ASSERT_OK(DestroyDB(dbname2, options)); + + // Using a different name than db2, to pacify infer's use-after-lifetime + // warnings (http://fbinfer.com). + DB* db2_init = nullptr; + options.create_if_missing = true; + ASSERT_OK(DB::Open(options, dbname2, &db2_init)); + ColumnFamilyHandle* cf; + ASSERT_OK( + db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf)); + delete cf; + delete db2_init; + + DB* db2 = nullptr; + std::vector column_families; + ColumnFamilyOptions cf_options; + column_families.push_back(ColumnFamilyDescriptor("default", cf_options)); + column_families.push_back( + ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); + std::vector handles; + DBOptions db_opts; + db_opts.env = env_; + ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2)); + + env_->SleepForMicroseconds(100); + ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "d", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "e", &value).IsNotFound()); + + std::unique_ptr trace_reader; + ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader)); + std::unique_ptr replayer; + ASSERT_OK( + db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer)); + ASSERT_OK(replayer->Prepare()); + ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr)); + replayer.reset(); + + ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); + ASSERT_FALSE(db2->Get(ro, handles[0], "b", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound()); + ASSERT_FALSE(db2->Get(ro, handles[0], "d", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "e", &value).IsNotFound()); + + for (auto handle : handles) { + delete handle; + } + delete db2; + ASSERT_OK(DestroyDB(dbname2, options)); +} + +TEST_F(DBTest2, TraceWithFilter) { + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreatePutOperator(); + ReadOptions ro; + WriteOptions wo; + TraceOptions trace_opts; + EnvOptions env_opts; + CreateAndReopenWithCF({"pikachu"}, options); + Random rnd(301); + Iterator* single_iter = nullptr; + + trace_opts.filter = TraceFilterType::kTraceFilterWrite; + + std::string trace_filename = dbname_ + "/rocksdb.trace"; + std::unique_ptr trace_writer; + ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer)); + ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer))); + + ASSERT_OK(Put(0, "a", "1")); + ASSERT_OK(Merge(0, "b", "2")); + ASSERT_OK(Delete(0, "c")); + ASSERT_OK(SingleDelete(0, "d")); + ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f")); + + WriteBatch batch; + ASSERT_OK(batch.Put("f", "11")); + ASSERT_OK(batch.Merge("g", "12")); + ASSERT_OK(batch.Delete("h")); + ASSERT_OK(batch.SingleDelete("i")); + ASSERT_OK(batch.DeleteRange("j", "k")); + ASSERT_OK(db_->Write(wo, &batch)); + + single_iter = db_->NewIterator(ro); + single_iter->Seek("f"); + single_iter->SeekForPrev("g"); + delete single_iter; + + ASSERT_EQ("1", Get(0, "a")); + ASSERT_EQ("12", Get(0, "g")); + + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "rocksdb", "rocks")); + ASSERT_EQ("NOT_FOUND", Get(1, "leveldb")); + + ASSERT_OK(db_->EndTrace()); + // These should not get into the trace file as it is after EndTrace. + ASSERT_OK(Put("hello", "world")); + ASSERT_OK(Merge("foo", "bar")); + + // Open another db, replay, and verify the data + std::string value; + std::string dbname2 = test::PerThreadDBPath(env_, "db_replay"); + ASSERT_OK(DestroyDB(dbname2, options)); + + // Using a different name than db2, to pacify infer's use-after-lifetime + // warnings (http://fbinfer.com). + DB* db2_init = nullptr; + options.create_if_missing = true; + ASSERT_OK(DB::Open(options, dbname2, &db2_init)); + ColumnFamilyHandle* cf; + ASSERT_OK( + db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf)); + delete cf; + delete db2_init; + + DB* db2 = nullptr; + std::vector column_families; + ColumnFamilyOptions cf_options; + cf_options.merge_operator = MergeOperators::CreatePutOperator(); + column_families.push_back(ColumnFamilyDescriptor("default", cf_options)); + column_families.push_back( + ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); + std::vector handles; + DBOptions db_opts; + db_opts.env = env_; + ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2)); + + env_->SleepForMicroseconds(100); + // Verify that the keys don't already exist + ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound()); + + std::unique_ptr trace_reader; + ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader)); + std::unique_ptr replayer; + ASSERT_OK( + db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer)); + ASSERT_OK(replayer->Prepare()); + ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr)); + replayer.reset(); + + // All the key-values should not present since we filter out the WRITE ops. + ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "foo", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "rocksdb", &value).IsNotFound()); + + for (auto handle : handles) { + delete handle; + } + delete db2; + ASSERT_OK(DestroyDB(dbname2, options)); + + // Set up a new db. + std::string dbname3 = test::PerThreadDBPath(env_, "db_not_trace_read"); + ASSERT_OK(DestroyDB(dbname3, options)); + + DB* db3_init = nullptr; + options.create_if_missing = true; + ColumnFamilyHandle* cf3; + ASSERT_OK(DB::Open(options, dbname3, &db3_init)); + ASSERT_OK( + db3_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf3)); + delete cf3; + delete db3_init; + + column_families.clear(); + column_families.push_back(ColumnFamilyDescriptor("default", cf_options)); + column_families.push_back( + ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); + handles.clear(); + + DB* db3 = nullptr; + ASSERT_OK(DB::Open(db_opts, dbname3, column_families, &handles, &db3)); + + env_->SleepForMicroseconds(100); + // Verify that the keys don't already exist + ASSERT_TRUE(db3->Get(ro, handles[0], "a", &value).IsNotFound()); + ASSERT_TRUE(db3->Get(ro, handles[0], "g", &value).IsNotFound()); + + // The tracer will not record the READ ops. + trace_opts.filter = TraceFilterType::kTraceFilterGet; + std::string trace_filename3 = dbname_ + "/rocksdb.trace_3"; + std::unique_ptr trace_writer3; + ASSERT_OK( + NewFileTraceWriter(env_, env_opts, trace_filename3, &trace_writer3)); + ASSERT_OK(db3->StartTrace(trace_opts, std::move(trace_writer3))); + + ASSERT_OK(db3->Put(wo, handles[0], "a", "1")); + ASSERT_OK(db3->Merge(wo, handles[0], "b", "2")); + ASSERT_OK(db3->Delete(wo, handles[0], "c")); + ASSERT_OK(db3->SingleDelete(wo, handles[0], "d")); + + ASSERT_OK(db3->Get(ro, handles[0], "a", &value)); + ASSERT_EQ(value, "1"); + ASSERT_TRUE(db3->Get(ro, handles[0], "c", &value).IsNotFound()); + + ASSERT_OK(db3->EndTrace()); + + for (auto handle : handles) { + delete handle; + } + delete db3; + ASSERT_OK(DestroyDB(dbname3, options)); + + std::unique_ptr trace_reader3; + ASSERT_OK( + NewFileTraceReader(env_, env_opts, trace_filename3, &trace_reader3)); + + // Count the number of records in the trace file; + int count = 0; + std::string data; + Status s; + while (true) { + s = trace_reader3->Read(&data); + if (!s.ok()) { + break; + } + count += 1; + } + // We also need to count the header and footer + // 4 WRITE + HEADER + FOOTER = 6 + ASSERT_EQ(count, 6); +} + +#endif // ROCKSDB_LITE + +TEST_F(DBTest2, PinnableSliceAndMmapReads) { + Options options = CurrentOptions(); + options.env = env_; + if (!IsMemoryMappedAccessSupported()) { + ROCKSDB_GTEST_SKIP("Test requires default environment"); + return; + } + options.allow_mmap_reads = true; + options.max_open_files = 100; + options.compression = kNoCompression; + Reopen(options); + + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + PinnableSlice pinned_value; + ASSERT_EQ(Get("foo", &pinned_value), Status::OK()); + // It is not safe to pin mmap files as they might disappear by compaction + ASSERT_FALSE(pinned_value.IsPinned()); + ASSERT_EQ(pinned_value.ToString(), "bar"); + + ASSERT_OK(dbfull()->TEST_CompactRange( + 0 /* level */, nullptr /* begin */, nullptr /* end */, + nullptr /* column_family */, true /* disallow_trivial_move */)); + + // Ensure pinned_value doesn't rely on memory munmap'd by the above + // compaction. It crashes if it does. + ASSERT_EQ(pinned_value.ToString(), "bar"); + +#ifndef ROCKSDB_LITE + pinned_value.Reset(); + // Unsafe to pin mmap files when they could be kicked out of table cache + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + ASSERT_EQ(Get("foo", &pinned_value), Status::OK()); + ASSERT_FALSE(pinned_value.IsPinned()); + ASSERT_EQ(pinned_value.ToString(), "bar"); + + pinned_value.Reset(); + // In read-only mode with infinite capacity on table cache it should pin the + // value and avoid the memcpy + Close(); + options.max_open_files = -1; + ASSERT_OK(ReadOnlyReopen(options)); + ASSERT_EQ(Get("foo", &pinned_value), Status::OK()); + ASSERT_TRUE(pinned_value.IsPinned()); + ASSERT_EQ(pinned_value.ToString(), "bar"); +#endif +} + +TEST_F(DBTest2, DISABLED_IteratorPinnedMemory) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions bbto; + bbto.no_block_cache = false; + bbto.cache_index_and_filter_blocks = false; + bbto.block_cache = NewLRUCache(100000); + bbto.block_size = 400; // small block size + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + Random rnd(301); + std::string v = rnd.RandomString(400); + + // Since v is the size of a block, each key should take a block + // of 400+ bytes. + ASSERT_OK(Put("1", v)); + ASSERT_OK(Put("3", v)); + ASSERT_OK(Put("5", v)); + ASSERT_OK(Put("7", v)); + ASSERT_OK(Flush()); + + ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage()); + + // Verify that iterators don't pin more than one data block in block cache + // at each time. + { + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + iter->SeekToFirst(); + + for (int i = 0; i < 4; i++) { + ASSERT_TRUE(iter->Valid()); + // Block cache should contain exactly one block. + ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0); + ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800); + iter->Next(); + } + ASSERT_FALSE(iter->Valid()); + + iter->Seek("4"); + ASSERT_TRUE(iter->Valid()); + + ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0); + ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800); + + iter->Seek("3"); + ASSERT_TRUE(iter->Valid()); + + ASSERT_OK(iter->status()); + + ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0); + ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800); + } + ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage()); + + // Test compaction case + ASSERT_OK(Put("2", v)); + ASSERT_OK(Put("5", v)); + ASSERT_OK(Put("6", v)); + ASSERT_OK(Put("8", v)); + ASSERT_OK(Flush()); + + // Clear existing data in block cache + bbto.block_cache->SetCapacity(0); + bbto.block_cache->SetCapacity(100000); + + // Verify compaction input iterators don't hold more than one data blocks at + // one time. + std::atomic finished(false); + std::atomic block_newed(0); + std::atomic block_destroyed(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "Block::Block:0", [&](void* /*arg*/) { + if (finished) { + return; + } + // Two iterators. At most 2 outstanding blocks. + EXPECT_GE(block_newed.load(), block_destroyed.load()); + EXPECT_LE(block_newed.load(), block_destroyed.load() + 1); + block_newed.fetch_add(1); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "Block::~Block", [&](void* /*arg*/) { + if (finished) { + return; + } + // Two iterators. At most 2 outstanding blocks. + EXPECT_GE(block_newed.load(), block_destroyed.load() + 1); + EXPECT_LE(block_newed.load(), block_destroyed.load() + 2); + block_destroyed.fetch_add(1); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run:BeforeVerify", + [&](void* /*arg*/) { finished = true; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Two input files. Each of them has 4 data blocks. + ASSERT_EQ(8, block_newed.load()); + ASSERT_EQ(8, block_destroyed.load()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, TestBBTTailPrefetch) { + std::atomic called(false); + size_t expected_lower_bound = 512 * 1024; + size_t expected_higher_bound = 512 * 1024; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) { + size_t* prefetch_size = static_cast(arg); + EXPECT_LE(expected_lower_bound, *prefetch_size); + EXPECT_GE(expected_higher_bound, *prefetch_size); + called = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("1", "1")); + ASSERT_OK(Put("9", "1")); + ASSERT_OK(Flush()); + + expected_lower_bound = 0; + expected_higher_bound = 8 * 1024; + + ASSERT_OK(Put("1", "1")); + ASSERT_OK(Put("9", "1")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("1", "1")); + ASSERT_OK(Put("9", "1")); + ASSERT_OK(Flush()); + + // Full compaction to make sure there is no L0 file after the open. + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + ASSERT_TRUE(called.load()); + called = false; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + std::atomic first_call(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) { + size_t* prefetch_size = static_cast(arg); + if (first_call) { + EXPECT_EQ(4 * 1024, *prefetch_size); + first_call = false; + } else { + EXPECT_GE(4 * 1024, *prefetch_size); + } + called = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.max_file_opening_threads = 1; // one thread + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.max_open_files = -1; + Reopen(options); + + ASSERT_OK(Put("1", "1")); + ASSERT_OK(Put("9", "1")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("1", "1")); + ASSERT_OK(Put("9", "1")); + ASSERT_OK(Flush()); + + ASSERT_TRUE(called.load()); + called = false; + + // Parallel loading SST files + options.max_file_opening_threads = 16; + Reopen(options); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + ASSERT_TRUE(called.load()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(DBTest2, TestGetColumnFamilyHandleUnlocked) { + // Setup sync point dependency to reproduce the race condition of + // DBImpl::GetColumnFamilyHandleUnlocked + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1", + "TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2"}, + {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2", + "TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + CreateColumnFamilies({"test1", "test2"}, Options()); + ASSERT_EQ(handles_.size(), 2); + + DBImpl* dbi = static_cast_with_check(db_); + port::Thread user_thread1([&]() { + auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[0]->GetID()); + ASSERT_EQ(cfh->GetID(), handles_[0]->GetID()); + TEST_SYNC_POINT( + "TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1"); + TEST_SYNC_POINT( + "TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1"); + ASSERT_EQ(cfh->GetID(), handles_[0]->GetID()); + }); + + port::Thread user_thread2([&]() { + TEST_SYNC_POINT( + "TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2"); + auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[1]->GetID()); + ASSERT_EQ(cfh->GetID(), handles_[1]->GetID()); + TEST_SYNC_POINT( + "TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2"); + ASSERT_EQ(cfh->GetID(), handles_[1]->GetID()); + }); + + user_thread1.join(); + user_thread2.join(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest2, TestCompactFiles) { + // Setup sync point dependency to reproduce the race condition of + // DBImpl::GetColumnFamilyHandleUnlocked + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"TestCompactFiles::IngestExternalFile1", + "TestCompactFiles::IngestExternalFile2"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Options options; + options.env = env_; + options.num_levels = 2; + options.disable_auto_compactions = true; + Reopen(options); + auto* handle = db_->DefaultColumnFamily(); + ASSERT_EQ(db_->NumberLevels(handle), 2); + + ROCKSDB_NAMESPACE::SstFileWriter sst_file_writer{ + ROCKSDB_NAMESPACE::EnvOptions(), options}; + std::string external_file1 = dbname_ + "/test_compact_files1.sst_t"; + std::string external_file2 = dbname_ + "/test_compact_files2.sst_t"; + std::string external_file3 = dbname_ + "/test_compact_files3.sst_t"; + + ASSERT_OK(sst_file_writer.Open(external_file1)); + ASSERT_OK(sst_file_writer.Put("1", "1")); + ASSERT_OK(sst_file_writer.Put("2", "2")); + ASSERT_OK(sst_file_writer.Finish()); + + ASSERT_OK(sst_file_writer.Open(external_file2)); + ASSERT_OK(sst_file_writer.Put("3", "3")); + ASSERT_OK(sst_file_writer.Put("4", "4")); + ASSERT_OK(sst_file_writer.Finish()); + + ASSERT_OK(sst_file_writer.Open(external_file3)); + ASSERT_OK(sst_file_writer.Put("5", "5")); + ASSERT_OK(sst_file_writer.Put("6", "6")); + ASSERT_OK(sst_file_writer.Finish()); + + ASSERT_OK(db_->IngestExternalFile(handle, {external_file1, external_file3}, + IngestExternalFileOptions())); + ASSERT_EQ(NumTableFilesAtLevel(1, 0), 2); + std::vector files; + GetSstFiles(env_, dbname_, &files); + ASSERT_EQ(files.size(), 2); + + Status user_thread1_status; + port::Thread user_thread1([&]() { + user_thread1_status = + db_->CompactFiles(CompactionOptions(), handle, files, 1); + }); + + Status user_thread2_status; + port::Thread user_thread2([&]() { + user_thread2_status = db_->IngestExternalFile(handle, {external_file2}, + IngestExternalFileOptions()); + TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile1"); + }); + + user_thread1.join(); + user_thread2.join(); + + ASSERT_OK(user_thread1_status); + ASSERT_OK(user_thread2_status); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} +#endif // ROCKSDB_LITE + +TEST_F(DBTest2, MultiDBParallelOpenTest) { + const int kNumDbs = 2; + Options options = CurrentOptions(); + std::vector dbnames; + for (int i = 0; i < kNumDbs; ++i) { + dbnames.emplace_back(test::PerThreadDBPath(env_, "db" + std::to_string(i))); + ASSERT_OK(DestroyDB(dbnames.back(), options)); + } + + // Verify empty DBs can be created in parallel + std::vector open_threads; + std::vector dbs{static_cast(kNumDbs), nullptr}; + options.create_if_missing = true; + for (int i = 0; i < kNumDbs; ++i) { + open_threads.emplace_back( + [&](int dbnum) { + ASSERT_OK(DB::Open(options, dbnames[dbnum], &dbs[dbnum])); + }, + i); + } + + // Now add some data and close, so next we can verify non-empty DBs can be + // recovered in parallel + for (int i = 0; i < kNumDbs; ++i) { + open_threads[i].join(); + ASSERT_OK(dbs[i]->Put(WriteOptions(), "xi", "gua")); + delete dbs[i]; + } + + // Verify non-empty DBs can be recovered in parallel + open_threads.clear(); + for (int i = 0; i < kNumDbs; ++i) { + open_threads.emplace_back( + [&](int dbnum) { + ASSERT_OK(DB::Open(options, dbnames[dbnum], &dbs[dbnum])); + }, + i); + } + + // Wait and cleanup + for (int i = 0; i < kNumDbs; ++i) { + open_threads[i].join(); + delete dbs[i]; + ASSERT_OK(DestroyDB(dbnames[i], options)); + } +} + +namespace { +class DummyOldStats : public Statistics { + public: + const char* Name() const override { return "DummyOldStats"; } + uint64_t getTickerCount(uint32_t /*ticker_type*/) const override { return 0; } + void recordTick(uint32_t /* ticker_type */, uint64_t /* count */) override { + num_rt++; + } + void setTickerCount(uint32_t /*ticker_type*/, uint64_t /*count*/) override {} + uint64_t getAndResetTickerCount(uint32_t /*ticker_type*/) override { + return 0; + } + void measureTime(uint32_t /*histogram_type*/, uint64_t /*count*/) override { + num_mt++; + } + void histogramData( + uint32_t /*histogram_type*/, + ROCKSDB_NAMESPACE::HistogramData* const /*data*/) const override {} + std::string getHistogramString(uint32_t /*type*/) const override { + return ""; + } + bool HistEnabledForType(uint32_t /*type*/) const override { return false; } + std::string ToString() const override { return ""; } + std::atomic num_rt{0}; + std::atomic num_mt{0}; +}; +} // anonymous namespace + +TEST_F(DBTest2, OldStatsInterface) { + DummyOldStats* dos = new DummyOldStats(); + std::shared_ptr stats(dos); + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = stats; + Reopen(options); + + ASSERT_OK(Put("foo", "bar")); + ASSERT_EQ("bar", Get("foo")); + ASSERT_OK(Flush()); + ASSERT_EQ("bar", Get("foo")); + + ASSERT_GT(dos->num_rt, 0); + ASSERT_GT(dos->num_mt, 0); +} + +TEST_F(DBTest2, CloseWithUnreleasedSnapshot) { + const Snapshot* ss = db_->GetSnapshot(); + + for (auto h : handles_) { + db_->DestroyColumnFamilyHandle(h); + } + handles_.clear(); + + ASSERT_NOK(db_->Close()); + db_->ReleaseSnapshot(ss); + ASSERT_OK(db_->Close()); + delete db_; + db_ = nullptr; +} + +TEST_F(DBTest2, PrefixBloomReseek) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.prefix_extractor.reset(NewCappedPrefixTransform(3)); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + // Construct two L1 files with keys: + // f1:[aaa1 ccc1] f2:[ddd0] + ASSERT_OK(Put("aaa1", "")); + ASSERT_OK(Put("ccc1", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("ddd0", "")); + ASSERT_OK(Flush()); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + ASSERT_OK(Put("bbb1", "")); + + Iterator* iter = db_->NewIterator(ReadOptions()); + ASSERT_OK(iter->status()); + + // Seeking into f1, the iterator will check bloom filter which returns the + // file iterator ot be invalidate, and the cursor will put into f2, with + // the next key to be "ddd0". + iter->Seek("bbb1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bbb1", iter->key().ToString()); + + // Reseek ccc1, the L1 iterator needs to go back to f1 and reseek. + iter->Seek("ccc1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("ccc1", iter->key().ToString()); + + delete iter; +} + +TEST_F(DBTest2, PrefixBloomFilteredOut) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.prefix_extractor.reset(NewCappedPrefixTransform(3)); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + // Construct two L1 files with keys: + // f1:[aaa1 ccc1] f2:[ddd0] + ASSERT_OK(Put("aaa1", "")); + ASSERT_OK(Put("ccc1", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("ddd0", "")); + ASSERT_OK(Flush()); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + Iterator* iter = db_->NewIterator(ReadOptions()); + ASSERT_OK(iter->status()); + + // Bloom filter is filterd out by f1. + // This is just one of several valid position following the contract. + // Postioning to ccc1 or ddd0 is also valid. This is just to validate + // the behavior of the current implementation. If underlying implementation + // changes, the test might fail here. + iter->Seek("bbb1"); + ASSERT_OK(iter->status()); + ASSERT_FALSE(iter->Valid()); + + delete iter; +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest2, RowCacheSnapshot) { + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.row_cache = NewLRUCache(8 * 8192); + DestroyAndReopen(options); + + ASSERT_OK(Put("foo", "bar1")); + + const Snapshot* s1 = db_->GetSnapshot(); + + ASSERT_OK(Put("foo", "bar2")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("foo2", "bar")); + const Snapshot* s2 = db_->GetSnapshot(); + ASSERT_OK(Put("foo3", "bar")); + const Snapshot* s3 = db_->GetSnapshot(); + + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0); + ASSERT_EQ(Get("foo"), "bar2"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); + ASSERT_EQ(Get("foo"), "bar2"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); + ASSERT_EQ(Get("foo", s1), "bar1"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + ASSERT_EQ(Get("foo", s2), "bar2"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + ASSERT_EQ(Get("foo", s1), "bar1"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + ASSERT_EQ(Get("foo", s3), "bar2"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 4); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + + db_->ReleaseSnapshot(s1); + db_->ReleaseSnapshot(s2); + db_->ReleaseSnapshot(s3); +} +#endif // ROCKSDB_LITE + +// When DB is reopened with multiple column families, the manifest file +// is written after the first CF is flushed, and it is written again +// after each flush. If DB crashes between the flushes, the flushed CF +// flushed will pass the latest log file, and now we require it not +// to be corrupted, and triggering a corruption report. +// We need to fix the bug and enable the test. +TEST_F(DBTest2, CrashInRecoveryMultipleCF) { + const std::vector sync_points = { + "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable", + "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0"}; + for (const auto& test_sync_point : sync_points) { + Options options = CurrentOptions(); + // First destroy original db to ensure a clean start. + DestroyAndReopen(options); + options.create_if_missing = true; + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put(1, "foo", "bar")); + // The value is large enough to be divided to two blocks. + std::string large_value(400, ' '); + ASSERT_OK(Put("foo1", large_value)); + ASSERT_OK(Put("foo2", large_value)); + Close(); + + // Corrupt the log file in the middle, so that it is not corrupted + // in the tail. + std::vector filenames; + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); + for (const auto& f : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == FileType::kWalFile) { + std::string fname = dbname_ + "/" + f; + std::string file_content; + ASSERT_OK(ReadFileToString(env_, fname, &file_content)); + file_content[400] = 'h'; + file_content[401] = 'a'; + ASSERT_OK(WriteStringToFile(env_, file_content, fname)); + break; + } + } + + // Reopen and freeze the file system after the first manifest write. + FaultInjectionTestEnv fit_env(options.env); + options.env = &fit_env; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + test_sync_point, + [&](void* /*arg*/) { fit_env.SetFilesystemActive(false); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_NOK(TryReopenWithColumnFamilies( + {kDefaultColumnFamilyName, "pikachu"}, options)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + fit_env.SetFilesystemActive(true); + // If we continue using failure ingestion Env, it will conplain something + // when renaming current file, which is not expected. Need to investigate + // why. + options.env = env_; + ASSERT_OK(TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, + options)); + } +} + +TEST_F(DBTest2, SeekFileRangeDeleteTail) { + Options options = CurrentOptions(); + options.prefix_extractor.reset(NewCappedPrefixTransform(1)); + options.num_levels = 3; + DestroyAndReopen(options); + + ASSERT_OK(Put("a", "a")); + const Snapshot* s1 = db_->GetSnapshot(); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "f")); + ASSERT_OK(Put("b", "a")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("x", "a")); + ASSERT_OK(Put("z", "a")); + ASSERT_OK(Flush()); + + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + { + ReadOptions ro; + ro.total_order_seek = true; + std::unique_ptr iter(db_->NewIterator(ro)); + ASSERT_OK(iter->status()); + iter->Seek("e"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("x", iter->key().ToString()); + } + db_->ReleaseSnapshot(s1); +} + +TEST_F(DBTest2, BackgroundPurgeTest) { + Options options = CurrentOptions(); + options.write_buffer_manager = + std::make_shared(1 << 20); + options.avoid_unnecessary_blocking_io = true; + DestroyAndReopen(options); + size_t base_value = options.write_buffer_manager->memory_usage(); + + ASSERT_OK(Put("a", "a")); + Iterator* iter = db_->NewIterator(ReadOptions()); + ASSERT_OK(iter->status()); + ASSERT_OK(Flush()); + size_t value = options.write_buffer_manager->memory_usage(); + ASSERT_GT(value, base_value); + + db_->GetEnv()->SetBackgroundThreads(1, Env::Priority::HIGH); + test::SleepingBackgroundTask sleeping_task_after; + db_->GetEnv()->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_after, Env::Priority::HIGH); + delete iter; + + Env::Default()->SleepForMicroseconds(100000); + value = options.write_buffer_manager->memory_usage(); + ASSERT_GT(value, base_value); + + sleeping_task_after.WakeUp(); + sleeping_task_after.WaitUntilDone(); + + test::SleepingBackgroundTask sleeping_task_after2; + db_->GetEnv()->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_after2, Env::Priority::HIGH); + sleeping_task_after2.WakeUp(); + sleeping_task_after2.WaitUntilDone(); + + value = options.write_buffer_manager->memory_usage(); + ASSERT_EQ(base_value, value); +} + +TEST_F(DBTest2, SwitchMemtableRaceWithNewManifest) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + options.max_manifest_file_size = 10; + options.create_if_missing = true; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_EQ(2, handles_.size()); + + ASSERT_OK(Put("foo", "value")); + const int kL0Files = options.level0_file_num_compaction_trigger; + for (int i = 0; i < kL0Files; ++i) { + ASSERT_OK(Put(/*cf=*/1, "a", std::to_string(i))); + ASSERT_OK(Flush(/*cf=*/1)); + } + + port::Thread thread([&]() { ASSERT_OK(Flush()); }); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + thread.join(); +} + +TEST_F(DBTest2, SameSmallestInSameLevel) { + // This test validates fractional casacading logic when several files at one + // one level only contains the same user key. + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + DestroyAndReopen(options); + + ASSERT_OK(Put("key", "1")); + ASSERT_OK(Put("key", "2")); + ASSERT_OK(db_->Merge(WriteOptions(), "key", "3")); + ASSERT_OK(db_->Merge(WriteOptions(), "key", "4")); + ASSERT_OK(Flush()); + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(dbfull()->CompactRange(cro, db_->DefaultColumnFamily(), nullptr, + nullptr)); + + ASSERT_OK(db_->Merge(WriteOptions(), "key", "5")); + ASSERT_OK(Flush()); + ASSERT_OK(db_->Merge(WriteOptions(), "key", "6")); + ASSERT_OK(Flush()); + ASSERT_OK(db_->Merge(WriteOptions(), "key", "7")); + ASSERT_OK(Flush()); + ASSERT_OK(db_->Merge(WriteOptions(), "key", "8")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,4,1", FilesPerLevel()); +#endif // ROCKSDB_LITE + + ASSERT_EQ("2,3,4,5,6,7,8", Get("key")); +} + +TEST_F(DBTest2, FileConsistencyCheckInOpen) { + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + SyncPoint::GetInstance()->SetCallBack( + "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) { + Status* ret_s = static_cast(arg); + *ret_s = Status::Corruption("fcc"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.force_consistency_checks = true; + ASSERT_NOK(TryReopen(options)); + + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, BlockBasedTablePrefixIndexSeekForPrev) { + // create a DB with block prefix index + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + table_options.block_size = 300; + table_options.index_type = BlockBasedTableOptions::kHashSearch; + table_options.index_shortening = + BlockBasedTableOptions::IndexShorteningMode::kNoShortening; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + + Reopen(options); + + Random rnd(301); + std::string large_value = rnd.RandomString(500); + + ASSERT_OK(Put("a1", large_value)); + ASSERT_OK(Put("x1", large_value)); + ASSERT_OK(Put("y1", large_value)); + ASSERT_OK(Flush()); + + { + std::unique_ptr iterator(db_->NewIterator(ReadOptions())); + ASSERT_OK(iterator->status()); + iterator->SeekForPrev("x3"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("x1", iterator->key().ToString()); + + iterator->SeekForPrev("a3"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("a1", iterator->key().ToString()); + + iterator->SeekForPrev("y3"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("y1", iterator->key().ToString()); + + // Query more than one non-existing prefix to cover the case both + // of empty hash bucket and hash bucket conflict. + iterator->SeekForPrev("b1"); + // Result should be not valid or "a1". + if (iterator->Valid()) { + ASSERT_EQ("a1", iterator->key().ToString()); + } + + iterator->SeekForPrev("c1"); + // Result should be not valid or "a1". + if (iterator->Valid()) { + ASSERT_EQ("a1", iterator->key().ToString()); + } + + iterator->SeekForPrev("d1"); + // Result should be not valid or "a1". + if (iterator->Valid()) { + ASSERT_EQ("a1", iterator->key().ToString()); + } + + iterator->SeekForPrev("y3"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("y1", iterator->key().ToString()); + } +} + +TEST_F(DBTest2, PartitionedIndexPrefetchFailure) { + Options options = last_options_; + options.env = env_; + options.max_open_files = 20; + BlockBasedTableOptions bbto; + bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + bbto.metadata_block_size = 128; + bbto.block_size = 128; + bbto.block_cache = NewLRUCache(16777216); + bbto.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + // Force no table cache so every read will preload the SST file. + dbfull()->TEST_table_cache()->SetCapacity(0); + bbto.block_cache->SetCapacity(0); + + Random rnd(301); + for (int i = 0; i < 4096; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(32))); + } + ASSERT_OK(Flush()); + + // Try different random failures in table open for 300 times. + for (int i = 0; i < 300; i++) { + env_->num_reads_fails_ = 0; + env_->rand_reads_fail_odd_ = 8; + + std::string value; + Status s = dbfull()->Get(ReadOptions(), Key(1), &value); + if (env_->num_reads_fails_ > 0) { + ASSERT_NOK(s); + } else { + ASSERT_OK(s); + } + } + + env_->rand_reads_fail_odd_ = 0; +} + +TEST_F(DBTest2, ChangePrefixExtractor) { + for (bool use_partitioned_filter : {true, false}) { + // create a DB with block prefix index + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + + // Sometimes filter is checked based on upper bound. Assert counters + // for that case. Otherwise, only check data correctness. +#ifndef ROCKSDB_LITE + bool expect_filter_check = !use_partitioned_filter; +#else + bool expect_filter_check = false; +#endif + table_options.partition_filters = use_partitioned_filter; + if (use_partitioned_filter) { + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.statistics = CreateDBStatistics(); + + options.prefix_extractor.reset(NewFixedPrefixTransform(2)); + DestroyAndReopen(options); + + Random rnd(301); + + ASSERT_OK(Put("aa", "")); + ASSERT_OK(Put("xb", "")); + ASSERT_OK(Put("xx1", "")); + ASSERT_OK(Put("xz1", "")); + ASSERT_OK(Put("zz", "")); + ASSERT_OK(Flush()); + + // After reopening DB with prefix size 2 => 1, prefix extractor + // won't take effective unless it won't change results based + // on upper bound and seek key. + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + Reopen(options); + + { + std::unique_ptr iterator(db_->NewIterator(ReadOptions())); + ASSERT_OK(iterator->status()); + iterator->Seek("xa"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("xb", iterator->key().ToString()); + // It's a bug that the counter BLOOM_FILTER_PREFIX_CHECKED is not + // correct in this case. So don't check counters in this case. + if (expect_filter_check) { + ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + + iterator->Seek("xz"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("xz1", iterator->key().ToString()); + if (expect_filter_check) { + ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + } + + std::string ub_str = "xg9"; + Slice ub(ub_str); + ReadOptions ro; + ro.iterate_upper_bound = &ub; + + { + std::unique_ptr iterator(db_->NewIterator(ro)); + ASSERT_OK(iterator->status()); + + // SeekForPrev() never uses prefix bloom if it is changed. + iterator->SeekForPrev("xg0"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("xb", iterator->key().ToString()); + if (expect_filter_check) { + ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + } + + ub_str = "xx9"; + ub = Slice(ub_str); + { + std::unique_ptr iterator(db_->NewIterator(ro)); + ASSERT_OK(iterator->status()); + + iterator->Seek("x"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("xb", iterator->key().ToString()); + if (expect_filter_check) { + ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + + iterator->Seek("xx0"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("xx1", iterator->key().ToString()); + if (expect_filter_check) { + ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + } + + CompactRangeOptions compact_range_opts; + compact_range_opts.bottommost_level_compaction = + BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr)); + ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr)); + + // Re-execute similar queries after a full compaction + { + std::unique_ptr iterator(db_->NewIterator(ReadOptions())); + + iterator->Seek("x"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("xb", iterator->key().ToString()); + if (expect_filter_check) { + ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + + iterator->Seek("xg"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("xx1", iterator->key().ToString()); + if (expect_filter_check) { + ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + + iterator->Seek("xz"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("xz1", iterator->key().ToString()); + if (expect_filter_check) { + ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + + ASSERT_OK(iterator->status()); + } + { + std::unique_ptr iterator(db_->NewIterator(ro)); + + iterator->SeekForPrev("xx0"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("xb", iterator->key().ToString()); + if (expect_filter_check) { + ASSERT_EQ(5, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + + iterator->Seek("xx0"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("xx1", iterator->key().ToString()); + if (expect_filter_check) { + ASSERT_EQ(6, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + + ASSERT_OK(iterator->status()); + } + + ub_str = "xg9"; + ub = Slice(ub_str); + { + std::unique_ptr iterator(db_->NewIterator(ro)); + iterator->SeekForPrev("xg0"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("xb", iterator->key().ToString()); + if (expect_filter_check) { + ASSERT_EQ(7, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + ASSERT_OK(iterator->status()); + } + } +} + +TEST_F(DBTest2, BlockBasedTablePrefixGetIndexNotFound) { + // create a DB with block prefix index + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + table_options.block_size = 300; + table_options.index_type = BlockBasedTableOptions::kHashSearch; + table_options.index_shortening = + BlockBasedTableOptions::IndexShorteningMode::kNoShortening; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.level0_file_num_compaction_trigger = 8; + + Reopen(options); + + ASSERT_OK(Put("b1", "ok")); + ASSERT_OK(Flush()); + + // Flushing several files so that the chance that hash bucket + // is empty fo "b" in at least one of the files is high. + ASSERT_OK(Put("a1", "")); + ASSERT_OK(Put("c1", "")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("a2", "")); + ASSERT_OK(Put("c2", "")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("a3", "")); + ASSERT_OK(Put("c3", "")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("a4", "")); + ASSERT_OK(Put("c4", "")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("a5", "")); + ASSERT_OK(Put("c5", "")); + ASSERT_OK(Flush()); + + ASSERT_EQ("ok", Get("b1")); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest2, AutoPrefixMode1) { + do { + // create a DB with block prefix index + Options options = CurrentOptions(); + BlockBasedTableOptions table_options = + *options.table_factory->GetOptions(); + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.statistics = CreateDBStatistics(); + + Reopen(options); + + Random rnd(301); + std::string large_value = rnd.RandomString(500); + + ASSERT_OK(Put("a1", large_value)); + ASSERT_OK(Put("x1", large_value)); + ASSERT_OK(Put("y1", large_value)); + ASSERT_OK(Flush()); + + ReadOptions ro; + ro.total_order_seek = false; + ro.auto_prefix_mode = true; + + const auto stat = BLOOM_FILTER_PREFIX_CHECKED; + { + std::unique_ptr iterator(db_->NewIterator(ro)); + iterator->Seek("b1"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("x1", iterator->key().ToString()); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + ASSERT_OK(iterator->status()); + } + + Slice ub; + ro.iterate_upper_bound = &ub; + + ub = "b9"; + { + std::unique_ptr iterator(db_->NewIterator(ro)); + iterator->Seek("b1"); + ASSERT_FALSE(iterator->Valid()); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); + ASSERT_OK(iterator->status()); + } + + ub = "z"; + { + std::unique_ptr iterator(db_->NewIterator(ro)); + iterator->Seek("b1"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("x1", iterator->key().ToString()); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + ASSERT_OK(iterator->status()); + } + + ub = "c"; + { + std::unique_ptr iterator(db_->NewIterator(ro)); + iterator->Seek("b1"); + ASSERT_FALSE(iterator->Valid()); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); + ASSERT_OK(iterator->status()); + } + + ub = "c1"; + { + std::unique_ptr iterator(db_->NewIterator(ro)); + iterator->Seek("b1"); + ASSERT_FALSE(iterator->Valid()); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + ASSERT_OK(iterator->status()); + } + + // The same queries without recreating iterator + { + std::unique_ptr iterator(db_->NewIterator(ro)); + + ub = "b9"; + iterator->Seek("b1"); + ASSERT_FALSE(iterator->Valid()); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); + ASSERT_OK(iterator->status()); + + ub = "z"; + iterator->Seek("b1"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("x1", iterator->key().ToString()); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + + ub = "c"; + iterator->Seek("b1"); + ASSERT_FALSE(iterator->Valid()); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); + + ub = "b9"; + iterator->SeekForPrev("b1"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("a1", iterator->key().ToString()); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + + ub = "zz"; + iterator->SeekToLast(); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("y1", iterator->key().ToString()); + + iterator->SeekToFirst(); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("a1", iterator->key().ToString()); + } + + // Similar, now with reverse comparator + // Technically, we are violating axiom 2 of prefix_extractors, but + // it should be revised because of major use-cases using + // ReverseBytewiseComparator with capped/fixed prefix Seek. (FIXME) + options.comparator = ReverseBytewiseComparator(); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + + DestroyAndReopen(options); + + ASSERT_OK(Put("a1", large_value)); + ASSERT_OK(Put("x1", large_value)); + ASSERT_OK(Put("y1", large_value)); + ASSERT_OK(Flush()); + + { + std::unique_ptr iterator(db_->NewIterator(ro)); + + ub = "b1"; + iterator->Seek("b9"); + ASSERT_FALSE(iterator->Valid()); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); + ASSERT_OK(iterator->status()); + + ub = "b1"; + iterator->Seek("z"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("y1", iterator->key().ToString()); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + + ub = "b1"; + iterator->Seek("c"); + ASSERT_FALSE(iterator->Valid()); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + + ub = "b"; + iterator->Seek("c9"); + ASSERT_FALSE(iterator->Valid()); + // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor + // is "correctly" implemented. + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + + ub = "a"; + iterator->Seek("b9"); + // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor + // is "correctly" implemented. + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("a1", iterator->key().ToString()); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + + ub = "b"; + iterator->Seek("a"); + ASSERT_FALSE(iterator->Valid()); + // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor + // matches BytewiseComparator::IsSameLengthImmediateSuccessor. Upper + // comparing before seek key prevents a real bug from surfacing. + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + + ub = "b1"; + iterator->SeekForPrev("b9"); + ASSERT_TRUE(iterator->Valid()); + // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor + // is "correctly" implemented. + ASSERT_EQ("x1", iterator->key().ToString()); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + + ub = "a"; + iterator->SeekToLast(); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("a1", iterator->key().ToString()); + + iterator->SeekToFirst(); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("y1", iterator->key().ToString()); + } + + // Now something a bit different, related to "short" keys that + // auto_prefix_mode can omit. See "BUG" section of auto_prefix_mode. + options.comparator = BytewiseComparator(); + for (const auto config : {"fixed:2", "capped:2"}) { + ASSERT_OK(SliceTransform::CreateFromString(ConfigOptions(), config, + &options.prefix_extractor)); + + // FIXME: kHashSearch, etc. requires all keys be InDomain + if (StartsWith(config, "fixed") && + (table_options.index_type == BlockBasedTableOptions::kHashSearch || + StartsWith(options.memtable_factory->Name(), "Hash"))) { + continue; + } + DestroyAndReopen(options); + + const char* a_end_stuff = "a\xffXYZ"; + const char* b_begin_stuff = "b\x00XYZ"; + ASSERT_OK(Put("a", large_value)); + ASSERT_OK(Put("b", large_value)); + ASSERT_OK(Put(Slice(b_begin_stuff, 3), large_value)); + ASSERT_OK(Put("c", large_value)); + ASSERT_OK(Flush()); + + // control showing valid optimization with auto_prefix mode + ub = Slice(a_end_stuff, 4); + ro.iterate_upper_bound = &ub; + + std::unique_ptr iterator(db_->NewIterator(ro)); + iterator->Seek(Slice(a_end_stuff, 2)); + ASSERT_FALSE(iterator->Valid()); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); + ASSERT_OK(iterator->status()); + + // test, cannot be validly optimized with auto_prefix_mode + ub = Slice(b_begin_stuff, 2); + ro.iterate_upper_bound = &ub; + + iterator->Seek(Slice(a_end_stuff, 2)); + // !!! BUG !!! See "BUG" section of auto_prefix_mode. + ASSERT_FALSE(iterator->Valid()); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); + ASSERT_OK(iterator->status()); + + // To prove that is the wrong result, now use total order seek + ReadOptions tos_ro = ro; + tos_ro.total_order_seek = true; + tos_ro.auto_prefix_mode = false; + iterator.reset(db_->NewIterator(tos_ro)); + iterator->Seek(Slice(a_end_stuff, 2)); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("b", iterator->key().ToString()); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + ASSERT_OK(iterator->status()); + } + } while (ChangeOptions(kSkipPlainTable)); +} + +class RenameCurrentTest : public DBTestBase, + public testing::WithParamInterface { + public: + RenameCurrentTest() + : DBTestBase("rename_current_test", /*env_do_fsync=*/true), + sync_point_(GetParam()) {} + + ~RenameCurrentTest() override {} + + void SetUp() override { + env_->no_file_overwrite_.store(true, std::memory_order_release); + } + + void TearDown() override { + env_->no_file_overwrite_.store(false, std::memory_order_release); + } + + void SetupSyncPoints() { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->SetCallBack(sync_point_, [&](void* arg) { + Status* s = reinterpret_cast(arg); + assert(s); + *s = Status::IOError("Injected IO error."); + }); + } + + const std::string sync_point_; +}; + +INSTANTIATE_TEST_CASE_P(DistributedFS, RenameCurrentTest, + ::testing::Values("SetCurrentFile:BeforeRename", + "SetCurrentFile:AfterRename")); + +TEST_P(RenameCurrentTest, Open) { + Destroy(last_options_); + Options options = GetDefaultOptions(); + options.create_if_missing = true; + SetupSyncPoints(); + SyncPoint::GetInstance()->EnableProcessing(); + Status s = TryReopen(options); + ASSERT_NOK(s); + + SyncPoint::GetInstance()->DisableProcessing(); + Reopen(options); +} + +TEST_P(RenameCurrentTest, Flush) { + Destroy(last_options_); + Options options = GetDefaultOptions(); + options.max_manifest_file_size = 1; + options.create_if_missing = true; + Reopen(options); + ASSERT_OK(Put("key", "value")); + SetupSyncPoints(); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_NOK(Flush()); + + ASSERT_NOK(Put("foo", "value")); + + SyncPoint::GetInstance()->DisableProcessing(); + Reopen(options); + ASSERT_EQ("value", Get("key")); + ASSERT_EQ("NOT_FOUND", Get("foo")); +} + +TEST_P(RenameCurrentTest, Compaction) { + Destroy(last_options_); + Options options = GetDefaultOptions(); + options.max_manifest_file_size = 1; + options.create_if_missing = true; + Reopen(options); + ASSERT_OK(Put("a", "a_value")); + ASSERT_OK(Put("c", "c_value")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("b", "b_value")); + ASSERT_OK(Put("d", "d_value")); + ASSERT_OK(Flush()); + + SetupSyncPoints(); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_NOK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + + ASSERT_NOK(Put("foo", "value")); + + SyncPoint::GetInstance()->DisableProcessing(); + Reopen(options); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ("d_value", Get("d")); +} + +TEST_F(DBTest2, LastLevelTemperature) { + class TestListener : public EventListener { + public: + void OnFileReadFinish(const FileOperationInfo& info) override { + UpdateFileTemperature(info); + } + + void OnFileWriteFinish(const FileOperationInfo& info) override { + UpdateFileTemperature(info); + } + + void OnFileFlushFinish(const FileOperationInfo& info) override { + UpdateFileTemperature(info); + } + + void OnFileSyncFinish(const FileOperationInfo& info) override { + UpdateFileTemperature(info); + } + + void OnFileCloseFinish(const FileOperationInfo& info) override { + UpdateFileTemperature(info); + } + + bool ShouldBeNotifiedOnFileIO() override { return true; } + + std::unordered_map file_temperatures; + + private: + void UpdateFileTemperature(const FileOperationInfo& info) { + auto filename = GetFileName(info.path); + uint64_t number; + FileType type; + ASSERT_TRUE(ParseFileName(filename, &number, &type)); + if (type == kTableFile) { + MutexLock l(&mutex_); + auto ret = file_temperatures.insert({number, info.temperature}); + if (!ret.second) { + // the same file temperature should always be the same for all events + ASSERT_TRUE(ret.first->second == info.temperature); + } + } + } + + std::string GetFileName(const std::string& fname) { + auto filename = fname.substr(fname.find_last_of(kFilePathSeparator) + 1); + // workaround only for Windows that the file path could contain both + // Windows FilePathSeparator and '/' + filename = filename.substr(filename.find_last_of('/') + 1); + return filename; + } + + port::Mutex mutex_; + }; + + const int kNumLevels = 7; + const int kLastLevel = kNumLevels - 1; + + auto* listener = new TestListener(); + + Options options = CurrentOptions(); + options.bottommost_temperature = Temperature::kWarm; + options.level0_file_num_compaction_trigger = 2; + options.level_compaction_dynamic_level_bytes = true; + options.num_levels = kNumLevels; + options.statistics = CreateDBStatistics(); + options.listeners.emplace_back(listener); + Reopen(options); + + auto size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kHot); + ASSERT_EQ(size, 0); + + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + get_iostats_context()->Reset(); + IOStatsContext* iostats = get_iostats_context(); + + ColumnFamilyMetaData metadata; + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(1, metadata.file_count); + SstFileMetaData meta = metadata.levels[kLastLevel].files[0]; + ASSERT_EQ(Temperature::kWarm, meta.temperature); + uint64_t number; + FileType type; + ASSERT_TRUE(ParseFileName(meta.name, &number, &type)); + ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature); + + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_GT(size, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0); + ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0); + + ASSERT_EQ("bar", Get("foo")); + + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0); + ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0); + ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0); + ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0); + ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0); + ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0); + + // non-bottommost file still has unknown temperature + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_EQ("bar", Get("bar")); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0); + ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0); + ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0); + ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0); + ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0); + ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0); + + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(2, metadata.file_count); + meta = metadata.levels[0].files[0]; + ASSERT_EQ(Temperature::kUnknown, meta.temperature); + ASSERT_TRUE(ParseFileName(meta.name, &number, &type)); + ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature); + + meta = metadata.levels[kLastLevel].files[0]; + ASSERT_EQ(Temperature::kWarm, meta.temperature); + ASSERT_TRUE(ParseFileName(meta.name, &number, &type)); + ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature); + + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_GT(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_GT(size, 0); + + // reopen and check the information is persisted + Reopen(options); + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(2, metadata.file_count); + meta = metadata.levels[0].files[0]; + ASSERT_EQ(Temperature::kUnknown, meta.temperature); + ASSERT_TRUE(ParseFileName(meta.name, &number, &type)); + ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature); + + meta = metadata.levels[kLastLevel].files[0]; + ASSERT_EQ(Temperature::kWarm, meta.temperature); + ASSERT_TRUE(ParseFileName(meta.name, &number, &type)); + ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature); + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_GT(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_GT(size, 0); + + // check other non-exist temperatures + size = GetSstSizeHelper(Temperature::kHot); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kCold); + ASSERT_EQ(size, 0); + std::string prop; + ASSERT_TRUE(dbfull()->GetProperty( + DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22), + &prop)); + ASSERT_EQ(std::atoi(prop.c_str()), 0); + + Reopen(options); + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(2, metadata.file_count); + meta = metadata.levels[0].files[0]; + ASSERT_EQ(Temperature::kUnknown, meta.temperature); + ASSERT_TRUE(ParseFileName(meta.name, &number, &type)); + ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature); + + meta = metadata.levels[kLastLevel].files[0]; + ASSERT_EQ(Temperature::kWarm, meta.temperature); + ASSERT_TRUE(ParseFileName(meta.name, &number, &type)); + ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature); +} + +TEST_F(DBTest2, LastLevelTemperatureUniversal) { + const int kTriggerNum = 3; + const int kNumLevels = 5; + const int kBottommostLevel = kNumLevels - 1; + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.level0_file_num_compaction_trigger = kTriggerNum; + options.num_levels = kNumLevels; + options.statistics = CreateDBStatistics(); + DestroyAndReopen(options); + + auto size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kHot); + ASSERT_EQ(size, 0); + get_iostats_context()->Reset(); + IOStatsContext* iostats = get_iostats_context(); + + for (int i = 0; i < kTriggerNum; i++) { + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ColumnFamilyMetaData metadata; + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(1, metadata.file_count); + ASSERT_EQ(Temperature::kUnknown, + metadata.levels[kBottommostLevel].files[0].temperature); + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_GT(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_EQ(size, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_read_count, 0); + ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0); + ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0); + ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0); + ASSERT_EQ("bar", Get("foo")); + + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0); + ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0); + ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0); + ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0); + + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(2, metadata.file_count); + ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature); + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_GT(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_EQ(size, 0); + + // Update bottommost temperature + options.bottommost_temperature = Temperature::kWarm; + Reopen(options); + db_->GetColumnFamilyMetaData(&metadata); + // Should not impact existing ones + ASSERT_EQ(Temperature::kUnknown, + metadata.levels[kBottommostLevel].files[0].temperature); + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_GT(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_EQ(size, 0); + + // new generated file should have the new settings + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(1, metadata.file_count); + ASSERT_EQ(Temperature::kWarm, + metadata.levels[kBottommostLevel].files[0].temperature); + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_GT(size, 0); + ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0); + ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0); + ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0); + ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0); + + // non-bottommost file still has unknown temperature + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(2, metadata.file_count); + ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature); + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_GT(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_GT(size, 0); + + // check other non-exist temperatures + size = GetSstSizeHelper(Temperature::kHot); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kCold); + ASSERT_EQ(size, 0); + std::string prop; + ASSERT_TRUE(dbfull()->GetProperty( + DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22), + &prop)); + ASSERT_EQ(std::atoi(prop.c_str()), 0); + + // Update bottommost temperature dynamically with SetOptions + auto s = db_->SetOptions({{"last_level_temperature", "kCold"}}); + ASSERT_OK(s); + ASSERT_EQ(db_->GetOptions().bottommost_temperature, Temperature::kCold); + db_->GetColumnFamilyMetaData(&metadata); + // Should not impact the existing files + ASSERT_EQ(Temperature::kWarm, + metadata.levels[kBottommostLevel].files[0].temperature); + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_GT(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_GT(size, 0); + size = GetSstSizeHelper(Temperature::kCold); + ASSERT_EQ(size, 0); + + // new generated files should have the new settings + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(1, metadata.file_count); + ASSERT_EQ(Temperature::kCold, + metadata.levels[kBottommostLevel].files[0].temperature); + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kCold); + ASSERT_GT(size, 0); + + // kLastTemperature is an invalid temperature + options.bottommost_temperature = Temperature::kLastTemperature; + s = TryReopen(options); + ASSERT_TRUE(s.IsIOError()); +} + +TEST_F(DBTest2, LastLevelStatistics) { + Options options = CurrentOptions(); + options.bottommost_temperature = Temperature::kWarm; + options.level0_file_num_compaction_trigger = 2; + options.level_compaction_dynamic_level_bytes = true; + options.statistics = CreateDBStatistics(); + Reopen(options); + + // generate 1 sst on level 0 + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_EQ("bar", Get("bar")); + + ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), 0); + ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), 0); + ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), 0); + + // 2nd flush to trigger compaction + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("bar", Get("bar")); + + ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), + options.statistics->getTickerCount(WARM_FILE_READ_BYTES)); + ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), + options.statistics->getTickerCount(WARM_FILE_READ_COUNT)); + + auto pre_bytes = + options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES); + auto pre_count = + options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT); + + // 3rd flush to generate 1 sst on level 0 + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_EQ("bar", Get("bar")); + + ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), + pre_bytes); + ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), + pre_count); + ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), + options.statistics->getTickerCount(WARM_FILE_READ_BYTES)); + ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), + options.statistics->getTickerCount(WARM_FILE_READ_COUNT)); +} + +TEST_F(DBTest2, CheckpointFileTemperature) { + class NoLinkTestFS : public FileTemperatureTestFS { + using FileTemperatureTestFS::FileTemperatureTestFS; + + IOStatus LinkFile(const std::string&, const std::string&, const IOOptions&, + IODebugContext*) override { + // return not supported to force checkpoint copy the file instead of just + // link + return IOStatus::NotSupported(); + } + }; + auto test_fs = std::make_shared(env_->GetFileSystem()); + std::unique_ptr env(new CompositeEnvWrapper(env_, test_fs)); + Options options = CurrentOptions(); + options.bottommost_temperature = Temperature::kWarm; + // set dynamic_level to true so the compaction would compact the data to the + // last level directly which will have the last_level_temperature + options.level_compaction_dynamic_level_bytes = true; + options.level0_file_num_compaction_trigger = 2; + options.env = env.get(); + Reopen(options); + + // generate a bottommost file and a non-bottommost file + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + auto size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_GT(size, 0); + + std::map temperatures; + std::vector infos; + ASSERT_OK( + dbfull()->GetLiveFilesStorageInfo(LiveFilesStorageInfoOptions(), &infos)); + for (auto info : infos) { + temperatures.emplace(info.file_number, info.temperature); + } + + test_fs->PopRequestedSstFileTemperatures(); + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK( + checkpoint->CreateCheckpoint(dbname_ + kFilePathSeparator + "tempcp")); + + // checking src file src_temperature hints: 2 sst files: 1 sst is kWarm, + // another is kUnknown + std::vector> requested_temps; + test_fs->PopRequestedSstFileTemperatures(&requested_temps); + // Two requests + ASSERT_EQ(requested_temps.size(), 2); + std::set distinct_requests; + for (const auto& requested_temp : requested_temps) { + // Matching manifest temperatures + ASSERT_EQ(temperatures.at(requested_temp.first), requested_temp.second); + distinct_requests.insert(requested_temp.first); + } + // Each request to distinct file + ASSERT_EQ(distinct_requests.size(), requested_temps.size()); + + delete checkpoint; + Close(); +} + +TEST_F(DBTest2, FileTemperatureManifestFixup) { + auto test_fs = std::make_shared(env_->GetFileSystem()); + std::unique_ptr env(new CompositeEnvWrapper(env_, test_fs)); + Options options = CurrentOptions(); + options.bottommost_temperature = Temperature::kWarm; + // set dynamic_level to true so the compaction would compact the data to the + // last level directly which will have the last_level_temperature + options.level_compaction_dynamic_level_bytes = true; + options.level0_file_num_compaction_trigger = 2; + options.env = env.get(); + std::vector cfs = {/*"default",*/ "test1", "test2"}; + CreateAndReopenWithCF(cfs, options); + // Needed for later re-opens (weird) + cfs.insert(cfs.begin(), kDefaultColumnFamilyName); + + // Generate a bottommost file in all CFs + for (int cf = 0; cf < 3; ++cf) { + ASSERT_OK(Put(cf, "a", "val")); + ASSERT_OK(Put(cf, "c", "val")); + ASSERT_OK(Flush(cf)); + ASSERT_OK(Put(cf, "b", "val")); + ASSERT_OK(Put(cf, "d", "val")); + ASSERT_OK(Flush(cf)); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // verify + ASSERT_GT(GetSstSizeHelper(Temperature::kWarm), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kHot), 0); + + // Generate a non-bottommost file in all CFs + for (int cf = 0; cf < 3; ++cf) { + ASSERT_OK(Put(cf, "e", "val")); + ASSERT_OK(Flush(cf)); + } + + // re-verify + ASSERT_GT(GetSstSizeHelper(Temperature::kWarm), 0); + // Not supported: ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kHot), 0); + + // Now change FS temperature on bottommost file(s) to kCold + std::map current_temps; + test_fs->CopyCurrentSstFileTemperatures(¤t_temps); + for (auto e : current_temps) { + if (e.second == Temperature::kWarm) { + test_fs->OverrideSstFileTemperature(e.first, Temperature::kCold); + } + } + // Metadata not yet updated + ASSERT_EQ(Get("a"), "val"); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + // Update with Close and UpdateManifestForFilesState, but first save cf + // descriptors + std::vector column_families; + for (size_t i = 0; i < handles_.size(); ++i) { + ColumnFamilyDescriptor cfdescriptor; + // GetDescriptor is not implemented for ROCKSDB_LITE + handles_[i]->GetDescriptor(&cfdescriptor).PermitUncheckedError(); + column_families.push_back(cfdescriptor); + } + Close(); + experimental::UpdateManifestForFilesStateOptions update_opts; + update_opts.update_temperatures = true; + + ASSERT_OK(experimental::UpdateManifestForFilesState( + options, dbname_, column_families, update_opts)); + + // Re-open and re-verify after update + ReopenWithColumnFamilies(cfs, options); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + // Not supported: ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kWarm), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kHot), 0); + + // Change kUnknown to kHot + test_fs->CopyCurrentSstFileTemperatures(¤t_temps); + for (auto e : current_temps) { + if (e.second == Temperature::kUnknown) { + test_fs->OverrideSstFileTemperature(e.first, Temperature::kHot); + } + } + + // Update with Close and UpdateManifestForFilesState + Close(); + ASSERT_OK(experimental::UpdateManifestForFilesState( + options, dbname_, column_families, update_opts)); + + // Re-open and re-verify after update + ReopenWithColumnFamilies(cfs, options); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kWarm), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kHot), 0); + + Close(); +} +#endif // ROCKSDB_LITE + +// WAL recovery mode is WALRecoveryMode::kPointInTimeRecovery. +TEST_F(DBTest2, PointInTimeRecoveryWithIOErrorWhileReadingWal) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "value0")); + Close(); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + bool should_inject_error = false; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::RecoverLogFiles:BeforeReadWal", + [&](void* /*arg*/) { should_inject_error = true; }); + SyncPoint::GetInstance()->SetCallBack( + "LogReader::ReadMore:AfterReadFile", [&](void* arg) { + if (should_inject_error) { + ASSERT_NE(nullptr, arg); + *reinterpret_cast(arg) = Status::IOError("Injected IOError"); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + options.avoid_flush_during_recovery = true; + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + Status s = TryReopen(options); + ASSERT_TRUE(s.IsIOError()); +} + +TEST_F(DBTest2, PointInTimeRecoveryWithSyncFailureInCFCreation) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCallFlush:Start:1", + "PointInTimeRecoveryWithSyncFailureInCFCreation:1"}, + {"PointInTimeRecoveryWithSyncFailureInCFCreation:2", + "DBImpl::BackgroundCallFlush:Start:2"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + CreateColumnFamilies({"test1"}, Options()); + ASSERT_OK(Put("foo", "bar")); + + // Creating a CF when a flush is going on, log is synced but the + // closed log file is not synced and corrupted. + port::Thread flush_thread([&]() { ASSERT_NOK(Flush()); }); + TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:1"); + CreateColumnFamilies({"test2"}, Options()); + env_->corrupt_in_sync_ = true; + TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:2"); + flush_thread.join(); + env_->corrupt_in_sync_ = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + // Reopening the DB should not corrupt anything + Options options = CurrentOptions(); + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + ReopenWithColumnFamilies({"default", "test1", "test2"}, options); +} + +TEST_F(DBTest2, RenameDirectory) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "value0")); + Close(); + auto old_dbname = dbname_; + auto new_dbname = dbname_ + "_2"; + EXPECT_OK(env_->RenameFile(dbname_, new_dbname)); + options.create_if_missing = false; + dbname_ = new_dbname; + ASSERT_OK(TryReopen(options)); + ASSERT_EQ("value0", Get("foo")); + Destroy(options); + dbname_ = old_dbname; +} + +TEST_F(DBTest2, SstUniqueIdVerifyBackwardCompatible) { + const int kNumSst = 3; + const int kLevel0Trigger = 4; + auto options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kLevel0Trigger; + options.statistics = CreateDBStatistics(); + // Skip for now + options.verify_sst_unique_id_in_manifest = false; + Reopen(options); + + std::atomic_int skipped = 0; + std::atomic_int passed = 0; + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::Open::SkippedVerifyUniqueId", + [&](void* /*arg*/) { skipped++; }); + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::Open::PassedVerifyUniqueId", + [&](void* /*arg*/) { passed++; }); + SyncPoint::GetInstance()->EnableProcessing(); + + // generate a few SSTs + for (int i = 0; i < kNumSst; i++) { + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value")); + } + ASSERT_OK(Flush()); + } + + // Verification has been skipped on files so far + EXPECT_EQ(skipped, kNumSst); + EXPECT_EQ(passed, 0); + + // Reopen with verification + options.verify_sst_unique_id_in_manifest = true; + skipped = 0; + passed = 0; + Reopen(options); + EXPECT_EQ(skipped, 0); + EXPECT_EQ(passed, kNumSst); + + // Now simulate no unique id in manifest for next file + // NOTE: this only works for loading manifest from disk, + // not in-memory manifest, so we need to re-open below. + SyncPoint::GetInstance()->SetCallBack( + "VersionEdit::EncodeTo:UniqueId", [&](void* arg) { + auto unique_id = static_cast(arg); + // remove id before writing it to manifest + (*unique_id)[0] = 0; + (*unique_id)[1] = 0; + }); + + // test compaction generated Sst + for (int i = kNumSst; i < kLevel0Trigger; i++) { + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value")); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,1", FilesPerLevel(0)); +#endif // ROCKSDB_LITE + + // Reopen (with verification) + ASSERT_TRUE(options.verify_sst_unique_id_in_manifest); + skipped = 0; + passed = 0; + Reopen(options); + EXPECT_EQ(skipped, 1); + EXPECT_EQ(passed, 0); +} + +TEST_F(DBTest2, SstUniqueIdVerify) { + const int kNumSst = 3; + const int kLevel0Trigger = 4; + auto options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kLevel0Trigger; + // Allow mismatch for now + options.verify_sst_unique_id_in_manifest = false; + Reopen(options); + + SyncPoint::GetInstance()->SetCallBack( + "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) { + auto props = static_cast(props_vs); + // update table property session_id to a different one, which + // changes unique ID + props->db_session_id = DBImpl::GenerateDbSessionId(nullptr); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // generate a few SSTs + for (int i = 0; i < kNumSst; i++) { + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value")); + } + ASSERT_OK(Flush()); + } + + // Reopen with verification should report corruption + options.verify_sst_unique_id_in_manifest = true; + auto s = TryReopen(options); + ASSERT_TRUE(s.IsCorruption()); + + // Reopen without verification should be fine + options.verify_sst_unique_id_in_manifest = false; + Reopen(options); + + // test compaction generated Sst + for (int i = kNumSst; i < kLevel0Trigger; i++) { + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value")); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,1", FilesPerLevel(0)); +#endif // ROCKSDB_LITE + + // Reopen with verification should fail + options.verify_sst_unique_id_in_manifest = true; + s = TryReopen(options); + ASSERT_TRUE(s.IsCorruption()); +} + +TEST_F(DBTest2, SstUniqueIdVerifyMultiCFs) { + const int kNumSst = 3; + const int kLevel0Trigger = 4; + auto options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kLevel0Trigger; + // Allow mismatch for now + options.verify_sst_unique_id_in_manifest = false; + + CreateAndReopenWithCF({"one", "two"}, options); + + // generate good SSTs + for (int cf_num : {0, 2}) { + for (int i = 0; i < kNumSst; i++) { + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(cf_num, Key(i * 10 + j), "value")); + } + ASSERT_OK(Flush(cf_num)); + } + } + + // generate SSTs with bad unique id + SyncPoint::GetInstance()->SetCallBack( + "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) { + auto props = static_cast(props_vs); + // update table property session_id to a different one + props->db_session_id = DBImpl::GenerateDbSessionId(nullptr); + }); + SyncPoint::GetInstance()->EnableProcessing(); + for (int i = 0; i < kNumSst; i++) { + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(1, Key(i * 10 + j), "value")); + } + ASSERT_OK(Flush(1)); + } + + // Reopen with verification should report corruption + options.verify_sst_unique_id_in_manifest = true; + auto s = TryReopenWithColumnFamilies({"default", "one", "two"}, options); + ASSERT_TRUE(s.IsCorruption()); +} + +TEST_F(DBTest2, BestEffortsRecoveryWithSstUniqueIdVerification) { + const auto tamper_with_uniq_id = [&](void* arg) { + auto props = static_cast(arg); + assert(props); + // update table property session_id to a different one + props->db_session_id = DBImpl::GenerateDbSessionId(nullptr); + }; + + const auto assert_db = [&](size_t expected_count, + const std::string& expected_v) { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + size_t cnt = 0; + for (it->SeekToFirst(); it->Valid(); it->Next(), ++cnt) { + ASSERT_EQ(std::to_string(cnt), it->key()); + ASSERT_EQ(expected_v, it->value()); + } + ASSERT_EQ(expected_count, cnt); + }; + + const int num_l0_compaction_trigger = 8; + const int num_l0 = num_l0_compaction_trigger - 1; + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = num_l0_compaction_trigger; + + for (int k = 0; k < num_l0; ++k) { + // Allow mismatch for now + options.verify_sst_unique_id_in_manifest = false; + + DestroyAndReopen(options); + + constexpr size_t num_keys_per_file = 10; + for (int i = 0; i < num_l0; ++i) { + for (size_t j = 0; j < num_keys_per_file; ++j) { + ASSERT_OK(Put(std::to_string(j), "v" + std::to_string(i))); + } + if (i == k) { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->SetCallBack( + "PropertyBlockBuilder::AddTableProperty:Start", + tamper_with_uniq_id); + SyncPoint::GetInstance()->EnableProcessing(); + } + ASSERT_OK(Flush()); + } + + options.verify_sst_unique_id_in_manifest = true; + Status s = TryReopen(options); + ASSERT_TRUE(s.IsCorruption()); + + options.best_efforts_recovery = true; + Reopen(options); + assert_db(k == 0 ? 0 : num_keys_per_file, "v" + std::to_string(k - 1)); + + // Reopen with regular recovery + options.best_efforts_recovery = false; + Reopen(options); + assert_db(k == 0 ? 0 : num_keys_per_file, "v" + std::to_string(k - 1)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + for (size_t i = 0; i < num_keys_per_file; ++i) { + ASSERT_OK(Put(std::to_string(i), "v")); + } + ASSERT_OK(Flush()); + Reopen(options); + { + for (size_t i = 0; i < num_keys_per_file; ++i) { + ASSERT_EQ("v", Get(std::to_string(i))); + } + } + } +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest2, GetLatestSeqAndTsForKey) { + Destroy(last_options_); + + Options options = CurrentOptions(); + options.max_write_buffer_size_to_maintain = 64 << 10; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + options.statistics = CreateDBStatistics(); + + Reopen(options); + + constexpr uint64_t kTsU64Value = 12; + + for (uint64_t key = 0; key < 100; ++key) { + std::string ts; + PutFixed64(&ts, kTsU64Value); + + std::string key_str; + PutFixed64(&key_str, key); + std::reverse(key_str.begin(), key_str.end()); + ASSERT_OK(db_->Put(WriteOptions(), key_str, ts, "value")); + } + + ASSERT_OK(Flush()); + + constexpr bool cache_only = true; + constexpr SequenceNumber lower_bound_seq = 0; + auto* cfhi = static_cast_with_check( + dbfull()->DefaultColumnFamily()); + assert(cfhi); + assert(cfhi->cfd()); + SuperVersion* sv = cfhi->cfd()->GetSuperVersion(); + for (uint64_t key = 0; key < 100; ++key) { + std::string key_str; + PutFixed64(&key_str, key); + std::reverse(key_str.begin(), key_str.end()); + std::string ts; + SequenceNumber seq = kMaxSequenceNumber; + bool found_record_for_key = false; + bool is_blob_index = false; + + const Status s = dbfull()->GetLatestSequenceForKey( + sv, key_str, cache_only, lower_bound_seq, &seq, &ts, + &found_record_for_key, &is_blob_index); + ASSERT_OK(s); + std::string expected_ts; + PutFixed64(&expected_ts, kTsU64Value); + ASSERT_EQ(expected_ts, ts); + ASSERT_TRUE(found_record_for_key); + ASSERT_FALSE(is_blob_index); + } + + // Verify that no read to SST files. + ASSERT_EQ(0, options.statistics->getTickerCount(GET_HIT_L0)); +} +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_test_util.cc b/src/rocksdb/db/db_test_util.cc new file mode 100644 index 000000000..d53bca51a --- /dev/null +++ b/src/rocksdb/db/db_test_util.cc @@ -0,0 +1,1773 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" + +#include "cache/cache_reservation_manager.h" +#include "db/forward_iterator.h" +#include "env/mock_env.h" +#include "port/lang.h" +#include "rocksdb/cache.h" +#include "rocksdb/convenience.h" +#include "rocksdb/env_encryption.h" +#include "rocksdb/unique_id.h" +#include "rocksdb/utilities/object_registry.h" +#include "table/format.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +int64_t MaybeCurrentTime(Env* env) { + int64_t time = 1337346000; // arbitrary fallback default + env->GetCurrentTime(&time).PermitUncheckedError(); + return time; +} +} // anonymous namespace + +// Special Env used to delay background operations + +SpecialEnv::SpecialEnv(Env* base, bool time_elapse_only_sleep) + : EnvWrapper(base), + maybe_starting_time_(MaybeCurrentTime(base)), + rnd_(301), + sleep_counter_(this), + time_elapse_only_sleep_(time_elapse_only_sleep), + no_slowdown_(time_elapse_only_sleep) { + delay_sstable_sync_.store(false, std::memory_order_release); + drop_writes_.store(false, std::memory_order_release); + no_space_.store(false, std::memory_order_release); + non_writable_.store(false, std::memory_order_release); + count_random_reads_ = false; + count_sequential_reads_ = false; + manifest_sync_error_.store(false, std::memory_order_release); + manifest_write_error_.store(false, std::memory_order_release); + log_write_error_.store(false, std::memory_order_release); + no_file_overwrite_.store(false, std::memory_order_release); + random_file_open_counter_.store(0, std::memory_order_relaxed); + delete_count_.store(0, std::memory_order_relaxed); + num_open_wal_file_.store(0); + log_write_slowdown_ = 0; + bytes_written_ = 0; + sync_counter_ = 0; + non_writeable_rate_ = 0; + new_writable_count_ = 0; + non_writable_count_ = 0; + table_write_callback_ = nullptr; +} +DBTestBase::DBTestBase(const std::string path, bool env_do_fsync) + : mem_env_(nullptr), encrypted_env_(nullptr), option_config_(kDefault) { + Env* base_env = Env::Default(); + ConfigOptions config_options; + EXPECT_OK(test::CreateEnvFromSystem(config_options, &base_env, &env_guard_)); + EXPECT_NE(nullptr, base_env); + if (getenv("MEM_ENV")) { + mem_env_ = MockEnv::Create(base_env, base_env->GetSystemClock()); + } +#ifndef ROCKSDB_LITE + if (getenv("ENCRYPTED_ENV")) { + std::shared_ptr provider; + std::string provider_id = getenv("ENCRYPTED_ENV"); + if (provider_id.find("=") == std::string::npos && + !EndsWith(provider_id, "://test")) { + provider_id = provider_id + "://test"; + } + EXPECT_OK(EncryptionProvider::CreateFromString(ConfigOptions(), provider_id, + &provider)); + encrypted_env_ = NewEncryptedEnv(mem_env_ ? mem_env_ : base_env, provider); + } +#endif // !ROCKSDB_LITE + env_ = new SpecialEnv(encrypted_env_ ? encrypted_env_ + : (mem_env_ ? mem_env_ : base_env)); + env_->SetBackgroundThreads(1, Env::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); + env_->skip_fsync_ = !env_do_fsync; + dbname_ = test::PerThreadDBPath(env_, path); + alternative_wal_dir_ = dbname_ + "/wal"; + alternative_db_log_dir_ = dbname_ + "/db_log_dir"; + auto options = CurrentOptions(); + options.env = env_; + auto delete_options = options; + delete_options.wal_dir = alternative_wal_dir_; + EXPECT_OK(DestroyDB(dbname_, delete_options)); + // Destroy it for not alternative WAL dir is used. + EXPECT_OK(DestroyDB(dbname_, options)); + db_ = nullptr; + Reopen(options); + Random::GetTLSInstance()->Reset(0xdeadbeef); +} + +DBTestBase::~DBTestBase() { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + Close(); + Options options; + options.db_paths.emplace_back(dbname_, 0); + options.db_paths.emplace_back(dbname_ + "_2", 0); + options.db_paths.emplace_back(dbname_ + "_3", 0); + options.db_paths.emplace_back(dbname_ + "_4", 0); + options.env = env_; + + if (getenv("KEEP_DB")) { + printf("DB is still at %s\n", dbname_.c_str()); + } else { + EXPECT_OK(DestroyDB(dbname_, options)); + } + delete env_; +} + +bool DBTestBase::ShouldSkipOptions(int option_config, int skip_mask) { +#ifdef ROCKSDB_LITE + // These options are not supported in ROCKSDB_LITE + if (option_config == kHashSkipList || + option_config == kPlainTableFirstBytePrefix || + option_config == kPlainTableCappedPrefix || + option_config == kPlainTableCappedPrefixNonMmap || + option_config == kPlainTableAllBytesPrefix || + option_config == kVectorRep || option_config == kHashLinkList || + option_config == kUniversalCompaction || + option_config == kUniversalCompactionMultiLevel || + option_config == kUniversalSubcompactions || + option_config == kFIFOCompaction || + option_config == kConcurrentSkipList) { + return true; + } +#endif + + if ((skip_mask & kSkipUniversalCompaction) && + (option_config == kUniversalCompaction || + option_config == kUniversalCompactionMultiLevel || + option_config == kUniversalSubcompactions)) { + return true; + } + if ((skip_mask & kSkipMergePut) && option_config == kMergePut) { + return true; + } + if ((skip_mask & kSkipNoSeekToLast) && + (option_config == kHashLinkList || option_config == kHashSkipList)) { + return true; + } + if ((skip_mask & kSkipPlainTable) && + (option_config == kPlainTableAllBytesPrefix || + option_config == kPlainTableFirstBytePrefix || + option_config == kPlainTableCappedPrefix || + option_config == kPlainTableCappedPrefixNonMmap)) { + return true; + } + if ((skip_mask & kSkipHashIndex) && + (option_config == kBlockBasedTableWithPrefixHashIndex || + option_config == kBlockBasedTableWithWholeKeyHashIndex)) { + return true; + } + if ((skip_mask & kSkipFIFOCompaction) && option_config == kFIFOCompaction) { + return true; + } + if ((skip_mask & kSkipMmapReads) && option_config == kWalDirAndMmapReads) { + return true; + } + return false; +} + +// Switch to a fresh database with the next option configuration to +// test. Return false if there are no more configurations to test. +bool DBTestBase::ChangeOptions(int skip_mask) { + for (option_config_++; option_config_ < kEnd; option_config_++) { + if (ShouldSkipOptions(option_config_, skip_mask)) { + continue; + } + break; + } + + if (option_config_ >= kEnd) { + Destroy(last_options_); + return false; + } else { + auto options = CurrentOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); + return true; + } +} + +// Switch between different compaction styles. +bool DBTestBase::ChangeCompactOptions() { + if (option_config_ == kDefault) { + option_config_ = kUniversalCompaction; + Destroy(last_options_); + auto options = CurrentOptions(); + options.create_if_missing = true; + Reopen(options); + return true; + } else if (option_config_ == kUniversalCompaction) { + option_config_ = kUniversalCompactionMultiLevel; + Destroy(last_options_); + auto options = CurrentOptions(); + options.create_if_missing = true; + Reopen(options); + return true; + } else if (option_config_ == kUniversalCompactionMultiLevel) { + option_config_ = kLevelSubcompactions; + Destroy(last_options_); + auto options = CurrentOptions(); + assert(options.max_subcompactions > 1); + Reopen(options); + return true; + } else if (option_config_ == kLevelSubcompactions) { + option_config_ = kUniversalSubcompactions; + Destroy(last_options_); + auto options = CurrentOptions(); + assert(options.max_subcompactions > 1); + Reopen(options); + return true; + } else { + return false; + } +} + +// Switch between different WAL settings +bool DBTestBase::ChangeWalOptions() { + if (option_config_ == kDefault) { + option_config_ = kDBLogDir; + Destroy(last_options_); + auto options = CurrentOptions(); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + return true; + } else if (option_config_ == kDBLogDir) { + option_config_ = kWalDirAndMmapReads; + Destroy(last_options_); + auto options = CurrentOptions(); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + return true; + } else if (option_config_ == kWalDirAndMmapReads) { + option_config_ = kRecycleLogFiles; + Destroy(last_options_); + auto options = CurrentOptions(); + Destroy(options); + Reopen(options); + return true; + } else { + return false; + } +} + +// Switch between different filter policy +// Jump from kDefault to kFilter to kFullFilter +bool DBTestBase::ChangeFilterOptions() { + if (option_config_ == kDefault) { + option_config_ = kFilter; + } else if (option_config_ == kFilter) { + option_config_ = kFullFilterWithNewTableReaderForCompactions; + } else if (option_config_ == kFullFilterWithNewTableReaderForCompactions) { + option_config_ = kPartitionedFilterWithNewTableReaderForCompactions; + } else { + return false; + } + Destroy(last_options_); + + auto options = CurrentOptions(); + options.create_if_missing = true; + TryReopen(options); + return true; +} + +// Switch between different DB options for file ingestion tests. +bool DBTestBase::ChangeOptionsForFileIngestionTest() { + if (option_config_ == kDefault) { + option_config_ = kUniversalCompaction; + Destroy(last_options_); + auto options = CurrentOptions(); + options.create_if_missing = true; + TryReopen(options); + return true; + } else if (option_config_ == kUniversalCompaction) { + option_config_ = kUniversalCompactionMultiLevel; + Destroy(last_options_); + auto options = CurrentOptions(); + options.create_if_missing = true; + TryReopen(options); + return true; + } else if (option_config_ == kUniversalCompactionMultiLevel) { + option_config_ = kLevelSubcompactions; + Destroy(last_options_); + auto options = CurrentOptions(); + assert(options.max_subcompactions > 1); + TryReopen(options); + return true; + } else if (option_config_ == kLevelSubcompactions) { + option_config_ = kUniversalSubcompactions; + Destroy(last_options_); + auto options = CurrentOptions(); + assert(options.max_subcompactions > 1); + TryReopen(options); + return true; + } else if (option_config_ == kUniversalSubcompactions) { + option_config_ = kDirectIO; + Destroy(last_options_); + auto options = CurrentOptions(); + TryReopen(options); + return true; + } else { + return false; + } +} + +// Return the current option configuration. +Options DBTestBase::CurrentOptions( + const anon::OptionsOverride& options_override) const { + return GetOptions(option_config_, GetDefaultOptions(), options_override); +} + +Options DBTestBase::CurrentOptions( + const Options& default_options, + const anon::OptionsOverride& options_override) const { + return GetOptions(option_config_, default_options, options_override); +} + +Options DBTestBase::GetDefaultOptions() const { + Options options; + options.write_buffer_size = 4090 * 4096; + options.target_file_size_base = 2 * 1024 * 1024; + options.max_bytes_for_level_base = 10 * 1024 * 1024; + options.max_open_files = 5000; + options.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords; + options.compaction_pri = CompactionPri::kByCompensatedSize; + options.env = env_; + if (!env_->skip_fsync_) { + options.track_and_verify_wals_in_manifest = true; + } + return options; +} + +Options DBTestBase::GetOptions( + int option_config, const Options& default_options, + const anon::OptionsOverride& options_override) const { + // this redundant copy is to minimize code change w/o having lint error. + Options options = default_options; + BlockBasedTableOptions table_options; + bool set_block_based_table_factory = true; +#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \ + !defined(OS_AIX) + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "NewRandomAccessFile:O_DIRECT"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "NewWritableFile:O_DIRECT"); +#endif + // kMustFreeHeapAllocations -> indicates ASAN build + if (kMustFreeHeapAllocations && !options_override.full_block_cache) { + // Detecting block cache use-after-free is normally difficult in unit + // tests, because as a cache, it tends to keep unreferenced entries in + // memory, and we normally want unit tests to take advantage of block + // cache for speed. However, we also want a strong chance of detecting + // block cache use-after-free in unit tests in ASAN builds, so for ASAN + // builds we use a trivially small block cache to which entries can be + // added but are immediately freed on no more references. + table_options.block_cache = NewLRUCache(/* too small */ 1); + } + + bool can_allow_mmap = IsMemoryMappedAccessSupported(); + switch (option_config) { +#ifndef ROCKSDB_LITE + case kHashSkipList: + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.memtable_factory.reset(NewHashSkipListRepFactory(16)); + options.allow_concurrent_memtable_write = false; + options.unordered_write = false; + break; + case kPlainTableFirstBytePrefix: + options.table_factory.reset(NewPlainTableFactory()); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.allow_mmap_reads = can_allow_mmap; + options.max_sequential_skip_in_iterations = 999999; + set_block_based_table_factory = false; + break; + case kPlainTableCappedPrefix: + options.table_factory.reset(NewPlainTableFactory()); + options.prefix_extractor.reset(NewCappedPrefixTransform(8)); + options.allow_mmap_reads = can_allow_mmap; + options.max_sequential_skip_in_iterations = 999999; + set_block_based_table_factory = false; + break; + case kPlainTableCappedPrefixNonMmap: + options.table_factory.reset(NewPlainTableFactory()); + options.prefix_extractor.reset(NewCappedPrefixTransform(8)); + options.allow_mmap_reads = false; + options.max_sequential_skip_in_iterations = 999999; + set_block_based_table_factory = false; + break; + case kPlainTableAllBytesPrefix: + options.table_factory.reset(NewPlainTableFactory()); + options.prefix_extractor.reset(NewNoopTransform()); + options.allow_mmap_reads = can_allow_mmap; + options.max_sequential_skip_in_iterations = 999999; + set_block_based_table_factory = false; + break; + case kVectorRep: + options.memtable_factory.reset(new VectorRepFactory(100)); + options.allow_concurrent_memtable_write = false; + options.unordered_write = false; + break; + case kHashLinkList: + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.memtable_factory.reset( + NewHashLinkListRepFactory(4, 0, 3, true, 4)); + options.allow_concurrent_memtable_write = false; + options.unordered_write = false; + break; + case kDirectIO: { + options.use_direct_reads = true; + options.use_direct_io_for_flush_and_compaction = true; + options.compaction_readahead_size = 2 * 1024 * 1024; + SetupSyncPointsToMockDirectIO(); + break; + } +#endif // ROCKSDB_LITE + case kMergePut: + options.merge_operator = MergeOperators::CreatePutOperator(); + break; + case kFilter: + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + break; + case kFullFilterWithNewTableReaderForCompactions: + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.compaction_readahead_size = 10 * 1024 * 1024; + break; + case kPartitionedFilterWithNewTableReaderForCompactions: + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + table_options.partition_filters = true; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + options.compaction_readahead_size = 10 * 1024 * 1024; + break; + case kUncompressed: + options.compression = kNoCompression; + break; + case kNumLevel_3: + options.num_levels = 3; + break; + case kDBLogDir: + options.db_log_dir = alternative_db_log_dir_; + break; + case kWalDirAndMmapReads: + options.wal_dir = alternative_wal_dir_; + // mmap reads should be orthogonal to WalDir setting, so we piggyback to + // this option config to test mmap reads as well + options.allow_mmap_reads = can_allow_mmap; + break; + case kManifestFileSize: + options.max_manifest_file_size = 50; // 50 bytes + break; + case kPerfOptions: + options.delayed_write_rate = 8 * 1024 * 1024; + options.report_bg_io_stats = true; + // TODO(3.13) -- test more options + break; + case kUniversalCompaction: + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 1; + break; + case kUniversalCompactionMultiLevel: + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 8; + break; + case kCompressedBlockCache: + options.allow_mmap_writes = can_allow_mmap; + table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024); + break; + case kInfiniteMaxOpenFiles: + options.max_open_files = -1; + break; + case kCRC32cChecksum: { + // Old default was CRC32c, but XXH3 (new default) is faster on common + // hardware + table_options.checksum = kCRC32c; + // Thrown in here for basic coverage: + options.DisableExtraChecks(); + break; + } + case kFIFOCompaction: { + options.compaction_style = kCompactionStyleFIFO; + options.max_open_files = -1; + break; + } + case kBlockBasedTableWithPrefixHashIndex: { + table_options.index_type = BlockBasedTableOptions::kHashSearch; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + break; + } + case kBlockBasedTableWithWholeKeyHashIndex: { + table_options.index_type = BlockBasedTableOptions::kHashSearch; + options.prefix_extractor.reset(NewNoopTransform()); + break; + } + case kBlockBasedTableWithPartitionedIndex: { + table_options.format_version = 3; + table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; + options.prefix_extractor.reset(NewNoopTransform()); + break; + } + case kBlockBasedTableWithPartitionedIndexFormat4: { + table_options.format_version = 4; + // Format 4 changes the binary index format. Since partitioned index is a + // super-set of simple indexes, we are also using kTwoLevelIndexSearch to + // test this format. + table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; + // The top-level index in partition filters are also affected by format 4. + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + table_options.partition_filters = true; + table_options.index_block_restart_interval = 8; + break; + } + case kBlockBasedTableWithIndexRestartInterval: { + table_options.index_block_restart_interval = 8; + break; + } + case kBlockBasedTableWithLatestFormat: { + // In case different from default + table_options.format_version = kLatestFormatVersion; + break; + } + case kOptimizeFiltersForHits: { + options.optimize_filters_for_hits = true; + set_block_based_table_factory = true; + break; + } + case kRowCache: { + options.row_cache = NewLRUCache(1024 * 1024); + break; + } + case kRecycleLogFiles: { + options.recycle_log_file_num = 2; + break; + } + case kLevelSubcompactions: { + options.max_subcompactions = 4; + break; + } + case kUniversalSubcompactions: { + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 8; + options.max_subcompactions = 4; + break; + } + case kConcurrentSkipList: { + options.allow_concurrent_memtable_write = true; + options.enable_write_thread_adaptive_yield = true; + break; + } + case kPipelinedWrite: { + options.enable_pipelined_write = true; + break; + } + case kConcurrentWALWrites: { + // This options optimize 2PC commit path + options.two_write_queues = true; + options.manual_wal_flush = true; + break; + } + case kUnorderedWrite: { + options.allow_concurrent_memtable_write = false; + options.unordered_write = false; + break; + } + + default: + break; + } + + if (options_override.filter_policy) { + table_options.filter_policy = options_override.filter_policy; + table_options.partition_filters = options_override.partition_filters; + table_options.metadata_block_size = options_override.metadata_block_size; + } + if (set_block_based_table_factory) { + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + } + options.env = env_; + options.create_if_missing = true; + options.fail_if_options_file_error = true; + return options; +} + +void DBTestBase::CreateColumnFamilies(const std::vector& cfs, + const Options& options) { + ColumnFamilyOptions cf_opts(options); + size_t cfi = handles_.size(); + handles_.resize(cfi + cfs.size()); + for (auto cf : cfs) { + Status s = db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]); + ASSERT_OK(s); + } +} + +void DBTestBase::CreateAndReopenWithCF(const std::vector& cfs, + const Options& options) { + CreateColumnFamilies(cfs, options); + std::vector cfs_plus_default = cfs; + cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName); + ReopenWithColumnFamilies(cfs_plus_default, options); +} + +void DBTestBase::ReopenWithColumnFamilies(const std::vector& cfs, + const std::vector& options) { + ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); +} + +void DBTestBase::ReopenWithColumnFamilies(const std::vector& cfs, + const Options& options) { + ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); +} + +void DBTestBase::SetTimeElapseOnlySleepOnReopen(DBOptions* options) { + time_elapse_only_sleep_on_reopen_ = true; + + // Need to disable stats dumping and persisting which also use + // RepeatableThread, which uses InstrumentedCondVar::TimedWaitInternal. + // With time_elapse_only_sleep_, this can hang on some platforms (MacOS) + // because (a) on some platforms, pthread_cond_timedwait does not appear + // to release the lock for other threads to operate if the deadline time + // is already passed, and (b) TimedWait calls are currently a bad abstraction + // because the deadline parameter is usually computed from Env time, + // but is interpreted in real clock time. + options->stats_dump_period_sec = 0; + options->stats_persist_period_sec = 0; +} + +void DBTestBase::MaybeInstallTimeElapseOnlySleep(const DBOptions& options) { + if (time_elapse_only_sleep_on_reopen_) { + assert(options.env == env_ || + static_cast_with_check(options.env) + ->env_target() == env_); + assert(options.stats_dump_period_sec == 0); + assert(options.stats_persist_period_sec == 0); + // We cannot set these before destroying the last DB because they might + // cause a deadlock or similar without the appropriate options set in + // the DB. + env_->time_elapse_only_sleep_ = true; + env_->no_slowdown_ = true; + } else { + // Going back in same test run is not yet supported, so no + // reset in this case. + } +} + +Status DBTestBase::TryReopenWithColumnFamilies( + const std::vector& cfs, const std::vector& options) { + Close(); + EXPECT_EQ(cfs.size(), options.size()); + std::vector column_families; + for (size_t i = 0; i < cfs.size(); ++i) { + column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i])); + } + DBOptions db_opts = DBOptions(options[0]); + last_options_ = options[0]; + MaybeInstallTimeElapseOnlySleep(db_opts); + return DB::Open(db_opts, dbname_, column_families, &handles_, &db_); +} + +Status DBTestBase::TryReopenWithColumnFamilies( + const std::vector& cfs, const Options& options) { + Close(); + std::vector v_opts(cfs.size(), options); + return TryReopenWithColumnFamilies(cfs, v_opts); +} + +void DBTestBase::Reopen(const Options& options) { + ASSERT_OK(TryReopen(options)); +} + +void DBTestBase::Close() { + for (auto h : handles_) { + EXPECT_OK(db_->DestroyColumnFamilyHandle(h)); + } + handles_.clear(); + delete db_; + db_ = nullptr; +} + +void DBTestBase::DestroyAndReopen(const Options& options) { + // Destroy using last options + Destroy(last_options_); + Reopen(options); +} + +void DBTestBase::Destroy(const Options& options, bool delete_cf_paths) { + std::vector column_families; + if (delete_cf_paths) { + for (size_t i = 0; i < handles_.size(); ++i) { + ColumnFamilyDescriptor cfdescriptor; + // GetDescriptor is not implemented for ROCKSDB_LITE + handles_[i]->GetDescriptor(&cfdescriptor).PermitUncheckedError(); + column_families.push_back(cfdescriptor); + } + } + Close(); + ASSERT_OK(DestroyDB(dbname_, options, column_families)); +} + +Status DBTestBase::ReadOnlyReopen(const Options& options) { + MaybeInstallTimeElapseOnlySleep(options); + return DB::OpenForReadOnly(options, dbname_, &db_); +} + +Status DBTestBase::TryReopen(const Options& options) { + Close(); + last_options_.table_factory.reset(); + // Note: operator= is an unsafe approach here since it destructs + // std::shared_ptr in the same order of their creation, in contrast to + // destructors which destructs them in the opposite order of creation. One + // particular problem is that the cache destructor might invoke callback + // functions that use Option members such as statistics. To work around this + // problem, we manually call destructor of table_factory which eventually + // clears the block cache. + last_options_ = options; + MaybeInstallTimeElapseOnlySleep(options); + return DB::Open(options, dbname_, &db_); +} + +bool DBTestBase::IsDirectIOSupported() { + return test::IsDirectIOSupported(env_, dbname_); +} + +bool DBTestBase::IsMemoryMappedAccessSupported() const { + return (!encrypted_env_); +} + +Status DBTestBase::Flush(int cf) { + if (cf == 0) { + return db_->Flush(FlushOptions()); + } else { + return db_->Flush(FlushOptions(), handles_[cf]); + } +} + +Status DBTestBase::Flush(const std::vector& cf_ids) { + std::vector cfhs; + std::for_each(cf_ids.begin(), cf_ids.end(), + [&cfhs, this](int id) { cfhs.emplace_back(handles_[id]); }); + return db_->Flush(FlushOptions(), cfhs); +} + +Status DBTestBase::Put(const Slice& k, const Slice& v, WriteOptions wo) { + if (kMergePut == option_config_) { + return db_->Merge(wo, k, v); + } else { + return db_->Put(wo, k, v); + } +} + +Status DBTestBase::Put(int cf, const Slice& k, const Slice& v, + WriteOptions wo) { + if (kMergePut == option_config_) { + return db_->Merge(wo, handles_[cf], k, v); + } else { + return db_->Put(wo, handles_[cf], k, v); + } +} + +Status DBTestBase::Merge(const Slice& k, const Slice& v, WriteOptions wo) { + return db_->Merge(wo, k, v); +} + +Status DBTestBase::Merge(int cf, const Slice& k, const Slice& v, + WriteOptions wo) { + return db_->Merge(wo, handles_[cf], k, v); +} + +Status DBTestBase::Delete(const std::string& k) { + return db_->Delete(WriteOptions(), k); +} + +Status DBTestBase::Delete(int cf, const std::string& k) { + return db_->Delete(WriteOptions(), handles_[cf], k); +} + +Status DBTestBase::SingleDelete(const std::string& k) { + return db_->SingleDelete(WriteOptions(), k); +} + +Status DBTestBase::SingleDelete(int cf, const std::string& k) { + return db_->SingleDelete(WriteOptions(), handles_[cf], k); +} + +std::string DBTestBase::Get(const std::string& k, const Snapshot* snapshot) { + ReadOptions options; + options.verify_checksums = true; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; +} + +std::string DBTestBase::Get(int cf, const std::string& k, + const Snapshot* snapshot) { + ReadOptions options; + options.verify_checksums = true; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, handles_[cf], k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; +} + +std::vector DBTestBase::MultiGet(std::vector cfs, + const std::vector& k, + const Snapshot* snapshot, + const bool batched, + const bool async) { + ReadOptions options; + options.verify_checksums = true; + options.snapshot = snapshot; + options.async_io = async; + std::vector handles; + std::vector keys; + std::vector result; + + for (unsigned int i = 0; i < cfs.size(); ++i) { + handles.push_back(handles_[cfs[i]]); + keys.push_back(k[i]); + } + std::vector s; + if (!batched) { + s = db_->MultiGet(options, handles, keys, &result); + for (size_t i = 0; i < s.size(); ++i) { + if (s[i].IsNotFound()) { + result[i] = "NOT_FOUND"; + } else if (!s[i].ok()) { + result[i] = s[i].ToString(); + } + } + } else { + std::vector pin_values(cfs.size()); + result.resize(cfs.size()); + s.resize(cfs.size()); + db_->MultiGet(options, cfs.size(), handles.data(), keys.data(), + pin_values.data(), s.data()); + for (size_t i = 0; i < s.size(); ++i) { + if (s[i].IsNotFound()) { + result[i] = "NOT_FOUND"; + } else if (!s[i].ok()) { + result[i] = s[i].ToString(); + } else { + result[i].assign(pin_values[i].data(), pin_values[i].size()); + // Increase likelihood of detecting potential use-after-free bugs with + // PinnableSlices tracking the same resource + pin_values[i].Reset(); + } + } + } + return result; +} + +std::vector DBTestBase::MultiGet(const std::vector& k, + const Snapshot* snapshot, + const bool async) { + ReadOptions options; + options.verify_checksums = true; + options.snapshot = snapshot; + options.async_io = async; + std::vector keys; + std::vector result(k.size()); + std::vector statuses(k.size()); + std::vector pin_values(k.size()); + + for (size_t i = 0; i < k.size(); ++i) { + keys.push_back(k[i]); + } + db_->MultiGet(options, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), pin_values.data(), statuses.data()); + for (size_t i = 0; i < statuses.size(); ++i) { + if (statuses[i].IsNotFound()) { + result[i] = "NOT_FOUND"; + } else if (!statuses[i].ok()) { + result[i] = statuses[i].ToString(); + } else { + result[i].assign(pin_values[i].data(), pin_values[i].size()); + // Increase likelihood of detecting potential use-after-free bugs with + // PinnableSlices tracking the same resource + pin_values[i].Reset(); + } + } + return result; +} + +Status DBTestBase::Get(const std::string& k, PinnableSlice* v) { + ReadOptions options; + options.verify_checksums = true; + Status s = dbfull()->Get(options, dbfull()->DefaultColumnFamily(), k, v); + return s; +} + +uint64_t DBTestBase::GetNumSnapshots() { + uint64_t int_num; + EXPECT_TRUE(dbfull()->GetIntProperty("rocksdb.num-snapshots", &int_num)); + return int_num; +} + +uint64_t DBTestBase::GetTimeOldestSnapshots() { + uint64_t int_num; + EXPECT_TRUE( + dbfull()->GetIntProperty("rocksdb.oldest-snapshot-time", &int_num)); + return int_num; +} + +uint64_t DBTestBase::GetSequenceOldestSnapshots() { + uint64_t int_num; + EXPECT_TRUE( + dbfull()->GetIntProperty("rocksdb.oldest-snapshot-sequence", &int_num)); + return int_num; +} + +// Return a string that contains all key,value pairs in order, +// formatted like "(k1->v1)(k2->v2)". +std::string DBTestBase::Contents(int cf) { + std::vector forward; + std::string result; + Iterator* iter = (cf == 0) ? db_->NewIterator(ReadOptions()) + : db_->NewIterator(ReadOptions(), handles_[cf]); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string s = IterStatus(iter); + result.push_back('('); + result.append(s); + result.push_back(')'); + forward.push_back(s); + } + + // Check reverse iteration results are the reverse of forward results + unsigned int matched = 0; + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + EXPECT_LT(matched, forward.size()); + EXPECT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]); + matched++; + } + EXPECT_EQ(matched, forward.size()); + + delete iter; + return result; +} + +void DBTestBase::CheckAllEntriesWithFifoReopen( + const std::string& expected_value, const Slice& user_key, int cf, + const std::vector& cfs, const Options& options) { + ASSERT_EQ(AllEntriesFor(user_key, cf), expected_value); + + std::vector cfs_plus_default = cfs; + cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName); + + Options fifo_options(options); + fifo_options.compaction_style = kCompactionStyleFIFO; + fifo_options.max_open_files = -1; + fifo_options.disable_auto_compactions = true; + ASSERT_OK(TryReopenWithColumnFamilies(cfs_plus_default, fifo_options)); + ASSERT_EQ(AllEntriesFor(user_key, cf), expected_value); + + ASSERT_OK(TryReopenWithColumnFamilies(cfs_plus_default, options)); + ASSERT_EQ(AllEntriesFor(user_key, cf), expected_value); +} + +std::string DBTestBase::AllEntriesFor(const Slice& user_key, int cf) { + Arena arena; + auto options = CurrentOptions(); + InternalKeyComparator icmp(options.comparator); + ReadOptions read_options; + ScopedArenaIterator iter; + if (cf == 0) { + iter.set(dbfull()->NewInternalIterator(read_options, &arena, + kMaxSequenceNumber)); + } else { + iter.set(dbfull()->NewInternalIterator(read_options, &arena, + kMaxSequenceNumber, handles_[cf])); + } + InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); + iter->Seek(target.Encode()); + std::string result; + if (!iter->status().ok()) { + result = iter->status().ToString(); + } else { + result = "[ "; + bool first = true; + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + if (ParseInternalKey(iter->key(), &ikey, true /* log_err_key */) != + Status::OK()) { + result += "CORRUPTED"; + } else { + if (!last_options_.comparator->Equal(ikey.user_key, user_key)) { + break; + } + if (!first) { + result += ", "; + } + first = false; + switch (ikey.type) { + case kTypeValue: + result += iter->value().ToString(); + break; + case kTypeMerge: + // keep it the same as kTypeValue for testing kMergePut + result += iter->value().ToString(); + break; + case kTypeDeletion: + result += "DEL"; + break; + case kTypeSingleDeletion: + result += "SDEL"; + break; + default: + assert(false); + break; + } + } + iter->Next(); + } + if (!first) { + result += " "; + } + result += "]"; + } + return result; +} + +#ifndef ROCKSDB_LITE +int DBTestBase::NumSortedRuns(int cf) { + ColumnFamilyMetaData cf_meta; + if (cf == 0) { + db_->GetColumnFamilyMetaData(&cf_meta); + } else { + db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta); + } + int num_sr = static_cast(cf_meta.levels[0].files.size()); + for (size_t i = 1U; i < cf_meta.levels.size(); i++) { + if (cf_meta.levels[i].files.size() > 0) { + num_sr++; + } + } + return num_sr; +} + +uint64_t DBTestBase::TotalSize(int cf) { + ColumnFamilyMetaData cf_meta; + if (cf == 0) { + db_->GetColumnFamilyMetaData(&cf_meta); + } else { + db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta); + } + return cf_meta.size; +} + +uint64_t DBTestBase::SizeAtLevel(int level) { + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + uint64_t sum = 0; + for (const auto& m : metadata) { + if (m.level == level) { + sum += m.size; + } + } + return sum; +} + +size_t DBTestBase::TotalLiveFiles(int cf) { + ColumnFamilyMetaData cf_meta; + if (cf == 0) { + db_->GetColumnFamilyMetaData(&cf_meta); + } else { + db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta); + } + size_t num_files = 0; + for (auto& level : cf_meta.levels) { + num_files += level.files.size(); + } + return num_files; +} + +size_t DBTestBase::CountLiveFiles() { + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + return metadata.size(); +} + +int DBTestBase::NumTableFilesAtLevel(int level, int cf) { + std::string property; + if (cf == 0) { + // default cfd + EXPECT_TRUE(db_->GetProperty( + "rocksdb.num-files-at-level" + std::to_string(level), &property)); + } else { + EXPECT_TRUE(db_->GetProperty( + handles_[cf], "rocksdb.num-files-at-level" + std::to_string(level), + &property)); + } + return atoi(property.c_str()); +} + +double DBTestBase::CompressionRatioAtLevel(int level, int cf) { + std::string property; + if (cf == 0) { + // default cfd + EXPECT_TRUE(db_->GetProperty( + "rocksdb.compression-ratio-at-level" + std::to_string(level), + &property)); + } else { + EXPECT_TRUE(db_->GetProperty( + handles_[cf], + "rocksdb.compression-ratio-at-level" + std::to_string(level), + &property)); + } + return std::stod(property); +} + +int DBTestBase::TotalTableFiles(int cf, int levels) { + if (levels == -1) { + levels = (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]); + } + int result = 0; + for (int level = 0; level < levels; level++) { + result += NumTableFilesAtLevel(level, cf); + } + return result; +} + +// Return spread of files per level +std::string DBTestBase::FilesPerLevel(int cf) { + int num_levels = + (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]); + std::string result; + size_t last_non_zero_offset = 0; + for (int level = 0; level < num_levels; level++) { + int f = NumTableFilesAtLevel(level, cf); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; +} + +#endif // !ROCKSDB_LITE + +std::vector DBTestBase::GetBlobFileNumbers() { + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + Version* const current = cfd->current(); + assert(current); + + const VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); + + const auto& blob_files = storage_info->GetBlobFiles(); + + std::vector result; + result.reserve(blob_files.size()); + + for (const auto& blob_file : blob_files) { + assert(blob_file); + result.emplace_back(blob_file->GetBlobFileNumber()); + } + + return result; +} + +size_t DBTestBase::CountFiles() { + size_t count = 0; + std::vector files; + if (env_->GetChildren(dbname_, &files).ok()) { + count += files.size(); + } + + if (dbname_ != last_options_.wal_dir) { + if (env_->GetChildren(last_options_.wal_dir, &files).ok()) { + count += files.size(); + } + } + + return count; +}; + +Status DBTestBase::CountFiles(size_t* count) { + std::vector files; + Status s = env_->GetChildren(dbname_, &files); + if (!s.ok()) { + return s; + } + size_t files_count = files.size(); + + if (dbname_ != last_options_.wal_dir) { + s = env_->GetChildren(last_options_.wal_dir, &files); + if (!s.ok()) { + return s; + } + *count = files_count + files.size(); + } + + return Status::OK(); +} + +Status DBTestBase::Size(const Slice& start, const Slice& limit, int cf, + uint64_t* size) { + Range r(start, limit); + if (cf == 0) { + return db_->GetApproximateSizes(&r, 1, size); + } else { + return db_->GetApproximateSizes(handles_[1], &r, 1, size); + } +} + +void DBTestBase::Compact(int cf, const Slice& start, const Slice& limit, + uint32_t target_path_id) { + CompactRangeOptions compact_options; + compact_options.target_path_id = target_path_id; + ASSERT_OK(db_->CompactRange(compact_options, handles_[cf], &start, &limit)); +} + +void DBTestBase::Compact(int cf, const Slice& start, const Slice& limit) { + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit)); +} + +void DBTestBase::Compact(const Slice& start, const Slice& limit) { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &limit)); +} + +// Do n memtable compactions, each of which produces an sstable +// covering the range [small,large]. +void DBTestBase::MakeTables(int n, const std::string& small, + const std::string& large, int cf) { + for (int i = 0; i < n; i++) { + ASSERT_OK(Put(cf, small, "begin")); + ASSERT_OK(Put(cf, large, "end")); + ASSERT_OK(Flush(cf)); + MoveFilesToLevel(n - i - 1, cf); + } +} + +// Prevent pushing of new sstables into deeper levels by adding +// tables that cover a specified range to all levels. +void DBTestBase::FillLevels(const std::string& smallest, + const std::string& largest, int cf) { + MakeTables(db_->NumberLevels(handles_[cf]), smallest, largest, cf); +} + +void DBTestBase::MoveFilesToLevel(int level, int cf) { + for (int l = 0; l < level; ++l) { + if (cf > 0) { + EXPECT_OK(dbfull()->TEST_CompactRange(l, nullptr, nullptr, handles_[cf])); + } else { + EXPECT_OK(dbfull()->TEST_CompactRange(l, nullptr, nullptr)); + } + } +} + +#ifndef ROCKSDB_LITE +void DBTestBase::DumpFileCounts(const char* label) { + fprintf(stderr, "---\n%s:\n", label); + fprintf(stderr, "maxoverlap: %" PRIu64 "\n", + dbfull()->TEST_MaxNextLevelOverlappingBytes()); + for (int level = 0; level < db_->NumberLevels(); level++) { + int num = NumTableFilesAtLevel(level); + if (num > 0) { + fprintf(stderr, " level %3d : %d files\n", level, num); + } + } +} +#endif // !ROCKSDB_LITE + +std::string DBTestBase::DumpSSTableList() { + std::string property; + db_->GetProperty("rocksdb.sstables", &property); + return property; +} + +void DBTestBase::GetSstFiles(Env* env, std::string path, + std::vector* files) { + EXPECT_OK(env->GetChildren(path, files)); + + files->erase(std::remove_if(files->begin(), files->end(), + [](std::string name) { + uint64_t number; + FileType type; + return !(ParseFileName(name, &number, &type) && + type == kTableFile); + }), + files->end()); +} + +int DBTestBase::GetSstFileCount(std::string path) { + std::vector files; + DBTestBase::GetSstFiles(env_, path, &files); + return static_cast(files.size()); +} + +// this will generate non-overlapping files since it keeps increasing key_idx +void DBTestBase::GenerateNewFile(int cf, Random* rnd, int* key_idx, + bool nowait) { + for (int i = 0; i < KNumKeysByGenerateNewFile; i++) { + ASSERT_OK(Put(cf, Key(*key_idx), rnd->RandomString((i == 99) ? 1 : 990))); + (*key_idx)++; + } + if (!nowait) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } +} + +// this will generate non-overlapping files since it keeps increasing key_idx +void DBTestBase::GenerateNewFile(Random* rnd, int* key_idx, bool nowait) { + for (int i = 0; i < KNumKeysByGenerateNewFile; i++) { + ASSERT_OK(Put(Key(*key_idx), rnd->RandomString((i == 99) ? 1 : 990))); + (*key_idx)++; + } + if (!nowait) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } +} + +const int DBTestBase::kNumKeysByGenerateNewRandomFile = 51; + +void DBTestBase::GenerateNewRandomFile(Random* rnd, bool nowait) { + for (int i = 0; i < kNumKeysByGenerateNewRandomFile; i++) { + ASSERT_OK(Put("key" + rnd->RandomString(7), rnd->RandomString(2000))); + } + ASSERT_OK(Put("key" + rnd->RandomString(7), rnd->RandomString(200))); + if (!nowait) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } +} + +std::string DBTestBase::IterStatus(Iterator* iter) { + std::string result; + if (iter->Valid()) { + result = iter->key().ToString() + "->" + iter->value().ToString(); + } else { + result = "(invalid)"; + } + return result; +} + +Options DBTestBase::OptionsForLogIterTest() { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.WAL_ttl_seconds = 1000; + return options; +} + +std::string DBTestBase::DummyString(size_t len, char c) { + return std::string(len, c); +} + +void DBTestBase::VerifyIterLast(std::string expected_key, int cf) { + Iterator* iter; + ReadOptions ro; + if (cf == 0) { + iter = db_->NewIterator(ro); + } else { + iter = db_->NewIterator(ro, handles_[cf]); + } + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), expected_key); + delete iter; +} + +// Used to test InplaceUpdate + +// If previous value is nullptr or delta is > than previous value, +// sets newValue with delta +// If previous value is not empty, +// updates previous value with 'b' string of previous value size - 1. +UpdateStatus DBTestBase::updateInPlaceSmallerSize(char* prevValue, + uint32_t* prevSize, + Slice delta, + std::string* newValue) { + if (prevValue == nullptr) { + *newValue = std::string(delta.size(), 'c'); + return UpdateStatus::UPDATED; + } else { + *prevSize = *prevSize - 1; + std::string str_b = std::string(*prevSize, 'b'); + memcpy(prevValue, str_b.c_str(), str_b.size()); + return UpdateStatus::UPDATED_INPLACE; + } +} + +UpdateStatus DBTestBase::updateInPlaceSmallerVarintSize(char* prevValue, + uint32_t* prevSize, + Slice delta, + std::string* newValue) { + if (prevValue == nullptr) { + *newValue = std::string(delta.size(), 'c'); + return UpdateStatus::UPDATED; + } else { + *prevSize = 1; + std::string str_b = std::string(*prevSize, 'b'); + memcpy(prevValue, str_b.c_str(), str_b.size()); + return UpdateStatus::UPDATED_INPLACE; + } +} + +UpdateStatus DBTestBase::updateInPlaceLargerSize(char* /*prevValue*/, + uint32_t* /*prevSize*/, + Slice delta, + std::string* newValue) { + *newValue = std::string(delta.size(), 'c'); + return UpdateStatus::UPDATED; +} + +UpdateStatus DBTestBase::updateInPlaceNoAction(char* /*prevValue*/, + uint32_t* /*prevSize*/, + Slice /*delta*/, + std::string* /*newValue*/) { + return UpdateStatus::UPDATE_FAILED; +} + +// Utility method to test InplaceUpdate +void DBTestBase::validateNumberOfEntries(int numValues, int cf) { + Arena arena; + auto options = CurrentOptions(); + InternalKeyComparator icmp(options.comparator); + ReadOptions read_options; + ScopedArenaIterator iter; + if (cf != 0) { + iter.set(dbfull()->NewInternalIterator(read_options, &arena, + kMaxSequenceNumber, handles_[cf])); + } else { + iter.set(dbfull()->NewInternalIterator(read_options, &arena, + kMaxSequenceNumber)); + } + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + int seq = numValues; + while (iter->Valid()) { + ParsedInternalKey ikey; + ikey.clear(); + ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */)); + + // checks sequence number for updates + ASSERT_EQ(ikey.sequence, (unsigned)seq--); + iter->Next(); + } + ASSERT_EQ(0, seq); +} + +void DBTestBase::CopyFile(const std::string& source, + const std::string& destination, uint64_t size) { + const EnvOptions soptions; + std::unique_ptr srcfile; + ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions)); + std::unique_ptr destfile; + ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions)); + + if (size == 0) { + // default argument means copy everything + ASSERT_OK(env_->GetFileSize(source, &size)); + } + + char buffer[4096]; + Slice slice; + while (size > 0) { + uint64_t one = std::min(uint64_t(sizeof(buffer)), size); + ASSERT_OK(srcfile->Read(one, &slice, buffer)); + ASSERT_OK(destfile->Append(slice)); + size -= slice.size(); + } + ASSERT_OK(destfile->Close()); +} + +Status DBTestBase::GetAllDataFiles( + const FileType file_type, std::unordered_map* files, + uint64_t* total_size /* = nullptr */) { + if (total_size) { + *total_size = 0; + } + std::vector children; + Status s = env_->GetChildren(dbname_, &children); + if (s.ok()) { + for (auto& file_name : children) { + uint64_t number; + FileType type; + if (ParseFileName(file_name, &number, &type) && type == file_type) { + std::string file_path = dbname_ + "/" + file_name; + uint64_t file_size = 0; + s = env_->GetFileSize(file_path, &file_size); + if (!s.ok()) { + break; + } + (*files)[file_path] = file_size; + if (total_size) { + *total_size += file_size; + } + } + } + } + return s; +} + +std::vector DBTestBase::ListTableFiles(Env* env, + const std::string& path) { + std::vector files; + std::vector file_numbers; + EXPECT_OK(env->GetChildren(path, &files)); + uint64_t number; + FileType type; + for (size_t i = 0; i < files.size(); ++i) { + if (ParseFileName(files[i], &number, &type)) { + if (type == kTableFile) { + file_numbers.push_back(number); + } + } + } + return file_numbers; +} + +void DBTestBase::VerifyDBFromMap(std::map true_data, + size_t* total_reads_res, bool tailing_iter, + std::map status) { + size_t total_reads = 0; + + for (auto& kv : true_data) { + Status s = status[kv.first]; + if (s.ok()) { + ASSERT_EQ(Get(kv.first), kv.second); + } else { + std::string value; + ASSERT_EQ(s, db_->Get(ReadOptions(), kv.first, &value)); + } + total_reads++; + } + + // Normal Iterator + { + int iter_cnt = 0; + ReadOptions ro; + ro.total_order_seek = true; + Iterator* iter = db_->NewIterator(ro); + // Verify Iterator::Next() + iter_cnt = 0; + auto data_iter = true_data.begin(); + Status s; + for (iter->SeekToFirst(); iter->Valid(); iter->Next(), data_iter++) { + ASSERT_EQ(iter->key().ToString(), data_iter->first); + Status current_status = status[data_iter->first]; + if (!current_status.ok()) { + s = current_status; + } + ASSERT_EQ(iter->status(), s); + if (current_status.ok()) { + ASSERT_EQ(iter->value().ToString(), data_iter->second); + } + iter_cnt++; + total_reads++; + } + ASSERT_EQ(data_iter, true_data.end()) + << iter_cnt << " / " << true_data.size(); + delete iter; + + // Verify Iterator::Prev() + // Use a new iterator to make sure its status is clean. + iter = db_->NewIterator(ro); + iter_cnt = 0; + s = Status::OK(); + auto data_rev = true_data.rbegin(); + for (iter->SeekToLast(); iter->Valid(); iter->Prev(), data_rev++) { + ASSERT_EQ(iter->key().ToString(), data_rev->first); + Status current_status = status[data_rev->first]; + if (!current_status.ok()) { + s = current_status; + } + ASSERT_EQ(iter->status(), s); + if (current_status.ok()) { + ASSERT_EQ(iter->value().ToString(), data_rev->second); + } + iter_cnt++; + total_reads++; + } + ASSERT_EQ(data_rev, true_data.rend()) + << iter_cnt << " / " << true_data.size(); + + // Verify Iterator::Seek() + for (auto kv : true_data) { + iter->Seek(kv.first); + ASSERT_EQ(kv.first, iter->key().ToString()); + ASSERT_EQ(kv.second, iter->value().ToString()); + total_reads++; + } + delete iter; + } + + if (tailing_iter) { +#ifndef ROCKSDB_LITE + // Tailing iterator + int iter_cnt = 0; + ReadOptions ro; + ro.tailing = true; + ro.total_order_seek = true; + Iterator* iter = db_->NewIterator(ro); + + // Verify ForwardIterator::Next() + iter_cnt = 0; + auto data_iter = true_data.begin(); + for (iter->SeekToFirst(); iter->Valid(); iter->Next(), data_iter++) { + ASSERT_EQ(iter->key().ToString(), data_iter->first); + ASSERT_EQ(iter->value().ToString(), data_iter->second); + iter_cnt++; + total_reads++; + } + ASSERT_EQ(data_iter, true_data.end()) + << iter_cnt << " / " << true_data.size(); + + // Verify ForwardIterator::Seek() + for (auto kv : true_data) { + iter->Seek(kv.first); + ASSERT_EQ(kv.first, iter->key().ToString()); + ASSERT_EQ(kv.second, iter->value().ToString()); + total_reads++; + } + + delete iter; +#endif // ROCKSDB_LITE + } + + if (total_reads_res) { + *total_reads_res = total_reads; + } +} + +void DBTestBase::VerifyDBInternal( + std::vector> true_data) { + Arena arena; + InternalKeyComparator icmp(last_options_.comparator); + ReadOptions read_options; + auto iter = + dbfull()->NewInternalIterator(read_options, &arena, kMaxSequenceNumber); + iter->SeekToFirst(); + for (auto p : true_data) { + ASSERT_TRUE(iter->Valid()); + ParsedInternalKey ikey; + ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */)); + ASSERT_EQ(p.first, ikey.user_key); + ASSERT_EQ(p.second, iter->value()); + iter->Next(); + }; + ASSERT_FALSE(iter->Valid()); + iter->~InternalIterator(); +} + +#ifndef ROCKSDB_LITE + +uint64_t DBTestBase::GetNumberOfSstFilesForColumnFamily( + DB* db, std::string column_family_name) { + std::vector metadata; + db->GetLiveFilesMetaData(&metadata); + uint64_t result = 0; + for (auto& fileMetadata : metadata) { + result += (fileMetadata.column_family_name == column_family_name); + } + return result; +} + +uint64_t DBTestBase::GetSstSizeHelper(Temperature temperature) { + std::string prop; + EXPECT_TRUE(dbfull()->GetProperty( + DB::Properties::kLiveSstFilesSizeAtTemperature + + std::to_string(static_cast(temperature)), + &prop)); + return static_cast(std::atoi(prop.c_str())); +} +#endif // ROCKSDB_LITE + +void VerifySstUniqueIds(const TablePropertiesCollection& props) { + ASSERT_FALSE(props.empty()); // suspicious test if empty + std::unordered_set seen; + for (auto& pair : props) { + std::string id; + ASSERT_OK(GetUniqueIdFromTableProperties(*pair.second, &id)); + ASSERT_TRUE(seen.insert(id).second); + } +} + +template +TargetCacheChargeTrackingCache::TargetCacheChargeTrackingCache( + std::shared_ptr target) + : CacheWrapper(std::move(target)), + cur_cache_charge_(0), + cache_charge_peak_(0), + cache_charge_increment_(0), + last_peak_tracked_(false), + cache_charge_increments_sum_(0) {} + +template +Status TargetCacheChargeTrackingCache::Insert( + const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value), Handle** handle, + Priority priority) { + Status s = target_->Insert(key, value, charge, deleter, handle, priority); + if (deleter == kNoopDeleter) { + if (last_peak_tracked_) { + cache_charge_peak_ = 0; + cache_charge_increment_ = 0; + last_peak_tracked_ = false; + } + if (s.ok()) { + cur_cache_charge_ += charge; + } + cache_charge_peak_ = std::max(cache_charge_peak_, cur_cache_charge_); + cache_charge_increment_ += charge; + } + + return s; +} + +template +bool TargetCacheChargeTrackingCache::Release(Handle* handle, + bool erase_if_last_ref) { + auto deleter = GetDeleter(handle); + if (deleter == kNoopDeleter) { + if (!last_peak_tracked_) { + cache_charge_peaks_.push_back(cache_charge_peak_); + cache_charge_increments_sum_ += cache_charge_increment_; + last_peak_tracked_ = true; + } + cur_cache_charge_ -= GetCharge(handle); + } + bool is_successful = target_->Release(handle, erase_if_last_ref); + return is_successful; +} + +template +const Cache::DeleterFn TargetCacheChargeTrackingCache::kNoopDeleter = + CacheReservationManagerImpl::TEST_GetNoopDeleterForRole(); + +template class TargetCacheChargeTrackingCache< + CacheEntryRole::kFilterConstruction>; +template class TargetCacheChargeTrackingCache< + CacheEntryRole::kBlockBasedTableReader>; +template class TargetCacheChargeTrackingCache; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_test_util.h b/src/rocksdb/db/db_test_util.h new file mode 100644 index 000000000..29d5cd9d7 --- /dev/null +++ b/src/rocksdb/db/db_test_util.h @@ -0,0 +1,1402 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "file/filename.h" +#include "rocksdb/advanced_options.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/io_status.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/sst_file_writer.h" +#include "rocksdb/statistics.h" +#include "rocksdb/table.h" +#include "rocksdb/utilities/checkpoint.h" +#include "table/mock_table.h" +#include "table/scoped_arena_iterator.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "util/cast_util.h" +#include "util/compression.h" +#include "util/mutexlock.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { +class MockEnv; + +namespace anon { +class AtomicCounter { + public: + explicit AtomicCounter(Env* env = NULL) + : env_(env), cond_count_(&mu_), count_(0) {} + + void Increment() { + MutexLock l(&mu_); + count_++; + cond_count_.SignalAll(); + } + + int Read() { + MutexLock l(&mu_); + return count_; + } + + bool WaitFor(int count) { + MutexLock l(&mu_); + + uint64_t start = env_->NowMicros(); + while (count_ < count) { + uint64_t now = env_->NowMicros(); + cond_count_.TimedWait(now + /*1s*/ 1 * 1000 * 1000); + if (env_->NowMicros() - start > /*10s*/ 10 * 1000 * 1000) { + return false; + } + if (count_ < count) { + GTEST_LOG_(WARNING) << "WaitFor is taking more time than usual"; + } + } + + return true; + } + + void Reset() { + MutexLock l(&mu_); + count_ = 0; + cond_count_.SignalAll(); + } + + private: + Env* env_; + port::Mutex mu_; + port::CondVar cond_count_; + int count_; +}; + +struct OptionsOverride { + std::shared_ptr filter_policy = nullptr; + // These will be used only if filter_policy is set + bool partition_filters = false; + // Force using a default block cache. (Setting to false allows ASAN build + // use a trivially small block cache for better UAF error detection.) + bool full_block_cache = false; + uint64_t metadata_block_size = 1024; + + // Used as a bit mask of individual enums in which to skip an XF test point + int skip_policy = 0; +}; + +} // namespace anon + +enum SkipPolicy { kSkipNone = 0, kSkipNoSnapshot = 1, kSkipNoPrefix = 2 }; + +// Special Env used to delay background operations +class SpecialEnv : public EnvWrapper { + public: + explicit SpecialEnv(Env* base, bool time_elapse_only_sleep = false); + + static const char* kClassName() { return "SpecialEnv"; } + const char* Name() const override { return kClassName(); } + + Status NewWritableFile(const std::string& f, std::unique_ptr* r, + const EnvOptions& soptions) override { + class SSTableFile : public WritableFile { + private: + SpecialEnv* env_; + std::unique_ptr base_; + + public: + SSTableFile(SpecialEnv* env, std::unique_ptr&& base) + : env_(env), base_(std::move(base)) {} + Status Append(const Slice& data) override { + if (env_->table_write_callback_) { + (*env_->table_write_callback_)(); + } + if (env_->drop_writes_.load(std::memory_order_acquire)) { + // Drop writes on the floor + return Status::OK(); + } else if (env_->no_space_.load(std::memory_order_acquire)) { + return Status::NoSpace("No space left on device"); + } else { + env_->bytes_written_ += data.size(); + return base_->Append(data); + } + } + Status Append( + const Slice& data, + const DataVerificationInfo& /* verification_info */) override { + return Append(data); + } + Status PositionedAppend(const Slice& data, uint64_t offset) override { + if (env_->table_write_callback_) { + (*env_->table_write_callback_)(); + } + if (env_->drop_writes_.load(std::memory_order_acquire)) { + // Drop writes on the floor + return Status::OK(); + } else if (env_->no_space_.load(std::memory_order_acquire)) { + return Status::NoSpace("No space left on device"); + } else { + env_->bytes_written_ += data.size(); + return base_->PositionedAppend(data, offset); + } + } + Status PositionedAppend( + const Slice& data, uint64_t offset, + const DataVerificationInfo& /* verification_info */) override { + return PositionedAppend(data, offset); + } + Status Truncate(uint64_t size) override { return base_->Truncate(size); } + Status RangeSync(uint64_t offset, uint64_t nbytes) override { + Status s = base_->RangeSync(offset, nbytes); +#if !(defined NDEBUG) || !defined(OS_WIN) + TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::RangeSync", &s); +#endif // !(defined NDEBUG) || !defined(OS_WIN) + return s; + } + Status Close() override { +// SyncPoint is not supported in Released Windows Mode. +#if !(defined NDEBUG) || !defined(OS_WIN) + // Check preallocation size + // preallocation size is never passed to base file. + size_t preallocation_size = preallocation_block_size(); + TEST_SYNC_POINT_CALLBACK("DBTestWritableFile.GetPreallocationStatus", + &preallocation_size); +#endif // !(defined NDEBUG) || !defined(OS_WIN) + Status s = base_->Close(); +#if !(defined NDEBUG) || !defined(OS_WIN) + TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::Close", &s); +#endif // !(defined NDEBUG) || !defined(OS_WIN) + return s; + } + Status Flush() override { return base_->Flush(); } + Status Sync() override { + ++env_->sync_counter_; + while (env_->delay_sstable_sync_.load(std::memory_order_acquire)) { + env_->SleepForMicroseconds(100000); + } + Status s; + if (!env_->skip_fsync_) { + s = base_->Sync(); + } +#if !(defined NDEBUG) || !defined(OS_WIN) + TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::Sync", &s); +#endif // !(defined NDEBUG) || !defined(OS_WIN) + return s; + } + void SetIOPriority(Env::IOPriority pri) override { + base_->SetIOPriority(pri); + } + Env::IOPriority GetIOPriority() override { + return base_->GetIOPriority(); + } + bool use_direct_io() const override { return base_->use_direct_io(); } + Status Allocate(uint64_t offset, uint64_t len) override { + return base_->Allocate(offset, len); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + return base_->GetUniqueId(id, max_size); + } + }; + class ManifestFile : public WritableFile { + public: + ManifestFile(SpecialEnv* env, std::unique_ptr&& b) + : env_(env), base_(std::move(b)) {} + Status Append(const Slice& data) override { + if (env_->manifest_write_error_.load(std::memory_order_acquire)) { + return Status::IOError("simulated writer error"); + } else { + return base_->Append(data); + } + } + Status Append( + const Slice& data, + const DataVerificationInfo& /*verification_info*/) override { + return Append(data); + } + + Status Truncate(uint64_t size) override { return base_->Truncate(size); } + Status Close() override { return base_->Close(); } + Status Flush() override { return base_->Flush(); } + Status Sync() override { + ++env_->sync_counter_; + if (env_->manifest_sync_error_.load(std::memory_order_acquire)) { + return Status::IOError("simulated sync error"); + } else { + if (env_->skip_fsync_) { + return Status::OK(); + } else { + return base_->Sync(); + } + } + } + uint64_t GetFileSize() override { return base_->GetFileSize(); } + Status Allocate(uint64_t offset, uint64_t len) override { + return base_->Allocate(offset, len); + } + + private: + SpecialEnv* env_; + std::unique_ptr base_; + }; + class WalFile : public WritableFile { + public: + WalFile(SpecialEnv* env, std::unique_ptr&& b) + : env_(env), base_(std::move(b)) { + env_->num_open_wal_file_.fetch_add(1); + } + virtual ~WalFile() { env_->num_open_wal_file_.fetch_add(-1); } + Status Append(const Slice& data) override { +#if !(defined NDEBUG) || !defined(OS_WIN) + TEST_SYNC_POINT("SpecialEnv::WalFile::Append:1"); +#endif + Status s; + if (env_->log_write_error_.load(std::memory_order_acquire)) { + s = Status::IOError("simulated writer error"); + } else { + int slowdown = + env_->log_write_slowdown_.load(std::memory_order_acquire); + if (slowdown > 0) { + env_->SleepForMicroseconds(slowdown); + } + s = base_->Append(data); + } +#if !(defined NDEBUG) || !defined(OS_WIN) + TEST_SYNC_POINT("SpecialEnv::WalFile::Append:2"); +#endif + return s; + } + Status Append( + const Slice& data, + const DataVerificationInfo& /* verification_info */) override { + return Append(data); + } + Status Truncate(uint64_t size) override { return base_->Truncate(size); } + void PrepareWrite(size_t offset, size_t len) override { + base_->PrepareWrite(offset, len); + } + void SetPreallocationBlockSize(size_t size) override { + base_->SetPreallocationBlockSize(size); + } + Status Close() override { +// SyncPoint is not supported in Released Windows Mode. +#if !(defined NDEBUG) || !defined(OS_WIN) + // Check preallocation size + size_t block_size, last_allocated_block; + base_->GetPreallocationStatus(&block_size, &last_allocated_block); + TEST_SYNC_POINT_CALLBACK("DBTestWalFile.GetPreallocationStatus", + &block_size); +#endif // !(defined NDEBUG) || !defined(OS_WIN) + + return base_->Close(); + } + Status Flush() override { return base_->Flush(); } + Status Sync() override { + ++env_->sync_counter_; + if (env_->corrupt_in_sync_) { + EXPECT_OK(Append(std::string(33000, ' '))); + return Status::IOError("Ingested Sync Failure"); + } + if (env_->skip_fsync_) { + return Status::OK(); + } else { + return base_->Sync(); + } + } + bool IsSyncThreadSafe() const override { + return env_->is_wal_sync_thread_safe_.load(); + } + Status Allocate(uint64_t offset, uint64_t len) override { + return base_->Allocate(offset, len); + } + + private: + SpecialEnv* env_; + std::unique_ptr base_; + }; + class OtherFile : public WritableFile { + public: + OtherFile(SpecialEnv* env, std::unique_ptr&& b) + : env_(env), base_(std::move(b)) {} + Status Append(const Slice& data) override { return base_->Append(data); } + Status Append( + const Slice& data, + const DataVerificationInfo& /*verification_info*/) override { + return Append(data); + } + Status Truncate(uint64_t size) override { return base_->Truncate(size); } + Status Close() override { return base_->Close(); } + Status Flush() override { return base_->Flush(); } + Status Sync() override { + if (env_->skip_fsync_) { + return Status::OK(); + } else { + return base_->Sync(); + } + } + uint64_t GetFileSize() override { return base_->GetFileSize(); } + Status Allocate(uint64_t offset, uint64_t len) override { + return base_->Allocate(offset, len); + } + + private: + SpecialEnv* env_; + std::unique_ptr base_; + }; + + if (no_file_overwrite_.load(std::memory_order_acquire) && + target()->FileExists(f).ok()) { + return Status::NotSupported("SpecialEnv::no_file_overwrite_ is true."); + } + + if (non_writeable_rate_.load(std::memory_order_acquire) > 0) { + uint32_t random_number; + { + MutexLock l(&rnd_mutex_); + random_number = rnd_.Uniform(100); + } + if (random_number < non_writeable_rate_.load()) { + return Status::IOError("simulated random write error"); + } + } + + new_writable_count_++; + + if (non_writable_count_.load() > 0) { + non_writable_count_--; + return Status::IOError("simulated write error"); + } + + EnvOptions optimized = soptions; + if (strstr(f.c_str(), "MANIFEST") != nullptr || + strstr(f.c_str(), "log") != nullptr) { + optimized.use_mmap_writes = false; + optimized.use_direct_writes = false; + } + + Status s = target()->NewWritableFile(f, r, optimized); + if (s.ok()) { + if (strstr(f.c_str(), ".sst") != nullptr) { + r->reset(new SSTableFile(this, std::move(*r))); + } else if (strstr(f.c_str(), "MANIFEST") != nullptr) { + r->reset(new ManifestFile(this, std::move(*r))); + } else if (strstr(f.c_str(), "log") != nullptr) { + r->reset(new WalFile(this, std::move(*r))); + } else { + r->reset(new OtherFile(this, std::move(*r))); + } + } + return s; + } + + Status NewRandomAccessFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& soptions) override { + class CountingFile : public RandomAccessFile { + public: + CountingFile(std::unique_ptr&& target, + anon::AtomicCounter* counter, + std::atomic* bytes_read) + : target_(std::move(target)), + counter_(counter), + bytes_read_(bytes_read) {} + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + counter_->Increment(); + Status s = target_->Read(offset, n, result, scratch); + *bytes_read_ += result->size(); + return s; + } + + virtual Status Prefetch(uint64_t offset, size_t n) override { + Status s = target_->Prefetch(offset, n); + *bytes_read_ += n; + return s; + } + + private: + std::unique_ptr target_; + anon::AtomicCounter* counter_; + std::atomic* bytes_read_; + }; + + class RandomFailureFile : public RandomAccessFile { + public: + RandomFailureFile(std::unique_ptr&& target, + std::atomic* failure_cnt, uint32_t fail_odd) + : target_(std::move(target)), + fail_cnt_(failure_cnt), + fail_odd_(fail_odd) {} + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + if (Random::GetTLSInstance()->OneIn(fail_odd_)) { + fail_cnt_->fetch_add(1); + return Status::IOError("random error"); + } + return target_->Read(offset, n, result, scratch); + } + + virtual Status Prefetch(uint64_t offset, size_t n) override { + return target_->Prefetch(offset, n); + } + + private: + std::unique_ptr target_; + std::atomic* fail_cnt_; + uint32_t fail_odd_; + }; + + Status s = target()->NewRandomAccessFile(f, r, soptions); + random_file_open_counter_++; + if (s.ok()) { + if (count_random_reads_) { + r->reset(new CountingFile(std::move(*r), &random_read_counter_, + &random_read_bytes_counter_)); + } else if (rand_reads_fail_odd_ > 0) { + r->reset(new RandomFailureFile(std::move(*r), &num_reads_fails_, + rand_reads_fail_odd_)); + } + } + + if (s.ok() && soptions.compaction_readahead_size > 0) { + compaction_readahead_size_ = soptions.compaction_readahead_size; + } + return s; + } + + virtual Status NewSequentialFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& soptions) override { + class CountingFile : public SequentialFile { + public: + CountingFile(std::unique_ptr&& target, + anon::AtomicCounter* counter) + : target_(std::move(target)), counter_(counter) {} + virtual Status Read(size_t n, Slice* result, char* scratch) override { + counter_->Increment(); + return target_->Read(n, result, scratch); + } + virtual Status Skip(uint64_t n) override { return target_->Skip(n); } + + private: + std::unique_ptr target_; + anon::AtomicCounter* counter_; + }; + + Status s = target()->NewSequentialFile(f, r, soptions); + if (s.ok() && count_sequential_reads_) { + r->reset(new CountingFile(std::move(*r), &sequential_read_counter_)); + } + return s; + } + + virtual void SleepForMicroseconds(int micros) override { + sleep_counter_.Increment(); + if (no_slowdown_ || time_elapse_only_sleep_) { + addon_microseconds_.fetch_add(micros); + } + if (!no_slowdown_) { + target()->SleepForMicroseconds(micros); + } + } + + void MockSleepForMicroseconds(int64_t micros) { + sleep_counter_.Increment(); + assert(no_slowdown_); + addon_microseconds_.fetch_add(micros); + } + + void MockSleepForSeconds(int64_t seconds) { + sleep_counter_.Increment(); + assert(no_slowdown_); + addon_microseconds_.fetch_add(seconds * 1000000); + } + + virtual Status GetCurrentTime(int64_t* unix_time) override { + Status s; + if (time_elapse_only_sleep_) { + *unix_time = maybe_starting_time_; + } else { + s = target()->GetCurrentTime(unix_time); + } + if (s.ok()) { + // mock microseconds elapsed to seconds of time + *unix_time += addon_microseconds_.load() / 1000000; + } + return s; + } + + virtual uint64_t NowCPUNanos() override { + now_cpu_count_.fetch_add(1); + return target()->NowCPUNanos(); + } + + virtual uint64_t NowNanos() override { + return (time_elapse_only_sleep_ ? 0 : target()->NowNanos()) + + addon_microseconds_.load() * 1000; + } + + virtual uint64_t NowMicros() override { + return (time_elapse_only_sleep_ ? 0 : target()->NowMicros()) + + addon_microseconds_.load(); + } + + virtual Status DeleteFile(const std::string& fname) override { + delete_count_.fetch_add(1); + return target()->DeleteFile(fname); + } + + void SetMockSleep(bool enabled = true) { no_slowdown_ = enabled; } + + Status NewDirectory(const std::string& name, + std::unique_ptr* result) override { + if (!skip_fsync_) { + return target()->NewDirectory(name, result); + } else { + class NoopDirectory : public Directory { + public: + NoopDirectory() {} + ~NoopDirectory() {} + + Status Fsync() override { return Status::OK(); } + Status Close() override { return Status::OK(); } + }; + + result->reset(new NoopDirectory()); + return Status::OK(); + } + } + + Status RenameFile(const std::string& src, const std::string& dest) override { + rename_count_.fetch_add(1); + if (rename_error_.load(std::memory_order_acquire)) { + return Status::NotSupported("Simulated `RenameFile()` error."); + } + return target()->RenameFile(src, dest); + } + + // Something to return when mocking current time + const int64_t maybe_starting_time_; + + Random rnd_; + port::Mutex rnd_mutex_; // Lock to pretect rnd_ + + // sstable Sync() calls are blocked while this pointer is non-nullptr. + std::atomic delay_sstable_sync_; + + // Drop writes on the floor while this pointer is non-nullptr. + std::atomic drop_writes_; + + // Simulate no-space errors while this pointer is non-nullptr. + std::atomic no_space_; + + // Simulate non-writable file system while this pointer is non-nullptr + std::atomic non_writable_; + + // Force sync of manifest files to fail while this pointer is non-nullptr + std::atomic manifest_sync_error_; + + // Force write to manifest files to fail while this pointer is non-nullptr + std::atomic manifest_write_error_; + + // Force write to log files to fail while this pointer is non-nullptr + std::atomic log_write_error_; + + // Force `RenameFile()` to fail while this pointer is non-nullptr + std::atomic rename_error_{false}; + + // Slow down every log write, in micro-seconds. + std::atomic log_write_slowdown_; + + // If true, returns Status::NotSupported for file overwrite. + std::atomic no_file_overwrite_; + + // Number of WAL files that are still open for write. + std::atomic num_open_wal_file_; + + bool count_random_reads_; + uint32_t rand_reads_fail_odd_ = 0; + std::atomic num_reads_fails_; + anon::AtomicCounter random_read_counter_; + std::atomic random_read_bytes_counter_; + std::atomic random_file_open_counter_; + + bool count_sequential_reads_; + anon::AtomicCounter sequential_read_counter_; + + anon::AtomicCounter sleep_counter_; + + std::atomic bytes_written_; + + std::atomic sync_counter_; + + // If true, all fsync to files and directories are skipped. + bool skip_fsync_ = false; + + // If true, ingest the corruption to file during sync. + bool corrupt_in_sync_ = false; + + std::atomic non_writeable_rate_; + + std::atomic new_writable_count_; + + std::atomic non_writable_count_; + + std::function* table_write_callback_; + + std::atomic now_cpu_count_; + + std::atomic delete_count_; + + std::atomic rename_count_{0}; + + std::atomic is_wal_sync_thread_safe_{true}; + + std::atomic compaction_readahead_size_{}; + + private: // accessing these directly is prone to error + friend class DBTestBase; + + std::atomic addon_microseconds_{0}; + + // Do not modify in the env of a running DB (could cause deadlock) + std::atomic time_elapse_only_sleep_; + + bool no_slowdown_; +}; + +#ifndef ROCKSDB_LITE +class FileTemperatureTestFS : public FileSystemWrapper { + public: + explicit FileTemperatureTestFS(const std::shared_ptr& fs) + : FileSystemWrapper(fs) {} + + static const char* kClassName() { return "FileTemperatureTestFS"; } + const char* Name() const override { return kClassName(); } + + IOStatus NewSequentialFile(const std::string& fname, const FileOptions& opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + IOStatus s = target()->NewSequentialFile(fname, opts, result, dbg); + uint64_t number; + FileType type; + if (ParseFileName(GetFileName(fname), &number, &type) && + type == kTableFile) { + MutexLock lock(&mu_); + requested_sst_file_temperatures_.emplace_back(number, opts.temperature); + if (s.ok()) { + if (opts.temperature != Temperature::kUnknown) { + // Be extra picky and don't open if a wrong non-unknown temperature is + // provided + auto e = current_sst_file_temperatures_.find(number); + if (e != current_sst_file_temperatures_.end() && + e->second != opts.temperature) { + result->reset(); + return IOStatus::PathNotFound("Temperature mismatch on " + fname); + } + } + *result = WrapWithTemperature( + number, std::move(*result)); + } + } + return s; + } + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + IOStatus s = target()->NewRandomAccessFile(fname, opts, result, dbg); + uint64_t number; + FileType type; + if (ParseFileName(GetFileName(fname), &number, &type) && + type == kTableFile) { + MutexLock lock(&mu_); + requested_sst_file_temperatures_.emplace_back(number, opts.temperature); + if (s.ok()) { + if (opts.temperature != Temperature::kUnknown) { + // Be extra picky and don't open if a wrong non-unknown temperature is + // provided + auto e = current_sst_file_temperatures_.find(number); + if (e != current_sst_file_temperatures_.end() && + e->second != opts.temperature) { + result->reset(); + return IOStatus::PathNotFound("Temperature mismatch on " + fname); + } + } + *result = WrapWithTemperature( + number, std::move(*result)); + } + } + return s; + } + + void PopRequestedSstFileTemperatures( + std::vector>* out = nullptr) { + MutexLock lock(&mu_); + if (out) { + *out = std::move(requested_sst_file_temperatures_); + assert(requested_sst_file_temperatures_.empty()); + } else { + requested_sst_file_temperatures_.clear(); + } + } + + IOStatus NewWritableFile(const std::string& fname, const FileOptions& opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + uint64_t number; + FileType type; + if (ParseFileName(GetFileName(fname), &number, &type) && + type == kTableFile) { + MutexLock lock(&mu_); + current_sst_file_temperatures_[number] = opts.temperature; + } + return target()->NewWritableFile(fname, opts, result, dbg); + } + + void CopyCurrentSstFileTemperatures(std::map* out) { + MutexLock lock(&mu_); + *out = current_sst_file_temperatures_; + } + + void OverrideSstFileTemperature(uint64_t number, Temperature temp) { + MutexLock lock(&mu_); + current_sst_file_temperatures_[number] = temp; + } + + protected: + port::Mutex mu_; + std::vector> + requested_sst_file_temperatures_; + std::map current_sst_file_temperatures_; + + std::string GetFileName(const std::string& fname) { + auto filename = fname.substr(fname.find_last_of(kFilePathSeparator) + 1); + // workaround only for Windows that the file path could contain both Windows + // FilePathSeparator and '/' + filename = filename.substr(filename.find_last_of('/') + 1); + return filename; + } + + template + std::unique_ptr WrapWithTemperature(uint64_t number, + std::unique_ptr&& t) { + class FileWithTemp : public FileOwnerWrapperT { + public: + FileWithTemp(FileTemperatureTestFS* fs, uint64_t number, + std::unique_ptr&& t) + : FileOwnerWrapperT(std::move(t)), fs_(fs), number_(number) {} + + Temperature GetTemperature() const override { + MutexLock lock(&fs_->mu_); + return fs_->current_sst_file_temperatures_[number_]; + } + + private: + FileTemperatureTestFS* fs_; + uint64_t number_; + }; + return std::make_unique(this, number, std::move(t)); + } +}; + +class OnFileDeletionListener : public EventListener { + public: + OnFileDeletionListener() : matched_count_(0), expected_file_name_("") {} + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "OnFileDeletionListener"; } + + void SetExpectedFileName(const std::string file_name) { + expected_file_name_ = file_name; + } + + void VerifyMatchedCount(size_t expected_value) { + ASSERT_EQ(matched_count_, expected_value); + } + + void OnTableFileDeleted(const TableFileDeletionInfo& info) override { + if (expected_file_name_ != "") { + ASSERT_EQ(expected_file_name_, info.file_path); + expected_file_name_ = ""; + matched_count_++; + } + } + + private: + size_t matched_count_; + std::string expected_file_name_; +}; + +class FlushCounterListener : public EventListener { + public: + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "FlushCounterListener"; } + std::atomic count{0}; + std::atomic expected_flush_reason{FlushReason::kOthers}; + + void OnFlushBegin(DB* /*db*/, const FlushJobInfo& flush_job_info) override { + count++; + ASSERT_EQ(expected_flush_reason.load(), flush_job_info.flush_reason); + } +}; +#endif + +// A test merge operator mimics put but also fails if one of merge operands is +// "corrupted". +class TestPutOperator : public MergeOperator { + public: + virtual bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override { + if (merge_in.existing_value != nullptr && + *(merge_in.existing_value) == "corrupted") { + return false; + } + for (auto value : merge_in.operand_list) { + if (value == "corrupted") { + return false; + } + } + merge_out->existing_operand = merge_in.operand_list.back(); + return true; + } + + virtual const char* Name() const override { return "TestPutOperator"; } +}; + +// A wrapper around Cache that can easily be extended with instrumentation, +// etc. +class CacheWrapper : public Cache { + public: + explicit CacheWrapper(std::shared_ptr target) + : target_(std::move(target)) {} + + const char* Name() const override { return target_->Name(); } + + using Cache::Insert; + Status Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value), + Handle** handle = nullptr, + Priority priority = Priority::LOW) override { + return target_->Insert(key, value, charge, deleter, handle, priority); + } + + using Cache::Lookup; + Handle* Lookup(const Slice& key, Statistics* stats = nullptr) override { + return target_->Lookup(key, stats); + } + + bool Ref(Handle* handle) override { return target_->Ref(handle); } + + using Cache::Release; + bool Release(Handle* handle, bool erase_if_last_ref = false) override { + return target_->Release(handle, erase_if_last_ref); + } + + void* Value(Handle* handle) override { return target_->Value(handle); } + + void Erase(const Slice& key) override { target_->Erase(key); } + uint64_t NewId() override { return target_->NewId(); } + + void SetCapacity(size_t capacity) override { target_->SetCapacity(capacity); } + + void SetStrictCapacityLimit(bool strict_capacity_limit) override { + target_->SetStrictCapacityLimit(strict_capacity_limit); + } + + bool HasStrictCapacityLimit() const override { + return target_->HasStrictCapacityLimit(); + } + + size_t GetCapacity() const override { return target_->GetCapacity(); } + + size_t GetUsage() const override { return target_->GetUsage(); } + + size_t GetUsage(Handle* handle) const override { + return target_->GetUsage(handle); + } + + size_t GetPinnedUsage() const override { return target_->GetPinnedUsage(); } + + size_t GetCharge(Handle* handle) const override { + return target_->GetCharge(handle); + } + + DeleterFn GetDeleter(Handle* handle) const override { + return target_->GetDeleter(handle); + } + + void ApplyToAllCacheEntries(void (*callback)(void*, size_t), + bool thread_safe) override { + target_->ApplyToAllCacheEntries(callback, thread_safe); + } + + void ApplyToAllEntries( + const std::function& callback, + const ApplyToAllEntriesOptions& opts) override { + target_->ApplyToAllEntries(callback, opts); + } + + void EraseUnRefEntries() override { target_->EraseUnRefEntries(); } + + protected: + std::shared_ptr target_; +}; + +/* + * A cache wrapper that tracks certain CacheEntryRole's cache charge, its + * peaks and increments + * + * p0 + * / \ p1 + * / \ /\ + * / \/ \ + * a / b \ + * peaks = {p0, p1} + * increments = {p1-a, p2-b} + */ +template +class TargetCacheChargeTrackingCache : public CacheWrapper { + public: + explicit TargetCacheChargeTrackingCache(std::shared_ptr target); + + using Cache::Insert; + Status Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value), + Handle** handle = nullptr, + Priority priority = Priority::LOW) override; + + using Cache::Release; + bool Release(Handle* handle, bool erase_if_last_ref = false) override; + + std::size_t GetCacheCharge() { return cur_cache_charge_; } + + std::deque GetChargedCachePeaks() { return cache_charge_peaks_; } + + std::size_t GetChargedCacheIncrementSum() { + return cache_charge_increments_sum_; + } + + private: + static const Cache::DeleterFn kNoopDeleter; + + std::size_t cur_cache_charge_; + std::size_t cache_charge_peak_; + std::size_t cache_charge_increment_; + bool last_peak_tracked_; + std::deque cache_charge_peaks_; + std::size_t cache_charge_increments_sum_; +}; + +class DBTestBase : public testing::Test { + public: + // Sequence of option configurations to try + enum OptionConfig : int { + kDefault = 0, + kBlockBasedTableWithPrefixHashIndex = 1, + kBlockBasedTableWithWholeKeyHashIndex = 2, + kPlainTableFirstBytePrefix = 3, + kPlainTableCappedPrefix = 4, + kPlainTableCappedPrefixNonMmap = 5, + kPlainTableAllBytesPrefix = 6, + kVectorRep = 7, + kHashLinkList = 8, + kMergePut = 9, + kFilter = 10, + kFullFilterWithNewTableReaderForCompactions = 11, + kUncompressed = 12, + kNumLevel_3 = 13, + kDBLogDir = 14, + kWalDirAndMmapReads = 15, + kManifestFileSize = 16, + kPerfOptions = 17, + kHashSkipList = 18, + kUniversalCompaction = 19, + kUniversalCompactionMultiLevel = 20, + kCompressedBlockCache = 21, + kInfiniteMaxOpenFiles = 22, + kCRC32cChecksum = 23, + kFIFOCompaction = 24, + kOptimizeFiltersForHits = 25, + kRowCache = 26, + kRecycleLogFiles = 27, + kConcurrentSkipList = 28, + kPipelinedWrite = 29, + kConcurrentWALWrites = 30, + kDirectIO, + kLevelSubcompactions, + kBlockBasedTableWithIndexRestartInterval, + kBlockBasedTableWithPartitionedIndex, + kBlockBasedTableWithPartitionedIndexFormat4, + kBlockBasedTableWithLatestFormat, + kPartitionedFilterWithNewTableReaderForCompactions, + kUniversalSubcompactions, + kUnorderedWrite, + // This must be the last line + kEnd, + }; + + public: + std::string dbname_; + std::string alternative_wal_dir_; + std::string alternative_db_log_dir_; + MockEnv* mem_env_; + Env* encrypted_env_; + SpecialEnv* env_; + std::shared_ptr env_guard_; + DB* db_; + std::vector handles_; + + int option_config_; + Options last_options_; + + // Skip some options, as they may not be applicable to a specific test. + // To add more skip constants, use values 4, 8, 16, etc. + enum OptionSkip { + kNoSkip = 0, + kSkipDeletesFilterFirst = 1, + kSkipUniversalCompaction = 2, + kSkipMergePut = 4, + kSkipPlainTable = 8, + kSkipHashIndex = 16, + kSkipNoSeekToLast = 32, + kSkipFIFOCompaction = 128, + kSkipMmapReads = 256, + }; + + const int kRangeDelSkipConfigs = + // Plain tables do not support range deletions. + kSkipPlainTable | + // MmapReads disables the iterator pinning that RangeDelAggregator + // requires. + kSkipMmapReads; + + // `env_do_fsync` decides whether the special Env would do real + // fsync for files and directories. Skipping fsync can speed up + // tests, but won't cover the exact fsync logic. + DBTestBase(const std::string path, bool env_do_fsync); + + ~DBTestBase(); + + static std::string Key(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "key%06d", i); + return std::string(buf); + } + + static bool ShouldSkipOptions(int option_config, int skip_mask = kNoSkip); + + // Switch to a fresh database with the next option configuration to + // test. Return false if there are no more configurations to test. + bool ChangeOptions(int skip_mask = kNoSkip); + + // Switch between different compaction styles. + bool ChangeCompactOptions(); + + // Switch between different WAL-realted options. + bool ChangeWalOptions(); + + // Switch between different filter policy + // Jump from kDefault to kFilter to kFullFilter + bool ChangeFilterOptions(); + + // Switch between different DB options for file ingestion tests. + bool ChangeOptionsForFileIngestionTest(); + + // Return the current option configuration. + Options CurrentOptions(const anon::OptionsOverride& options_override = + anon::OptionsOverride()) const; + + Options CurrentOptions(const Options& default_options, + const anon::OptionsOverride& options_override = + anon::OptionsOverride()) const; + + Options GetDefaultOptions() const; + + Options GetOptions(int option_config) const { + return GetOptions(option_config, GetDefaultOptions()); + } + + Options GetOptions(int option_config, const Options& default_options, + const anon::OptionsOverride& options_override = + anon::OptionsOverride()) const; + + DBImpl* dbfull() { return static_cast_with_check(db_); } + + void CreateColumnFamilies(const std::vector& cfs, + const Options& options); + + void CreateAndReopenWithCF(const std::vector& cfs, + const Options& options); + + void ReopenWithColumnFamilies(const std::vector& cfs, + const std::vector& options); + + void ReopenWithColumnFamilies(const std::vector& cfs, + const Options& options); + + Status TryReopenWithColumnFamilies(const std::vector& cfs, + const std::vector& options); + + Status TryReopenWithColumnFamilies(const std::vector& cfs, + const Options& options); + + void Reopen(const Options& options); + + void Close(); + + void DestroyAndReopen(const Options& options); + + void Destroy(const Options& options, bool delete_cf_paths = false); + + Status ReadOnlyReopen(const Options& options); + + Status TryReopen(const Options& options); + + bool IsDirectIOSupported(); + + bool IsMemoryMappedAccessSupported() const; + + Status Flush(int cf = 0); + + Status Flush(const std::vector& cf_ids); + + Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()); + + Status Put(int cf, const Slice& k, const Slice& v, + WriteOptions wo = WriteOptions()); + + Status Merge(const Slice& k, const Slice& v, + WriteOptions wo = WriteOptions()); + + Status Merge(int cf, const Slice& k, const Slice& v, + WriteOptions wo = WriteOptions()); + + Status Delete(const std::string& k); + + Status Delete(int cf, const std::string& k); + + Status SingleDelete(const std::string& k); + + Status SingleDelete(int cf, const std::string& k); + + std::string Get(const std::string& k, const Snapshot* snapshot = nullptr); + + std::string Get(int cf, const std::string& k, + const Snapshot* snapshot = nullptr); + + Status Get(const std::string& k, PinnableSlice* v); + + std::vector MultiGet(std::vector cfs, + const std::vector& k, + const Snapshot* snapshot, + const bool batched, + const bool async = false); + + std::vector MultiGet(const std::vector& k, + const Snapshot* snapshot = nullptr, + const bool async = false); + + uint64_t GetNumSnapshots(); + + uint64_t GetTimeOldestSnapshots(); + + uint64_t GetSequenceOldestSnapshots(); + + // Return a string that contains all key,value pairs in order, + // formatted like "(k1->v1)(k2->v2)". + std::string Contents(int cf = 0); + + std::string AllEntriesFor(const Slice& user_key, int cf = 0); + + // Similar to AllEntriesFor but this function also covers reopen with fifo. + // Note that test cases with snapshots or entries in memtable should simply + // use AllEntriesFor instead as snapshots and entries in memtable will + // survive after db reopen. + void CheckAllEntriesWithFifoReopen(const std::string& expected_value, + const Slice& user_key, int cf, + const std::vector& cfs, + const Options& options); + +#ifndef ROCKSDB_LITE + int NumSortedRuns(int cf = 0); + + uint64_t TotalSize(int cf = 0); + + uint64_t SizeAtLevel(int level); + + size_t TotalLiveFiles(int cf = 0); + + size_t CountLiveFiles(); + + int NumTableFilesAtLevel(int level, int cf = 0); + + double CompressionRatioAtLevel(int level, int cf = 0); + + int TotalTableFiles(int cf = 0, int levels = -1); +#endif // ROCKSDB_LITE + + std::vector GetBlobFileNumbers(); + + // Return spread of files per level + std::string FilesPerLevel(int cf = 0); + + size_t CountFiles(); + + Status CountFiles(size_t* count); + + Status Size(const Slice& start, const Slice& limit, uint64_t* size) { + return Size(start, limit, 0, size); + } + + Status Size(const Slice& start, const Slice& limit, int cf, uint64_t* size); + + void Compact(int cf, const Slice& start, const Slice& limit, + uint32_t target_path_id); + + void Compact(int cf, const Slice& start, const Slice& limit); + + void Compact(const Slice& start, const Slice& limit); + + // Do n memtable compactions, each of which produces an sstable + // covering the range [small,large]. + void MakeTables(int n, const std::string& small, const std::string& large, + int cf = 0); + + // Prevent pushing of new sstables into deeper levels by adding + // tables that cover a specified range to all levels. + void FillLevels(const std::string& smallest, const std::string& largest, + int cf); + + void MoveFilesToLevel(int level, int cf = 0); + +#ifndef ROCKSDB_LITE + void DumpFileCounts(const char* label); +#endif // ROCKSDB_LITE + + std::string DumpSSTableList(); + + static void GetSstFiles(Env* env, std::string path, + std::vector* files); + + int GetSstFileCount(std::string path); + + // this will generate non-overlapping files since it keeps increasing key_idx + void GenerateNewFile(Random* rnd, int* key_idx, bool nowait = false); + + void GenerateNewFile(int fd, Random* rnd, int* key_idx, bool nowait = false); + + static const int kNumKeysByGenerateNewRandomFile; + static const int KNumKeysByGenerateNewFile = 100; + + void GenerateNewRandomFile(Random* rnd, bool nowait = false); + + std::string IterStatus(Iterator* iter); + + Options OptionsForLogIterTest(); + + std::string DummyString(size_t len, char c = 'a'); + + void VerifyIterLast(std::string expected_key, int cf = 0); + + // Used to test InplaceUpdate + + // If previous value is nullptr or delta is > than previous value, + // sets newValue with delta + // If previous value is not empty, + // updates previous value with 'b' string of previous value size - 1. + static UpdateStatus updateInPlaceSmallerSize(char* prevValue, + uint32_t* prevSize, Slice delta, + std::string* newValue); + + static UpdateStatus updateInPlaceSmallerVarintSize(char* prevValue, + uint32_t* prevSize, + Slice delta, + std::string* newValue); + + static UpdateStatus updateInPlaceLargerSize(char* prevValue, + uint32_t* prevSize, Slice delta, + std::string* newValue); + + static UpdateStatus updateInPlaceNoAction(char* prevValue, uint32_t* prevSize, + Slice delta, std::string* newValue); + + // Utility method to test InplaceUpdate + void validateNumberOfEntries(int numValues, int cf = 0); + + void CopyFile(const std::string& source, const std::string& destination, + uint64_t size = 0); + + Status GetAllDataFiles(const FileType file_type, + std::unordered_map* sst_files, + uint64_t* total_size = nullptr); + + std::vector ListTableFiles(Env* env, const std::string& path); + + void VerifyDBFromMap( + std::map true_data, + size_t* total_reads_res = nullptr, bool tailing_iter = false, + std::map status = std::map()); + + void VerifyDBInternal( + std::vector> true_data); + +#ifndef ROCKSDB_LITE + uint64_t GetNumberOfSstFilesForColumnFamily(DB* db, + std::string column_family_name); + + uint64_t GetSstSizeHelper(Temperature temperature); +#endif // ROCKSDB_LITE + + uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) { + return options.statistics->getTickerCount(ticker_type); + } + + uint64_t TestGetAndResetTickerCount(const Options& options, + Tickers ticker_type) { + return options.statistics->getAndResetTickerCount(ticker_type); + } + + // Note: reverting this setting within the same test run is not yet + // supported + void SetTimeElapseOnlySleepOnReopen(DBOptions* options); + + private: // Prone to error on direct use + void MaybeInstallTimeElapseOnlySleep(const DBOptions& options); + + bool time_elapse_only_sleep_on_reopen_ = false; +}; + +// For verifying that all files generated by current version have SST +// unique ids. +void VerifySstUniqueIds(const TablePropertiesCollection& props); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_universal_compaction_test.cc b/src/rocksdb/db/db_universal_compaction_test.cc new file mode 100644 index 000000000..f53c36f22 --- /dev/null +++ b/src/rocksdb/db/db_universal_compaction_test.cc @@ -0,0 +1,2235 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#if !defined(ROCKSDB_LITE) +#include "rocksdb/utilities/table_properties_collectors.h" +#include "test_util/sync_point.h" +#include "test_util/testutil.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +static std::string CompressibleString(Random* rnd, int len) { + std::string r; + test::CompressibleString(rnd, 0.8, len, &r); + return r; +} + +class DBTestUniversalCompactionBase + : public DBTestBase, + public ::testing::WithParamInterface> { + public: + explicit DBTestUniversalCompactionBase(const std::string& path) + : DBTestBase(path, /*env_do_fsync=*/false) {} + void SetUp() override { + num_levels_ = std::get<0>(GetParam()); + exclusive_manual_compaction_ = std::get<1>(GetParam()); + } + int num_levels_; + bool exclusive_manual_compaction_; +}; + +class DBTestUniversalCompaction : public DBTestUniversalCompactionBase { + public: + DBTestUniversalCompaction() + : DBTestUniversalCompactionBase("/db_universal_compaction_test") {} +}; + +class DBTestUniversalCompaction2 : public DBTestBase { + public: + DBTestUniversalCompaction2() + : DBTestBase("db_universal_compaction_test2", /*env_do_fsync=*/false) {} +}; + +namespace { +void VerifyCompactionResult( + const ColumnFamilyMetaData& cf_meta, + const std::set& overlapping_file_numbers) { +#ifndef NDEBUG + for (auto& level : cf_meta.levels) { + for (auto& file : level.files) { + assert(overlapping_file_numbers.find(file.name) == + overlapping_file_numbers.end()); + } + } +#endif +} + +class KeepFilter : public CompactionFilter { + public: + bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + return false; + } + + const char* Name() const override { return "KeepFilter"; } +}; + +class KeepFilterFactory : public CompactionFilterFactory { + public: + explicit KeepFilterFactory(bool check_context = false) + : check_context_(check_context) {} + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + if (check_context_) { + EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction); + EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction); + } + return std::unique_ptr(new KeepFilter()); + } + + const char* Name() const override { return "KeepFilterFactory"; } + bool check_context_; + std::atomic_bool expect_full_compaction_; + std::atomic_bool expect_manual_compaction_; +}; +} // anonymous namespace + +// Make sure we don't trigger a problem if the trigger condtion is given +// to be 0, which is invalid. +TEST_P(DBTestUniversalCompaction, UniversalCompactionSingleSortedRun) { + Options options = CurrentOptions(); + + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = num_levels_; + // Config universal compaction to always compact to one single sorted run. + options.level0_file_num_compaction_trigger = 0; + options.compaction_options_universal.size_ratio = 10; + options.compaction_options_universal.min_merge_width = 2; + options.compaction_options_universal.max_size_amplification_percent = 0; + + options.write_buffer_size = 105 << 10; // 105KB + options.arena_block_size = 4 << 10; + options.target_file_size_base = 32 << 10; // 32KB + // trigger compaction if there are >= 4 files + KeepFilterFactory* filter = new KeepFilterFactory(true); + filter->expect_manual_compaction_.store(false); + options.compaction_filter_factory.reset(filter); + + DestroyAndReopen(options); + ASSERT_EQ(1, db_->GetOptions().level0_file_num_compaction_trigger); + + Random rnd(301); + int key_idx = 0; + + filter->expect_full_compaction_.store(true); + + for (int num = 0; num < 16; num++) { + // Write 100KB file. And immediately it should be compacted to one file. + GenerateNewFile(&rnd, &key_idx); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumSortedRuns(0), 1); + } + ASSERT_OK(Put(Key(key_idx), "")); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumSortedRuns(0), 1); +} + +TEST_P(DBTestUniversalCompaction, OptimizeFiltersForHits) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.size_ratio = 5; + options.num_levels = num_levels_; + options.write_buffer_size = 105 << 10; // 105KB + options.arena_block_size = 4 << 10; + options.target_file_size_base = 32 << 10; // 32KB + // trigger compaction if there are >= 4 files + options.level0_file_num_compaction_trigger = 4; + BlockBasedTableOptions bbto; + bbto.cache_index_and_filter_blocks = true; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.optimize_filters_for_hits = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.memtable_factory.reset(test::NewSpecialSkipListFactory(3)); + + DestroyAndReopen(options); + + // block compaction from happening + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + ASSERT_OK(Put(Key(num * 10), "val")); + if (num) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_OK(Put(Key(30 + num * 10), "val")); + ASSERT_OK(Put(Key(60 + num * 10), "val")); + } + ASSERT_OK(Put("", "")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + // Query set of non existing keys + for (int i = 5; i < 90; i += 10) { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + + // Make sure bloom filter is used at least once. + ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + auto prev_counter = TestGetTickerCount(options, BLOOM_FILTER_USEFUL); + + // Make sure bloom filter is used for all but the last L0 file when looking + // up a non-existent key that's in the range of all L0 files. + ASSERT_EQ(Get(Key(35)), "NOT_FOUND"); + ASSERT_EQ(prev_counter + NumTableFilesAtLevel(0) - 1, + TestGetTickerCount(options, BLOOM_FILTER_USEFUL)); + prev_counter = TestGetTickerCount(options, BLOOM_FILTER_USEFUL); + + // Unblock compaction and wait it for happening. + sleeping_task_low.WakeUp(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // The same queries will not trigger bloom filter + for (int i = 5; i < 90; i += 10) { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + ASSERT_EQ(prev_counter, TestGetTickerCount(options, BLOOM_FILTER_USEFUL)); +} + +// TODO(kailiu) The tests on UniversalCompaction has some issues: +// 1. A lot of magic numbers ("11" or "12"). +// 2. Made assumption on the memtable flush conditions, which may change from +// time to time. +TEST_P(DBTestUniversalCompaction, UniversalCompactionTrigger) { + Options options; + options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.size_ratio = 5; + options.num_levels = num_levels_; + options.write_buffer_size = 105 << 10; // 105KB + options.arena_block_size = 4 << 10; + options.target_file_size_base = 32 << 10; // 32KB + // trigger compaction if there are >= 4 files + options.level0_file_num_compaction_trigger = 4; + KeepFilterFactory* filter = new KeepFilterFactory(true); + filter->expect_manual_compaction_.store(false); + options.compaction_filter_factory.reset(filter); + + options = CurrentOptions(options); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBTestWritableFile.GetPreallocationStatus", [&](void* arg) { + ASSERT_TRUE(arg != nullptr); + size_t preallocation_size = *(static_cast(arg)); + if (num_levels_ > 3) { + ASSERT_LE(preallocation_size, options.target_file_size_base * 1.1); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + int key_idx = 0; + + filter->expect_full_compaction_.store(true); + // Stage 1: + // Generate a set of files at level 0, but don't trigger level-0 + // compaction. + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + // Write 100KB + GenerateNewFile(1, &rnd, &key_idx); + } + + // Generate one more file at level-0, which should trigger level-0 + // compaction. + GenerateNewFile(1, &rnd, &key_idx); + // Suppose each file flushed from mem table has size 1. Now we compact + // (level0_file_num_compaction_trigger+1)=4 files and should have a big + // file of size 4. + ASSERT_EQ(NumSortedRuns(1), 1); + + // Stage 2: + // Now we have one file at level 0, with size 4. We also have some data in + // mem table. Let's continue generating new files at level 0, but don't + // trigger level-0 compaction. + // First, clean up memtable before inserting new data. This will generate + // a level-0 file, with size around 0.4 (according to previously written + // data amount). + filter->expect_full_compaction_.store(false); + ASSERT_OK(Flush(1)); + for (int num = 0; num < options.level0_file_num_compaction_trigger - 3; + num++) { + GenerateNewFile(1, &rnd, &key_idx); + ASSERT_EQ(NumSortedRuns(1), num + 3); + } + + // Generate one more file at level-0, which should trigger level-0 + // compaction. + GenerateNewFile(1, &rnd, &key_idx); + // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1. + // After compaction, we should have 2 files, with size 4, 2.4. + ASSERT_EQ(NumSortedRuns(1), 2); + + // Stage 3: + // Now we have 2 files at level 0, with size 4 and 2.4. Continue + // generating new files at level 0. + for (int num = 0; num < options.level0_file_num_compaction_trigger - 3; + num++) { + GenerateNewFile(1, &rnd, &key_idx); + ASSERT_EQ(NumSortedRuns(1), num + 3); + } + + // Generate one more file at level-0, which should trigger level-0 + // compaction. + GenerateNewFile(1, &rnd, &key_idx); + // Before compaction, we have 4 files at level 0, with size 4, 2.4, 1, 1. + // After compaction, we should have 3 files, with size 4, 2.4, 2. + ASSERT_EQ(NumSortedRuns(1), 3); + + // Stage 4: + // Now we have 3 files at level 0, with size 4, 2.4, 2. Let's generate a + // new file of size 1. + GenerateNewFile(1, &rnd, &key_idx); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Level-0 compaction is triggered, but no file will be picked up. + ASSERT_EQ(NumSortedRuns(1), 4); + + // Stage 5: + // Now we have 4 files at level 0, with size 4, 2.4, 2, 1. Let's generate + // a new file of size 1. + filter->expect_full_compaction_.store(true); + GenerateNewFile(1, &rnd, &key_idx); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // All files at level 0 will be compacted into a single one. + ASSERT_EQ(NumSortedRuns(1), 1); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBTestUniversalCompaction, UniversalCompactionSizeAmplification) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = num_levels_; + options.write_buffer_size = 100 << 10; // 100KB + options.target_file_size_base = 32 << 10; // 32KB + options.level0_file_num_compaction_trigger = 3; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Trigger compaction if size amplification exceeds 110% + options.compaction_options_universal.max_size_amplification_percent = 110; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + Random rnd(301); + int key_idx = 0; + + // Generate two files in Level 0. Both files are approx the same size. + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(1, Key(key_idx), rnd.RandomString(10000))); + key_idx++; + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + ASSERT_EQ(NumSortedRuns(1), num + 1); + } + ASSERT_EQ(NumSortedRuns(1), 2); + + // Flush whatever is remaining in memtable. This is typically + // small, which should not trigger size ratio based compaction + // but will instead trigger size amplification. + ASSERT_OK(Flush(1)); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Verify that size amplification did occur + ASSERT_EQ(NumSortedRuns(1), 1); +} + +TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionSizeAmplification) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 1; + options.write_buffer_size = 100 << 10; // 100KB + options.target_file_size_base = 32 << 10; // 32KB + options.level0_file_num_compaction_trigger = 3; + // Initial setup of compaction_options_universal will prevent universal + // compaction from happening + options.compaction_options_universal.size_ratio = 100; + options.compaction_options_universal.min_merge_width = 100; + DestroyAndReopen(options); + + int total_picked_compactions = 0; + int total_size_amp_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) { + if (arg) { + total_picked_compactions++; + Compaction* c = static_cast(arg); + if (c->compaction_reason() == + CompactionReason::kUniversalSizeAmplification) { + total_size_amp_compactions++; + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + MutableCFOptions mutable_cf_options; + CreateAndReopenWithCF({"pikachu"}, options); + + Random rnd(301); + int key_idx = 0; + + // Generate two files in Level 0. Both files are approx the same size. + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(1, Key(key_idx), rnd.RandomString(10000))); + key_idx++; + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + ASSERT_EQ(NumSortedRuns(1), num + 1); + } + ASSERT_EQ(NumSortedRuns(1), 2); + + // Flush whatever is remaining in memtable. This is typically + // small, which should not trigger size ratio based compaction + // but could instead trigger size amplification if it's set + // to 110. + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Verify compaction did not happen + ASSERT_EQ(NumSortedRuns(1), 3); + + // Trigger compaction if size amplification exceeds 110% without reopening DB + ASSERT_EQ(dbfull() + ->GetOptions(handles_[1]) + .compaction_options_universal.max_size_amplification_percent, + 200U); + ASSERT_OK(dbfull()->SetOptions(handles_[1], + {{"compaction_options_universal", + "{max_size_amplification_percent=110;}"}})); + ASSERT_EQ(dbfull() + ->GetOptions(handles_[1]) + .compaction_options_universal.max_size_amplification_percent, + 110u); + ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1], + &mutable_cf_options)); + ASSERT_EQ(110u, mutable_cf_options.compaction_options_universal + .max_size_amplification_percent); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Verify that size amplification did happen + ASSERT_EQ(NumSortedRuns(1), 1); + ASSERT_EQ(total_picked_compactions, 1); + ASSERT_EQ(total_size_amp_compactions, 1); +} + +TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionReadAmplification) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 1; + options.write_buffer_size = 100 << 10; // 100KB + options.target_file_size_base = 32 << 10; // 32KB + options.level0_file_num_compaction_trigger = 3; + // Initial setup of compaction_options_universal will prevent universal + // compaction from happening + options.compaction_options_universal.max_size_amplification_percent = 2000; + options.compaction_options_universal.size_ratio = 0; + options.compaction_options_universal.min_merge_width = 100; + DestroyAndReopen(options); + + int total_picked_compactions = 0; + int total_size_ratio_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) { + if (arg) { + total_picked_compactions++; + Compaction* c = static_cast(arg); + if (c->compaction_reason() == CompactionReason::kUniversalSizeRatio) { + total_size_ratio_compactions++; + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + MutableCFOptions mutable_cf_options; + CreateAndReopenWithCF({"pikachu"}, options); + + Random rnd(301); + int key_idx = 0; + + // Generate three files in Level 0. All files are approx the same size. + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(1, Key(key_idx), rnd.RandomString(10000))); + key_idx++; + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + ASSERT_EQ(NumSortedRuns(1), num + 1); + } + ASSERT_EQ(NumSortedRuns(1), options.level0_file_num_compaction_trigger); + + // Flush whatever is remaining in memtable. This is typically small, about + // 30KB. + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Verify compaction did not happen + ASSERT_EQ(NumSortedRuns(1), options.level0_file_num_compaction_trigger + 1); + ASSERT_EQ(total_picked_compactions, 0); + + ASSERT_OK(dbfull()->SetOptions( + handles_[1], + {{"compaction_options_universal", + "{min_merge_width=2;max_merge_width=2;size_ratio=100;}"}})); + ASSERT_EQ(dbfull() + ->GetOptions(handles_[1]) + .compaction_options_universal.min_merge_width, + 2u); + ASSERT_EQ(dbfull() + ->GetOptions(handles_[1]) + .compaction_options_universal.max_merge_width, + 2u); + ASSERT_EQ( + dbfull()->GetOptions(handles_[1]).compaction_options_universal.size_ratio, + 100u); + + ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1], + &mutable_cf_options)); + ASSERT_EQ(mutable_cf_options.compaction_options_universal.size_ratio, 100u); + ASSERT_EQ(mutable_cf_options.compaction_options_universal.min_merge_width, + 2u); + ASSERT_EQ(mutable_cf_options.compaction_options_universal.max_merge_width, + 2u); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Files in L0 are approx: 0.3 (30KB), 1, 1, 1. + // On compaction: the files are below the size amp threshold, so we + // fallthrough to checking read amp conditions. The configured size ratio is + // not big enough to take 0.3 into consideration. So the next files 1 and 1 + // are compacted together first as they satisfy size ratio condition and + // (min_merge_width, max_merge_width) condition, to give out a file size of 2. + // Next, the newly generated 2 and the last file 1 are compacted together. So + // at the end: #sortedRuns = 2, #picked_compactions = 2, and all the picked + // ones are size ratio based compactions. + ASSERT_EQ(NumSortedRuns(1), 2); + // If max_merge_width had not been changed dynamically above, and if it + // continued to be the default value of UINIT_MAX, total_picked_compactions + // would have been 1. + ASSERT_EQ(total_picked_compactions, 2); + ASSERT_EQ(total_size_ratio_compactions, 2); +} + +TEST_P(DBTestUniversalCompaction, CompactFilesOnUniversalCompaction) { + const int kTestKeySize = 16; + const int kTestValueSize = 984; + const int kEntrySize = kTestKeySize + kTestValueSize; + const int kEntriesPerBuffer = 10; + + ChangeCompactOptions(); + Options options; + options.create_if_missing = true; + options.compaction_style = kCompactionStyleLevel; + options.num_levels = 1; + options.target_file_size_base = options.write_buffer_size; + options.compression = kNoCompression; + options = CurrentOptions(options); + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_EQ(options.compaction_style, kCompactionStyleUniversal); + Random rnd(301); + for (int key = 1024 * kEntriesPerBuffer; key >= 0; --key) { + ASSERT_OK(Put(1, std::to_string(key), rnd.RandomString(kTestValueSize))); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ColumnFamilyMetaData cf_meta; + dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta); + std::vector compaction_input_file_names; + for (auto file : cf_meta.levels[0].files) { + if (rnd.OneIn(2)) { + compaction_input_file_names.push_back(file.name); + } + } + + if (compaction_input_file_names.size() == 0) { + compaction_input_file_names.push_back(cf_meta.levels[0].files[0].name); + } + + // expect fail since universal compaction only allow L0 output + ASSERT_FALSE(dbfull() + ->CompactFiles(CompactionOptions(), handles_[1], + compaction_input_file_names, 1) + .ok()); + + // expect ok and verify the compacted files no longer exist. + ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), handles_[1], + compaction_input_file_names, 0)); + + dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta); + VerifyCompactionResult( + cf_meta, std::set(compaction_input_file_names.begin(), + compaction_input_file_names.end())); + + compaction_input_file_names.clear(); + + // Pick the first and the last file, expect everything is + // compacted into one single file. + compaction_input_file_names.push_back(cf_meta.levels[0].files[0].name); + compaction_input_file_names.push_back( + cf_meta.levels[0].files[cf_meta.levels[0].files.size() - 1].name); + ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), handles_[1], + compaction_input_file_names, 0)); + + dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta); + ASSERT_EQ(cf_meta.levels[0].files.size(), 1U); +} + +TEST_P(DBTestUniversalCompaction, UniversalCompactionTargetLevel) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100 << 10; // 100KB + options.num_levels = 7; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + // Generate 3 overlapping files + Random rnd(301); + for (int i = 0; i < 210; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(100))); + } + ASSERT_OK(Flush()); + + for (int i = 200; i < 300; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(100))); + } + ASSERT_OK(Flush()); + + for (int i = 250; i < 260; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(100))); + } + ASSERT_OK(Flush()); + + ASSERT_EQ("3", FilesPerLevel(0)); + // Compact all files into 1 file and put it in L4 + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 4; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0)); +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +class DBTestUniversalCompactionMultiLevels + : public DBTestUniversalCompactionBase { + public: + DBTestUniversalCompactionMultiLevels() + : DBTestUniversalCompactionBase( + "/db_universal_compaction_multi_levels_test") {} +}; + +TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionMultiLevels) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = num_levels_; + options.write_buffer_size = 100 << 10; // 100KB + options.level0_file_num_compaction_trigger = 8; + options.max_background_compactions = 3; + options.target_file_size_base = 32 * 1024; + CreateAndReopenWithCF({"pikachu"}, options); + + // Trigger compaction if size amplification exceeds 110% + options.compaction_options_universal.max_size_amplification_percent = 110; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + Random rnd(301); + int num_keys = 100000; + for (int i = 0; i < num_keys * 2; i++) { + ASSERT_OK(Put(1, Key(i % num_keys), Key(i))); + } + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + for (int i = num_keys; i < num_keys * 2; i++) { + ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i)); + } +} + +// Tests universal compaction with trivial move enabled +TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionTrivialMove) { + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) { + non_trivial_move++; + ASSERT_TRUE(arg != nullptr); + int output_level = *(static_cast(arg)); + ASSERT_EQ(output_level, 0); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.allow_trivial_move = true; + options.num_levels = 3; + options.write_buffer_size = 100 << 10; // 100KB + options.level0_file_num_compaction_trigger = 3; + options.max_background_compactions = 2; + options.target_file_size_base = 32 * 1024; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Trigger compaction if size amplification exceeds 110% + options.compaction_options_universal.max_size_amplification_percent = 110; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + Random rnd(301); + int num_keys = 150000; + for (int i = 0; i < num_keys; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + std::vector values; + + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_GT(trivial_move, 0); + ASSERT_GT(non_trivial_move, 0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +INSTANTIATE_TEST_CASE_P(MultiLevels, DBTestUniversalCompactionMultiLevels, + ::testing::Combine(::testing::Values(3, 20), + ::testing::Bool())); + +class DBTestUniversalCompactionParallel : public DBTestUniversalCompactionBase { + public: + DBTestUniversalCompactionParallel() + : DBTestUniversalCompactionBase("/db_universal_compaction_prallel_test") { + } +}; + +TEST_P(DBTestUniversalCompactionParallel, UniversalCompactionParallel) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = num_levels_; + options.env = env_; + options.write_buffer_size = 1 << 10; // 1KB + options.level0_file_num_compaction_trigger = 3; + options.max_background_compactions = 3; + options.max_background_flushes = 3; + options.target_file_size_base = 1 * 1024; + options.compaction_options_universal.max_size_amplification_percent = 110; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Delay every compaction so multiple compactions will happen. + std::atomic num_compactions_running(0); + std::atomic has_parallel(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():Start", [&](void* /*arg*/) { + if (num_compactions_running.fetch_add(1) > 0) { + has_parallel.store(true); + return; + } + for (int nwait = 0; nwait < 20000; nwait++) { + if (has_parallel.load() || num_compactions_running.load() > 1) { + has_parallel.store(true); + break; + } + env_->SleepForMicroseconds(1000); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():End", + [&](void* /*arg*/) { num_compactions_running.fetch_add(-1); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + Random rnd(301); + int num_keys = 30000; + for (int i = 0; i < num_keys * 2; i++) { + ASSERT_OK(Put(1, Key(i % num_keys), Key(i))); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_EQ(num_compactions_running.load(), 0); + ASSERT_TRUE(has_parallel.load()); + + for (int i = num_keys; i < num_keys * 2; i++) { + ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i)); + } + + // Reopen and check. + ReopenWithColumnFamilies({"default", "pikachu"}, options); + for (int i = num_keys; i < num_keys * 2; i++) { + ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i)); + } +} + +TEST_P(DBTestUniversalCompactionParallel, PickByFileNumberBug) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = num_levels_; + options.write_buffer_size = 1 * 1024; // 1KB + options.level0_file_num_compaction_trigger = 7; + options.max_background_compactions = 2; + options.target_file_size_base = 1024 * 1024; // 1MB + + // Disable size amplifiction compaction + options.compaction_options_universal.max_size_amplification_percent = + UINT_MAX; + DestroyAndReopen(options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBTestUniversalCompactionParallel::PickByFileNumberBug:0", + "BackgroundCallCompaction:0"}, + {"UniversalCompactionBuilder::PickCompaction:Return", + "DBTestUniversalCompactionParallel::PickByFileNumberBug:1"}, + {"DBTestUniversalCompactionParallel::PickByFileNumberBug:2", + "CompactionJob::Run():Start"}}); + + int total_picked_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) { + if (arg) { + total_picked_compactions++; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Write 7 files to trigger compaction + int key_idx = 1; + for (int i = 1; i <= 70; i++) { + std::string k = Key(key_idx++); + ASSERT_OK(Put(k, k)); + if (i % 10 == 0) { + ASSERT_OK(Flush()); + } + } + + // Wait for the 1st background compaction process to start + TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:0"); + TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:1"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); + + // Write 3 files while 1st compaction is held + // These 3 files have different sizes to avoid compacting based on size_ratio + int num_keys = 1000; + for (int i = 0; i < 3; i++) { + for (int j = 1; j <= num_keys; j++) { + std::string k = Key(key_idx++); + ASSERT_OK(Put(k, k)); + } + ASSERT_OK(Flush()); + num_keys -= 100; + } + + // Hold the 1st compaction from finishing + TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:2"); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // There should only be one picked compaction as the score drops below one + // after the first one is picked. + EXPECT_EQ(total_picked_compactions, 1); + EXPECT_EQ(TotalTableFiles(), 4); + + // Stop SyncPoint and destroy the DB and reopen it again + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + key_idx = 1; + total_picked_compactions = 0; + DestroyAndReopen(options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Write 7 files to trigger compaction + for (int i = 1; i <= 70; i++) { + std::string k = Key(key_idx++); + ASSERT_OK(Put(k, k)); + if (i % 10 == 0) { + ASSERT_OK(Flush()); + } + } + + // Wait for the 1st background compaction process to start + TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:0"); + TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:1"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); + + // Write 8 files while 1st compaction is held + // These 8 files have different sizes to avoid compacting based on size_ratio + num_keys = 1000; + for (int i = 0; i < 8; i++) { + for (int j = 1; j <= num_keys; j++) { + std::string k = Key(key_idx++); + ASSERT_OK(Put(k, k)); + } + ASSERT_OK(Flush()); + num_keys -= 100; + } + + // Wait for the 2nd background compaction process to start + TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:0"); + TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:1"); + + // Hold the 1st and 2nd compaction from finishing + TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:2"); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // This time we will trigger a compaction because of size ratio and + // another compaction because of number of files that are not compacted + // greater than 7 + EXPECT_GE(total_picked_compactions, 2); +} + +INSTANTIATE_TEST_CASE_P(Parallel, DBTestUniversalCompactionParallel, + ::testing::Combine(::testing::Values(1, 10), + ::testing::Values(false))); +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_P(DBTestUniversalCompaction, UniversalCompactionOptions) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 105 << 10; // 105KB + options.arena_block_size = 4 << 10; // 4KB + options.target_file_size_base = 32 << 10; // 32KB + options.level0_file_num_compaction_trigger = 4; + options.num_levels = num_levels_; + options.compaction_options_universal.compression_size_percent = -1; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + Random rnd(301); + int key_idx = 0; + + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + // Write 100KB (100 values, each 1K) + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(1, Key(key_idx), rnd.RandomString(990))); + key_idx++; + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + + if (num < options.level0_file_num_compaction_trigger - 1) { + ASSERT_EQ(NumSortedRuns(1), num + 1); + } + } + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumSortedRuns(1), 1); +} + +TEST_P(DBTestUniversalCompaction, UniversalCompactionStopStyleSimilarSize) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 105 << 10; // 105KB + options.arena_block_size = 4 << 10; // 4KB + options.target_file_size_base = 32 << 10; // 32KB + // trigger compaction if there are >= 4 files + options.level0_file_num_compaction_trigger = 4; + options.compaction_options_universal.size_ratio = 10; + options.compaction_options_universal.stop_style = + kCompactionStopStyleSimilarSize; + options.num_levels = num_levels_; + DestroyAndReopen(options); + + Random rnd(301); + int key_idx = 0; + + // Stage 1: + // Generate a set of files at level 0, but don't trigger level-0 + // compaction. + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + // Write 100KB (100 values, each 1K) + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990))); + key_idx++; + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ(NumSortedRuns(), num + 1); + } + + // Generate one more file at level-0, which should trigger level-0 + // compaction. + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990))); + key_idx++; + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Suppose each file flushed from mem table has size 1. Now we compact + // (level0_file_num_compaction_trigger+1)=4 files and should have a big + // file of size 4. + ASSERT_EQ(NumSortedRuns(), 1); + + // Stage 2: + // Now we have one file at level 0, with size 4. We also have some data in + // mem table. Let's continue generating new files at level 0, but don't + // trigger level-0 compaction. + // First, clean up memtable before inserting new data. This will generate + // a level-0 file, with size around 0.4 (according to previously written + // data amount). + ASSERT_OK(dbfull()->Flush(FlushOptions())); + for (int num = 0; num < options.level0_file_num_compaction_trigger - 3; + num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990))); + key_idx++; + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ(NumSortedRuns(), num + 3); + } + + // Generate one more file at level-0, which should trigger level-0 + // compaction. + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990))); + key_idx++; + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1. + // After compaction, we should have 3 files, with size 4, 0.4, 2. + ASSERT_EQ(NumSortedRuns(), 3); + // Stage 3: + // Now we have 3 files at level 0, with size 4, 0.4, 2. Generate one + // more file at level-0, which should trigger level-0 compaction. + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990))); + key_idx++; + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Level-0 compaction is triggered, but no file will be picked up. + ASSERT_EQ(NumSortedRuns(), 4); +} + +TEST_P(DBTestUniversalCompaction, UniversalCompactionCompressRatio1) { + if (!Snappy_Supported()) { + return; + } + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100 << 10; // 100KB + options.target_file_size_base = 32 << 10; // 32KB + options.level0_file_num_compaction_trigger = 2; + options.num_levels = num_levels_; + options.compaction_options_universal.compression_size_percent = 70; + DestroyAndReopen(options); + + Random rnd(301); + int key_idx = 0; + + // The first compaction (2) is compressed. + for (int num = 0; num < 2; num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_LT(TotalSize(), 110000U * 2 * 0.9); + + // The second compaction (4) is compressed + for (int num = 0; num < 2; num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_LT(TotalSize(), 110000 * 4 * 0.9); + + // The third compaction (2 4) is compressed since this time it is + // (1 1 3.2) and 3.2/5.2 doesn't reach ratio. + for (int num = 0; num < 2; num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_LT(TotalSize(), 110000 * 6 * 0.9); + + // When we start for the compaction up to (2 4 8), the latest + // compressed is not compressed. + for (int num = 0; num < 8; num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_GT(TotalSize(), 110000 * 11 * 0.8 + 110000 * 2); +} + +TEST_P(DBTestUniversalCompaction, UniversalCompactionCompressRatio2) { + if (!Snappy_Supported()) { + return; + } + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100 << 10; // 100KB + options.target_file_size_base = 32 << 10; // 32KB + options.level0_file_num_compaction_trigger = 2; + options.num_levels = num_levels_; + options.compaction_options_universal.compression_size_percent = 95; + DestroyAndReopen(options); + + Random rnd(301); + int key_idx = 0; + + // When we start for the compaction up to (2 4 8), the latest + // compressed is compressed given the size ratio to compress. + for (int num = 0; num < 14; num++) { + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_LT(TotalSize(), 120000U * 12 * 0.82 + 120000 * 2); +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +// Test that checks trivial move in universal compaction +TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest1) { + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) { + non_trivial_move++; + ASSERT_TRUE(arg != nullptr); + int output_level = *(static_cast(arg)); + ASSERT_EQ(output_level, 0); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.allow_trivial_move = true; + options.num_levels = 2; + options.write_buffer_size = 100 << 10; // 100KB + options.level0_file_num_compaction_trigger = 3; + options.max_background_compactions = 1; + options.target_file_size_base = 32 * 1024; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Trigger compaction if size amplification exceeds 110% + options.compaction_options_universal.max_size_amplification_percent = 110; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + Random rnd(301); + int num_keys = 250000; + for (int i = 0; i < num_keys; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + std::vector values; + + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_GT(trivial_move, 0); + ASSERT_GT(non_trivial_move, 0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} +// Test that checks trivial move in universal compaction +TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest2) { + int32_t trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) { + ASSERT_TRUE(arg != nullptr); + int output_level = *(static_cast(arg)); + ASSERT_EQ(output_level, 0); + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.allow_trivial_move = true; + options.num_levels = 15; + options.write_buffer_size = 100 << 10; // 100KB + options.level0_file_num_compaction_trigger = 8; + options.max_background_compactions = 2; + options.target_file_size_base = 64 * 1024; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Trigger compaction if size amplification exceeds 110% + options.compaction_options_universal.max_size_amplification_percent = 110; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + Random rnd(301); + int num_keys = 500000; + for (int i = 0; i < num_keys; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + std::vector values; + + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_GT(trivial_move, 0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_P(DBTestUniversalCompaction, UniversalCompactionFourPaths) { + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_, 300 * 1024); + options.db_paths.emplace_back(dbname_ + "_2", 300 * 1024); + options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024); + options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.size_ratio = 5; + options.write_buffer_size = 111 << 10; // 114KB + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 1; + + std::vector filenames; + if (env_->GetChildren(options.db_paths[1].path, &filenames).ok()) { + // Delete archival files. + for (size_t i = 0; i < filenames.size(); ++i) { + ASSERT_OK( + env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i])); + } + ASSERT_OK(env_->DeleteDir(options.db_paths[1].path)); + } + Reopen(options); + + Random rnd(301); + int key_idx = 0; + + // First three 110KB files are not going to second path. + // After that, (100K, 200K) + for (int num = 0; num < 3; num++) { + GenerateNewFile(&rnd, &key_idx); + } + + // Another 110KB triggers a compaction to 400K file to second path + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path)); + + // (1, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1,1,4) -> (2, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // (1, 2, 4) -> (3, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // (1, 3, 4) -> (8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path)); + + // (1, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 1, 8) -> (2, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + + // (1, 2, 8) -> (3, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // (1, 3, 8) -> (4, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path)); + + // (1, 4, 8) -> (5, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Reopen(options); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Destroy(options); +} + +TEST_P(DBTestUniversalCompaction, UniversalCompactionCFPathUse) { + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_, 300 * 1024); + options.db_paths.emplace_back(dbname_ + "_2", 300 * 1024); + options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024); + options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.size_ratio = 10; + options.write_buffer_size = 111 << 10; // 114KB + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 1; + + std::vector option_vector; + option_vector.emplace_back(options); + ColumnFamilyOptions cf_opt1(options), cf_opt2(options); + // Configure CF1 specific paths. + cf_opt1.cf_paths.emplace_back(dbname_ + "cf1", 300 * 1024); + cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_2", 300 * 1024); + cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_3", 500 * 1024); + cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_4", 1024 * 1024 * 1024); + option_vector.emplace_back(DBOptions(options), cf_opt1); + CreateColumnFamilies({"one"}, option_vector[1]); + + // Configura CF2 specific paths. + cf_opt2.cf_paths.emplace_back(dbname_ + "cf2", 300 * 1024); + cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_2", 300 * 1024); + cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_3", 500 * 1024); + cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_4", 1024 * 1024 * 1024); + option_vector.emplace_back(DBOptions(options), cf_opt2); + CreateColumnFamilies({"two"}, option_vector[2]); + + ReopenWithColumnFamilies({"default", "one", "two"}, option_vector); + + Random rnd(301); + int key_idx = 0; + int key_idx1 = 0; + int key_idx2 = 0; + + auto generate_file = [&]() { + GenerateNewFile(0, &rnd, &key_idx); + GenerateNewFile(1, &rnd, &key_idx1); + GenerateNewFile(2, &rnd, &key_idx2); + }; + + auto check_sstfilecount = [&](int path_id, int expected) { + ASSERT_EQ(expected, GetSstFileCount(options.db_paths[path_id].path)); + ASSERT_EQ(expected, GetSstFileCount(cf_opt1.cf_paths[path_id].path)); + ASSERT_EQ(expected, GetSstFileCount(cf_opt2.cf_paths[path_id].path)); + }; + + auto check_getvalues = [&]() { + for (int i = 0; i < key_idx; i++) { + auto v = Get(0, Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + for (int i = 0; i < key_idx1; i++) { + auto v = Get(1, Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + for (int i = 0; i < key_idx2; i++) { + auto v = Get(2, Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + }; + + // First three 110KB files are not going to second path. + // After that, (100K, 200K) + for (int num = 0; num < 3; num++) { + generate_file(); + } + + // Another 110KB triggers a compaction to 400K file to second path + generate_file(); + check_sstfilecount(2, 1); + + // (1, 4) + generate_file(); + check_sstfilecount(2, 1); + check_sstfilecount(0, 1); + + // (1,1,4) -> (2, 4) + generate_file(); + check_sstfilecount(2, 1); + check_sstfilecount(1, 1); + check_sstfilecount(0, 0); + + // (1, 2, 4) -> (3, 4) + generate_file(); + check_sstfilecount(2, 1); + check_sstfilecount(1, 1); + check_sstfilecount(0, 0); + + // (1, 3, 4) -> (8) + generate_file(); + check_sstfilecount(3, 1); + + // (1, 8) + generate_file(); + check_sstfilecount(3, 1); + check_sstfilecount(0, 1); + + // (1, 1, 8) -> (2, 8) + generate_file(); + check_sstfilecount(3, 1); + check_sstfilecount(1, 1); + + // (1, 2, 8) -> (3, 8) + generate_file(); + check_sstfilecount(3, 1); + check_sstfilecount(1, 1); + check_sstfilecount(0, 0); + + // (1, 3, 8) -> (4, 8) + generate_file(); + check_sstfilecount(2, 1); + check_sstfilecount(3, 1); + + // (1, 4, 8) -> (5, 8) + generate_file(); + check_sstfilecount(3, 1); + check_sstfilecount(2, 1); + check_sstfilecount(0, 0); + + check_getvalues(); + + ReopenWithColumnFamilies({"default", "one", "two"}, option_vector); + + check_getvalues(); + + Destroy(options, true); +} + +TEST_P(DBTestUniversalCompaction, IncreaseUniversalCompactionNumLevels) { + std::function verify_func = [&](int num_keys_in_db) { + std::string keys_in_db; + Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + keys_in_db.append(iter->key().ToString()); + keys_in_db.push_back(','); + } + delete iter; + + std::string expected_keys; + for (int i = 0; i <= num_keys_in_db; i++) { + expected_keys.append(Key(i)); + expected_keys.push_back(','); + } + + ASSERT_EQ(keys_in_db, expected_keys); + }; + + Random rnd(301); + int max_key1 = 200; + int max_key2 = 600; + int max_key3 = 800; + const int KNumKeysPerFile = 10; + + // Stage 1: open a DB with universal compaction, num_levels=1 + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 1; + options.write_buffer_size = 200 << 10; // 200KB + options.level0_file_num_compaction_trigger = 3; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(KNumKeysPerFile)); + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, options); + + for (int i = 0; i <= max_key1; i++) { + // each value is 10K + ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000))); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Stage 2: reopen with universal compaction, num_levels=4 + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 4; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + verify_func(max_key1); + + // Insert more keys + for (int i = max_key1 + 1; i <= max_key2; i++) { + // each value is 10K + ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000))); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + verify_func(max_key2); + // Compaction to non-L0 has happened. + ASSERT_GT(NumTableFilesAtLevel(options.num_levels - 1, 1), 0); + + // Stage 3: Revert it back to one level and revert to num_levels=1. + options.num_levels = 4; + options.target_file_size_base = INT_MAX; + ReopenWithColumnFamilies({"default", "pikachu"}, options); + // Compact all to level 0 + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 0; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK( + dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + // Need to restart it once to remove higher level records in manifest. + ReopenWithColumnFamilies({"default", "pikachu"}, options); + // Final reopen + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 1; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + // Insert more keys + for (int i = max_key2 + 1; i <= max_key3; i++) { + // each value is 10K + ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000))); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + verify_func(max_key3); +} + +TEST_P(DBTestUniversalCompaction, UniversalCompactionSecondPathRatio) { + if (!Snappy_Supported()) { + return; + } + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_, 500 * 1024); + options.db_paths.emplace_back(dbname_ + "_2", 1024 * 1024 * 1024); + options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.size_ratio = 5; + options.write_buffer_size = 111 << 10; // 114KB + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 1; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + + std::vector filenames; + if (env_->GetChildren(options.db_paths[1].path, &filenames).ok()) { + // Delete archival files. + for (size_t i = 0; i < filenames.size(); ++i) { + ASSERT_OK( + env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i])); + } + ASSERT_OK(env_->DeleteDir(options.db_paths[1].path)); + } + Reopen(options); + + Random rnd(301); + int key_idx = 0; + + // First three 110KB files are not going to second path. + // After that, (100K, 200K) + for (int num = 0; num < 3; num++) { + GenerateNewFile(&rnd, &key_idx); + } + + // Another 110KB triggers a compaction to 400K file to second path + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + + // (1, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1,1,4) -> (2, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 2, 4) -> (3, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // (1, 3, 4) -> (8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // (1, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 1, 8) -> (2, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 2, 8) -> (3, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // (1, 3, 8) -> (4, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // (1, 4, 8) -> (5, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Reopen(options); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Destroy(options); +} + +TEST_P(DBTestUniversalCompaction, ConcurrentBottomPriLowPriCompactions) { + if (num_levels_ == 1) { + // for single-level universal, everything's bottom level so nothing should + // be executed in bottom-pri thread pool. + return; + } + const int kNumFilesTrigger = 3; + Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM); + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.max_background_compactions = 2; + options.num_levels = num_levels_; + options.write_buffer_size = 100 << 10; // 100KB + options.target_file_size_base = 32 << 10; // 32KB + options.level0_file_num_compaction_trigger = kNumFilesTrigger; + // Trigger compaction if size amplification exceeds 110% + options.compaction_options_universal.max_size_amplification_percent = 110; + DestroyAndReopen(options); + + // Need to get a token to enable compaction parallelism up to + // `max_background_compactions` jobs. + auto pressure_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {// wait for the full compaction to be picked before adding files intended + // for the second one. + {"DBImpl::BackgroundCompaction:ForwardToBottomPriPool", + "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0"}, + // the full (bottom-pri) compaction waits until a partial (low-pri) + // compaction has started to verify they can run in parallel. + {"DBImpl::BackgroundCompaction:NonTrivial", + "DBImpl::BGWorkBottomCompaction"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int i = 0; i < 2; ++i) { + for (int num = 0; num < kNumFilesTrigger; num++) { + int key_idx = 0; + GenerateNewFile(&rnd, &key_idx, true /* no_wait */); + // use no_wait above because that one waits for flush and compaction. We + // don't want to wait for compaction because the full compaction is + // intentionally blocked while more files are flushed. + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + if (i == 0) { + TEST_SYNC_POINT( + "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0"); + } + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // First compaction should output to bottom level. Second should output to L0 + // since older L0 files pending compaction prevent it from being placed lower. + ASSERT_EQ(NumSortedRuns(), 2); + ASSERT_GT(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(num_levels_ - 1), 0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM); +} + +TEST_P(DBTestUniversalCompaction, RecalculateScoreAfterPicking) { + // Regression test for extra compactions scheduled. Once enough compactions + // have been scheduled to bring the score below one, we should stop + // scheduling more; otherwise, other CFs/DBs may be delayed unnecessarily. + const int kNumFilesTrigger = 8; + Options options = CurrentOptions(); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.compaction_options_universal.max_merge_width = kNumFilesTrigger / 2; + options.compaction_options_universal.max_size_amplification_percent = + static_cast(-1); + options.compaction_style = kCompactionStyleUniversal; + options.level0_file_num_compaction_trigger = kNumFilesTrigger; + options.num_levels = num_levels_; + Reopen(options); + + std::atomic num_compactions_attempted(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:Start", + [&](void* /*arg*/) { ++num_compactions_attempted; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int num = 0; num < kNumFilesTrigger; num++) { + ASSERT_EQ(NumSortedRuns(), num); + int key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Compacting the first four files was enough to bring the score below one so + // there's no need to schedule any more compactions. + ASSERT_EQ(1, num_compactions_attempted); + ASSERT_EQ(NumSortedRuns(), 5); +} + +TEST_P(DBTestUniversalCompaction, FinalSortedRunCompactFilesConflict) { + // Regression test for conflict between: + // (1) Running CompactFiles including file in the final sorted run; and + // (2) Picking universal size-amp-triggered compaction, which always includes + // the final sorted run. + if (exclusive_manual_compaction_) { + return; + } + + Options opts = CurrentOptions(); + opts.compaction_style = kCompactionStyleUniversal; + opts.compaction_options_universal.max_size_amplification_percent = 50; + opts.compaction_options_universal.min_merge_width = 2; + opts.compression = kNoCompression; + opts.level0_file_num_compaction_trigger = 2; + opts.max_background_compactions = 2; + opts.num_levels = num_levels_; + Reopen(opts); + + // make sure compaction jobs can be parallelized + auto stop_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + ASSERT_OK(Put("key", "val")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(NumTableFilesAtLevel(num_levels_ - 1), 1); + ColumnFamilyMetaData cf_meta; + ColumnFamilyHandle* default_cfh = db_->DefaultColumnFamily(); + dbfull()->GetColumnFamilyMetaData(default_cfh, &cf_meta); + ASSERT_EQ(1, cf_meta.levels[num_levels_ - 1].files.size()); + std::string first_sst_filename = + cf_meta.levels[num_levels_ - 1].files[0].name; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"CompactFilesImpl:0", + "DBTestUniversalCompaction:FinalSortedRunCompactFilesConflict:0"}, + {"DBImpl::BackgroundCompaction():AfterPickCompaction", + "CompactFilesImpl:1"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + port::Thread compact_files_thread([&]() { + ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), default_cfh, + {first_sst_filename}, num_levels_ - 1)); + }); + + TEST_SYNC_POINT( + "DBTestUniversalCompaction:FinalSortedRunCompactFilesConflict:0"); + for (int i = 0; i < 2; ++i) { + ASSERT_OK(Put("key", "val")); + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + compact_files_thread.join(); +} + +INSTANTIATE_TEST_CASE_P(NumLevels, DBTestUniversalCompaction, + ::testing::Combine(::testing::Values(1, 3, 5), + ::testing::Bool())); + +class DBTestUniversalManualCompactionOutputPathId + : public DBTestUniversalCompactionBase { + public: + DBTestUniversalManualCompactionOutputPathId() + : DBTestUniversalCompactionBase( + "/db_universal_compaction_manual_pid_test") {} +}; + +TEST_P(DBTestUniversalManualCompactionOutputPathId, + ManualCompactionOutputPathId) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.db_paths.emplace_back(dbname_, 1000000000); + options.db_paths.emplace_back(dbname_ + "_2", 1000000000); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = num_levels_; + options.target_file_size_base = 1 << 30; // Big size + options.level0_file_num_compaction_trigger = 10; + Destroy(options); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + MakeTables(3, "p", "q", 1); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(2, TotalLiveFiles(1)); + ASSERT_EQ(2, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path)); + + // Full compaction to DB path 0 + CompactRangeOptions compact_options; + compact_options.target_path_id = 1; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + ASSERT_EQ(1, TotalLiveFiles(1)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); + ASSERT_EQ(1, TotalLiveFiles(1)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + + MakeTables(1, "p", "q", 1); + ASSERT_EQ(2, TotalLiveFiles(1)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); + ASSERT_EQ(2, TotalLiveFiles(1)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + + // Full compaction to DB path 0 + compact_options.target_path_id = 0; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + ASSERT_EQ(1, TotalLiveFiles(1)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path)); + + // Fail when compacting to an invalid path ID + compact_options.target_path_id = 2; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_TRUE(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr) + .IsInvalidArgument()); +} + +INSTANTIATE_TEST_CASE_P(OutputPathId, + DBTestUniversalManualCompactionOutputPathId, + ::testing::Combine(::testing::Values(1, 8), + ::testing::Bool())); + +TEST_F(DBTestUniversalCompaction2, BasicL0toL1) { + const int kNumKeys = 3000; + const int kWindowSize = 100; + const int kNumDelsTrigger = 90; + + Options opts = CurrentOptions(); + opts.table_properties_collector_factories.emplace_back( + NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger)); + opts.compaction_style = kCompactionStyleUniversal; + opts.level0_file_num_compaction_trigger = 2; + opts.compression = kNoCompression; + opts.compaction_options_universal.size_ratio = 10; + opts.compaction_options_universal.min_merge_width = 2; + opts.compaction_options_universal.max_size_amplification_percent = 200; + Reopen(opts); + + // add an L1 file to prevent tombstones from dropping due to obsolescence + // during flush + int i; + for (i = 0; i < 2000; ++i) { + ASSERT_OK(Put(Key(i), "val")); + } + ASSERT_OK(Flush()); + // MoveFilesToLevel(6); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + for (i = 1999; i < kNumKeys; ++i) { + if (i >= kNumKeys - kWindowSize && + i < kNumKeys - kWindowSize + kNumDelsTrigger) { + ASSERT_OK(Delete(Key(i))); + } else { + ASSERT_OK(Put(Key(i), "val")); + } + } + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_GT(NumTableFilesAtLevel(6), 0); +} + +#if defined(ENABLE_SINGLE_LEVEL_DTC) +TEST_F(DBTestUniversalCompaction2, SingleLevel) { + const int kNumKeys = 3000; + const int kWindowSize = 100; + const int kNumDelsTrigger = 90; + + Options opts = CurrentOptions(); + opts.table_properties_collector_factories.emplace_back( + NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger)); + opts.compaction_style = kCompactionStyleUniversal; + opts.level0_file_num_compaction_trigger = 2; + opts.compression = kNoCompression; + opts.num_levels = 1; + opts.compaction_options_universal.size_ratio = 10; + opts.compaction_options_universal.min_merge_width = 2; + opts.compaction_options_universal.max_size_amplification_percent = 200; + Reopen(opts); + + // add an L1 file to prevent tombstones from dropping due to obsolescence + // during flush + int i; + for (i = 0; i < 2000; ++i) { + ASSERT_OK(Put(Key(i), "val")); + } + ASSERT_OK(Flush()); + + for (i = 1999; i < kNumKeys; ++i) { + if (i >= kNumKeys - kWindowSize && + i < kNumKeys - kWindowSize + kNumDelsTrigger) { + ASSERT_OK(Delete(Key(i))); + } else { + ASSERT_OK(Put(Key(i), "val")); + } + } + ASSERT_OK(Flush()(; + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); +} +#endif // ENABLE_SINGLE_LEVEL_DTC + +TEST_F(DBTestUniversalCompaction2, MultipleLevels) { + const int kWindowSize = 100; + const int kNumDelsTrigger = 90; + + Options opts = CurrentOptions(); + opts.table_properties_collector_factories.emplace_back( + NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger)); + opts.compaction_style = kCompactionStyleUniversal; + opts.level0_file_num_compaction_trigger = 4; + opts.compression = kNoCompression; + opts.compaction_options_universal.size_ratio = 10; + opts.compaction_options_universal.min_merge_width = 2; + opts.compaction_options_universal.max_size_amplification_percent = 200; + Reopen(opts); + + // add an L1 file to prevent tombstones from dropping due to obsolescence + // during flush + int i; + for (i = 0; i < 500; ++i) { + ASSERT_OK(Put(Key(i), "val")); + } + ASSERT_OK(Flush()); + for (i = 500; i < 1000; ++i) { + ASSERT_OK(Put(Key(i), "val")); + } + ASSERT_OK(Flush()); + for (i = 1000; i < 1500; ++i) { + ASSERT_OK(Put(Key(i), "val")); + } + ASSERT_OK(Flush()); + for (i = 1500; i < 2000; ++i) { + ASSERT_OK(Put(Key(i), "val")); + } + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_GT(NumTableFilesAtLevel(6), 0); + + for (i = 1999; i < 2333; ++i) { + ASSERT_OK(Put(Key(i), "val")); + } + ASSERT_OK(Flush()); + for (i = 2333; i < 2666; ++i) { + ASSERT_OK(Put(Key(i), "val")); + } + ASSERT_OK(Flush()); + for (i = 2666; i < 2999; ++i) { + ASSERT_OK(Put(Key(i), "val")); + } + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_GT(NumTableFilesAtLevel(6), 0); + ASSERT_GT(NumTableFilesAtLevel(5), 0); + + for (i = 1900; i < 2100; ++i) { + ASSERT_OK(Delete(Key(i))); + } + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); + ASSERT_EQ(0, NumTableFilesAtLevel(2)); + ASSERT_EQ(0, NumTableFilesAtLevel(3)); + ASSERT_EQ(0, NumTableFilesAtLevel(4)); + ASSERT_EQ(0, NumTableFilesAtLevel(5)); + ASSERT_GT(NumTableFilesAtLevel(6), 0); +} + +TEST_F(DBTestUniversalCompaction2, OverlappingL0) { + const int kWindowSize = 100; + const int kNumDelsTrigger = 90; + + Options opts = CurrentOptions(); + opts.table_properties_collector_factories.emplace_back( + NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger)); + opts.compaction_style = kCompactionStyleUniversal; + opts.level0_file_num_compaction_trigger = 5; + opts.compression = kNoCompression; + opts.compaction_options_universal.size_ratio = 10; + opts.compaction_options_universal.min_merge_width = 2; + opts.compaction_options_universal.max_size_amplification_percent = 200; + Reopen(opts); + + // add an L1 file to prevent tombstones from dropping due to obsolescence + // during flush + int i; + for (i = 0; i < 2000; ++i) { + ASSERT_OK(Put(Key(i), "val")); + } + ASSERT_OK(Flush()); + for (i = 2000; i < 3000; ++i) { + ASSERT_OK(Put(Key(i), "val")); + } + ASSERT_OK(Flush()); + for (i = 3500; i < 4000; ++i) { + ASSERT_OK(Put(Key(i), "val")); + } + ASSERT_OK(Flush()); + for (i = 2900; i < 3100; ++i) { + ASSERT_OK(Delete(Key(i))); + } + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + ASSERT_GT(NumTableFilesAtLevel(6), 0); +} + +TEST_F(DBTestUniversalCompaction2, IngestBehind) { + const int kNumKeys = 3000; + const int kWindowSize = 100; + const int kNumDelsTrigger = 90; + + Options opts = CurrentOptions(); + opts.table_properties_collector_factories.emplace_back( + NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger)); + opts.compaction_style = kCompactionStyleUniversal; + opts.level0_file_num_compaction_trigger = 2; + opts.compression = kNoCompression; + opts.allow_ingest_behind = true; + opts.compaction_options_universal.size_ratio = 10; + opts.compaction_options_universal.min_merge_width = 2; + opts.compaction_options_universal.max_size_amplification_percent = 200; + Reopen(opts); + + // add an L1 file to prevent tombstones from dropping due to obsolescence + // during flush + int i; + for (i = 0; i < 2000; ++i) { + ASSERT_OK(Put(Key(i), "val")); + } + ASSERT_OK(Flush()); + // MoveFilesToLevel(6); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + for (i = 1999; i < kNumKeys; ++i) { + if (i >= kNumKeys - kWindowSize && + i < kNumKeys - kWindowSize + kNumDelsTrigger) { + ASSERT_OK(Delete(Key(i))); + } else { + ASSERT_OK(Put(Key(i), "val")); + } + } + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(0, NumTableFilesAtLevel(6)); + ASSERT_GT(NumTableFilesAtLevel(5), 0); +} + +TEST_F(DBTestUniversalCompaction2, PeriodicCompactionDefault) { + Options options; + options.compaction_style = kCompactionStyleUniversal; + options.env = env_; + KeepFilterFactory* filter = new KeepFilterFactory(true); + options.compaction_filter_factory.reset(filter); + Reopen(options); + ASSERT_EQ(30 * 24 * 60 * 60, + dbfull()->GetOptions().periodic_compaction_seconds); + + KeepFilter df; + options.compaction_filter_factory.reset(); + options.compaction_filter = &df; + Reopen(options); + ASSERT_EQ(30 * 24 * 60 * 60, + dbfull()->GetOptions().periodic_compaction_seconds); + + options.ttl = 60 * 24 * 60 * 60; + options.compaction_filter = nullptr; + Reopen(options); + ASSERT_EQ(60 * 24 * 60 * 60, + dbfull()->GetOptions().periodic_compaction_seconds); +} + +TEST_F(DBTestUniversalCompaction2, PeriodicCompaction) { + Options opts = CurrentOptions(); + opts.env = env_; + opts.compaction_style = kCompactionStyleUniversal; + opts.level0_file_num_compaction_trigger = 10; + opts.max_open_files = -1; + opts.compaction_options_universal.size_ratio = 10; + opts.compaction_options_universal.min_merge_width = 2; + opts.compaction_options_universal.max_size_amplification_percent = 200; + opts.periodic_compaction_seconds = 48 * 60 * 60; // 2 days + opts.num_levels = 5; + env_->SetMockSleep(); + Reopen(opts); + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + int periodic_compactions = 0; + int start_level = -1; + int output_level = -1; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "UniversalCompactionPicker::PickPeriodicCompaction:Return", + [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(arg != nullptr); + ASSERT_TRUE(compaction->compaction_reason() == + CompactionReason::kPeriodicCompaction); + start_level = compaction->start_level(); + output_level = compaction->output_level(); + periodic_compactions++; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Case 1: Oldest flushed file excceeds periodic compaction threshold. + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + ASSERT_EQ(0, periodic_compactions); + // Move clock forward so that the flushed file would qualify periodic + // compaction. + env_->MockSleepForSeconds(48 * 60 * 60 + 100); + + // Another flush would trigger compaction the oldest file. + ASSERT_OK(Put("foo", "bar2")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ(1, periodic_compactions); + ASSERT_EQ(0, start_level); + ASSERT_EQ(4, output_level); + + // Case 2: Oldest compacted file excceeds periodic compaction threshold + periodic_compactions = 0; + // A flush doesn't trigger a periodic compaction when threshold not hit + ASSERT_OK(Put("foo", "bar2")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(0, periodic_compactions); + + // After periodic compaction threshold hits, a flush will trigger + // a compaction + ASSERT_OK(Put("foo", "bar2")); + env_->MockSleepForSeconds(48 * 60 * 60 + 100); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(1, periodic_compactions); + ASSERT_EQ(0, start_level); + ASSERT_EQ(4, output_level); +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // !defined(ROCKSDB_LITE) + +int main(int argc, char** argv) { +#if !defined(ROCKSDB_LITE) + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +#else + (void)argc; + (void)argv; + return 0; +#endif +} diff --git a/src/rocksdb/db/db_wal_test.cc b/src/rocksdb/db/db_wal_test.cc new file mode 100644 index 000000000..5b5ec76af --- /dev/null +++ b/src/rocksdb/db/db_wal_test.cc @@ -0,0 +1,2314 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "options/options_helper.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/file_system.h" +#include "test_util/sync_point.h" +#include "utilities/fault_injection_env.h" +#include "utilities/fault_injection_fs.h" + +namespace ROCKSDB_NAMESPACE { +class DBWALTestBase : public DBTestBase { + protected: + explicit DBWALTestBase(const std::string& dir_name) + : DBTestBase(dir_name, /*env_do_fsync=*/true) {} + +#if defined(ROCKSDB_PLATFORM_POSIX) + public: +#if defined(ROCKSDB_FALLOCATE_PRESENT) + bool IsFallocateSupported() { + // Test fallocate support of running file system. + // Skip this test if fallocate is not supported. + std::string fname_test_fallocate = dbname_ + "/preallocate_testfile"; + int fd = -1; + do { + fd = open(fname_test_fallocate.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); + } while (fd < 0 && errno == EINTR); + assert(fd > 0); + int alloc_status = fallocate(fd, 0, 0, 1); + int err_number = errno; + close(fd); + assert(env_->DeleteFile(fname_test_fallocate) == Status::OK()); + if (err_number == ENOSYS || err_number == EOPNOTSUPP) { + fprintf(stderr, "Skipped preallocated space check: %s\n", + errnoStr(err_number).c_str()); + return false; + } + assert(alloc_status == 0); + return true; + } +#endif // ROCKSDB_FALLOCATE_PRESENT + + uint64_t GetAllocatedFileSize(std::string file_name) { + struct stat sbuf; + int err = stat(file_name.c_str(), &sbuf); + assert(err == 0); + return sbuf.st_blocks * 512; + } +#endif // ROCKSDB_PLATFORM_POSIX +}; + +class DBWALTest : public DBWALTestBase { + public: + DBWALTest() : DBWALTestBase("/db_wal_test") {} +}; + +// A SpecialEnv enriched to give more insight about deleted files +class EnrichedSpecialEnv : public SpecialEnv { + public: + explicit EnrichedSpecialEnv(Env* base) : SpecialEnv(base) {} + Status NewSequentialFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& soptions) override { + InstrumentedMutexLock l(&env_mutex_); + if (f == skipped_wal) { + deleted_wal_reopened = true; + if (IsWAL(f) && largest_deleted_wal.size() != 0 && + f.compare(largest_deleted_wal) <= 0) { + gap_in_wals = true; + } + } + return SpecialEnv::NewSequentialFile(f, r, soptions); + } + Status DeleteFile(const std::string& fname) override { + if (IsWAL(fname)) { + deleted_wal_cnt++; + InstrumentedMutexLock l(&env_mutex_); + // If this is the first WAL, remember its name and skip deleting it. We + // remember its name partly because the application might attempt to + // delete the file again. + if (skipped_wal.size() != 0 && skipped_wal != fname) { + if (largest_deleted_wal.size() == 0 || + largest_deleted_wal.compare(fname) < 0) { + largest_deleted_wal = fname; + } + } else { + skipped_wal = fname; + return Status::OK(); + } + } + return SpecialEnv::DeleteFile(fname); + } + bool IsWAL(const std::string& fname) { + // printf("iswal %s\n", fname.c_str()); + return fname.compare(fname.size() - 3, 3, "log") == 0; + } + + InstrumentedMutex env_mutex_; + // the wal whose actual delete was skipped by the env + std::string skipped_wal = ""; + // the largest WAL that was requested to be deleted + std::string largest_deleted_wal = ""; + // number of WALs that were successfully deleted + std::atomic deleted_wal_cnt = {0}; + // the WAL whose delete from fs was skipped is reopened during recovery + std::atomic deleted_wal_reopened = {false}; + // whether a gap in the WALs was detected during recovery + std::atomic gap_in_wals = {false}; +}; + +class DBWALTestWithEnrichedEnv : public DBTestBase { + public: + DBWALTestWithEnrichedEnv() + : DBTestBase("db_wal_test", /*env_do_fsync=*/true) { + enriched_env_ = new EnrichedSpecialEnv(env_->target()); + auto options = CurrentOptions(); + options.env = enriched_env_; + options.allow_2pc = true; + Reopen(options); + delete env_; + // to be deleted by the parent class + env_ = enriched_env_; + } + + protected: + EnrichedSpecialEnv* enriched_env_; +}; + +// Test that the recovery would successfully avoid the gaps between the logs. +// One known scenario that could cause this is that the application issue the +// WAL deletion out of order. For the sake of simplicity in the test, here we +// create the gap by manipulating the env to skip deletion of the first WAL but +// not the ones after it. +TEST_F(DBWALTestWithEnrichedEnv, SkipDeletedWALs) { + auto options = last_options_; + // To cause frequent WAL deletion + options.write_buffer_size = 128; + Reopen(options); + + WriteOptions writeOpt = WriteOptions(); + for (int i = 0; i < 128 * 5; i++) { + ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1")); + } + FlushOptions fo; + fo.wait = true; + ASSERT_OK(db_->Flush(fo)); + + // some wals are deleted + ASSERT_NE(0, enriched_env_->deleted_wal_cnt); + // but not the first one + ASSERT_NE(0, enriched_env_->skipped_wal.size()); + + // Test that the WAL that was not deleted will be skipped during recovery + options = last_options_; + Reopen(options); + ASSERT_FALSE(enriched_env_->deleted_wal_reopened); + ASSERT_FALSE(enriched_env_->gap_in_wals); +} + +TEST_F(DBWALTest, WAL) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); + + writeOpt.disableWAL = false; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2")); + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2")); + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + // Both value's should be present. + ASSERT_EQ("v2", Get(1, "bar")); + ASSERT_EQ("v2", Get(1, "foo")); + + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3")); + writeOpt.disableWAL = false; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3")); + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + // again both values should be present. + ASSERT_EQ("v3", Get(1, "foo")); + ASSERT_EQ("v3", Get(1, "bar")); + } while (ChangeWalOptions()); +} + +TEST_F(DBWALTest, RollLog) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "baz", "v5")); + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + for (int i = 0; i < 10; i++) { + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + } + ASSERT_OK(Put(1, "foo", "v4")); + for (int i = 0; i < 10; i++) { + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + } + } while (ChangeWalOptions()); +} + +TEST_F(DBWALTest, SyncWALNotBlockWrite) { + Options options = CurrentOptions(); + options.max_write_buffer_number = 4; + DestroyAndReopen(options); + + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("foo5", "bar5")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"WritableFileWriter::SyncWithoutFlush:1", + "DBWALTest::SyncWALNotBlockWrite:1"}, + {"DBWALTest::SyncWALNotBlockWrite:2", + "WritableFileWriter::SyncWithoutFlush:2"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread thread([&]() { ASSERT_OK(db_->SyncWAL()); }); + + TEST_SYNC_POINT("DBWALTest::SyncWALNotBlockWrite:1"); + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_OK(Put("foo3", "bar3")); + FlushOptions fo; + fo.wait = false; + ASSERT_OK(db_->Flush(fo)); + ASSERT_OK(Put("foo4", "bar4")); + + TEST_SYNC_POINT("DBWALTest::SyncWALNotBlockWrite:2"); + + thread.join(); + + ASSERT_EQ(Get("foo1"), "bar1"); + ASSERT_EQ(Get("foo2"), "bar2"); + ASSERT_EQ(Get("foo3"), "bar3"); + ASSERT_EQ(Get("foo4"), "bar4"); + ASSERT_EQ(Get("foo5"), "bar5"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBWALTest, SyncWALNotWaitWrite) { + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("foo3", "bar3")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"SpecialEnv::WalFile::Append:1", "DBWALTest::SyncWALNotWaitWrite:1"}, + {"DBWALTest::SyncWALNotWaitWrite:2", "SpecialEnv::WalFile::Append:2"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread thread( + [&]() { ASSERT_OK(Put("foo2", "bar2")); }); + // Moving this to SyncWAL before the actual fsync + // TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:1"); + ASSERT_OK(db_->SyncWAL()); + // Moving this to SyncWAL after actual fsync + // TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2"); + + thread.join(); + + ASSERT_EQ(Get("foo1"), "bar1"); + ASSERT_EQ(Get("foo2"), "bar2"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBWALTest, Recover) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "baz", "v5")); + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v5", Get(1, "baz")); + ASSERT_OK(Put(1, "bar", "v2")); + ASSERT_OK(Put(1, "foo", "v3")); + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("v3", Get(1, "foo")); + ASSERT_OK(Put(1, "foo", "v4")); + ASSERT_EQ("v4", Get(1, "foo")); + ASSERT_EQ("v2", Get(1, "bar")); + ASSERT_EQ("v5", Get(1, "baz")); + } while (ChangeWalOptions()); +} + +TEST_F(DBWALTest, RecoverWithTableHandle) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.avoid_flush_during_recovery = false; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "bar", "v2")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "foo", "v3")); + ASSERT_OK(Put(1, "bar", "v4")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "big", std::string(100, 'a'))); + + options = CurrentOptions(); + const int kSmallMaxOpenFiles = 13; + if (option_config_ == kDBLogDir) { + // Use this option to check not preloading files + // Set the max open files to be small enough so no preload will + // happen. + options.max_open_files = kSmallMaxOpenFiles; + // RocksDB sanitize max open files to at least 20. Modify it back. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = static_cast(arg); + *max_open_files = kSmallMaxOpenFiles; + }); + + } else if (option_config_ == kWalDirAndMmapReads) { + // Use this option to check always loading all files. + options.max_open_files = 100; + } else { + options.max_open_files = -1; + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + std::vector> files; + dbfull()->TEST_GetFilesMetaData(handles_[1], &files); + size_t total_files = 0; + for (const auto& level : files) { + total_files += level.size(); + } + ASSERT_EQ(total_files, 3); + for (const auto& level : files) { + for (const auto& file : level) { + if (options.max_open_files == kSmallMaxOpenFiles) { + ASSERT_TRUE(file.table_reader_handle == nullptr); + } else { + ASSERT_TRUE(file.table_reader_handle != nullptr); + } + } + } + } while (ChangeWalOptions()); +} + +TEST_F(DBWALTest, RecoverWithBlob) { + // Write a value that's below the prospective size limit for blobs and another + // one that's above. Note that blob files are not actually enabled at this + // point. + constexpr uint64_t min_blob_size = 10; + + constexpr char short_value[] = "short"; + static_assert(sizeof(short_value) - 1 < min_blob_size, + "short_value too long"); + + constexpr char long_value[] = "long_value"; + static_assert(sizeof(long_value) - 1 >= min_blob_size, + "long_value too short"); + + ASSERT_OK(Put("key1", short_value)); + ASSERT_OK(Put("key2", long_value)); + + // There should be no files just yet since we haven't flushed. + { + VersionSet* const versions = dbfull()->GetVersionSet(); + ASSERT_NE(versions, nullptr); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + + const VersionStorageInfo* const storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + + ASSERT_EQ(storage_info->num_non_empty_levels(), 0); + ASSERT_TRUE(storage_info->GetBlobFiles().empty()); + } + + // Reopen the database with blob files enabled. A new table file/blob file + // pair should be written during recovery. + Options options; + options.enable_blob_files = true; + options.min_blob_size = min_blob_size; + options.avoid_flush_during_recovery = false; + options.disable_auto_compactions = true; + options.env = env_; + + Reopen(options); + + ASSERT_EQ(Get("key1"), short_value); + ASSERT_EQ(Get("key2"), long_value); + + VersionSet* const versions = dbfull()->GetVersionSet(); + ASSERT_NE(versions, nullptr); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + + const VersionStorageInfo* const storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + + const auto& l0_files = storage_info->LevelFiles(0); + ASSERT_EQ(l0_files.size(), 1); + + const FileMetaData* const table_file = l0_files[0]; + ASSERT_NE(table_file, nullptr); + + const auto& blob_files = storage_info->GetBlobFiles(); + ASSERT_EQ(blob_files.size(), 1); + + const auto& blob_file = blob_files.front(); + ASSERT_NE(blob_file, nullptr); + + ASSERT_EQ(table_file->smallest.user_key(), "key1"); + ASSERT_EQ(table_file->largest.user_key(), "key2"); + ASSERT_EQ(table_file->fd.smallest_seqno, 1); + ASSERT_EQ(table_file->fd.largest_seqno, 2); + ASSERT_EQ(table_file->oldest_blob_file_number, + blob_file->GetBlobFileNumber()); + + ASSERT_EQ(blob_file->GetTotalBlobCount(), 1); + +#ifndef ROCKSDB_LITE + const InternalStats* const internal_stats = cfd->internal_stats(); + ASSERT_NE(internal_stats, nullptr); + + const auto& compaction_stats = internal_stats->TEST_GetCompactionStats(); + ASSERT_FALSE(compaction_stats.empty()); + ASSERT_EQ(compaction_stats[0].bytes_written, table_file->fd.GetFileSize()); + ASSERT_EQ(compaction_stats[0].bytes_written_blob, + blob_file->GetTotalBlobBytes()); + ASSERT_EQ(compaction_stats[0].num_output_files, 1); + ASSERT_EQ(compaction_stats[0].num_output_files_blob, 1); + + const uint64_t* const cf_stats_value = internal_stats->TEST_GetCFStatsValue(); + ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED], + compaction_stats[0].bytes_written + + compaction_stats[0].bytes_written_blob); +#endif // ROCKSDB_LITE +} + +TEST_F(DBWALTest, RecoverWithBlobMultiSST) { + // Write several large (4 KB) values without flushing. Note that blob files + // are not actually enabled at this point. + std::string large_value(1 << 12, 'a'); + + constexpr int num_keys = 64; + + for (int i = 0; i < num_keys; ++i) { + ASSERT_OK(Put(Key(i), large_value)); + } + + // There should be no files just yet since we haven't flushed. + { + VersionSet* const versions = dbfull()->GetVersionSet(); + ASSERT_NE(versions, nullptr); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + + const VersionStorageInfo* const storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + + ASSERT_EQ(storage_info->num_non_empty_levels(), 0); + ASSERT_TRUE(storage_info->GetBlobFiles().empty()); + } + + // Reopen the database with blob files enabled and write buffer size set to a + // smaller value. Multiple table files+blob files should be written and added + // to the Version during recovery. + Options options; + options.write_buffer_size = 1 << 16; // 64 KB + options.enable_blob_files = true; + options.avoid_flush_during_recovery = false; + options.disable_auto_compactions = true; + options.env = env_; + + Reopen(options); + + for (int i = 0; i < num_keys; ++i) { + ASSERT_EQ(Get(Key(i)), large_value); + } + + VersionSet* const versions = dbfull()->GetVersionSet(); + ASSERT_NE(versions, nullptr); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + + const VersionStorageInfo* const storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + + const auto& l0_files = storage_info->LevelFiles(0); + ASSERT_GT(l0_files.size(), 1); + + const auto& blob_files = storage_info->GetBlobFiles(); + ASSERT_GT(blob_files.size(), 1); + + ASSERT_EQ(l0_files.size(), blob_files.size()); +} + +TEST_F(DBWALTest, WALWithChecksumHandoff) { +#ifndef ROCKSDB_ASSERT_STATUS_CHECKED + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + do { + Options options = CurrentOptions(); + + options.checksum_handoff_file_types.Add(FileType::kWalFile); + options.env = fault_fs_env.get(); + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + + CreateAndReopenWithCF({"pikachu"}, options); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); + + writeOpt.disableWAL = false; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2")); + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2")); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + // Both value's should be present. + ASSERT_EQ("v2", Get(1, "bar")); + ASSERT_EQ("v2", Get(1, "foo")); + + writeOpt.disableWAL = true; + // This put, data is persisted by Flush + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3")); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + writeOpt.disableWAL = false; + // Data is persisted in the WAL + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "zoo", "v3")); + // The hash does not match, write fails + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + writeOpt.disableWAL = false; + ASSERT_NOK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3")); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + // Due to the write failure, Get should not find + ASSERT_NE("v3", Get(1, "foo")); + ASSERT_EQ("v3", Get(1, "zoo")); + ASSERT_EQ("v3", Get(1, "bar")); + + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + // Each write will be similated as corrupted. + fault_fs->IngestDataCorruptionBeforeWrite(); + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v4")); + writeOpt.disableWAL = false; + ASSERT_NOK(dbfull()->Put(writeOpt, handles_[1], "foo", "v4")); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_NE("v4", Get(1, "foo")); + ASSERT_NE("v4", Get(1, "bar")); + fault_fs->NoDataCorruptionBeforeWrite(); + + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum); + // The file system does not provide checksum method and verification. + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v5")); + writeOpt.disableWAL = false; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v5")); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("v5", Get(1, "foo")); + ASSERT_EQ("v5", Get(1, "bar")); + + Destroy(options); + } while (ChangeWalOptions()); +#endif // ROCKSDB_ASSERT_STATUS_CHECKED +} + +class DBRecoveryTestBlobError + : public DBWALTest, + public testing::WithParamInterface { + public: + DBRecoveryTestBlobError() : sync_point_(GetParam()) {} + + std::string sync_point_; +}; + +INSTANTIATE_TEST_CASE_P(DBRecoveryTestBlobError, DBRecoveryTestBlobError, + ::testing::ValuesIn(std::vector{ + "BlobFileBuilder::WriteBlobToFile:AddRecord", + "BlobFileBuilder::WriteBlobToFile:AppendFooter"})); + +TEST_P(DBRecoveryTestBlobError, RecoverWithBlobError) { + // Write a value. Note that blob files are not actually enabled at this point. + ASSERT_OK(Put("key", "blob")); + + // Reopen with blob files enabled but make blob file writing fail during + // recovery. + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) { + Status* const s = static_cast(arg); + assert(s); + + (*s) = Status::IOError(sync_point_); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Options options; + options.enable_blob_files = true; + options.avoid_flush_during_recovery = false; + options.disable_auto_compactions = true; + options.env = env_; + + ASSERT_NOK(TryReopen(options)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Make sure the files generated by the failed recovery have been deleted. + std::vector files; + ASSERT_OK(env_->GetChildren(dbname_, &files)); + for (const auto& file : files) { + uint64_t number = 0; + FileType type = kTableFile; + + if (!ParseFileName(file, &number, &type)) { + continue; + } + + ASSERT_NE(type, kTableFile); + ASSERT_NE(type, kBlobFile); + } +} + +TEST_F(DBWALTest, IgnoreRecoveredLog) { + std::string backup_logs = dbname_ + "/backup_logs"; + + do { + // delete old files in backup_logs directory + ASSERT_OK(env_->CreateDirIfMissing(backup_logs)); + std::vector old_files; + ASSERT_OK(env_->GetChildren(backup_logs, &old_files)); + for (auto& file : old_files) { + ASSERT_OK(env_->DeleteFile(backup_logs + "/" + file)); + } + Options options = CurrentOptions(); + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreateUInt64AddOperator(); + options.wal_dir = dbname_ + "/logs"; + DestroyAndReopen(options); + + // fill up the DB + std::string one, two; + PutFixed64(&one, 1); + PutFixed64(&two, 2); + ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one))); + ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one))); + ASSERT_OK(db_->Merge(WriteOptions(), Slice("bar"), Slice(one))); + + // copy the logs to backup + std::vector logs; + ASSERT_OK(env_->GetChildren(options.wal_dir, &logs)); + for (auto& log : logs) { + CopyFile(options.wal_dir + "/" + log, backup_logs + "/" + log); + } + + // recover the DB + Reopen(options); + ASSERT_EQ(two, Get("foo")); + ASSERT_EQ(one, Get("bar")); + Close(); + + // copy the logs from backup back to wal dir + for (auto& log : logs) { + CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); + } + // this should ignore the log files, recovery should not happen again + // if the recovery happens, the same merge operator would be called twice, + // leading to incorrect results + Reopen(options); + ASSERT_EQ(two, Get("foo")); + ASSERT_EQ(one, Get("bar")); + Close(); + Destroy(options); + Reopen(options); + Close(); + + // copy the logs from backup back to wal dir + ASSERT_OK(env_->CreateDirIfMissing(options.wal_dir)); + for (auto& log : logs) { + CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); + } + // assert that we successfully recovered only from logs, even though we + // destroyed the DB + Reopen(options); + ASSERT_EQ(two, Get("foo")); + ASSERT_EQ(one, Get("bar")); + + // Recovery will fail if DB directory doesn't exist. + Destroy(options); + // copy the logs from backup back to wal dir + ASSERT_OK(env_->CreateDirIfMissing(options.wal_dir)); + for (auto& log : logs) { + CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); + // we won't be needing this file no more + ASSERT_OK(env_->DeleteFile(backup_logs + "/" + log)); + } + Status s = TryReopen(options); + ASSERT_NOK(s); + Destroy(options); + } while (ChangeWalOptions()); +} + +TEST_F(DBWALTest, RecoveryWithEmptyLog) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "foo", "v2")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v3")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("v3", Get(1, "foo")); + } while (ChangeWalOptions()); +} + +#if !(defined NDEBUG) || !defined(OS_WIN) +TEST_F(DBWALTest, PreallocateBlock) { + Options options = CurrentOptions(); + options.write_buffer_size = 10 * 1000 * 1000; + options.max_total_wal_size = 0; + + size_t expected_preallocation_size = static_cast( + options.write_buffer_size + options.write_buffer_size / 10); + + DestroyAndReopen(options); + + std::atomic called(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBTestWalFile.GetPreallocationStatus", [&](void* arg) { + ASSERT_TRUE(arg != nullptr); + size_t preallocation_size = *(static_cast(arg)); + ASSERT_EQ(expected_preallocation_size, preallocation_size); + called.fetch_add(1); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put("", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("", "")); + Close(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_EQ(2, called.load()); + + options.max_total_wal_size = 1000 * 1000; + expected_preallocation_size = static_cast(options.max_total_wal_size); + Reopen(options); + called.store(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBTestWalFile.GetPreallocationStatus", [&](void* arg) { + ASSERT_TRUE(arg != nullptr); + size_t preallocation_size = *(static_cast(arg)); + ASSERT_EQ(expected_preallocation_size, preallocation_size); + called.fetch_add(1); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put("", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("", "")); + Close(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_EQ(2, called.load()); + + options.db_write_buffer_size = 800 * 1000; + expected_preallocation_size = + static_cast(options.db_write_buffer_size); + Reopen(options); + called.store(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBTestWalFile.GetPreallocationStatus", [&](void* arg) { + ASSERT_TRUE(arg != nullptr); + size_t preallocation_size = *(static_cast(arg)); + ASSERT_EQ(expected_preallocation_size, preallocation_size); + called.fetch_add(1); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put("", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("", "")); + Close(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_EQ(2, called.load()); + + expected_preallocation_size = 700 * 1000; + std::shared_ptr write_buffer_manager = + std::make_shared(static_cast(700 * 1000)); + options.write_buffer_manager = write_buffer_manager; + Reopen(options); + called.store(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBTestWalFile.GetPreallocationStatus", [&](void* arg) { + ASSERT_TRUE(arg != nullptr); + size_t preallocation_size = *(static_cast(arg)); + ASSERT_EQ(expected_preallocation_size, preallocation_size); + called.fetch_add(1); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put("", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("", "")); + Close(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_EQ(2, called.load()); +} +#endif // !(defined NDEBUG) || !defined(OS_WIN) + +#ifndef ROCKSDB_LITE +TEST_F(DBWALTest, DISABLED_FullPurgePreservesRecycledLog) { + // TODO(ajkr): Disabled until WAL recycling is fixed for + // `kPointInTimeRecovery`. + + // For github issue #1303 + for (int i = 0; i < 2; ++i) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.recycle_log_file_num = 2; + if (i != 0) { + options.wal_dir = alternative_wal_dir_; + } + + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "v1")); + VectorLogPtr log_files; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files)); + ASSERT_GT(log_files.size(), 0); + ASSERT_OK(Flush()); + + // Now the original WAL is in log_files[0] and should be marked for + // recycling. + // Verify full purge cannot remove this file. + JobContext job_context(0); + dbfull()->TEST_LockMutex(); + dbfull()->FindObsoleteFiles(&job_context, true /* force */); + dbfull()->TEST_UnlockMutex(); + dbfull()->PurgeObsoleteFiles(job_context); + + if (i == 0) { + ASSERT_OK( + env_->FileExists(LogFileName(dbname_, log_files[0]->LogNumber()))); + } else { + ASSERT_OK(env_->FileExists( + LogFileName(alternative_wal_dir_, log_files[0]->LogNumber()))); + } + } +} + +TEST_F(DBWALTest, DISABLED_FullPurgePreservesLogPendingReuse) { + // TODO(ajkr): Disabled until WAL recycling is fixed for + // `kPointInTimeRecovery`. + + // Ensures full purge cannot delete a WAL while it's in the process of being + // recycled. In particular, we force the full purge after a file has been + // chosen for reuse, but before it has been renamed. + for (int i = 0; i < 2; ++i) { + Options options = CurrentOptions(); + options.recycle_log_file_num = 1; + if (i != 0) { + options.wal_dir = alternative_wal_dir_; + } + DestroyAndReopen(options); + + // The first flush creates a second log so writes can continue before the + // flush finishes. + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + // The second flush can recycle the first log. Sync points enforce the + // full purge happens after choosing the log to recycle and before it is + // renamed. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::CreateWAL:BeforeReuseWritableFile1", + "DBWALTest::FullPurgePreservesLogPendingReuse:PreFullPurge"}, + {"DBWALTest::FullPurgePreservesLogPendingReuse:PostFullPurge", + "DBImpl::CreateWAL:BeforeReuseWritableFile2"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ROCKSDB_NAMESPACE::port::Thread thread([&]() { + TEST_SYNC_POINT( + "DBWALTest::FullPurgePreservesLogPendingReuse:PreFullPurge"); + ASSERT_OK(db_->EnableFileDeletions(true)); + TEST_SYNC_POINT( + "DBWALTest::FullPurgePreservesLogPendingReuse:PostFullPurge"); + }); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + thread.join(); + } +} + +TEST_F(DBWALTest, GetSortedWalFiles) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + VectorLogPtr log_files; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files)); + ASSERT_EQ(0, log_files.size()); + + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files)); + ASSERT_EQ(1, log_files.size()); + } while (ChangeWalOptions()); +} + +TEST_F(DBWALTest, GetCurrentWalFile) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + + std::unique_ptr* bad_log_file = nullptr; + ASSERT_NOK(dbfull()->GetCurrentWalFile(bad_log_file)); + + std::unique_ptr log_file; + ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file)); + + // nothing has been written to the log yet + ASSERT_EQ(log_file->StartSequence(), 0); + ASSERT_EQ(log_file->SizeFileBytes(), 0); + ASSERT_EQ(log_file->Type(), kAliveLogFile); + ASSERT_GT(log_file->LogNumber(), 0); + + // add some data and verify that the file size actually moves foward + ASSERT_OK(Put(0, "foo", "v1")); + ASSERT_OK(Put(0, "foo2", "v2")); + ASSERT_OK(Put(0, "foo3", "v3")); + + ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file)); + + ASSERT_EQ(log_file->StartSequence(), 0); + ASSERT_GT(log_file->SizeFileBytes(), 0); + ASSERT_EQ(log_file->Type(), kAliveLogFile); + ASSERT_GT(log_file->LogNumber(), 0); + + // force log files to cycle and add some more data, then check if + // log number moves forward + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + for (int i = 0; i < 10; i++) { + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + } + + ASSERT_OK(Put(0, "foo4", "v4")); + ASSERT_OK(Put(0, "foo5", "v5")); + ASSERT_OK(Put(0, "foo6", "v6")); + + ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file)); + + ASSERT_EQ(log_file->StartSequence(), 0); + ASSERT_GT(log_file->SizeFileBytes(), 0); + ASSERT_EQ(log_file->Type(), kAliveLogFile); + ASSERT_GT(log_file->LogNumber(), 0); + + } while (ChangeWalOptions()); +} + +TEST_F(DBWALTest, RecoveryWithLogDataForSomeCFs) { + // Test for regression of WAL cleanup missing files that don't contain data + // for every column family. + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "foo", "v2")); + uint64_t earliest_log_nums[2]; + for (int i = 0; i < 2; ++i) { + if (i > 0) { + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + } + VectorLogPtr log_files; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files)); + if (log_files.size() > 0) { + earliest_log_nums[i] = log_files[0]->LogNumber(); + } else { + earliest_log_nums[i] = std::numeric_limits::max(); + } + } + // Check at least the first WAL was cleaned up during the recovery. + ASSERT_LT(earliest_log_nums[0], earliest_log_nums[1]); + } while (ChangeWalOptions()); +} + +TEST_F(DBWALTest, RecoverWithLargeLog) { + do { + { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put(1, "big1", std::string(200000, '1'))); + ASSERT_OK(Put(1, "big2", std::string(200000, '2'))); + ASSERT_OK(Put(1, "small3", std::string(10, '3'))); + ASSERT_OK(Put(1, "small4", std::string(10, '4'))); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + } + + // Make sure that if we re-open with a small write buffer size that + // we flush table files in the middle of a large log file. + Options options; + options.write_buffer_size = 100000; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3); + ASSERT_EQ(std::string(200000, '1'), Get(1, "big1")); + ASSERT_EQ(std::string(200000, '2'), Get(1, "big2")); + ASSERT_EQ(std::string(10, '3'), Get(1, "small3")); + ASSERT_EQ(std::string(10, '4'), Get(1, "small4")); + ASSERT_GT(NumTableFilesAtLevel(0, 1), 1); + } while (ChangeWalOptions()); +} + +// In https://reviews.facebook.net/D20661 we change +// recovery behavior: previously for each log file each column family +// memtable was flushed, even it was empty. Now it's changed: +// we try to create the smallest number of table files by merging +// updates from multiple logs +TEST_F(DBWALTest, RecoverCheckFileAmountWithSmallWriteBuffer) { + Options options = CurrentOptions(); + options.write_buffer_size = 5000000; + CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options); + + // Since we will reopen DB with smaller write_buffer_size, + // each key will go to new SST file + ASSERT_OK(Put(1, Key(10), DummyString(1000000))); + ASSERT_OK(Put(1, Key(10), DummyString(1000000))); + ASSERT_OK(Put(1, Key(10), DummyString(1000000))); + ASSERT_OK(Put(1, Key(10), DummyString(1000000))); + + ASSERT_OK(Put(3, Key(10), DummyString(1))); + // Make 'dobrynia' to be flushed and new WAL file to be created + ASSERT_OK(Put(2, Key(10), DummyString(7500000))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2])); + { + auto tables = ListTableFiles(env_, dbname_); + ASSERT_EQ(tables.size(), static_cast(1)); + // Make sure 'dobrynia' was flushed: check sst files amount + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(1)); + } + // New WAL file + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(3, Key(10), DummyString(1))); + ASSERT_OK(Put(3, Key(10), DummyString(1))); + ASSERT_OK(Put(3, Key(10), DummyString(1))); + + options.write_buffer_size = 4096; + options.arena_block_size = 4096; + ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"}, + options); + { + // No inserts => default is empty + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(0)); + // First 4 keys goes to separate SSTs + 1 more SST for 2 smaller keys + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(5)); + // 1 SST for big key + 1 SST for small one + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(2)); + // 1 SST for all keys + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(1)); + } +} + +// In https://reviews.facebook.net/D20661 we change +// recovery behavior: previously for each log file each column family +// memtable was flushed, even it wasn't empty. Now it's changed: +// we try to create the smallest number of table files by merging +// updates from multiple logs +TEST_F(DBWALTest, RecoverCheckFileAmount) { + Options options = CurrentOptions(); + options.write_buffer_size = 100000; + options.arena_block_size = 4 * 1024; + options.avoid_flush_during_recovery = false; + CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options); + + ASSERT_OK(Put(0, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + + // Make 'nikitich' memtable to be flushed + ASSERT_OK(Put(3, Key(10), DummyString(1002400))); + ASSERT_OK(Put(3, Key(1), DummyString(1))); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3])); + // 4 memtable are not flushed, 1 sst file + { + auto tables = ListTableFiles(env_, dbname_); + ASSERT_EQ(tables.size(), static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(1)); + } + // Memtable for 'nikitich' has flushed, new WAL file has opened + // 4 memtable still not flushed + + // Write to new WAL file + ASSERT_OK(Put(0, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + + // Fill up 'nikitich' one more time + ASSERT_OK(Put(3, Key(10), DummyString(1002400))); + // make it flush + ASSERT_OK(Put(3, Key(1), DummyString(1))); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3])); + // There are still 4 memtable not flushed, and 2 sst tables + ASSERT_OK(Put(0, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + + { + auto tables = ListTableFiles(env_, dbname_); + ASSERT_EQ(tables.size(), static_cast(2)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(2)); + } + + ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"}, + options); + { + std::vector table_files = ListTableFiles(env_, dbname_); + // Check, that records for 'default', 'dobrynia' and 'pikachu' from + // first, second and third WALs went to the same SST. + // So, there is 6 SSTs: three for 'nikitich', one for 'default', one for + // 'dobrynia', one for 'pikachu' + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(3)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(1)); + } +} + +TEST_F(DBWALTest, SyncMultipleLogs) { + const uint64_t kNumBatches = 2; + const int kBatchSize = 1000; + + Options options = CurrentOptions(); + options.create_if_missing = true; + options.write_buffer_size = 4096; + Reopen(options); + + WriteBatch batch; + WriteOptions wo; + wo.sync = true; + + for (uint64_t b = 0; b < kNumBatches; b++) { + batch.Clear(); + for (int i = 0; i < kBatchSize; i++) { + ASSERT_OK(batch.Put(Key(i), DummyString(128))); + } + + ASSERT_OK(dbfull()->Write(wo, &batch)); + } + + ASSERT_OK(dbfull()->SyncWAL()); +} + +// Github issue 1339. Prior the fix we read sequence id from the first log to +// a local variable, then keep increase the variable as we replay logs, +// ignoring actual sequence id of the records. This is incorrect if some writes +// come with WAL disabled. +TEST_F(DBWALTest, PartOfWritesWithWALDisabled) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(env_)); + Options options = CurrentOptions(); + options.env = fault_env.get(); + options.disable_auto_compactions = true; + WriteOptions wal_on, wal_off; + wal_on.sync = true; + wal_on.disableWAL = false; + wal_off.disableWAL = true; + CreateAndReopenWithCF({"dummy"}, options); + ASSERT_OK(Put(1, "dummy", "d1", wal_on)); // seq id 1 + ASSERT_OK(Put(1, "dummy", "d2", wal_off)); + ASSERT_OK(Put(1, "dummy", "d3", wal_off)); + ASSERT_OK(Put(0, "key", "v4", wal_on)); // seq id 4 + ASSERT_OK(Flush(0)); + ASSERT_OK(Put(0, "key", "v5", wal_on)); // seq id 5 + ASSERT_EQ("v5", Get(0, "key")); + ASSERT_OK(dbfull()->FlushWAL(false)); + // Simulate a crash. + fault_env->SetFilesystemActive(false); + Close(); + fault_env->ResetState(); + ReopenWithColumnFamilies({"default", "dummy"}, options); + // Prior to the fix, we may incorrectly recover "v5" with sequence id = 3. + ASSERT_EQ("v5", Get(0, "key")); + // Destroy DB before destruct fault_env. + Destroy(options); +} + +// +// Test WAL recovery for the various modes available +// +class RecoveryTestHelper { + public: + // Number of WAL files to generate + static constexpr int kWALFilesCount = 10; + // Starting number for the WAL file name like 00010.log + static constexpr int kWALFileOffset = 10; + // Keys to be written per WAL file + static constexpr int kKeysPerWALFile = 133; + // Size of the value + static constexpr int kValueSize = 96; + + // Create WAL files with values filled in + static void FillData(DBWALTestBase* test, const Options& options, + const size_t wal_count, size_t* count) { + // Calling internal functions requires sanitized options. + Options sanitized_options = SanitizeOptions(test->dbname_, options); + const ImmutableDBOptions db_options(sanitized_options); + + *count = 0; + + std::shared_ptr table_cache = NewLRUCache(50, 0); + FileOptions file_options; + WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size); + + std::unique_ptr versions; + std::unique_ptr wal_manager; + WriteController write_controller; + + versions.reset(new VersionSet( + test->dbname_, &db_options, file_options, table_cache.get(), + &write_buffer_manager, &write_controller, + /*block_cache_tracer=*/nullptr, + /*io_tracer=*/nullptr, /*db_id*/ "", /*db_session_id*/ "")); + + wal_manager.reset( + new WalManager(db_options, file_options, /*io_tracer=*/nullptr)); + + std::unique_ptr current_log_writer; + + for (size_t j = kWALFileOffset; j < wal_count + kWALFileOffset; j++) { + uint64_t current_log_number = j; + std::string fname = LogFileName(test->dbname_, current_log_number); + std::unique_ptr file_writer; + ASSERT_OK(WritableFileWriter::Create(db_options.env->GetFileSystem(), + fname, file_options, &file_writer, + nullptr)); + log::Writer* log_writer = + new log::Writer(std::move(file_writer), current_log_number, + db_options.recycle_log_file_num > 0, false, + db_options.wal_compression); + ASSERT_OK(log_writer->AddCompressionTypeRecord()); + current_log_writer.reset(log_writer); + + WriteBatch batch; + for (int i = 0; i < kKeysPerWALFile; i++) { + std::string key = "key" + std::to_string((*count)++); + std::string value = test->DummyString(kValueSize); + ASSERT_NE(current_log_writer.get(), nullptr); + uint64_t seq = versions->LastSequence() + 1; + batch.Clear(); + ASSERT_OK(batch.Put(key, value)); + WriteBatchInternal::SetSequence(&batch, seq); + ASSERT_OK(current_log_writer->AddRecord( + WriteBatchInternal::Contents(&batch))); + versions->SetLastAllocatedSequence(seq); + versions->SetLastPublishedSequence(seq); + versions->SetLastSequence(seq); + } + } + } + + // Recreate and fill the store with some data + static size_t FillData(DBWALTestBase* test, Options* options) { + options->create_if_missing = true; + test->DestroyAndReopen(*options); + test->Close(); + + size_t count = 0; + FillData(test, *options, kWALFilesCount, &count); + return count; + } + + // Read back all the keys we wrote and return the number of keys found + static size_t GetData(DBWALTestBase* test) { + size_t count = 0; + for (size_t i = 0; i < kWALFilesCount * kKeysPerWALFile; i++) { + if (test->Get("key" + std::to_string(i)) != "NOT_FOUND") { + ++count; + } + } + return count; + } + + // Manuall corrupt the specified WAL + static void CorruptWAL(DBWALTestBase* test, const Options& options, + const double off, const double len, + const int wal_file_id, const bool trunc = false) { + Env* env = options.env; + std::string fname = LogFileName(test->dbname_, wal_file_id); + uint64_t size; + ASSERT_OK(env->GetFileSize(fname, &size)); + ASSERT_GT(size, 0); +#ifdef OS_WIN + // Windows disk cache behaves differently. When we truncate + // the original content is still in the cache due to the original + // handle is still open. Generally, in Windows, one prohibits + // shared access to files and it is not needed for WAL but we allow + // it to induce corruption at various tests. + test->Close(); +#endif + if (trunc) { + ASSERT_OK( + test::TruncateFile(env, fname, static_cast(size * off))); + } else { + ASSERT_OK(test::CorruptFile(env, fname, static_cast(size * off + 8), + static_cast(size * len), false)); + } + } +}; + +class DBWALTestWithParams : public DBWALTestBase, + public ::testing::WithParamInterface< + std::tuple> { + public: + DBWALTestWithParams() : DBWALTestBase("/db_wal_test_with_params") {} +}; + +INSTANTIATE_TEST_CASE_P( + Wal, DBWALTestWithParams, + ::testing::Combine(::testing::Bool(), ::testing::Range(0, 4, 1), + ::testing::Range(RecoveryTestHelper::kWALFileOffset, + RecoveryTestHelper::kWALFileOffset + + RecoveryTestHelper::kWALFilesCount, + 1), + ::testing::Values(CompressionType::kNoCompression, + CompressionType::kZSTD))); + +class DBWALTestWithParamsVaryingRecoveryMode + : public DBWALTestBase, + public ::testing::WithParamInterface< + std::tuple> { + public: + DBWALTestWithParamsVaryingRecoveryMode() + : DBWALTestBase("/db_wal_test_with_params_mode") {} +}; + +INSTANTIATE_TEST_CASE_P( + Wal, DBWALTestWithParamsVaryingRecoveryMode, + ::testing::Combine( + ::testing::Bool(), ::testing::Range(0, 4, 1), + ::testing::Range(RecoveryTestHelper::kWALFileOffset, + RecoveryTestHelper::kWALFileOffset + + RecoveryTestHelper::kWALFilesCount, + 1), + ::testing::Values(WALRecoveryMode::kTolerateCorruptedTailRecords, + WALRecoveryMode::kAbsoluteConsistency, + WALRecoveryMode::kPointInTimeRecovery, + WALRecoveryMode::kSkipAnyCorruptedRecords), + ::testing::Values(CompressionType::kNoCompression, + CompressionType::kZSTD))); + +// Test scope: +// - We expect to open the data store when there is incomplete trailing writes +// at the end of any of the logs +// - We do not expect to open the data store for corruption +TEST_P(DBWALTestWithParams, kTolerateCorruptedTailRecords) { + bool trunc = std::get<0>(GetParam()); // Corruption style + // Corruption offset position + int corrupt_offset = std::get<1>(GetParam()); + int wal_file_id = std::get<2>(GetParam()); // WAL file + + // Fill data for testing + Options options = CurrentOptions(); + const size_t row_count = RecoveryTestHelper::FillData(this, &options); + // test checksum failure or parsing + RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .3, + /*len%=*/.1, wal_file_id, trunc); + + options.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords; + if (trunc) { + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + const size_t recovered_row_count = RecoveryTestHelper::GetData(this); + ASSERT_TRUE(corrupt_offset == 0 || recovered_row_count > 0); + ASSERT_LT(recovered_row_count, row_count); + } else { + ASSERT_NOK(TryReopen(options)); + } +} + +// Test scope: +// We don't expect the data store to be opened if there is any corruption +// (leading, middle or trailing -- incomplete writes or corruption) +TEST_P(DBWALTestWithParams, kAbsoluteConsistency) { + // Verify clean slate behavior + Options options = CurrentOptions(); + const size_t row_count = RecoveryTestHelper::FillData(this, &options); + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + ASSERT_EQ(RecoveryTestHelper::GetData(this), row_count); + + bool trunc = std::get<0>(GetParam()); // Corruption style + // Corruption offset position + int corrupt_offset = std::get<1>(GetParam()); + int wal_file_id = std::get<2>(GetParam()); // WAL file + // WAL compression type + CompressionType compression_type = std::get<3>(GetParam()); + options.wal_compression = compression_type; + + if (trunc && corrupt_offset == 0) { + return; + } + + // fill with new date + RecoveryTestHelper::FillData(this, &options); + // corrupt the wal + RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .33, + /*len%=*/.1, wal_file_id, trunc); + // verify + options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency; + options.create_if_missing = false; + ASSERT_NOK(TryReopen(options)); +} + +// Test scope: +// We don't expect the data store to be opened if there is any inconsistency +// between WAL and SST files +TEST_F(DBWALTest, kPointInTimeRecoveryCFConsistency) { + Options options = CurrentOptions(); + options.avoid_flush_during_recovery = true; + + // Create DB with multiple column families. + CreateAndReopenWithCF({"one", "two"}, options); + ASSERT_OK(Put(1, "key1", "val1")); + ASSERT_OK(Put(2, "key2", "val2")); + + // Record the offset at this point + Env* env = options.env; + uint64_t wal_file_id = dbfull()->TEST_LogfileNumber(); + std::string fname = LogFileName(dbname_, wal_file_id); + uint64_t offset_to_corrupt; + ASSERT_OK(env->GetFileSize(fname, &offset_to_corrupt)); + ASSERT_GT(offset_to_corrupt, 0); + + ASSERT_OK(Put(1, "key3", "val3")); + // Corrupt WAL at location of key3 + ASSERT_OK(test::CorruptFile(env, fname, static_cast(offset_to_corrupt), + 4, false)); + ASSERT_OK(Put(2, "key4", "val4")); + ASSERT_OK(Put(1, "key5", "val5")); + ASSERT_OK(Flush(2)); + + // PIT recovery & verify + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + ASSERT_NOK(TryReopenWithColumnFamilies({"default", "one", "two"}, options)); +} + +TEST_F(DBWALTest, RaceInstallFlushResultsWithWalObsoletion) { + Options options = CurrentOptions(); + options.env = env_; + options.track_and_verify_wals_in_manifest = true; + // The following make sure there are two bg flush threads. + options.max_background_jobs = 8; + + DestroyAndReopen(options); + + const std::string cf1_name("cf1"); + CreateAndReopenWithCF({cf1_name}, options); + assert(handles_.size() == 2); + + { + dbfull()->TEST_LockMutex(); + ASSERT_LE(2, dbfull()->GetBGJobLimits().max_flushes); + dbfull()->TEST_UnlockMutex(); + } + + ASSERT_OK(dbfull()->PauseBackgroundWork()); + + ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "foo", "value")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "value")); + + ASSERT_OK(dbfull()->TEST_FlushMemTable( + /*wait=*/false, /*allow_write_stall=*/true, handles_[1])); + + ASSERT_OK(db_->Put(WriteOptions(), "foo", "value")); + + ASSERT_OK(dbfull()->TEST_FlushMemTable( + /*wait=*/false, /*allow_write_stall=*/true, handles_[0])); + + bool called = false; + std::atomic bg_flush_threads{0}; + std::atomic wal_synced{false}; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCallFlush:start", [&](void* /*arg*/) { + int cur = bg_flush_threads.load(); + int desired = cur + 1; + if (cur > 0 || + !bg_flush_threads.compare_exchange_strong(cur, desired)) { + while (!wal_synced.load()) { + // Wait until the other bg flush thread finishes committing WAL sync + // operation to the MANIFEST. + } + } + }); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushMemTableToOutputFile:CommitWal:1", + [&](void* /*arg*/) { wal_synced.store(true); }); + // This callback will be called when the first bg flush thread reaches the + // point before entering the MANIFEST write queue after flushing the SST + // file. + // The purpose of the sync points here is to ensure both bg flush threads + // finish computing `min_wal_number_to_keep` before any of them updates the + // `log_number` for the column family that's being flushed. + SyncPoint::GetInstance()->SetCallBack( + "MemTableList::TryInstallMemtableFlushResults:AfterComputeMinWalToKeep", + [&](void* /*arg*/) { + dbfull()->mutex()->AssertHeld(); + if (!called) { + // We are the first bg flush thread in the MANIFEST write queue. + // We set up the dependency between sync points for two threads that + // will be executing the same code. + // For the interleaving of events, see + // https://github.com/facebook/rocksdb/pull/9715. + // bg flush thread1 will release the db mutex while in the MANIFEST + // write queue. In the meantime, bg flush thread2 locks db mutex and + // computes the min_wal_number_to_keep (before thread1 writes to + // MANIFEST thus before cf1->log_number is updated). Bg thread2 joins + // the MANIFEST write queue afterwards and bg flush thread1 proceeds + // with writing to MANIFEST. + called = true; + SyncPoint::GetInstance()->LoadDependency({ + {"VersionSet::LogAndApply:WriteManifestStart", + "DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2"}, + {"DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2", + "VersionSet::LogAndApply:WriteManifest"}, + }); + } else { + // The other bg flush thread has already been in the MANIFEST write + // queue, and we are after. + TEST_SYNC_POINT( + "DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2"); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(dbfull()->ContinueBackgroundWork()); + + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0])); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + + ASSERT_TRUE(called); + + Close(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + DB* db1 = nullptr; + Status s = DB::OpenForReadOnly(options, dbname_, &db1); + ASSERT_OK(s); + assert(db1); + delete db1; +} + +// Test scope: +// - We expect to open data store under all circumstances +// - We expect only data upto the point where the first error was encountered +TEST_P(DBWALTestWithParams, kPointInTimeRecovery) { + const int maxkeys = + RecoveryTestHelper::kWALFilesCount * RecoveryTestHelper::kKeysPerWALFile; + + bool trunc = std::get<0>(GetParam()); // Corruption style + // Corruption offset position + int corrupt_offset = std::get<1>(GetParam()); + int wal_file_id = std::get<2>(GetParam()); // WAL file + // WAL compression type + CompressionType compression_type = std::get<3>(GetParam()); + + // Fill data for testing + Options options = CurrentOptions(); + options.wal_compression = compression_type; + const size_t row_count = RecoveryTestHelper::FillData(this, &options); + + // Corrupt the wal + // The offset here was 0.3 which cuts off right at the end of a + // valid fragment after wal zstd compression checksum is enabled, + // so changed the value to 0.33. + RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .33, + /*len%=*/.1, wal_file_id, trunc); + + // Verify + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + + // Probe data for invariants + size_t recovered_row_count = RecoveryTestHelper::GetData(this); + ASSERT_LT(recovered_row_count, row_count); + + // Verify a prefix of keys were recovered. But not in the case of full WAL + // truncation, because we have no way to know there was a corruption when + // truncation happened on record boundaries (preventing recovery holes in + // that case requires using `track_and_verify_wals_in_manifest`). + if (!trunc || corrupt_offset != 0) { + bool expect_data = true; + for (size_t k = 0; k < maxkeys; ++k) { + bool found = Get("key" + std::to_string(k)) != "NOT_FOUND"; + if (expect_data && !found) { + expect_data = false; + } + ASSERT_EQ(found, expect_data); + } + } + + const size_t min = RecoveryTestHelper::kKeysPerWALFile * + (wal_file_id - RecoveryTestHelper::kWALFileOffset); + ASSERT_GE(recovered_row_count, min); + if (!trunc && corrupt_offset != 0) { + const size_t max = RecoveryTestHelper::kKeysPerWALFile * + (wal_file_id - RecoveryTestHelper::kWALFileOffset + 1); + ASSERT_LE(recovered_row_count, max); + } +} + +// Test scope: +// - We expect to open the data store under all scenarios +// - We expect to have recovered records past the corruption zone +TEST_P(DBWALTestWithParams, kSkipAnyCorruptedRecords) { + bool trunc = std::get<0>(GetParam()); // Corruption style + // Corruption offset position + int corrupt_offset = std::get<1>(GetParam()); + int wal_file_id = std::get<2>(GetParam()); // WAL file + // WAL compression type + CompressionType compression_type = std::get<3>(GetParam()); + + // Fill data for testing + Options options = CurrentOptions(); + options.wal_compression = compression_type; + const size_t row_count = RecoveryTestHelper::FillData(this, &options); + + // Corrupt the WAL + RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .3, + /*len%=*/.1, wal_file_id, trunc); + + // Verify behavior + options.wal_recovery_mode = WALRecoveryMode::kSkipAnyCorruptedRecords; + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + + // Probe data for invariants + size_t recovered_row_count = RecoveryTestHelper::GetData(this); + ASSERT_LT(recovered_row_count, row_count); + + if (!trunc) { + ASSERT_TRUE(corrupt_offset != 0 || recovered_row_count > 0); + } +} + +TEST_F(DBWALTest, AvoidFlushDuringRecovery) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.avoid_flush_during_recovery = false; + + // Test with flush after recovery. + Reopen(options); + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "v3")); + ASSERT_OK(Put("bar", "v4")); + ASSERT_EQ(1, TotalTableFiles()); + // Reopen DB. Check if WAL logs flushed. + Reopen(options); + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v4", Get("bar")); + ASSERT_EQ(2, TotalTableFiles()); + + // Test without flush after recovery. + options.avoid_flush_during_recovery = true; + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "v5")); + ASSERT_OK(Put("bar", "v6")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "v7")); + ASSERT_OK(Put("bar", "v8")); + ASSERT_EQ(1, TotalTableFiles()); + // Reopen DB. WAL logs should not be flushed this time. + Reopen(options); + ASSERT_EQ("v7", Get("foo")); + ASSERT_EQ("v8", Get("bar")); + ASSERT_EQ(1, TotalTableFiles()); + + // Force flush with allow_2pc. + options.avoid_flush_during_recovery = true; + options.allow_2pc = true; + ASSERT_OK(Put("foo", "v9")); + ASSERT_OK(Put("bar", "v10")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "v11")); + ASSERT_OK(Put("bar", "v12")); + Reopen(options); + ASSERT_EQ("v11", Get("foo")); + ASSERT_EQ("v12", Get("bar")); + ASSERT_EQ(3, TotalTableFiles()); +} + +TEST_F(DBWALTest, WalCleanupAfterAvoidFlushDuringRecovery) { + // Verifies WAL files that were present during recovery, but not flushed due + // to avoid_flush_during_recovery, will be considered for deletion at a later + // stage. We check at least one such file is deleted during Flush(). + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.avoid_flush_during_recovery = true; + Reopen(options); + + ASSERT_OK(Put("foo", "v1")); + Reopen(options); + for (int i = 0; i < 2; ++i) { + if (i > 0) { + // Flush() triggers deletion of obsolete tracked files + ASSERT_OK(Flush()); + } + VectorLogPtr log_files; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files)); + if (i == 0) { + ASSERT_GT(log_files.size(), 0); + } else { + ASSERT_EQ(0, log_files.size()); + } + } +} + +TEST_F(DBWALTest, RecoverWithoutFlush) { + Options options = CurrentOptions(); + options.avoid_flush_during_recovery = true; + options.create_if_missing = false; + options.disable_auto_compactions = true; + options.write_buffer_size = 64 * 1024 * 1024; + + size_t count = RecoveryTestHelper::FillData(this, &options); + auto validateData = [this, count]() { + for (size_t i = 0; i < count; i++) { + ASSERT_NE(Get("key" + std::to_string(i)), "NOT_FOUND"); + } + }; + Reopen(options); + validateData(); + // Insert some data without flush + ASSERT_OK(Put("foo", "foo_v1")); + ASSERT_OK(Put("bar", "bar_v1")); + Reopen(options); + validateData(); + ASSERT_EQ(Get("foo"), "foo_v1"); + ASSERT_EQ(Get("bar"), "bar_v1"); + // Insert again and reopen + ASSERT_OK(Put("foo", "foo_v2")); + ASSERT_OK(Put("bar", "bar_v2")); + Reopen(options); + validateData(); + ASSERT_EQ(Get("foo"), "foo_v2"); + ASSERT_EQ(Get("bar"), "bar_v2"); + // manual flush and insert again + ASSERT_OK(Flush()); + ASSERT_EQ(Get("foo"), "foo_v2"); + ASSERT_EQ(Get("bar"), "bar_v2"); + ASSERT_OK(Put("foo", "foo_v3")); + ASSERT_OK(Put("bar", "bar_v3")); + Reopen(options); + validateData(); + ASSERT_EQ(Get("foo"), "foo_v3"); + ASSERT_EQ(Get("bar"), "bar_v3"); +} + +TEST_F(DBWALTest, RecoverWithoutFlushMultipleCF) { + const std::string kSmallValue = "v"; + const std::string kLargeValue = DummyString(1024); + Options options = CurrentOptions(); + options.avoid_flush_during_recovery = true; + options.create_if_missing = false; + options.disable_auto_compactions = true; + + auto countWalFiles = [this]() { + VectorLogPtr log_files; + if (!dbfull()->GetSortedWalFiles(log_files).ok()) { + return size_t{0}; + } + return log_files.size(); + }; + + // Create DB with multiple column families and multiple log files. + CreateAndReopenWithCF({"one", "two"}, options); + ASSERT_OK(Put(0, "key1", kSmallValue)); + ASSERT_OK(Put(1, "key2", kLargeValue)); + ASSERT_OK(Flush(1)); + ASSERT_EQ(1, countWalFiles()); + ASSERT_OK(Put(0, "key3", kSmallValue)); + ASSERT_OK(Put(2, "key4", kLargeValue)); + ASSERT_OK(Flush(2)); + ASSERT_EQ(2, countWalFiles()); + + // Reopen, insert and flush. + options.db_write_buffer_size = 64 * 1024 * 1024; + ReopenWithColumnFamilies({"default", "one", "two"}, options); + ASSERT_EQ(Get(0, "key1"), kSmallValue); + ASSERT_EQ(Get(1, "key2"), kLargeValue); + ASSERT_EQ(Get(0, "key3"), kSmallValue); + ASSERT_EQ(Get(2, "key4"), kLargeValue); + // Insert more data. + ASSERT_OK(Put(0, "key5", kLargeValue)); + ASSERT_OK(Put(1, "key6", kLargeValue)); + ASSERT_EQ(3, countWalFiles()); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(2, "key7", kLargeValue)); + ASSERT_OK(dbfull()->FlushWAL(false)); + ASSERT_EQ(4, countWalFiles()); + + // Reopen twice and validate. + for (int i = 0; i < 2; i++) { + ReopenWithColumnFamilies({"default", "one", "two"}, options); + ASSERT_EQ(Get(0, "key1"), kSmallValue); + ASSERT_EQ(Get(1, "key2"), kLargeValue); + ASSERT_EQ(Get(0, "key3"), kSmallValue); + ASSERT_EQ(Get(2, "key4"), kLargeValue); + ASSERT_EQ(Get(0, "key5"), kLargeValue); + ASSERT_EQ(Get(1, "key6"), kLargeValue); + ASSERT_EQ(Get(2, "key7"), kLargeValue); + ASSERT_EQ(4, countWalFiles()); + } +} + +// In this test we are trying to do the following: +// 1. Create a DB with corrupted WAL log; +// 2. Open with avoid_flush_during_recovery = true; +// 3. Append more data without flushing, which creates new WAL log. +// 4. Open again. See if it can correctly handle previous corruption. +TEST_P(DBWALTestWithParamsVaryingRecoveryMode, + RecoverFromCorruptedWALWithoutFlush) { + const int kAppendKeys = 100; + Options options = CurrentOptions(); + options.avoid_flush_during_recovery = true; + options.create_if_missing = false; + options.disable_auto_compactions = true; + options.write_buffer_size = 64 * 1024 * 1024; + + auto getAll = [this]() { + std::vector> data; + ReadOptions ropt; + Iterator* iter = dbfull()->NewIterator(ropt); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + data.push_back( + std::make_pair(iter->key().ToString(), iter->value().ToString())); + } + delete iter; + return data; + }; + + bool trunc = std::get<0>(GetParam()); // Corruption style + // Corruption offset position + int corrupt_offset = std::get<1>(GetParam()); + int wal_file_id = std::get<2>(GetParam()); // WAL file + WALRecoveryMode recovery_mode = std::get<3>(GetParam()); + // WAL compression type + CompressionType compression_type = std::get<4>(GetParam()); + + options.wal_recovery_mode = recovery_mode; + options.wal_compression = compression_type; + // Create corrupted WAL + RecoveryTestHelper::FillData(this, &options); + RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .3, + /*len%=*/.1, wal_file_id, trunc); + // Skip the test if DB won't open. + if (!TryReopen(options).ok()) { + ASSERT_TRUE(options.wal_recovery_mode == + WALRecoveryMode::kAbsoluteConsistency || + (!trunc && options.wal_recovery_mode == + WALRecoveryMode::kTolerateCorruptedTailRecords)); + return; + } + ASSERT_OK(TryReopen(options)); + // Append some more data. + for (int k = 0; k < kAppendKeys; k++) { + std::string key = "extra_key" + std::to_string(k); + std::string value = DummyString(RecoveryTestHelper::kValueSize); + ASSERT_OK(Put(key, value)); + } + // Save data for comparison. + auto data = getAll(); + // Reopen. Verify data. + ASSERT_OK(TryReopen(options)); + auto actual_data = getAll(); + ASSERT_EQ(data, actual_data); +} + +// Tests that total log size is recovered if we set +// avoid_flush_during_recovery=true. +// Flush should trigger if max_total_wal_size is reached. +TEST_F(DBWALTest, RestoreTotalLogSizeAfterRecoverWithoutFlush) { + auto test_listener = std::make_shared(); + test_listener->expected_flush_reason = FlushReason::kWalFull; + + constexpr size_t kKB = 1024; + constexpr size_t kMB = 1024 * 1024; + Options options = CurrentOptions(); + options.avoid_flush_during_recovery = true; + options.max_total_wal_size = 1 * kMB; + options.listeners.push_back(test_listener); + // Have to open DB in multi-CF mode to trigger flush when + // max_total_wal_size is reached. + CreateAndReopenWithCF({"one"}, options); + // Write some keys and we will end up with one log file which is slightly + // smaller than 1MB. + std::string value_100k(100 * kKB, 'v'); + std::string value_300k(300 * kKB, 'v'); + ASSERT_OK(Put(0, "foo", "v1")); + for (int i = 0; i < 9; i++) { + ASSERT_OK(Put(1, "key" + std::to_string(i), value_100k)); + } + // Get log files before reopen. + VectorLogPtr log_files_before; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before)); + ASSERT_EQ(1, log_files_before.size()); + uint64_t log_size_before = log_files_before[0]->SizeFileBytes(); + ASSERT_GT(log_size_before, 900 * kKB); + ASSERT_LT(log_size_before, 1 * kMB); + ReopenWithColumnFamilies({"default", "one"}, options); + // Write one more value to make log larger than 1MB. + ASSERT_OK(Put(1, "bar", value_300k)); + // Get log files again. A new log file will be opened. + VectorLogPtr log_files_after_reopen; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after_reopen)); + ASSERT_EQ(2, log_files_after_reopen.size()); + ASSERT_EQ(log_files_before[0]->LogNumber(), + log_files_after_reopen[0]->LogNumber()); + ASSERT_GT(log_files_after_reopen[0]->SizeFileBytes() + + log_files_after_reopen[1]->SizeFileBytes(), + 1 * kMB); + // Write one more key to trigger flush. + ASSERT_OK(Put(0, "foo", "v2")); + for (auto* h : handles_) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(h)); + } + // Flushed two column families. + ASSERT_EQ(2, test_listener->count.load()); +} + +#if defined(ROCKSDB_PLATFORM_POSIX) +#if defined(ROCKSDB_FALLOCATE_PRESENT) +// Tests that we will truncate the preallocated space of the last log from +// previous. +TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithoutFlush) { + constexpr size_t kKB = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.avoid_flush_during_recovery = true; + if (mem_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem environment"); + return; + } + if (!IsFallocateSupported()) { + return; + } + + DestroyAndReopen(options); + size_t preallocated_size = + dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size); + ASSERT_OK(Put("foo", "v1")); + VectorLogPtr log_files_before; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before)); + ASSERT_EQ(1, log_files_before.size()); + auto& file_before = log_files_before[0]; + ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB); + // The log file has preallocated space. + ASSERT_GE(GetAllocatedFileSize(dbname_ + file_before->PathName()), + preallocated_size); + Reopen(options); + VectorLogPtr log_files_after; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after)); + ASSERT_EQ(1, log_files_after.size()); + ASSERT_LT(log_files_after[0]->SizeFileBytes(), 1 * kKB); + // The preallocated space should be truncated. + ASSERT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()), + preallocated_size); +} +// Tests that we will truncate the preallocated space of the last log from +// previous. +TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithFlush) { + constexpr size_t kKB = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.avoid_flush_during_recovery = false; + options.avoid_flush_during_shutdown = true; + if (mem_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem environment"); + return; + } + if (!IsFallocateSupported()) { + return; + } + + DestroyAndReopen(options); + size_t preallocated_size = + dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size); + ASSERT_OK(Put("foo", "v1")); + VectorLogPtr log_files_before; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before)); + ASSERT_EQ(1, log_files_before.size()); + auto& file_before = log_files_before[0]; + ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB); + ASSERT_GE(GetAllocatedFileSize(dbname_ + file_before->PathName()), + preallocated_size); + // The log file has preallocated space. + Close(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::PurgeObsoleteFiles:Begin", + "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"}, + {"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate", + "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + port::Thread reopen_thread([&]() { Reopen(options); }); + + TEST_SYNC_POINT( + "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"); + // After the flush during Open, the log file should get deleted. However, + // if the process is in a crash loop, the log file may not get + // deleted and thte preallocated space will keep accumulating. So we need + // to ensure it gets trtuncated. + EXPECT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()), + preallocated_size); + TEST_SYNC_POINT( + "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate"); + reopen_thread.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBWALTest, TruncateLastLogAfterRecoverWALEmpty) { + Options options = CurrentOptions(); + options.env = env_; + options.avoid_flush_during_recovery = false; + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem/non-encrypted environment"); + return; + } + if (!IsFallocateSupported()) { + return; + } + + DestroyAndReopen(options); + size_t preallocated_size = + dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size); + Close(); + std::vector filenames; + std::string last_log; + uint64_t last_log_num = 0; + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); + for (auto fname : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(fname, &number, &type, nullptr)) { + if (type == kWalFile && number > last_log_num) { + last_log = fname; + } + } + } + ASSERT_NE(last_log, ""); + last_log = dbname_ + '/' + last_log; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::PurgeObsoleteFiles:Begin", + "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"}, + {"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate", + "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PosixWritableFile::Close", + [](void* arg) { *(reinterpret_cast(arg)) = 0; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + // Preallocate space for the empty log file. This could happen if WAL data + // was buffered in memory and the process crashed. + std::unique_ptr log_file; + ASSERT_OK(env_->ReopenWritableFile(last_log, &log_file, EnvOptions())); + log_file->SetPreallocationBlockSize(preallocated_size); + log_file->PrepareWrite(0, 4096); + log_file.reset(); + + ASSERT_GE(GetAllocatedFileSize(last_log), preallocated_size); + + port::Thread reopen_thread([&]() { Reopen(options); }); + + TEST_SYNC_POINT( + "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"); + // The preallocated space should be truncated. + EXPECT_LT(GetAllocatedFileSize(last_log), preallocated_size); + TEST_SYNC_POINT( + "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate"); + reopen_thread.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(DBWALTest, ReadOnlyRecoveryNoTruncate) { + constexpr size_t kKB = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.avoid_flush_during_recovery = true; + if (mem_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem environment"); + return; + } + if (!IsFallocateSupported()) { + return; + } + + // create DB and close with file truncate disabled + std::atomic_bool enable_truncate{false}; + + SyncPoint::GetInstance()->SetCallBack( + "PosixWritableFile::Close", [&](void* arg) { + if (!enable_truncate) { + *(reinterpret_cast(arg)) = 0; + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndReopen(options); + size_t preallocated_size = + dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size); + ASSERT_OK(Put("foo", "v1")); + VectorLogPtr log_files_before; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before)); + ASSERT_EQ(1, log_files_before.size()); + auto& file_before = log_files_before[0]; + ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB); + // The log file has preallocated space. + auto db_size = GetAllocatedFileSize(dbname_ + file_before->PathName()); + ASSERT_GE(db_size, preallocated_size); + Close(); + + // enable truncate and open DB as readonly, the file should not be truncated + // and DB size is not changed. + enable_truncate = true; + ASSERT_OK(ReadOnlyReopen(options)); + VectorLogPtr log_files_after; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after)); + ASSERT_EQ(1, log_files_after.size()); + ASSERT_LT(log_files_after[0]->SizeFileBytes(), 1 * kKB); + ASSERT_EQ(log_files_after[0]->PathName(), file_before->PathName()); + // The preallocated space should NOT be truncated. + // the DB size is almost the same. + ASSERT_NEAR(GetAllocatedFileSize(dbname_ + file_before->PathName()), db_size, + db_size / 100); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} +#endif // ROCKSDB_FALLOCATE_PRESENT +#endif // ROCKSDB_PLATFORM_POSIX + +TEST_F(DBWALTest, WalInManifestButNotInSortedWals) { + Options options = CurrentOptions(); + options.track_and_verify_wals_in_manifest = true; + options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency; + + // Build a way to make wal files selectively go missing + bool wals_go_missing = false; + struct MissingWalFs : public FileSystemWrapper { + MissingWalFs(const std::shared_ptr& t, + bool* _wals_go_missing_flag) + : FileSystemWrapper(t), wals_go_missing_flag(_wals_go_missing_flag) {} + bool* wals_go_missing_flag; + IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts, + std::vector* r, + IODebugContext* dbg) override { + IOStatus s = target_->GetChildren(dir, io_opts, r, dbg); + if (s.ok() && *wals_go_missing_flag) { + for (size_t i = 0; i < r->size();) { + if (EndsWith(r->at(i), ".log")) { + r->erase(r->begin() + i); + } else { + ++i; + } + } + } + return s; + } + const char* Name() const override { return "MissingWalFs"; } + }; + auto my_fs = + std::make_shared(env_->GetFileSystem(), &wals_go_missing); + std::unique_ptr my_env(NewCompositeEnv(my_fs)); + options.env = my_env.get(); + + CreateAndReopenWithCF({"blah"}, options); + + // Currently necessary to get a WAL tracked in manifest; see + // https://github.com/facebook/rocksdb/issues/10080 + ASSERT_OK(Put(0, "x", "y")); + ASSERT_OK(db_->SyncWAL()); + ASSERT_OK(Put(1, "x", "y")); + ASSERT_OK(db_->SyncWAL()); + ASSERT_OK(Flush(1)); + + ASSERT_FALSE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty()); + std::vector> wals; + ASSERT_OK(db_->GetSortedWalFiles(wals)); + wals_go_missing = true; + ASSERT_NOK(db_->GetSortedWalFiles(wals)); + wals_go_missing = false; + Close(); +} + +#endif // ROCKSDB_LITE + +TEST_F(DBWALTest, WalTermTest) { + Options options = CurrentOptions(); + options.env = env_; + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "foo", "bar")); + + WriteOptions wo; + wo.sync = true; + wo.disableWAL = false; + + WriteBatch batch; + ASSERT_OK(batch.Put("foo", "bar")); + batch.MarkWalTerminationPoint(); + ASSERT_OK(batch.Put("foo2", "bar2")); + + ASSERT_OK(dbfull()->Write(wo, &batch)); + + // make sure we can re-open it. + ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); + ASSERT_EQ("bar", Get(1, "foo")); + ASSERT_EQ("NOT_FOUND", Get(1, "foo2")); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBWALTest, GetCompressedWalsAfterSync) { + if (db_->GetOptions().wal_compression == kNoCompression) { + ROCKSDB_GTEST_BYPASS("stream compression not present"); + return; + } + Options options = GetDefaultOptions(); + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + options.create_if_missing = true; + options.env = env_; + options.avoid_flush_during_recovery = true; + options.track_and_verify_wals_in_manifest = true; + // Enable WAL compression so that the newly-created WAL will be non-empty + // after DB open, even if point-in-time WAL recovery encounters no + // corruption. + options.wal_compression = kZSTD; + DestroyAndReopen(options); + + // Write something to memtable and WAL so that log_empty_ will be false after + // next DB::Open(). + ASSERT_OK(Put("a", "v")); + + Reopen(options); + + // New WAL is created, thanks to !log_empty_. + ASSERT_OK(dbfull()->TEST_SwitchWAL()); + + ASSERT_OK(Put("b", "v")); + + ASSERT_OK(db_->SyncWAL()); + + VectorLogPtr wals; + Status s = dbfull()->GetSortedWalFiles(wals); + ASSERT_OK(s); +} +#endif // ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_with_timestamp_basic_test.cc b/src/rocksdb/db/db_with_timestamp_basic_test.cc new file mode 100644 index 000000000..6ea1aaf46 --- /dev/null +++ b/src/rocksdb/db/db_with_timestamp_basic_test.cc @@ -0,0 +1,3880 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_with_timestamp_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/utilities/debug.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/block_builder.h" +#if !defined(ROCKSDB_LITE) +#include "test_util/sync_point.h" +#endif +#include "test_util/testutil.h" +#include "utilities/fault_injection_env.h" +#include "utilities/merge_operators/string_append/stringappend2.h" + +namespace ROCKSDB_NAMESPACE { +class DBBasicTestWithTimestamp : public DBBasicTestWithTimestampBase { + public: + DBBasicTestWithTimestamp() + : DBBasicTestWithTimestampBase("db_basic_test_with_timestamp") {} +}; + +TEST_F(DBBasicTestWithTimestamp, SanityChecks) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.avoid_flush_during_shutdown = true; + options.merge_operator = MergeOperators::CreateStringAppendTESTOperator(); + DestroyAndReopen(options); + + Options options1 = CurrentOptions(); + options1.env = env_; + options1.comparator = test::BytewiseComparatorWithU64TsWrapper(); + options1.merge_operator = MergeOperators::CreateStringAppendTESTOperator(); + assert(options1.comparator && + options1.comparator->timestamp_size() == sizeof(uint64_t)); + ColumnFamilyHandle* handle = nullptr; + Status s = db_->CreateColumnFamily(options1, "data", &handle); + ASSERT_OK(s); + + std::string dummy_ts(sizeof(uint64_t), '\0'); + // Perform timestamp operations on default cf. + ASSERT_TRUE( + db_->Put(WriteOptions(), "key", dummy_ts, "value").IsInvalidArgument()); + ASSERT_TRUE(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), "key", + dummy_ts, "value") + .IsInvalidArgument()); + ASSERT_TRUE(db_->Delete(WriteOptions(), "key", dummy_ts).IsInvalidArgument()); + ASSERT_TRUE( + db_->SingleDelete(WriteOptions(), "key", dummy_ts).IsInvalidArgument()); + ASSERT_TRUE(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + "begin_key", "end_key", dummy_ts) + .IsInvalidArgument()); + + // Perform non-timestamp operations on "data" cf. + ASSERT_TRUE( + db_->Put(WriteOptions(), handle, "key", "value").IsInvalidArgument()); + ASSERT_TRUE(db_->Delete(WriteOptions(), handle, "key").IsInvalidArgument()); + ASSERT_TRUE( + db_->SingleDelete(WriteOptions(), handle, "key").IsInvalidArgument()); + + ASSERT_TRUE( + db_->Merge(WriteOptions(), handle, "key", "value").IsInvalidArgument()); + ASSERT_TRUE(db_->DeleteRange(WriteOptions(), handle, "begin_key", "end_key") + .IsInvalidArgument()); + + { + WriteBatch wb; + ASSERT_OK(wb.Put(handle, "key", "value")); + ASSERT_TRUE(db_->Write(WriteOptions(), &wb).IsInvalidArgument()); + } + { + WriteBatch wb; + ASSERT_OK(wb.Delete(handle, "key")); + ASSERT_TRUE(db_->Write(WriteOptions(), &wb).IsInvalidArgument()); + } + { + WriteBatch wb; + ASSERT_OK(wb.SingleDelete(handle, "key")); + ASSERT_TRUE(db_->Write(WriteOptions(), &wb).IsInvalidArgument()); + } + { + WriteBatch wb; + ASSERT_OK(wb.DeleteRange(handle, "begin_key", "end_key")); + ASSERT_TRUE(db_->Write(WriteOptions(), &wb).IsInvalidArgument()); + } + + // Perform timestamp operations with timestamps of incorrect size. + const std::string wrong_ts(sizeof(uint32_t), '\0'); + ASSERT_TRUE(db_->Put(WriteOptions(), handle, "key", wrong_ts, "value") + .IsInvalidArgument()); + ASSERT_TRUE(db_->Merge(WriteOptions(), handle, "key", wrong_ts, "value") + .IsInvalidArgument()); + ASSERT_TRUE( + db_->Delete(WriteOptions(), handle, "key", wrong_ts).IsInvalidArgument()); + ASSERT_TRUE(db_->SingleDelete(WriteOptions(), handle, "key", wrong_ts) + .IsInvalidArgument()); + ASSERT_TRUE( + db_->DeleteRange(WriteOptions(), handle, "begin_key", "end_key", wrong_ts) + .IsInvalidArgument()); + + delete handle; +} + +TEST_F(DBBasicTestWithTimestamp, MixedCfs) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.avoid_flush_during_shutdown = true; + DestroyAndReopen(options); + + Options options1 = CurrentOptions(); + options1.env = env_; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options1.comparator = &test_cmp; + ColumnFamilyHandle* handle = nullptr; + Status s = db_->CreateColumnFamily(options1, "data", &handle); + ASSERT_OK(s); + + WriteBatch wb; + ASSERT_OK(wb.Put("a", "value")); + ASSERT_OK(wb.Put(handle, "a", "value")); + { + std::string ts = Timestamp(1, 0); + const auto ts_sz_func = [kTimestampSize, handle](uint32_t cf_id) { + assert(handle); + if (cf_id == 0) { + return static_cast(0); + } else if (cf_id == handle->GetID()) { + return kTimestampSize; + } else { + assert(false); + return std::numeric_limits::max(); + } + }; + ASSERT_OK(wb.UpdateTimestamps(ts, ts_sz_func)); + ASSERT_OK(db_->Write(WriteOptions(), &wb)); + } + + const auto verify_db = [this](ColumnFamilyHandle* h, const std::string& key, + const std::string& ts, + const std::string& expected_value) { + ASSERT_EQ(expected_value, Get(key)); + Slice read_ts_slice(ts); + ReadOptions read_opts; + read_opts.timestamp = &read_ts_slice; + std::string value; + ASSERT_OK(db_->Get(read_opts, h, key, &value)); + ASSERT_EQ(expected_value, value); + }; + + verify_db(handle, "a", Timestamp(1, 0), "value"); + + delete handle; + Close(); + + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back("data", options1); + options.create_if_missing = false; + s = DB::Open(options, dbname_, cf_descs, &handles_, &db_); + ASSERT_OK(s); + + verify_db(handles_[1], "a", Timestamp(1, 0), "value"); + + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, CompactRangeWithSpecifiedRange) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + WriteOptions write_opts; + std::string ts = Timestamp(1, 0); + + ASSERT_OK(db_->Put(write_opts, "foo1", ts, "bar")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->Put(write_opts, "foo2", ts, "bar")); + ASSERT_OK(Flush()); + + std::string start_str = "foo"; + std::string end_str = "foo2"; + Slice start(start_str), end(end_str); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end)); + + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, GcPreserveLatestVersionBelowFullHistoryLow) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + std::string ts_str = Timestamp(1, 0); + WriteOptions wopts; + ASSERT_OK(db_->Put(wopts, "k1", ts_str, "v1")); + ASSERT_OK(db_->Put(wopts, "k2", ts_str, "v2")); + ASSERT_OK(db_->Put(wopts, "k3", ts_str, "v3")); + + ts_str = Timestamp(2, 0); + ASSERT_OK(db_->Delete(wopts, "k3", ts_str)); + + ts_str = Timestamp(4, 0); + ASSERT_OK(db_->Put(wopts, "k1", ts_str, "v5")); + + ts_str = Timestamp(5, 0); + ASSERT_OK( + db_->DeleteRange(wopts, db_->DefaultColumnFamily(), "k0", "k9", ts_str)); + + ts_str = Timestamp(3, 0); + Slice ts = ts_str; + CompactRangeOptions cro; + cro.full_history_ts_low = &ts; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + ASSERT_OK(Flush()); + + ReadOptions ropts; + ropts.timestamp = &ts; + std::string value; + Status s = db_->Get(ropts, "k1", &value); + ASSERT_OK(s); + ASSERT_EQ("v1", value); + + std::string key_ts; + ASSERT_TRUE(db_->Get(ropts, "k3", &value, &key_ts).IsNotFound()); + ASSERT_EQ(Timestamp(2, 0), key_ts); + + ts_str = Timestamp(5, 0); + ts = ts_str; + ropts.timestamp = &ts; + ASSERT_TRUE(db_->Get(ropts, "k2", &value, &key_ts).IsNotFound()); + ASSERT_EQ(Timestamp(5, 0), key_ts); + ASSERT_TRUE(db_->Get(ropts, "k2", &value).IsNotFound()); + + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, UpdateFullHistoryTsLow) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + const std::string kKey = "test kKey"; + + // Test set ts_low first and flush() + int current_ts_low = 5; + std::string ts_low_str = Timestamp(current_ts_low, 0); + Slice ts_low = ts_low_str; + CompactRangeOptions comp_opts; + comp_opts.full_history_ts_low = &ts_low; + comp_opts.bottommost_level_compaction = BottommostLevelCompaction::kForce; + + ASSERT_OK(db_->CompactRange(comp_opts, nullptr, nullptr)); + + auto* cfd = + static_cast_with_check(db_->DefaultColumnFamily()) + ->cfd(); + auto result_ts_low = cfd->GetFullHistoryTsLow(); + + ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low, result_ts_low) == 0); + + for (int i = 0; i < 10; i++) { + WriteOptions write_opts; + std::string ts = Timestamp(i, 0); + ASSERT_OK(db_->Put(write_opts, kKey, ts, Key(i))); + } + ASSERT_OK(Flush()); + + for (int i = 0; i < 10; i++) { + ReadOptions read_opts; + std::string ts_str = Timestamp(i, 0); + Slice ts = ts_str; + read_opts.timestamp = &ts; + std::string value; + Status status = db_->Get(read_opts, kKey, &value); + if (i < current_ts_low) { + ASSERT_TRUE(status.IsInvalidArgument()); + } else { + ASSERT_OK(status); + ASSERT_TRUE(value.compare(Key(i)) == 0); + } + } + + // Test set ts_low and then trigger compaction + for (int i = 10; i < 20; i++) { + WriteOptions write_opts; + std::string ts = Timestamp(i, 0); + ASSERT_OK(db_->Put(write_opts, kKey, ts, Key(i))); + } + + ASSERT_OK(Flush()); + + current_ts_low = 15; + ts_low_str = Timestamp(current_ts_low, 0); + ts_low = ts_low_str; + comp_opts.full_history_ts_low = &ts_low; + ASSERT_OK(db_->CompactRange(comp_opts, nullptr, nullptr)); + result_ts_low = cfd->GetFullHistoryTsLow(); + ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low, result_ts_low) == 0); + + for (int i = current_ts_low; i < 20; i++) { + ReadOptions read_opts; + std::string ts_str = Timestamp(i, 0); + Slice ts = ts_str; + read_opts.timestamp = &ts; + std::string value; + Status status = db_->Get(read_opts, kKey, &value); + ASSERT_OK(status); + ASSERT_TRUE(value.compare(Key(i)) == 0); + } + + // Test invalid compaction with range + Slice start(kKey), end(kKey); + Status s = db_->CompactRange(comp_opts, &start, &end); + ASSERT_TRUE(s.IsInvalidArgument()); + s = db_->CompactRange(comp_opts, &start, nullptr); + ASSERT_TRUE(s.IsInvalidArgument()); + s = db_->CompactRange(comp_opts, nullptr, &end); + ASSERT_TRUE(s.IsInvalidArgument()); + + // Test invalid compaction with the decreasing ts_low + ts_low_str = Timestamp(current_ts_low - 1, 0); + ts_low = ts_low_str; + comp_opts.full_history_ts_low = &ts_low; + s = db_->CompactRange(comp_opts, nullptr, nullptr); + ASSERT_TRUE(s.IsInvalidArgument()); + + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, UpdateFullHistoryTsLowWithPublicAPI) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + std::string ts_low_str = Timestamp(9, 0); + ASSERT_OK( + db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_low_str)); + std::string result_ts_low; + ASSERT_OK(db_->GetFullHistoryTsLow(nullptr, &result_ts_low)); + ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low_str, result_ts_low) == 0); + // test increase full_history_low backward + std::string ts_low_str_back = Timestamp(8, 0); + auto s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), + ts_low_str_back); + ASSERT_EQ(s, Status::InvalidArgument()); + // test IncreaseFullHistoryTsLow with a timestamp whose length is longger + // than the cf's timestamp size + std::string ts_low_str_long(Timestamp(0, 0).size() + 1, 'a'); + s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), + ts_low_str_long); + ASSERT_EQ(s, Status::InvalidArgument()); + // test IncreaseFullHistoryTsLow with a timestamp which is null + std::string ts_low_str_null = ""; + s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), + ts_low_str_null); + ASSERT_EQ(s, Status::InvalidArgument()); + // test IncreaseFullHistoryTsLow for a column family that does not enable + // timestamp + options.comparator = BytewiseComparator(); + DestroyAndReopen(options); + ts_low_str = Timestamp(10, 0); + s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_low_str); + ASSERT_EQ(s, Status::InvalidArgument()); + // test GetFullHistoryTsLow for a column family that does not enable + // timestamp + std::string current_ts_low; + s = db_->GetFullHistoryTsLow(db_->DefaultColumnFamily(), ¤t_ts_low); + ASSERT_EQ(s, Status::InvalidArgument()); + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, GetApproximateSizes) { + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; // Large write buffer + options.compression = kNoCompression; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + auto default_cf = db_->DefaultColumnFamily(); + + WriteOptions write_opts; + std::string ts = Timestamp(1, 0); + + const int N = 128; + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(db_->Put(write_opts, Key(i), ts, rnd.RandomString(1024))); + } + + uint64_t size; + std::string start = Key(50); + std::string end = Key(60); + Range r(start, end); + SizeApproximationOptions size_approx_options; + size_approx_options.include_memtables = true; + size_approx_options.include_files = true; + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); + ASSERT_GT(size, 6000); + ASSERT_LT(size, 204800); + + // test multiple ranges + std::vector ranges; + std::string start_tmp = Key(10); + std::string end_tmp = Key(20); + ranges.emplace_back(Range(start_tmp, end_tmp)); + ranges.emplace_back(Range(start, end)); + uint64_t range_sizes[2]; + ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, + ranges.data(), 2, range_sizes)); + + ASSERT_EQ(range_sizes[1], size); + + // Zero if not including mem table + ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size)); + ASSERT_EQ(size, 0); + + start = Key(500); + end = Key(600); + r = Range(start, end); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); + ASSERT_EQ(size, 0); + + // Test range boundaries + ASSERT_OK(db_->Put(write_opts, Key(1000), ts, rnd.RandomString(1024))); + // Should include start key + start = Key(1000); + end = Key(1100); + r = Range(start, end); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); + ASSERT_GT(size, 0); + + // Should exclude end key + start = Key(900); + end = Key(1000); + r = Range(start, end); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); + ASSERT_EQ(size, 0); + + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, SimpleIterate) { + const int kNumKeysPerFile = 128; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + const std::vector start_keys = {1, 0}; + const std::vector write_timestamps = {Timestamp(1, 0), + Timestamp(3, 0)}; + const std::vector read_timestamps = {Timestamp(2, 0), + Timestamp(4, 0)}; + for (size_t i = 0; i < write_timestamps.size(); ++i) { + WriteOptions write_opts; + for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), write_timestamps[i], + "value" + std::to_string(i)); + ASSERT_OK(s); + } + } + for (size_t i = 0; i < read_timestamps.size(); ++i) { + ReadOptions read_opts; + Slice read_ts = read_timestamps[i]; + read_opts.timestamp = &read_ts; + std::unique_ptr it(db_->NewIterator(read_opts)); + int count = 0; + uint64_t key = 0; + // Forward iterate. + for (it->Seek(Key1(0)), key = start_keys[i]; it->Valid(); + it->Next(), ++count, ++key) { + CheckIterUserEntry(it.get(), Key1(key), kTypeValue, + "value" + std::to_string(i), write_timestamps[i]); + } + size_t expected_count = kMaxKey - start_keys[i] + 1; + ASSERT_EQ(expected_count, count); + + // Backward iterate. + count = 0; + for (it->SeekForPrev(Key1(kMaxKey)), key = kMaxKey; it->Valid(); + it->Prev(), ++count, --key) { + CheckIterUserEntry(it.get(), Key1(key), kTypeValue, + "value" + std::to_string(i), write_timestamps[i]); + } + ASSERT_EQ(static_cast(kMaxKey) - start_keys[i] + 1, count); + + // SeekToFirst()/SeekToLast() with lower/upper bounds. + // Then iter with lower and upper bounds. + uint64_t l = 0; + uint64_t r = kMaxKey + 1; + while (l < r) { + std::string lb_str = Key1(l); + Slice lb = lb_str; + std::string ub_str = Key1(r); + Slice ub = ub_str; + read_opts.iterate_lower_bound = &lb; + read_opts.iterate_upper_bound = &ub; + it.reset(db_->NewIterator(read_opts)); + for (it->SeekToFirst(), key = std::max(l, start_keys[i]), count = 0; + it->Valid(); it->Next(), ++key, ++count) { + CheckIterUserEntry(it.get(), Key1(key), kTypeValue, + "value" + std::to_string(i), write_timestamps[i]); + } + ASSERT_EQ(r - std::max(l, start_keys[i]), count); + + for (it->SeekToLast(), key = std::min(r, kMaxKey + 1), count = 0; + it->Valid(); it->Prev(), --key, ++count) { + CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue, + "value" + std::to_string(i), write_timestamps[i]); + } + l += (kMaxKey / 100); + r -= (kMaxKey / 100); + } + } + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, TrimHistoryTest) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + auto check_value_by_ts = [](DB* db, Slice key, std::string readTs, + Status status, std::string checkValue, + std::string expected_ts) { + ReadOptions ropts; + Slice ts = readTs; + ropts.timestamp = &ts; + std::string value; + std::string key_ts; + Status s = db->Get(ropts, key, &value, &key_ts); + ASSERT_TRUE(s == status); + if (s.ok()) { + ASSERT_EQ(checkValue, value); + } + if (s.ok() || s.IsNotFound()) { + ASSERT_EQ(expected_ts, key_ts); + } + }; + // Construct data of different versions with different ts + ASSERT_OK(db_->Put(WriteOptions(), "k1", Timestamp(2, 0), "v1")); + ASSERT_OK(db_->Put(WriteOptions(), "k1", Timestamp(4, 0), "v2")); + ASSERT_OK(db_->Delete(WriteOptions(), "k1", Timestamp(5, 0))); + ASSERT_OK(db_->Put(WriteOptions(), "k1", Timestamp(6, 0), "v3")); + check_value_by_ts(db_, "k1", Timestamp(7, 0), Status::OK(), "v3", + Timestamp(6, 0)); + ASSERT_OK(Flush()); + Close(); + + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + DBOptions db_options(options); + + // Trim data whose version > Timestamp(5, 0), read(k1, ts(7)) <- NOT_FOUND. + ASSERT_OK(DB::OpenAndTrimHistory(db_options, dbname_, column_families, + &handles_, &db_, Timestamp(5, 0))); + check_value_by_ts(db_, "k1", Timestamp(7, 0), Status::NotFound(), "", + Timestamp(5, 0)); + Close(); + + // Trim data whose timestamp > Timestamp(4, 0), read(k1, ts(7)) <- v2 + ASSERT_OK(DB::OpenAndTrimHistory(db_options, dbname_, column_families, + &handles_, &db_, Timestamp(4, 0))); + check_value_by_ts(db_, "k1", Timestamp(7, 0), Status::OK(), "v2", + Timestamp(4, 0)); + Close(); + + Reopen(options); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "k1", + "k3", Timestamp(7, 0))); + check_value_by_ts(db_, "k1", Timestamp(8, 0), Status::NotFound(), "", + Timestamp(7, 0)); + Close(); + // Trim data whose timestamp > Timestamp(6, 0), read(k1, ts(8)) <- v2 + ASSERT_OK(DB::OpenAndTrimHistory(db_options, dbname_, column_families, + &handles_, &db_, Timestamp(6, 0))); + check_value_by_ts(db_, "k1", Timestamp(8, 0), Status::OK(), "v2", + Timestamp(4, 0)); + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, OpenAndTrimHistoryInvalidOptionTest) { + Destroy(last_options_); + + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + DBOptions db_options(options); + + // OpenAndTrimHistory should not work with avoid_flush_during_recovery + db_options.avoid_flush_during_recovery = true; + ASSERT_TRUE(DB::OpenAndTrimHistory(db_options, dbname_, column_families, + &handles_, &db_, Timestamp(0, 0)) + .IsInvalidArgument()); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBBasicTestWithTimestamp, GetTimestampTableProperties) { + Options options = CurrentOptions(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + // Create 2 tables + for (int table = 0; table < 2; ++table) { + for (int i = 0; i < 10; i++) { + std::string ts = Timestamp(i, 0); + ASSERT_OK(db_->Put(WriteOptions(), "key", ts, Key(i))); + } + ASSERT_OK(Flush()); + } + + TablePropertiesCollection props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&props)); + ASSERT_EQ(2U, props.size()); + for (const auto& item : props) { + auto& user_collected = item.second->user_collected_properties; + ASSERT_TRUE(user_collected.find("rocksdb.timestamp_min") != + user_collected.end()); + ASSERT_TRUE(user_collected.find("rocksdb.timestamp_max") != + user_collected.end()); + ASSERT_EQ(user_collected.at("rocksdb.timestamp_min"), Timestamp(0, 0)); + ASSERT_EQ(user_collected.at("rocksdb.timestamp_max"), Timestamp(9, 0)); + } + Close(); +} +#endif // !ROCKSDB_LITE + +class DBBasicTestWithTimestampTableOptions + : public DBBasicTestWithTimestampBase, + public testing::WithParamInterface { + public: + explicit DBBasicTestWithTimestampTableOptions() + : DBBasicTestWithTimestampBase( + "db_basic_test_with_timestamp_table_options") {} +}; + +INSTANTIATE_TEST_CASE_P( + Timestamp, DBBasicTestWithTimestampTableOptions, + testing::Values( + BlockBasedTableOptions::IndexType::kBinarySearch, + BlockBasedTableOptions::IndexType::kHashSearch, + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch, + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey)); + +TEST_P(DBBasicTestWithTimestampTableOptions, GetAndMultiGet) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + options.compression = kNoCompression; + BlockBasedTableOptions bbto; + bbto.index_type = GetParam(); + bbto.block_size = 100; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator cmp(kTimestampSize); + options.comparator = &cmp; + DestroyAndReopen(options); + constexpr uint64_t kNumKeys = 1024; + for (uint64_t k = 0; k < kNumKeys; ++k) { + WriteOptions write_opts; + ASSERT_OK(db_->Put(write_opts, Key1(k), Timestamp(1, 0), + "value" + std::to_string(k))); + } + ASSERT_OK(Flush()); + { + ReadOptions read_opts; + read_opts.total_order_seek = true; + std::string ts_str = Timestamp(2, 0); + Slice ts = ts_str; + read_opts.timestamp = &ts; + std::unique_ptr it(db_->NewIterator(read_opts)); + // verify Get() + for (it->SeekToFirst(); it->Valid(); it->Next()) { + std::string value_from_get; + std::string key_str(it->key().data(), it->key().size()); + std::string timestamp; + ASSERT_OK(db_->Get(read_opts, key_str, &value_from_get, ×tamp)); + ASSERT_EQ(it->value(), value_from_get); + ASSERT_EQ(Timestamp(1, 0), timestamp); + } + + // verify MultiGet() + constexpr uint64_t step = 2; + static_assert(0 == (kNumKeys % step), + "kNumKeys must be a multiple of step"); + for (uint64_t k = 0; k < kNumKeys; k += 2) { + std::vector key_strs; + std::vector keys; + for (size_t i = 0; i < step; ++i) { + key_strs.push_back(Key1(k + i)); + } + for (size_t i = 0; i < step; ++i) { + keys.emplace_back(key_strs[i]); + } + std::vector values; + std::vector timestamps; + std::vector statuses = + db_->MultiGet(read_opts, keys, &values, ×tamps); + ASSERT_EQ(step, statuses.size()); + ASSERT_EQ(step, values.size()); + ASSERT_EQ(step, timestamps.size()); + for (uint64_t i = 0; i < step; ++i) { + ASSERT_OK(statuses[i]); + ASSERT_EQ("value" + std::to_string(k + i), values[i]); + ASSERT_EQ(Timestamp(1, 0), timestamps[i]); + } + } + } + Close(); +} + +TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithPrefixLessThanKey) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + options.memtable_whole_key_filtering = true; + options.memtable_prefix_bloom_size_ratio = 0.1; + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = true; + bbto.index_type = GetParam(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + WriteOptions write_opts; + std::string ts = Timestamp(1, 0); + + ASSERT_OK(db_->Put(write_opts, "foo1", ts, "bar")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->Put(write_opts, "foo2", ts, "bar")); + ASSERT_OK(Flush()); + + // Move sst file to next level + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + ASSERT_OK(db_->Put(write_opts, "foo3", ts, "bar")); + ASSERT_OK(Flush()); + + ReadOptions read_opts; + Slice read_ts = ts; + read_opts.timestamp = &read_ts; + { + std::unique_ptr iter(db_->NewIterator(read_opts)); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + + iter->Seek("bbb"); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + } + + Close(); +} + +TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithCappedPrefix) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + // All of the keys or this test must be longer than 3 characters + constexpr int kMinKeyLen = 3; + options.prefix_extractor.reset(NewCappedPrefixTransform(kMinKeyLen)); + options.memtable_whole_key_filtering = true; + options.memtable_prefix_bloom_size_ratio = 0.1; + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = true; + bbto.index_type = GetParam(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + WriteOptions write_opts; + std::string ts = Timestamp(1, 0); + + ASSERT_OK(db_->Put(write_opts, "foo1", ts, "bar")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->Put(write_opts, "foo2", ts, "bar")); + ASSERT_OK(Flush()); + + // Move sst file to next level + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + ASSERT_OK(db_->Put(write_opts, "foo3", ts, "bar")); + ASSERT_OK(Flush()); + + ReadOptions read_opts; + ts = Timestamp(2, 0); + Slice read_ts = ts; + read_opts.timestamp = &read_ts; + { + std::unique_ptr iter(db_->NewIterator(read_opts)); + // Make sure the prefix extractor doesn't include timestamp, otherwise it + // may return invalid result. + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + } + + Close(); +} + +TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithBound) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(2)); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = true; + bbto.index_type = GetParam(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + WriteOptions write_opts; + std::string ts = Timestamp(1, 0); + + ASSERT_OK(db_->Put(write_opts, "foo1", ts, "bar1")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->Put(write_opts, "foo2", ts, "bar2")); + ASSERT_OK(Flush()); + + // Move sst file to next level + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + for (int i = 3; i < 9; ++i) { + ASSERT_OK(db_->Put(write_opts, "foo" + std::to_string(i), ts, + "bar" + std::to_string(i))); + } + ASSERT_OK(Flush()); + + ReadOptions read_opts; + ts = Timestamp(2, 0); + Slice read_ts = ts; + read_opts.timestamp = &read_ts; + std::string up_bound = "foo5"; // exclusive + Slice up_bound_slice = up_bound; + std::string lo_bound = "foo2"; // inclusive + Slice lo_bound_slice = lo_bound; + read_opts.iterate_upper_bound = &up_bound_slice; + read_opts.iterate_lower_bound = &lo_bound_slice; + read_opts.auto_prefix_mode = true; + { + std::unique_ptr iter(db_->NewIterator(read_opts)); + // Make sure the prefix extractor doesn't include timestamp, otherwise it + // may return invalid result. + iter->Seek("foo"); + CheckIterUserEntry(iter.get(), lo_bound, kTypeValue, "bar2", + Timestamp(1, 0)); + iter->SeekToFirst(); + CheckIterUserEntry(iter.get(), lo_bound, kTypeValue, "bar2", + Timestamp(1, 0)); + iter->SeekForPrev("g"); + CheckIterUserEntry(iter.get(), "foo4", kTypeValue, "bar4", Timestamp(1, 0)); + iter->SeekToLast(); + CheckIterUserEntry(iter.get(), "foo4", kTypeValue, "bar4", Timestamp(1, 0)); + } + + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, ChangeIterationDirection) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.env = env_; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + const std::vector timestamps = {Timestamp(1, 1), Timestamp(0, 2), + Timestamp(4, 3)}; + const std::vector> kvs = { + std::make_tuple("aa", "value1"), std::make_tuple("ab", "value2")}; + for (const auto& ts : timestamps) { + WriteBatch wb(0, 0, 0, kTimestampSize); + for (const auto& kv : kvs) { + const std::string& key = std::get<0>(kv); + const std::string& value = std::get<1>(kv); + ASSERT_OK(wb.Put(key, value)); + } + + ASSERT_OK(wb.UpdateTimestamps( + ts, [kTimestampSize](uint32_t) { return kTimestampSize; })); + ASSERT_OK(db_->Write(WriteOptions(), &wb)); + } + std::string read_ts_str = Timestamp(5, 3); + Slice read_ts = read_ts_str; + ReadOptions read_opts; + read_opts.timestamp = &read_ts; + std::unique_ptr it(db_->NewIterator(read_opts)); + + it->SeekToFirst(); + ASSERT_TRUE(it->Valid()); + it->Prev(); + ASSERT_FALSE(it->Valid()); + + it->SeekToLast(); + ASSERT_TRUE(it->Valid()); + uint64_t prev_reseek_count = + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION); + ASSERT_EQ(0, prev_reseek_count); + it->Next(); + ASSERT_FALSE(it->Valid()); + ASSERT_EQ(1 + prev_reseek_count, + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + + it->Seek(std::get<0>(kvs[0])); + CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue, + std::get<1>(kvs[0]), Timestamp(4, 3)); + it->Next(); + CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue, + std::get<1>(kvs[1]), Timestamp(4, 3)); + it->Prev(); + CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue, + std::get<1>(kvs[0]), Timestamp(4, 3)); + + prev_reseek_count = + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION); + ASSERT_EQ(1, prev_reseek_count); + it->Next(); + CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue, + std::get<1>(kvs[1]), Timestamp(4, 3)); + ASSERT_EQ(1 + prev_reseek_count, + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + + it->SeekForPrev(std::get<0>(kvs[1])); + CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue, + std::get<1>(kvs[1]), Timestamp(4, 3)); + it->Prev(); + CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue, + std::get<1>(kvs[0]), Timestamp(4, 3)); + + prev_reseek_count = + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION); + it->Next(); + CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue, + std::get<1>(kvs[1]), Timestamp(4, 3)); + ASSERT_EQ(1 + prev_reseek_count, + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + + it.reset(); + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterateLowerTsBound) { + constexpr int kNumKeysPerFile = 128; + constexpr uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + const std::vector write_timestamps = {Timestamp(1, 0), + Timestamp(3, 0)}; + const std::vector read_timestamps = {Timestamp(2, 0), + Timestamp(4, 0)}; + const std::vector read_timestamps_lb = {Timestamp(1, 0), + Timestamp(1, 0)}; + for (size_t i = 0; i < write_timestamps.size(); ++i) { + WriteOptions write_opts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), write_timestamps[i], + "value" + std::to_string(i)); + ASSERT_OK(s); + } + } + for (size_t i = 0; i < read_timestamps.size(); ++i) { + ReadOptions read_opts; + Slice read_ts = read_timestamps[i]; + Slice read_ts_lb = read_timestamps_lb[i]; + read_opts.timestamp = &read_ts; + read_opts.iter_start_ts = &read_ts_lb; + std::unique_ptr it(db_->NewIterator(read_opts)); + int count = 0; + uint64_t key = 0; + for (it->Seek(Key1(0)), key = 0; it->Valid(); it->Next(), ++count, ++key) { + CheckIterEntry(it.get(), Key1(key), kTypeValue, + "value" + std::to_string(i), write_timestamps[i]); + if (i > 0) { + it->Next(); + CheckIterEntry(it.get(), Key1(key), kTypeValue, + "value" + std::to_string(i - 1), + write_timestamps[i - 1]); + } + } + size_t expected_count = kMaxKey + 1; + ASSERT_EQ(expected_count, count); + } + // Delete all keys@ts=5 and check iteration result with start ts set + { + std::string write_timestamp = Timestamp(5, 0); + WriteOptions write_opts; + for (uint64_t key = 0; key < kMaxKey + 1; ++key) { + Status s = db_->Delete(write_opts, Key1(key), write_timestamp); + ASSERT_OK(s); + } + + std::string read_timestamp = Timestamp(6, 0); + ReadOptions read_opts; + Slice read_ts = read_timestamp; + read_opts.timestamp = &read_ts; + std::string read_timestamp_lb = Timestamp(2, 0); + Slice read_ts_lb = read_timestamp_lb; + read_opts.iter_start_ts = &read_ts_lb; + std::unique_ptr it(db_->NewIterator(read_opts)); + int count = 0; + uint64_t key = 0; + for (it->Seek(Key1(0)), key = 0; it->Valid(); it->Next(), ++count, ++key) { + CheckIterEntry(it.get(), Key1(key), kTypeDeletionWithTimestamp, Slice(), + write_timestamp); + // Skip key@ts=3 and land on tombstone key@ts=5 + it->Next(); + } + ASSERT_EQ(kMaxKey + 1, count); + } + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, BackwardIterateLowerTsBound) { + constexpr int kNumKeysPerFile = 128; + constexpr uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + const std::vector write_timestamps = {Timestamp(1, 0), + Timestamp(3, 0)}; + const std::vector read_timestamps = {Timestamp(2, 0), + Timestamp(4, 0)}; + const std::vector read_timestamps_lb = {Timestamp(1, 0), + Timestamp(1, 0)}; + for (size_t i = 0; i < write_timestamps.size(); ++i) { + WriteOptions write_opts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), write_timestamps[i], + "value" + std::to_string(i)); + ASSERT_OK(s); + } + } + for (size_t i = 0; i < read_timestamps.size(); ++i) { + ReadOptions read_opts; + Slice read_ts = read_timestamps[i]; + Slice read_ts_lb = read_timestamps_lb[i]; + read_opts.timestamp = &read_ts; + read_opts.iter_start_ts = &read_ts_lb; + std::unique_ptr it(db_->NewIterator(read_opts)); + int count = 0; + uint64_t key = 0; + for (it->SeekForPrev(Key1(kMaxKey)), key = kMaxKey; it->Valid(); + it->Prev(), ++count, --key) { + CheckIterEntry(it.get(), Key1(key), kTypeValue, "value0", + write_timestamps[0]); + if (i > 0) { + it->Prev(); + CheckIterEntry(it.get(), Key1(key), kTypeValue, "value1", + write_timestamps[1]); + } + } + size_t expected_count = kMaxKey + 1; + ASSERT_EQ(expected_count, count); + } + // Delete all keys@ts=5 and check iteration result with start ts set + { + std::string write_timestamp = Timestamp(5, 0); + WriteOptions write_opts; + for (uint64_t key = 0; key < kMaxKey + 1; ++key) { + Status s = db_->Delete(write_opts, Key1(key), write_timestamp); + ASSERT_OK(s); + } + + std::string read_timestamp = Timestamp(6, 0); + ReadOptions read_opts; + Slice read_ts = read_timestamp; + read_opts.timestamp = &read_ts; + std::string read_timestamp_lb = Timestamp(2, 0); + Slice read_ts_lb = read_timestamp_lb; + read_opts.iter_start_ts = &read_ts_lb; + std::unique_ptr it(db_->NewIterator(read_opts)); + int count = 0; + uint64_t key = kMaxKey; + for (it->SeekForPrev(Key1(key)), key = kMaxKey; it->Valid(); + it->Prev(), ++count, --key) { + CheckIterEntry(it.get(), Key1(key), kTypeValue, "value1", + Timestamp(3, 0)); + it->Prev(); + CheckIterEntry(it.get(), Key1(key), kTypeDeletionWithTimestamp, Slice(), + write_timestamp); + } + ASSERT_EQ(kMaxKey + 1, count); + } + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, SimpleBackwardIterateLowerTsBound) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + std::string ts_ub_buf = Timestamp(5, 0); + Slice ts_ub = ts_ub_buf; + std::string ts_lb_buf = Timestamp(1, 0); + Slice ts_lb = ts_lb_buf; + + { + ReadOptions read_opts; + read_opts.timestamp = &ts_ub; + read_opts.iter_start_ts = &ts_lb; + std::unique_ptr it(db_->NewIterator(read_opts)); + it->SeekToLast(); + ASSERT_FALSE(it->Valid()); + ASSERT_OK(it->status()); + + it->SeekForPrev("foo"); + ASSERT_FALSE(it->Valid()); + ASSERT_OK(it->status()); + } + + // Test iterate_upper_bound + ASSERT_OK(db_->Put(WriteOptions(), "a", Timestamp(0, 0), "v0")); + ASSERT_OK(db_->SingleDelete(WriteOptions(), "a", Timestamp(1, 0))); + + for (int i = 0; i < 5; ++i) { + ASSERT_OK(db_->Put(WriteOptions(), "b", Timestamp(i, 0), + "v" + std::to_string(i))); + } + + { + ReadOptions read_opts; + read_opts.timestamp = &ts_ub; + read_opts.iter_start_ts = &ts_lb; + std::string key_ub_str = "b"; // exclusive + Slice key_ub = key_ub_str; + read_opts.iterate_upper_bound = &key_ub; + std::unique_ptr it(db_->NewIterator(read_opts)); + it->SeekToLast(); + CheckIterEntry(it.get(), "a", kTypeSingleDeletion, Slice(), + Timestamp(1, 0)); + + key_ub_str = "a"; // exclusive + key_ub = key_ub_str; + read_opts.iterate_upper_bound = &key_ub; + it.reset(db_->NewIterator(read_opts)); + it->SeekToLast(); + ASSERT_FALSE(it->Valid()); + ASSERT_OK(it->status()); + } + + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, BackwardIterateLowerTsBound_Reseek) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.max_sequential_skip_in_iterations = 2; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + for (int i = 0; i < 10; ++i) { + ASSERT_OK(db_->Put(WriteOptions(), "a", Timestamp(i, 0), + "v" + std::to_string(i))); + } + + for (int i = 0; i < 10; ++i) { + ASSERT_OK(db_->Put(WriteOptions(), "b", Timestamp(i, 0), + "v" + std::to_string(i))); + } + + { + std::string ts_ub_buf = Timestamp(6, 0); + Slice ts_ub = ts_ub_buf; + std::string ts_lb_buf = Timestamp(4, 0); + Slice ts_lb = ts_lb_buf; + + ReadOptions read_opts; + read_opts.timestamp = &ts_ub; + read_opts.iter_start_ts = &ts_lb; + std::unique_ptr it(db_->NewIterator(read_opts)); + it->SeekToLast(); + for (int i = 0; i < 3 && it->Valid(); it->Prev(), ++i) { + CheckIterEntry(it.get(), "b", kTypeValue, "v" + std::to_string(4 + i), + Timestamp(4 + i, 0)); + } + for (int i = 0; i < 3 && it->Valid(); it->Prev(), ++i) { + CheckIterEntry(it.get(), "a", kTypeValue, "v" + std::to_string(4 + i), + Timestamp(4 + i, 0)); + } + } + + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, ReseekToTargetTimestamp) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + constexpr size_t kNumKeys = 16; + options.max_sequential_skip_in_iterations = kNumKeys / 2; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + // Insert kNumKeys + WriteOptions write_opts; + Status s; + for (size_t i = 0; i != kNumKeys; ++i) { + std::string ts = Timestamp(static_cast(i + 1), 0); + s = db_->Put(write_opts, "foo", ts, "value" + std::to_string(i)); + ASSERT_OK(s); + } + { + ReadOptions read_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + read_opts.timestamp = &ts; + std::unique_ptr iter(db_->NewIterator(read_opts)); + iter->SeekToFirst(); + CheckIterUserEntry(iter.get(), "foo", kTypeValue, "value0", ts_str); + ASSERT_EQ( + 1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + + ts_str = Timestamp(kNumKeys, 0); + ts = ts_str; + read_opts.timestamp = &ts; + iter.reset(db_->NewIterator(read_opts)); + iter->SeekToLast(); + CheckIterUserEntry(iter.get(), "foo", kTypeValue, + "value" + std::to_string(kNumKeys - 1), ts_str); + ASSERT_EQ( + 2, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + } + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, ReseekToNextUserKey) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + constexpr size_t kNumKeys = 16; + options.max_sequential_skip_in_iterations = kNumKeys / 2; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + // Write kNumKeys + 1 keys + WriteOptions write_opts; + Status s; + for (size_t i = 0; i != kNumKeys; ++i) { + std::string ts = Timestamp(static_cast(i + 1), 0); + s = db_->Put(write_opts, "a", ts, "value" + std::to_string(i)); + ASSERT_OK(s); + } + { + std::string ts_str = Timestamp(static_cast(kNumKeys + 1), 0); + WriteBatch batch(0, 0, 0, kTimestampSize); + { ASSERT_OK(batch.Put("a", "new_value")); } + { ASSERT_OK(batch.Put("b", "new_value")); } + s = batch.UpdateTimestamps( + ts_str, [kTimestampSize](uint32_t) { return kTimestampSize; }); + ASSERT_OK(s); + s = db_->Write(write_opts, &batch); + ASSERT_OK(s); + } + { + ReadOptions read_opts; + std::string ts_str = Timestamp(static_cast(kNumKeys + 1), 0); + Slice ts = ts_str; + read_opts.timestamp = &ts; + std::unique_ptr iter(db_->NewIterator(read_opts)); + iter->Seek("a"); + iter->Next(); + CheckIterUserEntry(iter.get(), "b", kTypeValue, "new_value", ts_str); + ASSERT_EQ( + 1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + } + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, ReseekToUserKeyBeforeSavedKey) { + Options options = GetDefaultOptions(); + options.env = env_; + options.create_if_missing = true; + constexpr size_t kNumKeys = 16; + options.max_sequential_skip_in_iterations = kNumKeys / 2; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + for (size_t i = 0; i < kNumKeys; ++i) { + std::string ts = Timestamp(static_cast(i + 1), 0); + WriteOptions write_opts; + Status s = db_->Put(write_opts, "b", ts, "value" + std::to_string(i)); + ASSERT_OK(s); + } + { + std::string ts = Timestamp(1, 0); + WriteOptions write_opts; + ASSERT_OK(db_->Put(write_opts, "a", ts, "value")); + } + { + ReadOptions read_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + read_opts.timestamp = &ts; + std::unique_ptr iter(db_->NewIterator(read_opts)); + iter->SeekToLast(); + iter->Prev(); + CheckIterUserEntry(iter.get(), "a", kTypeValue, "value", ts_str); + ASSERT_EQ( + 1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + } + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, MultiGetWithFastLocalBloom) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + // Write any value + WriteOptions write_opts; + std::string ts = Timestamp(1, 0); + + ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar")); + + ASSERT_OK(Flush()); + + // Read with MultiGet + ReadOptions read_opts; + Slice read_ts = ts; + read_opts.timestamp = &read_ts; + size_t batch_size = 1; + std::vector keys(batch_size); + std::vector values(batch_size); + std::vector statuses(batch_size); + std::vector timestamps(batch_size); + keys[0] = "foo"; + ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); + db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(), + timestamps.data(), statuses.data(), true); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(Timestamp(1, 0), timestamps[0]); + for (auto& elem : values) { + elem.Reset(); + } + + ASSERT_OK(db_->SingleDelete(WriteOptions(), "foo", Timestamp(2, 0))); + ts = Timestamp(3, 0); + read_ts = ts; + read_opts.timestamp = &read_ts; + db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(), + timestamps.data(), statuses.data(), true); + ASSERT_TRUE(statuses[0].IsNotFound()); + ASSERT_EQ(Timestamp(2, 0), timestamps[0]); + + Close(); +} + +TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetWithPrefix) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor.reset(NewCappedPrefixTransform(5)); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = false; + bbto.index_type = GetParam(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + // Write any value + WriteOptions write_opts; + std::string ts = Timestamp(1, 0); + + ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar")); + + ASSERT_OK(Flush()); + + // Read with MultiGet + ReadOptions read_opts; + Slice read_ts = ts; + read_opts.timestamp = &read_ts; + size_t batch_size = 1; + std::vector keys(batch_size); + std::vector values(batch_size); + std::vector statuses(batch_size); + std::vector timestamps(batch_size); + keys[0] = "foo"; + ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); + db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(), + timestamps.data(), statuses.data(), true); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(Timestamp(1, 0), timestamps[0]); + for (auto& elem : values) { + elem.Reset(); + } + + ASSERT_OK(db_->SingleDelete(WriteOptions(), "foo", Timestamp(2, 0))); + // TODO re-enable after fixing a bug of kHashSearch + if (GetParam() != BlockBasedTableOptions::IndexType::kHashSearch) { + ASSERT_OK(Flush()); + } + + ts = Timestamp(3, 0); + read_ts = ts; + db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(), + timestamps.data(), statuses.data(), true); + ASSERT_TRUE(statuses[0].IsNotFound()); + ASSERT_EQ(Timestamp(2, 0), timestamps[0]); + + Close(); +} + +TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetWithMemBloomFilter) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor.reset(NewCappedPrefixTransform(5)); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = false; + bbto.index_type = GetParam(); + options.memtable_prefix_bloom_size_ratio = 0.1; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + // Write any value + WriteOptions write_opts; + std::string ts = Timestamp(1, 0); + + ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar")); + + // Read with MultiGet + ts = Timestamp(2, 0); + Slice read_ts = ts; + ReadOptions read_opts; + read_opts.timestamp = &read_ts; + size_t batch_size = 1; + std::vector keys(batch_size); + std::vector values(batch_size); + std::vector statuses(batch_size); + keys[0] = "foo"; + ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); + db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(), + statuses.data()); + + ASSERT_OK(statuses[0]); + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, MultiGetRangeFiltering) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = false; + options.memtable_prefix_bloom_size_ratio = 0.1; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + // Write any value + WriteOptions write_opts; + std::string ts = Timestamp(1, 0); + + // random data + for (int i = 0; i < 3; i++) { + auto key = std::to_string(i * 10); + auto value = std::to_string(i * 10); + Slice key_slice = key; + Slice value_slice = value; + ASSERT_OK(db_->Put(write_opts, key_slice, ts, value_slice)); + ASSERT_OK(Flush()); + } + + // Make num_levels to 2 to do key range filtering of sst files + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar")); + + ASSERT_OK(Flush()); + + // Read with MultiGet + ts = Timestamp(2, 0); + Slice read_ts = ts; + ReadOptions read_opts; + read_opts.timestamp = &read_ts; + size_t batch_size = 1; + std::vector keys(batch_size); + std::vector values(batch_size); + std::vector statuses(batch_size); + keys[0] = "foo"; + ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); + db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(), + statuses.data()); + + ASSERT_OK(statuses[0]); + Close(); +} + +TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetPrefixFilter) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor.reset(NewCappedPrefixTransform(3)); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = false; + bbto.index_type = GetParam(); + options.memtable_prefix_bloom_size_ratio = 0.1; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + WriteOptions write_opts; + std::string ts = Timestamp(1, 0); + + ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar")); + + ASSERT_OK(Flush()); + // Read with MultiGet + ts = Timestamp(2, 0); + Slice read_ts = ts; + ReadOptions read_opts; + read_opts.timestamp = &read_ts; + size_t batch_size = 1; + std::vector keys(batch_size); + std::vector values(batch_size); + std::vector timestamps(batch_size); + keys[0] = "foo"; + ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); + std::vector cfhs(keys.size(), cfh); + std::vector statuses = + db_->MultiGet(read_opts, cfhs, keys, &values, ×tamps); + + ASSERT_OK(statuses[0]); + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, MaxKeysSkippedDuringNext) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + constexpr size_t max_skippable_internal_keys = 2; + const size_t kNumKeys = max_skippable_internal_keys + 2; + WriteOptions write_opts; + Status s; + { + std::string ts = Timestamp(1, 0); + ASSERT_OK(db_->Put(write_opts, "a", ts, "value")); + } + for (size_t i = 0; i < kNumKeys; ++i) { + std::string ts = Timestamp(static_cast(i + 1), 0); + s = db_->Put(write_opts, "b", ts, "value" + std::to_string(i)); + ASSERT_OK(s); + } + { + ReadOptions read_opts; + read_opts.max_skippable_internal_keys = max_skippable_internal_keys; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + read_opts.timestamp = &ts; + std::unique_ptr iter(db_->NewIterator(read_opts)); + iter->SeekToFirst(); + iter->Next(); + ASSERT_TRUE(iter->status().IsIncomplete()); + } + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, MaxKeysSkippedDuringPrev) { + Options options = GetDefaultOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + constexpr size_t max_skippable_internal_keys = 2; + const size_t kNumKeys = max_skippable_internal_keys + 2; + WriteOptions write_opts; + Status s; + { + std::string ts = Timestamp(1, 0); + ASSERT_OK(db_->Put(write_opts, "b", ts, "value")); + } + for (size_t i = 0; i < kNumKeys; ++i) { + std::string ts = Timestamp(static_cast(i + 1), 0); + s = db_->Put(write_opts, "a", ts, "value" + std::to_string(i)); + ASSERT_OK(s); + } + { + ReadOptions read_opts; + read_opts.max_skippable_internal_keys = max_skippable_internal_keys; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + read_opts.timestamp = &ts; + std::unique_ptr iter(db_->NewIterator(read_opts)); + iter->SeekToLast(); + iter->Prev(); + ASSERT_TRUE(iter->status().IsIncomplete()); + } + Close(); +} + +// Create two L0, and compact them to a new L1. In this test, L1 is L_bottom. +// Two L0s: +// f1 f2 +// ... +// Since f2.smallest < f1.largest < f2.largest +// f1 and f2 will be the inputs of a real compaction instead of trivial move. +TEST_F(DBBasicTestWithTimestamp, CompactDeletionWithTimestampMarkerToBottom) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.num_levels = 2; + options.level0_file_num_compaction_trigger = 2; + DestroyAndReopen(options); + WriteOptions write_opts; + std::string ts = Timestamp(1, 0); + ASSERT_OK(db_->Put(write_opts, "a", ts, "value0")); + ASSERT_OK(Flush()); + + ts = Timestamp(2, 0); + ASSERT_OK(db_->Put(write_opts, "b", ts, "value0")); + ts = Timestamp(3, 0); + ASSERT_OK(db_->Delete(write_opts, "a", ts)); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ReadOptions read_opts; + ts = Timestamp(1, 0); + Slice read_ts = ts; + read_opts.timestamp = &read_ts; + std::string value; + Status s = db_->Get(read_opts, "a", &value); + ASSERT_OK(s); + ASSERT_EQ("value0", value); + + ts = Timestamp(3, 0); + read_ts = ts; + read_opts.timestamp = &read_ts; + std::string key_ts; + s = db_->Get(read_opts, "a", &value, &key_ts); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ(Timestamp(3, 0), key_ts); + + // Time-travel to the past before deletion + ts = Timestamp(2, 0); + read_ts = ts; + read_opts.timestamp = &read_ts; + s = db_->Get(read_opts, "a", &value); + ASSERT_OK(s); + ASSERT_EQ("value0", value); + Close(); +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +class DBBasicTestWithTimestampFilterPrefixSettings + : public DBBasicTestWithTimestampBase, + public testing::WithParamInterface< + std::tuple, bool, bool, + std::shared_ptr, bool, double, + BlockBasedTableOptions::IndexType>> { + public: + DBBasicTestWithTimestampFilterPrefixSettings() + : DBBasicTestWithTimestampBase( + "db_basic_test_with_timestamp_filter_prefix") {} +}; + +TEST_P(DBBasicTestWithTimestampFilterPrefixSettings, GetAndMultiGet) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + BlockBasedTableOptions bbto; + bbto.filter_policy = std::get<0>(GetParam()); + bbto.whole_key_filtering = std::get<1>(GetParam()); + bbto.cache_index_and_filter_blocks = std::get<2>(GetParam()); + bbto.index_type = std::get<6>(GetParam()); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.prefix_extractor = std::get<3>(GetParam()); + options.memtable_whole_key_filtering = std::get<4>(GetParam()); + options.memtable_prefix_bloom_size_ratio = std::get<5>(GetParam()); + + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + const int kMaxKey = 1000; + + // Write any value + WriteOptions write_opts; + std::string ts = Timestamp(1, 0); + + int idx = 0; + for (; idx < kMaxKey / 4; idx++) { + ASSERT_OK(db_->Put(write_opts, Key1(idx), ts, "bar")); + ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), ts, "bar")); + } + + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + for (; idx < kMaxKey / 2; idx++) { + ASSERT_OK(db_->Put(write_opts, Key1(idx), ts, "bar")); + ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), ts, "bar")); + } + + ASSERT_OK(Flush()); + + for (; idx < kMaxKey; idx++) { + ASSERT_OK(db_->Put(write_opts, Key1(idx), ts, "bar")); + ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), ts, "bar")); + } + + // Read with MultiGet + ReadOptions read_opts; + Slice read_ts = ts; + read_opts.timestamp = &read_ts; + + for (idx = 0; idx < kMaxKey; idx++) { + size_t batch_size = 4; + std::vector keys_str(batch_size); + std::vector values(batch_size); + std::vector statuses(batch_size); + ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); + + keys_str[0] = Key1(idx); + keys_str[1] = KeyWithPrefix("foo", idx); + keys_str[2] = Key1(kMaxKey + idx); + keys_str[3] = KeyWithPrefix("foo", kMaxKey + idx); + + auto keys = ConvertStrToSlice(keys_str); + + db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(), + statuses.data()); + + for (int i = 0; i < 2; i++) { + ASSERT_OK(statuses[i]); + } + for (int i = 2; i < 4; i++) { + ASSERT_TRUE(statuses[i].IsNotFound()); + } + + for (int i = 0; i < 2; i++) { + std::string value; + ASSERT_OK(db_->Get(read_opts, keys[i], &value)); + std::unique_ptr it1(db_->NewIterator(read_opts)); + ASSERT_NE(nullptr, it1); + ASSERT_OK(it1->status()); + it1->Seek(keys[i]); + ASSERT_TRUE(it1->Valid()); + } + + for (int i = 2; i < 4; i++) { + std::string value; + Status s = db_->Get(read_opts, keys[i], &value); + ASSERT_TRUE(s.IsNotFound()); + } + } + Close(); +} + +INSTANTIATE_TEST_CASE_P( + Timestamp, DBBasicTestWithTimestampFilterPrefixSettings, + ::testing::Combine( + ::testing::Values( + std::shared_ptr(nullptr), + std::shared_ptr(NewBloomFilterPolicy(10, true)), + std::shared_ptr(NewBloomFilterPolicy(10, + false))), + ::testing::Bool(), ::testing::Bool(), + ::testing::Values( + std::shared_ptr(NewFixedPrefixTransform(1)), + std::shared_ptr(NewFixedPrefixTransform(4)), + std::shared_ptr(NewFixedPrefixTransform(7)), + std::shared_ptr(NewFixedPrefixTransform(8))), + ::testing::Bool(), ::testing::Values(0, 0.1), + ::testing::Values( + BlockBasedTableOptions::IndexType::kBinarySearch, + BlockBasedTableOptions::IndexType::kHashSearch, + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch, + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey))); +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +class DataVisibilityTest : public DBBasicTestWithTimestampBase { + public: + DataVisibilityTest() : DBBasicTestWithTimestampBase("data_visibility_test") { + // Initialize test data + for (int i = 0; i < kTestDataSize; i++) { + test_data_[i].key = "key" + std::to_string(i); + test_data_[i].value = "value" + std::to_string(i); + test_data_[i].timestamp = Timestamp(i, 0); + test_data_[i].ts = i; + test_data_[i].seq_num = kMaxSequenceNumber; + } + } + + protected: + struct TestData { + std::string key; + std::string value; + int ts; + std::string timestamp; + SequenceNumber seq_num; + }; + + constexpr static int kTestDataSize = 3; + TestData test_data_[kTestDataSize]; + + void PutTestData(int index, ColumnFamilyHandle* cfh = nullptr) { + ASSERT_LE(index, kTestDataSize); + WriteOptions write_opts; + + if (cfh == nullptr) { + ASSERT_OK(db_->Put(write_opts, test_data_[index].key, + test_data_[index].timestamp, test_data_[index].value)); + const Snapshot* snap = db_->GetSnapshot(); + test_data_[index].seq_num = snap->GetSequenceNumber(); + if (index > 0) { + ASSERT_GT(test_data_[index].seq_num, test_data_[index - 1].seq_num); + } + db_->ReleaseSnapshot(snap); + } else { + ASSERT_OK(db_->Put(write_opts, cfh, test_data_[index].key, + test_data_[index].timestamp, test_data_[index].value)); + } + } + + void AssertVisibility(int ts, SequenceNumber seq, + std::vector statuses) { + ASSERT_EQ(kTestDataSize, statuses.size()); + for (int i = 0; i < kTestDataSize; i++) { + if (test_data_[i].seq_num <= seq && test_data_[i].ts <= ts) { + ASSERT_OK(statuses[i]); + } else { + ASSERT_TRUE(statuses[i].IsNotFound()); + } + } + } + + std::vector GetKeys() { + std::vector ret(kTestDataSize); + for (int i = 0; i < kTestDataSize; i++) { + ret[i] = test_data_[i].key; + } + return ret; + } + + void VerifyDefaultCF(int ts, const Snapshot* snap = nullptr) { + ReadOptions read_opts; + std::string read_ts = Timestamp(ts, 0); + Slice read_ts_slice = read_ts; + read_opts.timestamp = &read_ts_slice; + read_opts.snapshot = snap; + + ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); + std::vector cfs(kTestDataSize, cfh); + SequenceNumber seq = + snap ? snap->GetSequenceNumber() : kMaxSequenceNumber - 1; + + // There're several MultiGet interfaces with not exactly the same + // implementations, query data with all of them. + auto keys = GetKeys(); + std::vector values; + auto s1 = db_->MultiGet(read_opts, cfs, keys, &values); + AssertVisibility(ts, seq, s1); + + auto s2 = db_->MultiGet(read_opts, keys, &values); + AssertVisibility(ts, seq, s2); + + std::vector timestamps; + auto s3 = db_->MultiGet(read_opts, cfs, keys, &values, ×tamps); + AssertVisibility(ts, seq, s3); + + auto s4 = db_->MultiGet(read_opts, keys, &values, ×tamps); + AssertVisibility(ts, seq, s4); + + std::vector values_ps5(kTestDataSize); + std::vector s5(kTestDataSize); + db_->MultiGet(read_opts, cfh, kTestDataSize, keys.data(), values_ps5.data(), + s5.data()); + AssertVisibility(ts, seq, s5); + + std::vector values_ps6(kTestDataSize); + std::vector s6(kTestDataSize); + std::vector timestamps_array(kTestDataSize); + db_->MultiGet(read_opts, cfh, kTestDataSize, keys.data(), values_ps6.data(), + timestamps_array.data(), s6.data()); + AssertVisibility(ts, seq, s6); + + std::vector values_ps7(kTestDataSize); + std::vector s7(kTestDataSize); + db_->MultiGet(read_opts, kTestDataSize, cfs.data(), keys.data(), + values_ps7.data(), s7.data()); + AssertVisibility(ts, seq, s7); + + std::vector values_ps8(kTestDataSize); + std::vector s8(kTestDataSize); + db_->MultiGet(read_opts, kTestDataSize, cfs.data(), keys.data(), + values_ps8.data(), timestamps_array.data(), s8.data()); + AssertVisibility(ts, seq, s8); + } + + void VerifyDefaultCF(const Snapshot* snap = nullptr) { + for (int i = 0; i <= kTestDataSize; i++) { + VerifyDefaultCF(i, snap); + } + } +}; +constexpr int DataVisibilityTest::kTestDataSize; + +// Application specifies timestamp but not snapshot. +// reader writer +// ts'=90 +// ts=100 +// seq=10 +// seq'=11 +// write finishes +// GetImpl(ts,seq) +// It is OK to return if ts>=t1 AND seq>=s1. If ts>=t1 but seqDisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::GetImpl:3", + "DataVisibilityTest::PointLookupWithoutSnapshot1:BeforePut"}, + {"DataVisibilityTest::PointLookupWithoutSnapshot1:AfterPut", + "DBImpl::GetImpl:4"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + port::Thread writer_thread([this]() { + std::string write_ts = Timestamp(1, 0); + WriteOptions write_opts; + TEST_SYNC_POINT( + "DataVisibilityTest::PointLookupWithoutSnapshot1:BeforePut"); + Status s = db_->Put(write_opts, "foo", write_ts, "value"); + ASSERT_OK(s); + TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithoutSnapshot1:AfterPut"); + }); + ReadOptions read_opts; + std::string read_ts_str = Timestamp(3, 0); + Slice read_ts = read_ts_str; + read_opts.timestamp = &read_ts; + std::string value; + Status s = db_->Get(read_opts, "foo", &value); + + writer_thread.join(); + ASSERT_TRUE(s.IsNotFound()); + Close(); +} + +// Application specifies timestamp but not snapshot. +// reader writer +// ts'=90 +// ts=100 +// seq=10 +// seq'=11 +// write finishes +// Flush +// GetImpl(ts,seq) +// It is OK to return if ts>=t1 AND seq>=s1. If ts>=t1 but seqDisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::GetImpl:3", + "DataVisibilityTest::PointLookupWithoutSnapshot2:BeforePut"}, + {"DataVisibilityTest::PointLookupWithoutSnapshot2:AfterPut", + "DBImpl::GetImpl:4"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + port::Thread writer_thread([this]() { + std::string write_ts = Timestamp(1, 0); + WriteOptions write_opts; + TEST_SYNC_POINT( + "DataVisibilityTest::PointLookupWithoutSnapshot2:BeforePut"); + Status s = db_->Put(write_opts, "foo", write_ts, "value"); + ASSERT_OK(s); + ASSERT_OK(Flush()); + + write_ts = Timestamp(2, 0); + s = db_->Put(write_opts, "bar", write_ts, "value"); + ASSERT_OK(s); + TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithoutSnapshot2:AfterPut"); + }); + ReadOptions read_opts; + std::string read_ts_str = Timestamp(3, 0); + Slice read_ts = read_ts_str; + read_opts.timestamp = &read_ts; + std::string value; + Status s = db_->Get(read_opts, "foo", &value); + writer_thread.join(); + ASSERT_TRUE(s.IsNotFound()); + Close(); +} + +// Application specifies both timestamp and snapshot. +// reader writer +// seq=10 +// ts'=90 +// ts=100 +// seq'=11 +// write finishes +// GetImpl(ts,seq) +// Since application specifies both timestamp and snapshot, application expects +// to see data that visible in BOTH timestamp and sequence number. Therefore, +// can be returned only if t1<=ts AND s1<=seq. +TEST_F(DataVisibilityTest, PointLookupWithSnapshot1) { + Options options = CurrentOptions(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({ + {"DataVisibilityTest::PointLookupWithSnapshot1:AfterTakingSnap", + "DataVisibilityTest::PointLookupWithSnapshot1:BeforePut"}, + {"DataVisibilityTest::PointLookupWithSnapshot1:AfterPut", + "DBImpl::GetImpl:1"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + port::Thread writer_thread([this]() { + std::string write_ts = Timestamp(1, 0); + WriteOptions write_opts; + TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithSnapshot1:BeforePut"); + Status s = db_->Put(write_opts, "foo", write_ts, "value"); + TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithSnapshot1:AfterPut"); + ASSERT_OK(s); + }); + ReadOptions read_opts; + const Snapshot* snap = db_->GetSnapshot(); + TEST_SYNC_POINT( + "DataVisibilityTest::PointLookupWithSnapshot1:AfterTakingSnap"); + read_opts.snapshot = snap; + std::string read_ts_str = Timestamp(3, 0); + Slice read_ts = read_ts_str; + read_opts.timestamp = &read_ts; + std::string value; + Status s = db_->Get(read_opts, "foo", &value); + writer_thread.join(); + + ASSERT_TRUE(s.IsNotFound()); + + db_->ReleaseSnapshot(snap); + Close(); +} + +// Application specifies both timestamp and snapshot. +// reader writer +// seq=10 +// ts'=90 +// ts=100 +// seq'=11 +// write finishes +// Flush +// GetImpl(ts,seq) +// Since application specifies both timestamp and snapshot, application expects +// to see data that visible in BOTH timestamp and sequence number. Therefore, +// can be returned only if t1<=ts AND s1<=seq. +TEST_F(DataVisibilityTest, PointLookupWithSnapshot2) { + Options options = CurrentOptions(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({ + {"DataVisibilityTest::PointLookupWithSnapshot2:AfterTakingSnap", + "DataVisibilityTest::PointLookupWithSnapshot2:BeforePut"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + port::Thread writer_thread([this]() { + std::string write_ts = Timestamp(1, 0); + WriteOptions write_opts; + TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithSnapshot2:BeforePut"); + Status s = db_->Put(write_opts, "foo", write_ts, "value1"); + ASSERT_OK(s); + ASSERT_OK(Flush()); + + write_ts = Timestamp(2, 0); + s = db_->Put(write_opts, "bar", write_ts, "value2"); + ASSERT_OK(s); + }); + const Snapshot* snap = db_->GetSnapshot(); + TEST_SYNC_POINT( + "DataVisibilityTest::PointLookupWithSnapshot2:AfterTakingSnap"); + writer_thread.join(); + std::string read_ts_str = Timestamp(3, 0); + Slice read_ts = read_ts_str; + ReadOptions read_opts; + read_opts.snapshot = snap; + read_opts.timestamp = &read_ts; + std::string value; + Status s = db_->Get(read_opts, "foo", &value); + ASSERT_TRUE(s.IsNotFound()); + db_->ReleaseSnapshot(snap); + Close(); +} + +// Application specifies timestamp but not snapshot. +// reader writer +// ts'=90 +// ts=100 +// seq=10 +// seq'=11 +// write finishes +// scan(ts,seq) +// can be seen in scan as long as ts>=t1 AND seq>=s1. If ts>=t1 but +// seqDisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::NewIterator:3", + "DataVisibilityTest::RangeScanWithoutSnapshot:BeforePut"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + port::Thread writer_thread([this]() { + WriteOptions write_opts; + TEST_SYNC_POINT("DataVisibilityTest::RangeScanWithoutSnapshot:BeforePut"); + for (int i = 0; i < 3; ++i) { + std::string write_ts = Timestamp(i + 1, 0); + Status s = db_->Put(write_opts, "key" + std::to_string(i), write_ts, + "value" + std::to_string(i)); + ASSERT_OK(s); + } + }); + std::string read_ts_str = Timestamp(10, 0); + Slice read_ts = read_ts_str; + ReadOptions read_opts; + read_opts.total_order_seek = true; + read_opts.timestamp = &read_ts; + Iterator* it = db_->NewIterator(read_opts); + ASSERT_NE(nullptr, it); + writer_thread.join(); + it->SeekToFirst(); + ASSERT_FALSE(it->Valid()); + delete it; + Close(); +} + +// Application specifies both timestamp and snapshot. +// reader writer +// seq=10 +// ts'=90 +// ts=100 seq'=11 +// write finishes +// scan(ts,seq) +// can be seen by the scan only if t1<=ts AND s1<=seq. If t1<=ts +// but s1>seq, then the key should not be returned. +TEST_F(DataVisibilityTest, RangeScanWithSnapshot) { + Options options = CurrentOptions(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({ + {"DataVisibilityTest::RangeScanWithSnapshot:AfterTakingSnapshot", + "DataVisibilityTest::RangeScanWithSnapshot:BeforePut"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + port::Thread writer_thread([this]() { + WriteOptions write_opts; + TEST_SYNC_POINT("DataVisibilityTest::RangeScanWithSnapshot:BeforePut"); + for (int i = 0; i < 3; ++i) { + std::string write_ts = Timestamp(i + 1, 0); + Status s = db_->Put(write_opts, "key" + std::to_string(i), write_ts, + "value" + std::to_string(i)); + ASSERT_OK(s); + } + }); + const Snapshot* snap = db_->GetSnapshot(); + TEST_SYNC_POINT( + "DataVisibilityTest::RangeScanWithSnapshot:AfterTakingSnapshot"); + + writer_thread.join(); + + std::string read_ts_str = Timestamp(10, 0); + Slice read_ts = read_ts_str; + ReadOptions read_opts; + read_opts.snapshot = snap; + read_opts.total_order_seek = true; + read_opts.timestamp = &read_ts; + Iterator* it = db_->NewIterator(read_opts); + ASSERT_NE(nullptr, it); + it->Seek("key0"); + ASSERT_FALSE(it->Valid()); + + delete it; + db_->ReleaseSnapshot(snap); + Close(); +} + +// Application specifies both timestamp and snapshot. +// Query each combination and make sure for MultiGet key , only +// return keys that ts>=t1 AND seq>=s1. +TEST_F(DataVisibilityTest, MultiGetWithTimestamp) { + Options options = CurrentOptions(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + const Snapshot* snap0 = db_->GetSnapshot(); + PutTestData(0); + VerifyDefaultCF(); + VerifyDefaultCF(snap0); + + const Snapshot* snap1 = db_->GetSnapshot(); + PutTestData(1); + VerifyDefaultCF(); + VerifyDefaultCF(snap0); + VerifyDefaultCF(snap1); + + ASSERT_OK(Flush()); + + const Snapshot* snap2 = db_->GetSnapshot(); + PutTestData(2); + VerifyDefaultCF(); + VerifyDefaultCF(snap0); + VerifyDefaultCF(snap1); + VerifyDefaultCF(snap2); + + db_->ReleaseSnapshot(snap0); + db_->ReleaseSnapshot(snap1); + db_->ReleaseSnapshot(snap2); + + Close(); +} + +// Application specifies timestamp but not snapshot. +// reader writer +// ts'=0, 1 +// ts=3 +// seq=10 +// seq'=11, 12 +// write finishes +// MultiGet(ts,seq) +// For MultiGet , only return keys that ts>=t1 AND seq>=s1. +TEST_F(DataVisibilityTest, MultiGetWithoutSnapshot) { + Options options = CurrentOptions(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::MultiGet:AfterGetSeqNum1", + "DataVisibilityTest::MultiGetWithoutSnapshot:BeforePut"}, + {"DataVisibilityTest::MultiGetWithoutSnapshot:AfterPut", + "DBImpl::MultiGet:AfterGetSeqNum2"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + port::Thread writer_thread([this]() { + TEST_SYNC_POINT("DataVisibilityTest::MultiGetWithoutSnapshot:BeforePut"); + PutTestData(0); + PutTestData(1); + TEST_SYNC_POINT("DataVisibilityTest::MultiGetWithoutSnapshot:AfterPut"); + }); + + ReadOptions read_opts; + std::string read_ts = Timestamp(kTestDataSize, 0); + Slice read_ts_slice = read_ts; + read_opts.timestamp = &read_ts_slice; + auto keys = GetKeys(); + std::vector values; + auto ss = db_->MultiGet(read_opts, keys, &values); + + writer_thread.join(); + for (auto s : ss) { + ASSERT_TRUE(s.IsNotFound()); + } + VerifyDefaultCF(); + Close(); +} + +TEST_F(DataVisibilityTest, MultiGetCrossCF) { + Options options = CurrentOptions(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + CreateAndReopenWithCF({"second"}, options); + ColumnFamilyHandle* second_cf = handles_[1]; + + const Snapshot* snap0 = db_->GetSnapshot(); + PutTestData(0); + PutTestData(0, second_cf); + VerifyDefaultCF(); + VerifyDefaultCF(snap0); + + const Snapshot* snap1 = db_->GetSnapshot(); + PutTestData(1); + PutTestData(1, second_cf); + VerifyDefaultCF(); + VerifyDefaultCF(snap0); + VerifyDefaultCF(snap1); + + ASSERT_OK(Flush()); + + const Snapshot* snap2 = db_->GetSnapshot(); + PutTestData(2); + PutTestData(2, second_cf); + VerifyDefaultCF(); + VerifyDefaultCF(snap0); + VerifyDefaultCF(snap1); + VerifyDefaultCF(snap2); + + ReadOptions read_opts; + std::string read_ts = Timestamp(kTestDataSize, 0); + Slice read_ts_slice = read_ts; + read_opts.timestamp = &read_ts_slice; + read_opts.snapshot = snap1; + auto keys = GetKeys(); + auto keys2 = GetKeys(); + keys.insert(keys.end(), keys2.begin(), keys2.end()); + std::vector cfs(kTestDataSize, + db_->DefaultColumnFamily()); + std::vector cfs2(kTestDataSize, second_cf); + cfs.insert(cfs.end(), cfs2.begin(), cfs2.end()); + + std::vector values; + auto ss = db_->MultiGet(read_opts, cfs, keys, &values); + for (int i = 0; i < 2 * kTestDataSize; i++) { + if (i % 3 == 0) { + // only the first key for each column family should be returned + ASSERT_OK(ss[i]); + } else { + ASSERT_TRUE(ss[i].IsNotFound()); + } + } + + db_->ReleaseSnapshot(snap0); + db_->ReleaseSnapshot(snap1); + db_->ReleaseSnapshot(snap2); + Close(); +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +class DBBasicTestWithTimestampCompressionSettings + : public DBBasicTestWithTimestampBase, + public testing::WithParamInterface< + std::tuple, CompressionType, + uint32_t, uint32_t>> { + public: + DBBasicTestWithTimestampCompressionSettings() + : DBBasicTestWithTimestampBase( + "db_basic_test_with_timestamp_compression") {} +}; + +TEST_P(DBBasicTestWithTimestampCompressionSettings, PutAndGet) { + const int kNumKeysPerFile = 1024; + const size_t kNumTimestamps = 4; + Options options = CurrentOptions(); + options.create_if_missing = true; + options.env = env_; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + size_t ts_sz = Timestamp(0, 0).size(); + TestComparator test_cmp(ts_sz); + options.comparator = &test_cmp; + BlockBasedTableOptions bbto; + bbto.filter_policy = std::get<0>(GetParam()); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + const CompressionType comp_type = std::get<1>(GetParam()); +#if LZ4_VERSION_NUMBER < 10400 // r124+ + if (comp_type == kLZ4Compression || comp_type == kLZ4HCCompression) { + return; + } +#endif // LZ4_VERSION_NUMBER >= 10400 + if (!ZSTD_Supported() && comp_type == kZSTD) { + return; + } + if (!Zlib_Supported() && comp_type == kZlibCompression) { + return; + } + + options.compression = comp_type; + options.compression_opts.max_dict_bytes = std::get<2>(GetParam()); + if (comp_type == kZSTD) { + options.compression_opts.zstd_max_train_bytes = std::get<2>(GetParam()); + } + options.compression_opts.parallel_threads = std::get<3>(GetParam()); + options.target_file_size_base = 1 << 26; // 64MB + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + size_t num_cfs = handles_.size(); + ASSERT_EQ(2, num_cfs); + std::vector write_ts_list; + std::vector read_ts_list; + + for (size_t i = 0; i != kNumTimestamps; ++i) { + write_ts_list.push_back(Timestamp(i * 2, 0)); + read_ts_list.push_back(Timestamp(1 + i * 2, 0)); + const Slice write_ts = write_ts_list.back(); + WriteOptions wopts; + for (int cf = 0; cf != static_cast(num_cfs); ++cf) { + for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) { + ASSERT_OK( + db_->Put(wopts, handles_[cf], Key1(j), write_ts, + "value_" + std::to_string(j) + "_" + std::to_string(i))); + } + } + } + const auto& verify_db_func = [&]() { + for (size_t i = 0; i != kNumTimestamps; ++i) { + ReadOptions ropts; + const Slice read_ts = read_ts_list[i]; + ropts.timestamp = &read_ts; + for (int cf = 0; cf != static_cast(num_cfs); ++cf) { + ColumnFamilyHandle* cfh = handles_[cf]; + for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) { + std::string value; + ASSERT_OK(db_->Get(ropts, cfh, Key1(j), &value)); + ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i), + value); + } + } + } + }; + verify_db_func(); + Close(); +} + +TEST_P(DBBasicTestWithTimestampCompressionSettings, PutDeleteGet) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + const int kNumKeysPerFile = 1024; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + BlockBasedTableOptions bbto; + bbto.filter_policy = std::get<0>(GetParam()); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + const CompressionType comp_type = std::get<1>(GetParam()); +#if LZ4_VERSION_NUMBER < 10400 // r124+ + if (comp_type == kLZ4Compression || comp_type == kLZ4HCCompression) { + return; + } +#endif // LZ4_VERSION_NUMBER >= 10400 + if (!ZSTD_Supported() && comp_type == kZSTD) { + return; + } + if (!Zlib_Supported() && comp_type == kZlibCompression) { + return; + } + + options.compression = comp_type; + options.compression_opts.max_dict_bytes = std::get<2>(GetParam()); + if (comp_type == kZSTD) { + options.compression_opts.zstd_max_train_bytes = std::get<2>(GetParam()); + } + options.compression_opts.parallel_threads = std::get<3>(GetParam()); + options.target_file_size_base = 1 << 26; // 64MB + + DestroyAndReopen(options); + + const size_t kNumL0Files = + static_cast(Options().level0_file_num_compaction_trigger); + { + // Half of the keys will go through Deletion and remaining half with + // SingleDeletion. Generate enough L0 files with ts=1 to trigger compaction + // to L1 + std::string ts = Timestamp(1, 0); + WriteOptions wopts; + for (size_t i = 0; i < kNumL0Files; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK(db_->Put(wopts, Key1(j), ts, "value" + std::to_string(i))); + } + ASSERT_OK(db_->Flush(FlushOptions())); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Generate another L0 at ts=3 + ts = Timestamp(3, 0); + for (int i = 0; i < kNumKeysPerFile; ++i) { + std::string key_str = Key1(i); + Slice key(key_str); + if ((i % 3) == 0) { + if (i < kNumKeysPerFile / 2) { + ASSERT_OK(db_->Delete(wopts, key, ts)); + } else { + ASSERT_OK(db_->SingleDelete(wopts, key, ts)); + } + } else { + ASSERT_OK(db_->Put(wopts, key, ts, "new_value")); + } + } + ASSERT_OK(db_->Flush(FlushOptions())); + // Populate memtable at ts=5 + ts = Timestamp(5, 0); + for (int i = 0; i != kNumKeysPerFile; ++i) { + std::string key_str = Key1(i); + Slice key(key_str); + if ((i % 3) == 1) { + if (i < kNumKeysPerFile / 2) { + ASSERT_OK(db_->Delete(wopts, key, ts)); + } else { + ASSERT_OK(db_->SingleDelete(wopts, key, ts)); + } + } else if ((i % 3) == 2) { + ASSERT_OK(db_->Put(wopts, key, ts, "new_value_2")); + } + } + } + { + std::string ts_str = Timestamp(6, 0); + Slice ts = ts_str; + ReadOptions ropts; + ropts.timestamp = &ts; + for (uint64_t i = 0; i != static_cast(kNumKeysPerFile); ++i) { + std::string value; + std::string key_ts; + Status s = db_->Get(ropts, Key1(i), &value, &key_ts); + if ((i % 3) == 2) { + ASSERT_OK(s); + ASSERT_EQ("new_value_2", value); + ASSERT_EQ(Timestamp(5, 0), key_ts); + } else if ((i % 3) == 1) { + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ(Timestamp(5, 0), key_ts); + } else { + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ(Timestamp(3, 0), key_ts); + } + } + } +} + +#ifndef ROCKSDB_LITE +// A class which remembers the name of each flushed file. +class FlushedFileCollector : public EventListener { + public: + FlushedFileCollector() {} + ~FlushedFileCollector() override {} + + void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { + InstrumentedMutexLock lock(&mutex_); + flushed_files_.push_back(info.file_path); + } + + std::vector GetFlushedFiles() { + std::vector result; + { + InstrumentedMutexLock lock(&mutex_); + result = flushed_files_; + } + return result; + } + + void ClearFlushedFiles() { + InstrumentedMutexLock lock(&mutex_); + flushed_files_.clear(); + } + + private: + std::vector flushed_files_; + InstrumentedMutex mutex_; +}; + +TEST_P(DBBasicTestWithTimestampCompressionSettings, PutAndGetWithCompaction) { + const int kNumKeysPerFile = 1024; + const size_t kNumTimestamps = 2; + const size_t kNumKeysPerTimestamp = (kNumKeysPerFile - 1) / kNumTimestamps; + const size_t kSplitPosBase = kNumKeysPerTimestamp / 2; + Options options = CurrentOptions(); + options.create_if_missing = true; + options.env = env_; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + + size_t ts_sz = Timestamp(0, 0).size(); + TestComparator test_cmp(ts_sz); + options.comparator = &test_cmp; + BlockBasedTableOptions bbto; + bbto.filter_policy = std::get<0>(GetParam()); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + const CompressionType comp_type = std::get<1>(GetParam()); +#if LZ4_VERSION_NUMBER < 10400 // r124+ + if (comp_type == kLZ4Compression || comp_type == kLZ4HCCompression) { + return; + } +#endif // LZ4_VERSION_NUMBER >= 10400 + if (!ZSTD_Supported() && comp_type == kZSTD) { + return; + } + if (!Zlib_Supported() && comp_type == kZlibCompression) { + return; + } + + options.compression = comp_type; + options.compression_opts.max_dict_bytes = std::get<2>(GetParam()); + if (comp_type == kZSTD) { + options.compression_opts.zstd_max_train_bytes = std::get<2>(GetParam()); + } + options.compression_opts.parallel_threads = std::get<3>(GetParam()); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + size_t num_cfs = handles_.size(); + ASSERT_EQ(2, num_cfs); + std::vector write_ts_list; + std::vector read_ts_list; + + const auto& verify_records_func = [&](size_t i, size_t begin, size_t end, + ColumnFamilyHandle* cfh) { + std::string value; + std::string timestamp; + + ReadOptions ropts; + const Slice read_ts = read_ts_list[i]; + ropts.timestamp = &read_ts; + std::string expected_timestamp = + std::string(write_ts_list[i].data(), write_ts_list[i].size()); + + for (size_t j = begin; j <= end; ++j) { + ASSERT_OK(db_->Get(ropts, cfh, Key1(j), &value, ×tamp)); + ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i), value); + ASSERT_EQ(expected_timestamp, timestamp); + } + }; + + for (size_t i = 0; i != kNumTimestamps; ++i) { + write_ts_list.push_back(Timestamp(i * 2, 0)); + read_ts_list.push_back(Timestamp(1 + i * 2, 0)); + const Slice write_ts = write_ts_list.back(); + WriteOptions wopts; + for (int cf = 0; cf != static_cast(num_cfs); ++cf) { + size_t memtable_get_start = 0; + for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) { + ASSERT_OK( + db_->Put(wopts, handles_[cf], Key1(j), write_ts, + "value_" + std::to_string(j) + "_" + std::to_string(i))); + if (j == kSplitPosBase + i || j == kNumKeysPerTimestamp - 1) { + verify_records_func(i, memtable_get_start, j, handles_[cf]); + memtable_get_start = j + 1; + + // flush all keys with the same timestamp to two sst files, split at + // incremental positions such that lowerlevel[1].smallest.userkey == + // higherlevel[0].largest.userkey + ASSERT_OK(Flush(cf)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // wait for flush (which + // is also a compaction) + + // compact files (2 at each level) to a lower level such that all + // keys with the same timestamp is at one level, with newer versions + // at higher levels. + CompactionOptions compact_opt; + compact_opt.compression = kNoCompression; + ASSERT_OK(db_->CompactFiles(compact_opt, handles_[cf], + collector->GetFlushedFiles(), + static_cast(kNumTimestamps - i))); + collector->ClearFlushedFiles(); + } + } + } + } + const auto& verify_db_func = [&]() { + for (size_t i = 0; i != kNumTimestamps; ++i) { + ReadOptions ropts; + const Slice read_ts = read_ts_list[i]; + ropts.timestamp = &read_ts; + std::string expected_timestamp(write_ts_list[i].data(), + write_ts_list[i].size()); + for (int cf = 0; cf != static_cast(num_cfs); ++cf) { + ColumnFamilyHandle* cfh = handles_[cf]; + verify_records_func(i, 0, kNumKeysPerTimestamp - 1, cfh); + } + } + }; + verify_db_func(); + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, BatchWriteAndMultiGet) { + const int kNumKeysPerFile = 8192; + const size_t kNumTimestamps = 2; + const size_t kNumKeysPerTimestamp = (kNumKeysPerFile - 1) / kNumTimestamps; + Options options = CurrentOptions(); + options.create_if_missing = true; + options.env = env_; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + options.memtable_prefix_bloom_size_ratio = 0.1; + options.memtable_whole_key_filtering = true; + + size_t ts_sz = Timestamp(0, 0).size(); + TestComparator test_cmp(ts_sz); + options.comparator = &test_cmp; + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy( + 10 /*bits_per_key*/, false /*use_block_based_builder*/)); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + size_t num_cfs = handles_.size(); + ASSERT_EQ(2, num_cfs); + std::vector write_ts_list; + std::vector read_ts_list; + + const auto& verify_records_func = [&](size_t i, ColumnFamilyHandle* cfh) { + std::vector keys; + std::vector key_vals; + std::vector values; + std::vector timestamps; + + for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) { + key_vals.push_back(Key1(j)); + } + for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) { + keys.push_back(key_vals[j]); + } + + ReadOptions ropts; + const Slice read_ts = read_ts_list[i]; + ropts.timestamp = &read_ts; + std::string expected_timestamp(write_ts_list[i].data(), + write_ts_list[i].size()); + + std::vector cfhs(keys.size(), cfh); + std::vector statuses = + db_->MultiGet(ropts, cfhs, keys, &values, ×tamps); + for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) { + ASSERT_OK(statuses[j]); + ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i), + values[j]); + ASSERT_EQ(expected_timestamp, timestamps[j]); + } + }; + + const std::string dummy_ts(ts_sz, '\0'); + for (size_t i = 0; i != kNumTimestamps; ++i) { + write_ts_list.push_back(Timestamp(i * 2, 0)); + read_ts_list.push_back(Timestamp(1 + i * 2, 0)); + const Slice& write_ts = write_ts_list.back(); + for (int cf = 0; cf != static_cast(num_cfs); ++cf) { + WriteOptions wopts; + WriteBatch batch(0, 0, 0, ts_sz); + for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) { + const std::string key = Key1(j); + const std::string value = + "value_" + std::to_string(j) + "_" + std::to_string(i); + ASSERT_OK(batch.Put(handles_[cf], key, value)); + } + ASSERT_OK(batch.UpdateTimestamps(write_ts, + [ts_sz](uint32_t) { return ts_sz; })); + ASSERT_OK(db_->Write(wopts, &batch)); + + verify_records_func(i, handles_[cf]); + + ASSERT_OK(Flush(cf)); + } + } + + const auto& verify_db_func = [&]() { + for (size_t i = 0; i != kNumTimestamps; ++i) { + ReadOptions ropts; + const Slice read_ts = read_ts_list[i]; + ropts.timestamp = &read_ts; + for (int cf = 0; cf != static_cast(num_cfs); ++cf) { + ColumnFamilyHandle* cfh = handles_[cf]; + verify_records_func(i, cfh); + } + } + }; + verify_db_func(); + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, MultiGetNoReturnTs) { + Options options = CurrentOptions(); + options.env = env_; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + WriteOptions write_opts; + std::string ts = Timestamp(1, 0); + ASSERT_OK(db_->Put(write_opts, "foo", ts, "value")); + ASSERT_OK(db_->Put(write_opts, "bar", ts, "value")); + ASSERT_OK(db_->Put(write_opts, "fooxxxxxxxxxxxxxxxx", ts, "value")); + ASSERT_OK(db_->Put(write_opts, "barxxxxxxxxxxxxxxxx", ts, "value")); + ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); + ts = Timestamp(2, 0); + Slice read_ts = ts; + ReadOptions read_opts; + read_opts.timestamp = &read_ts; + { + ColumnFamilyHandle* column_families[] = {cfh, cfh}; + Slice keys[] = {"foo", "bar"}; + PinnableSlice values[] = {PinnableSlice(), PinnableSlice()}; + Status statuses[] = {Status::OK(), Status::OK()}; + dbfull()->MultiGet(read_opts, /*num_keys=*/2, &column_families[0], &keys[0], + &values[0], &statuses[0], /*sorted_input=*/false); + for (const auto& s : statuses) { + ASSERT_OK(s); + } + } + { + ColumnFamilyHandle* column_families[] = {cfh, cfh, cfh, cfh}; + // Make user keys longer than configured timestamp size (16 bytes) to + // verify RocksDB does not use the trailing bytes 'x' as timestamp. + Slice keys[] = {"fooxxxxxxxxxxxxxxxx", "barxxxxxxxxxxxxxxxx", "foo", "bar"}; + PinnableSlice values[] = {PinnableSlice(), PinnableSlice(), PinnableSlice(), + PinnableSlice()}; + Status statuses[] = {Status::OK(), Status::OK(), Status::OK(), + Status::OK()}; + dbfull()->MultiGet(read_opts, /*num_keys=*/4, &column_families[0], &keys[0], + &values[0], &statuses[0], /*sorted_input=*/false); + for (const auto& s : statuses) { + ASSERT_OK(s); + } + } + Close(); +} + +#endif // !ROCKSDB_LITE + +INSTANTIATE_TEST_CASE_P( + Timestamp, DBBasicTestWithTimestampCompressionSettings, + ::testing::Combine( + ::testing::Values(std::shared_ptr(nullptr), + std::shared_ptr( + NewBloomFilterPolicy(10, false))), + ::testing::Values(kNoCompression, kZlibCompression, kLZ4Compression, + kLZ4HCCompression, kZSTD), + ::testing::Values(0, 1 << 14), ::testing::Values(1, 4))); + +class DBBasicTestWithTimestampPrefixSeek + : public DBBasicTestWithTimestampBase, + public testing::WithParamInterface< + std::tuple, + std::shared_ptr, bool, + BlockBasedTableOptions::IndexType>> { + public: + DBBasicTestWithTimestampPrefixSeek() + : DBBasicTestWithTimestampBase( + "/db_basic_test_with_timestamp_prefix_seek") {} +}; + +TEST_P(DBBasicTestWithTimestampPrefixSeek, IterateWithPrefix) { + const size_t kNumKeysPerFile = 128; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.prefix_extractor = std::get<0>(GetParam()); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + BlockBasedTableOptions bbto; + bbto.filter_policy = std::get<1>(GetParam()); + bbto.index_type = std::get<3>(GetParam()); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + const uint64_t kMaxKey = 0xffffffffffffffff; + const uint64_t kMinKey = 0xfffffffffffff000; + const std::vector write_ts_list = {Timestamp(3, 0xffffffff), + Timestamp(6, 0xffffffff)}; + WriteOptions write_opts; + { + for (size_t i = 0; i != write_ts_list.size(); ++i) { + for (uint64_t key = kMaxKey; key >= kMinKey; --key) { + Status s = db_->Put(write_opts, Key1(key), write_ts_list[i], + "value" + std::to_string(i)); + ASSERT_OK(s); + } + } + } + const std::vector read_ts_list = {Timestamp(5, 0xffffffff), + Timestamp(9, 0xffffffff)}; + { + ReadOptions read_opts; + read_opts.total_order_seek = false; + read_opts.prefix_same_as_start = std::get<2>(GetParam()); + fprintf(stdout, "%s %s %d\n", options.prefix_extractor->Name(), + bbto.filter_policy ? bbto.filter_policy->Name() : "null", + static_cast(read_opts.prefix_same_as_start)); + for (size_t i = 0; i != read_ts_list.size(); ++i) { + Slice read_ts = read_ts_list[i]; + read_opts.timestamp = &read_ts; + std::unique_ptr iter(db_->NewIterator(read_opts)); + + // Seek to kMaxKey + iter->Seek(Key1(kMaxKey)); + CheckIterUserEntry(iter.get(), Key1(kMaxKey), kTypeValue, + "value" + std::to_string(i), write_ts_list[i]); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + // Seek to kMinKey + iter->Seek(Key1(kMinKey)); + CheckIterUserEntry(iter.get(), Key1(kMinKey), kTypeValue, + "value" + std::to_string(i), write_ts_list[i]); + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + } + const std::vector targets = {kMinKey, kMinKey + 0x10, + kMinKey + 0x100, kMaxKey}; + const SliceTransform* const pe = options.prefix_extractor.get(); + ASSERT_NE(nullptr, pe); + const size_t kPrefixShift = + 8 * (Key1(0).size() - pe->Transform(Key1(0)).size()); + const uint64_t kPrefixMask = + ~((static_cast(1) << kPrefixShift) - 1); + const uint64_t kNumKeysWithinPrefix = + (static_cast(1) << kPrefixShift); + for (size_t i = 0; i != read_ts_list.size(); ++i) { + Slice read_ts = read_ts_list[i]; + read_opts.timestamp = &read_ts; + std::unique_ptr it(db_->NewIterator(read_opts)); + // Forward and backward iterate. + for (size_t j = 0; j != targets.size(); ++j) { + std::string start_key = Key1(targets[j]); + uint64_t expected_ub = + (targets[j] & kPrefixMask) - 1 + kNumKeysWithinPrefix; + uint64_t expected_key = targets[j]; + size_t count = 0; + it->Seek(Key1(targets[j])); + while (it->Valid()) { + std::string saved_prev_key; + saved_prev_key.assign(it->key().data(), it->key().size()); + + // Out of prefix + if (!read_opts.prefix_same_as_start && + pe->Transform(saved_prev_key) != pe->Transform(start_key)) { + break; + } + CheckIterUserEntry(it.get(), Key1(expected_key), kTypeValue, + "value" + std::to_string(i), write_ts_list[i]); + ++count; + ++expected_key; + it->Next(); + } + ASSERT_EQ(expected_ub - targets[j] + 1, count); + + count = 0; + expected_key = targets[j]; + it->SeekForPrev(start_key); + uint64_t expected_lb = (targets[j] & kPrefixMask); + while (it->Valid()) { + // Out of prefix + if (!read_opts.prefix_same_as_start && + pe->Transform(it->key()) != pe->Transform(start_key)) { + break; + } + CheckIterUserEntry(it.get(), Key1(expected_key), kTypeValue, + "value" + std::to_string(i), write_ts_list[i]); + ++count; + --expected_key; + it->Prev(); + } + ASSERT_EQ(targets[j] - std::max(expected_lb, kMinKey) + 1, count); + } + } + } + Close(); +} + +// TODO(yanqin): consider handling non-fixed-length prefix extractors, e.g. +// NoopTransform. +INSTANTIATE_TEST_CASE_P( + Timestamp, DBBasicTestWithTimestampPrefixSeek, + ::testing::Combine( + ::testing::Values( + std::shared_ptr(NewFixedPrefixTransform(1)), + std::shared_ptr(NewFixedPrefixTransform(4)), + std::shared_ptr(NewFixedPrefixTransform(7)), + std::shared_ptr(NewFixedPrefixTransform(8))), + ::testing::Values(std::shared_ptr(nullptr), + std::shared_ptr( + NewBloomFilterPolicy(10 /*bits_per_key*/, false)), + std::shared_ptr( + NewBloomFilterPolicy(20 /*bits_per_key*/, + false))), + ::testing::Bool(), + ::testing::Values( + BlockBasedTableOptions::IndexType::kBinarySearch, + BlockBasedTableOptions::IndexType::kHashSearch, + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch, + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey))); + +class DBBasicTestWithTsIterTombstones + : public DBBasicTestWithTimestampBase, + public testing::WithParamInterface< + std::tuple, + std::shared_ptr, int, + BlockBasedTableOptions::IndexType>> { + public: + DBBasicTestWithTsIterTombstones() + : DBBasicTestWithTimestampBase("/db_basic_ts_iter_tombstones") {} +}; + +TEST_P(DBBasicTestWithTsIterTombstones, IterWithDelete) { + constexpr size_t kNumKeysPerFile = 128; + Options options = CurrentOptions(); + options.env = env_; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.prefix_extractor = std::get<0>(GetParam()); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + BlockBasedTableOptions bbto; + bbto.filter_policy = std::get<1>(GetParam()); + bbto.index_type = std::get<3>(GetParam()); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.num_levels = std::get<2>(GetParam()); + DestroyAndReopen(options); + std::vector write_ts_strs = {Timestamp(2, 0), Timestamp(4, 0)}; + constexpr uint64_t kMaxKey = 0xffffffffffffffff; + constexpr uint64_t kMinKey = 0xfffffffffffff000; + // Insert kMinKey...kMaxKey + uint64_t key = kMinKey; + WriteOptions write_opts; + Slice ts = write_ts_strs[0]; + do { + Status s = db_->Put(write_opts, Key1(key), write_ts_strs[0], + "value" + std::to_string(key)); + ASSERT_OK(s); + if (kMaxKey == key) { + break; + } + ++key; + } while (true); + + for (key = kMaxKey; key >= kMinKey; --key) { + Status s; + if (0 != (key % 2)) { + s = db_->Put(write_opts, Key1(key), write_ts_strs[1], + "value1" + std::to_string(key)); + } else { + s = db_->Delete(write_opts, Key1(key), write_ts_strs[1]); + } + ASSERT_OK(s); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + { + std::string read_ts = Timestamp(4, 0); + ts = read_ts; + ReadOptions read_opts; + read_opts.total_order_seek = true; + read_opts.timestamp = &ts; + std::unique_ptr iter(db_->NewIterator(read_opts)); + size_t count = 0; + key = kMinKey + 1; + for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++count, key += 2) { + ASSERT_EQ(Key1(key), iter->key()); + ASSERT_EQ("value1" + std::to_string(key), iter->value()); + } + ASSERT_EQ((kMaxKey - kMinKey + 1) / 2, count); + + for (iter->SeekToLast(), count = 0, key = kMaxKey; iter->Valid(); + key -= 2, ++count, iter->Prev()) { + ASSERT_EQ(Key1(key), iter->key()); + ASSERT_EQ("value1" + std::to_string(key), iter->value()); + } + ASSERT_EQ((kMaxKey - kMinKey + 1) / 2, count); + } + Close(); +} + +INSTANTIATE_TEST_CASE_P( + Timestamp, DBBasicTestWithTsIterTombstones, + ::testing::Combine( + ::testing::Values( + std::shared_ptr(NewFixedPrefixTransform(7)), + std::shared_ptr(NewFixedPrefixTransform(8))), + ::testing::Values(std::shared_ptr(nullptr), + std::shared_ptr( + NewBloomFilterPolicy(10, false)), + std::shared_ptr( + NewBloomFilterPolicy(20, false))), + ::testing::Values(2, 6), + ::testing::Values( + BlockBasedTableOptions::IndexType::kBinarySearch, + BlockBasedTableOptions::IndexType::kHashSearch, + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch, + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey))); +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +class UpdateFullHistoryTsLowTest : public DBBasicTestWithTimestampBase { + public: + UpdateFullHistoryTsLowTest() + : DBBasicTestWithTimestampBase("/update_full_history_ts_low_test") {} +}; + +TEST_F(UpdateFullHistoryTsLowTest, ConcurrentUpdate) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + std::string lower_ts_low = Timestamp(10, 0); + std::string higher_ts_low = Timestamp(25, 0); + const size_t kTimestampSize = lower_ts_low.size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + + DestroyAndReopen(options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + // This workaround swaps `lower_ts_low` originally used for update by the + // caller to `higher_ts_low` after its writer is queued to make sure + // the caller will always get a TryAgain error. + // It mimics cases where two threads update full_history_ts_low concurrently + // with one thread writing a higher ts_low and one thread writing a lower + // ts_low. + VersionEdit* version_edit; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::IncreaseFullHistoryTsLowImpl:BeforeEdit", + [&](void* arg) { version_edit = reinterpret_cast(arg); }); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:BeforeWriterWaiting", + [&](void* /*arg*/) { version_edit->SetFullHistoryTsLow(higher_ts_low); }); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_TRUE( + db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), lower_ts_low) + .IsTryAgain()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, + GCPreserveRangeTombstoneWhenNoOrSmallFullHistoryLow) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + std::string ts_str = Timestamp(1, 0); + WriteOptions wopts; + ASSERT_OK(db_->Put(wopts, "k1", ts_str, "v1")); + ASSERT_OK(db_->Put(wopts, "k2", ts_str, "v2")); + ASSERT_OK(db_->Put(wopts, "k3", ts_str, "v3")); + ts_str = Timestamp(2, 0); + ASSERT_OK( + db_->DeleteRange(wopts, db_->DefaultColumnFamily(), "k1", "k3", ts_str)); + + ts_str = Timestamp(3, 0); + Slice ts = ts_str; + ReadOptions ropts; + ropts.timestamp = &ts; + CompactRangeOptions cro; + cro.full_history_ts_low = nullptr; + std::string value, key_ts; + Status s; + auto verify = [&] { + s = db_->Get(ropts, "k1", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db_->Get(ropts, "k2", &value, &key_ts); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ(key_ts, Timestamp(2, 0)); + + ASSERT_OK(db_->Get(ropts, "k3", &value, &key_ts)); + ASSERT_EQ(value, "v3"); + ASSERT_EQ(Timestamp(1, 0), key_ts); + + size_t batch_size = 3; + std::vector key_strs = {"k1", "k2", "k3"}; + std::vector keys{key_strs.begin(), key_strs.end()}; + std::vector values(batch_size); + std::vector statuses(batch_size); + db_->MultiGet(ropts, db_->DefaultColumnFamily(), batch_size, keys.data(), + values.data(), statuses.data(), true /* sorted_input */); + ASSERT_TRUE(statuses[0].IsNotFound()); + ASSERT_TRUE(statuses[1].IsNotFound()); + ASSERT_OK(statuses[2]); + ; + ASSERT_EQ(values[2], "v3"); + }; + verify(); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + verify(); + std::string lb = Timestamp(0, 0); + Slice lb_slice = lb; + cro.full_history_ts_low = &lb_slice; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + verify(); + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, + GCRangeTombstonesAndCoveredKeysRespectingTslow) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.num_levels = 2; + DestroyAndReopen(options); + + WriteOptions wopts; + ASSERT_OK(db_->Put(wopts, "k1", Timestamp(1, 0), "v1")); + ASSERT_OK(db_->Delete(wopts, "k2", Timestamp(2, 0))); + ASSERT_OK(db_->DeleteRange(wopts, db_->DefaultColumnFamily(), "k1", "k3", + Timestamp(3, 0))); + ASSERT_OK(db_->Put(wopts, "k3", Timestamp(4, 0), "v3")); + + ReadOptions ropts; + std::string read_ts = Timestamp(5, 0); + Slice read_ts_slice = read_ts; + ropts.timestamp = &read_ts_slice; + size_t batch_size = 3; + std::vector key_strs = {"k1", "k2", "k3"}; + std::vector keys = {key_strs.begin(), key_strs.end()}; + std::vector values(batch_size); + std::vector statuses(batch_size); + std::vector timestamps(batch_size); + db_->MultiGet(ropts, db_->DefaultColumnFamily(), batch_size, keys.data(), + values.data(), timestamps.data(), statuses.data(), + true /* sorted_input */); + ASSERT_TRUE(statuses[0].IsNotFound()); + ASSERT_EQ(timestamps[0], Timestamp(3, 0)); + ASSERT_TRUE(statuses[1].IsNotFound()); + // DeleteRange has a higher timestamp than Delete for "k2" + ASSERT_EQ(timestamps[1], Timestamp(3, 0)); + ASSERT_OK(statuses[2]); + ASSERT_EQ(values[2], "v3"); + ASSERT_EQ(timestamps[2], Timestamp(4, 0)); + + CompactRangeOptions cro; + // Range tombstone has timestamp >= full_history_ts_low, covered keys + // are not dropped. + std::string compaction_ts_str = Timestamp(2, 0); + Slice compaction_ts = compaction_ts_str; + cro.full_history_ts_low = &compaction_ts; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ropts.timestamp = &compaction_ts; + std::string value, ts; + ASSERT_OK(db_->Get(ropts, "k1", &value, &ts)); + ASSERT_EQ(value, "v1"); + // timestamp is below full_history_ts_low, zeroed out as the key goes into + // bottommost level + ASSERT_EQ(ts, Timestamp(0, 0)); + ASSERT_TRUE(db_->Get(ropts, "k2", &value, &ts).IsNotFound()); + ASSERT_EQ(ts, Timestamp(2, 0)); + + compaction_ts_str = Timestamp(4, 0); + compaction_ts = compaction_ts_str; + cro.full_history_ts_low = &compaction_ts; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ropts.timestamp = &read_ts_slice; + // k1, k2 and the range tombstone should be dropped + // k3 should still exist + db_->MultiGet(ropts, db_->DefaultColumnFamily(), batch_size, keys.data(), + values.data(), timestamps.data(), statuses.data(), + true /* sorted_input */); + ASSERT_TRUE(statuses[0].IsNotFound()); + ASSERT_TRUE(timestamps[0].empty()); + ASSERT_TRUE(statuses[1].IsNotFound()); + ASSERT_TRUE(timestamps[1].empty()); + ASSERT_OK(statuses[2]); + ASSERT_EQ(values[2], "v3"); + ASSERT_EQ(timestamps[2], Timestamp(4, 0)); + + Close(); +} + +TEST_P(DBBasicTestWithTimestampTableOptions, DeleteRangeBaiscReadAndIterate) { + const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25; + Options options = CurrentOptions(); + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + options.compression = kNoCompression; + BlockBasedTableOptions bbto; + bbto.index_type = GetParam(); + bbto.block_size = 100; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile)); + DestroyAndReopen(options); + + // Write half of the keys before the tombstone and half after the tombstone. + // Only covered keys (i.e., within the range and older than the tombstone) + // should be deleted. + for (int i = 0; i < kNum; ++i) { + if (i == kNum / 2) { + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key1(kRangeBegin), Key1(kRangeEnd), + Timestamp(i, 0))); + } + ASSERT_OK(db_->Put(WriteOptions(), Key1(i), Timestamp(i, 0), + "val" + std::to_string(i))); + if (i == kNum - kNumPerFile) { + ASSERT_OK(Flush()); + } + } + + ReadOptions read_opts; + read_opts.total_order_seek = true; + std::string read_ts = Timestamp(kNum, 0); + Slice read_ts_slice = read_ts; + read_opts.timestamp = &read_ts_slice; + { + std::unique_ptr iter(db_->NewIterator(read_opts)); + ASSERT_OK(iter->status()); + + int expected = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_EQ(Key1(expected), iter->key()); + if (expected == kRangeBegin - 1) { + expected = kNum / 2; + } else { + ++expected; + } + } + ASSERT_EQ(kNum, expected); + + expected = kNum / 2; + for (iter->Seek(Key1(kNum / 2)); iter->Valid(); iter->Next()) { + ASSERT_EQ(Key1(expected), iter->key()); + ++expected; + } + ASSERT_EQ(kNum, expected); + + expected = kRangeBegin - 1; + for (iter->SeekForPrev(Key1(kNum / 2 - 1)); iter->Valid(); iter->Prev()) { + ASSERT_EQ(Key1(expected), iter->key()); + --expected; + } + ASSERT_EQ(-1, expected); + + read_ts = Timestamp(0, 0); + read_ts_slice = read_ts; + read_opts.timestamp = &read_ts_slice; + iter.reset(db_->NewIterator(read_opts)); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key1(0)); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + } + + read_ts = Timestamp(kNum, 0); + read_ts_slice = read_ts; + read_opts.timestamp = &read_ts_slice; + std::string value, timestamp; + Status s; + for (int i = 0; i < kNum; ++i) { + s = db_->Get(read_opts, Key1(i), &value, ×tamp); + if (i >= kRangeBegin && i < kNum / 2) { + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ(timestamp, Timestamp(kNum / 2, 0)); + } else { + ASSERT_OK(s); + ASSERT_EQ(value, "val" + std::to_string(i)); + ASSERT_EQ(timestamp, Timestamp(i, 0)); + } + } + + size_t batch_size = kNum; + std::vector key_strs(batch_size); + std::vector keys(batch_size); + std::vector values(batch_size); + std::vector statuses(batch_size); + std::vector timestamps(batch_size); + for (int i = 0; i < kNum; ++i) { + key_strs[i] = Key1(i); + keys[i] = key_strs[i]; + } + db_->MultiGet(read_opts, db_->DefaultColumnFamily(), batch_size, keys.data(), + values.data(), timestamps.data(), statuses.data(), + true /* sorted_input */); + for (int i = 0; i < kNum; ++i) { + if (i >= kRangeBegin && i < kNum / 2) { + ASSERT_TRUE(statuses[i].IsNotFound()); + ASSERT_EQ(timestamps[i], Timestamp(kNum / 2, 0)); + } else { + ASSERT_OK(statuses[i]); + ASSERT_EQ(values[i], "val" + std::to_string(i)); + ASSERT_EQ(timestamps[i], Timestamp(i, 0)); + } + } + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, DeleteRangeGetIteratorWithSnapshot) { + // 4 keys 0, 1, 2, 3 at timestamps 0, 1, 2, 3 respectively. + // A range tombstone [1, 3) at timestamp 1 and has a sequence number between + // key 1 and 2. + Options options = CurrentOptions(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + WriteOptions write_opts; + std::string put_ts = Timestamp(0, 0); + const int kNum = 4, kNumPerFile = 1, kRangeBegin = 1, kRangeEnd = 3; + options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile)); + const Snapshot* before_tombstone = nullptr; + const Snapshot* after_tombstone = nullptr; + for (int i = 0; i < kNum; ++i) { + ASSERT_OK(db_->Put(WriteOptions(), Key1(i), Timestamp(i, 0), + "val" + std::to_string(i))); + if (i == kRangeBegin) { + before_tombstone = db_->GetSnapshot(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key1(kRangeBegin), Key1(kRangeEnd), + Timestamp(kRangeBegin, 0))); + } + if (i == kNum / 2) { + ASSERT_OK(Flush()); + } + } + assert(before_tombstone); + after_tombstone = db_->GetSnapshot(); + // snapshot and ts before tombstone + std::string read_ts_str = Timestamp(kRangeBegin - 1, 0); // (0, 0) + Slice read_ts = read_ts_str; + ReadOptions read_opts; + read_opts.timestamp = &read_ts; + read_opts.snapshot = before_tombstone; + std::vector expected_status = { + Status::OK(), Status::NotFound(), Status::NotFound(), Status::NotFound()}; + std::vector expected_values(kNum); + expected_values[0] = "val" + std::to_string(0); + std::vector expected_timestamps(kNum); + expected_timestamps[0] = Timestamp(0, 0); + + size_t batch_size = kNum; + std::vector key_strs(batch_size); + std::vector keys(batch_size); + std::vector values(batch_size); + std::vector statuses(batch_size); + std::vector timestamps(batch_size); + for (int i = 0; i < kNum; ++i) { + key_strs[i] = Key1(i); + keys[i] = key_strs[i]; + } + + auto verify = [&] { + db_->MultiGet(read_opts, db_->DefaultColumnFamily(), batch_size, + keys.data(), values.data(), timestamps.data(), + statuses.data(), true /* sorted_input */); + std::string value, timestamp; + Status s; + for (int i = 0; i < kNum; ++i) { + s = db_->Get(read_opts, Key1(i), &value, ×tamp); + ASSERT_EQ(s, expected_status[i]); + ASSERT_EQ(statuses[i], expected_status[i]); + if (s.ok()) { + ASSERT_EQ(value, expected_values[i]); + ASSERT_EQ(values[i], expected_values[i]); + } + if (!timestamp.empty()) { + ASSERT_EQ(timestamp, expected_timestamps[i]); + ASSERT_EQ(timestamps[i], expected_timestamps[i]); + } else { + ASSERT_TRUE(timestamps[i].empty()); + } + } + std::unique_ptr iter(db_->NewIterator(read_opts)); + std::unique_ptr iter_for_seek(db_->NewIterator(read_opts)); + iter->SeekToFirst(); + for (int i = 0; i < kNum; ++i) { + if (expected_status[i].ok()) { + auto verify_iter = [&](Iterator* iter_ptr) { + ASSERT_TRUE(iter_ptr->Valid()); + ASSERT_EQ(iter_ptr->key(), keys[i]); + ASSERT_EQ(iter_ptr->value(), expected_values[i]); + ASSERT_EQ(iter_ptr->timestamp(), expected_timestamps[i]); + }; + verify_iter(iter.get()); + iter->Next(); + + iter_for_seek->Seek(keys[i]); + verify_iter(iter_for_seek.get()); + + iter_for_seek->SeekForPrev(keys[i]); + verify_iter(iter_for_seek.get()); + } + } + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + }; + + verify(); + + // snapshot before tombstone and ts after tombstone + read_ts_str = Timestamp(kNum, 0); // (4, 0) + read_ts = read_ts_str; + read_opts.timestamp = &read_ts; + read_opts.snapshot = before_tombstone; + expected_status[1] = Status::OK(); + expected_timestamps[1] = Timestamp(1, 0); + expected_values[1] = "val" + std::to_string(1); + verify(); + + // snapshot after tombstone and ts before tombstone + read_ts_str = Timestamp(kRangeBegin - 1, 0); // (0, 0) + read_ts = read_ts_str; + read_opts.timestamp = &read_ts; + read_opts.snapshot = after_tombstone; + expected_status[1] = Status::NotFound(); + expected_timestamps[1].clear(); + expected_values[1].clear(); + verify(); + + // snapshot and ts after tombstone + read_ts_str = Timestamp(kNum, 0); // (4, 0) + read_ts = read_ts_str; + read_opts.timestamp = &read_ts; + read_opts.snapshot = after_tombstone; + for (int i = 0; i < kNum; ++i) { + if (i == kRangeBegin) { + expected_status[i] = Status::NotFound(); + expected_values[i].clear(); + } else { + expected_status[i] = Status::OK(); + expected_values[i] = "val" + std::to_string(i); + } + expected_timestamps[i] = Timestamp(i, 0); + } + verify(); + + db_->ReleaseSnapshot(before_tombstone); + db_->ReleaseSnapshot(after_tombstone); + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, MergeBasic) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.merge_operator = std::make_shared('.'); + DestroyAndReopen(options); + + const std::array write_ts_strs = { + Timestamp(100, 0), Timestamp(200, 0), Timestamp(300, 0)}; + constexpr size_t kNumOfUniqKeys = 100; + ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily(); + + for (size_t i = 0; i < write_ts_strs.size(); ++i) { + for (size_t j = 0; j < kNumOfUniqKeys; ++j) { + Status s; + if (i == 0) { + const std::string val = "v" + std::to_string(j) + "_0"; + s = db_->Put(WriteOptions(), Key1(j), write_ts_strs[i], val); + } else { + const std::string merge_op = std::to_string(i); + s = db_->Merge(WriteOptions(), default_cf, Key1(j), write_ts_strs[i], + merge_op); + } + ASSERT_OK(s); + } + } + + std::array read_ts_strs = { + Timestamp(150, 0), Timestamp(250, 0), Timestamp(350, 0)}; + + const auto verify_db_with_get = [&]() { + for (size_t i = 0; i < kNumOfUniqKeys; ++i) { + const std::string base_val = "v" + std::to_string(i) + "_0"; + const std::array expected_values = { + base_val, base_val + ".1", base_val + ".1.2"}; + const std::array& expected_ts = write_ts_strs; + ReadOptions read_opts; + for (size_t j = 0; j < read_ts_strs.size(); ++j) { + Slice read_ts = read_ts_strs[j]; + read_opts.timestamp = &read_ts; + std::string value; + std::string ts; + const Status s = db_->Get(read_opts, Key1(i), &value, &ts); + ASSERT_OK(s); + ASSERT_EQ(expected_values[j], value); + ASSERT_EQ(expected_ts[j], ts); + + // Do Seek/SeekForPrev + std::unique_ptr it(db_->NewIterator(read_opts)); + it->Seek(Key1(i)); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ(expected_values[j], it->value()); + ASSERT_EQ(expected_ts[j], it->timestamp()); + + it->SeekForPrev(Key1(i)); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ(expected_values[j], it->value()); + ASSERT_EQ(expected_ts[j], it->timestamp()); + } + } + }; + + const auto verify_db_with_iterator = [&]() { + std::string value_suffix; + for (size_t i = 0; i < read_ts_strs.size(); ++i) { + ReadOptions read_opts; + Slice read_ts = read_ts_strs[i]; + read_opts.timestamp = &read_ts; + std::unique_ptr it(db_->NewIterator(read_opts)); + size_t key_int_val = 0; + for (it->SeekToFirst(); it->Valid(); it->Next(), ++key_int_val) { + const std::string key = Key1(key_int_val); + const std::string value = + "v" + std::to_string(key_int_val) + "_0" + value_suffix; + ASSERT_EQ(key, it->key()); + ASSERT_EQ(value, it->value()); + ASSERT_EQ(write_ts_strs[i], it->timestamp()); + } + ASSERT_EQ(kNumOfUniqKeys, key_int_val); + + key_int_val = kNumOfUniqKeys - 1; + for (it->SeekToLast(); it->Valid(); it->Prev(), --key_int_val) { + const std::string key = Key1(key_int_val); + const std::string value = + "v" + std::to_string(key_int_val) + "_0" + value_suffix; + ASSERT_EQ(key, it->key()); + ASSERT_EQ(value, it->value()); + ASSERT_EQ(write_ts_strs[i], it->timestamp()); + } + ASSERT_EQ(std::numeric_limits::max(), key_int_val); + + value_suffix = value_suffix + "." + std::to_string(i + 1); + } + }; + + verify_db_with_get(); + verify_db_with_iterator(); + + ASSERT_OK(db_->Flush(FlushOptions())); + + verify_db_with_get(); + verify_db_with_iterator(); + + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, MergeAfterDeletion) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.merge_operator = std::make_shared('.'); + DestroyAndReopen(options); + + ColumnFamilyHandle* const column_family = db_->DefaultColumnFamily(); + + const size_t num_keys_per_file = 10; + const size_t num_merges_per_key = 2; + for (size_t i = 0; i < num_keys_per_file; ++i) { + std::string ts = Timestamp(i + 10000, 0); + Status s = db_->Delete(WriteOptions(), Key1(i), ts); + ASSERT_OK(s); + for (size_t j = 1; j <= num_merges_per_key; ++j) { + ts = Timestamp(i + 10000 + j, 0); + s = db_->Merge(WriteOptions(), column_family, Key1(i), ts, + std::to_string(j)); + ASSERT_OK(s); + } + } + + const auto verify_db = [&]() { + ReadOptions read_opts; + std::string read_ts_str = Timestamp(20000, 0); + Slice ts = read_ts_str; + read_opts.timestamp = &ts; + std::unique_ptr it(db_->NewIterator(read_opts)); + size_t count = 0; + for (it->SeekToFirst(); it->Valid(); it->Next(), ++count) { + std::string key = Key1(count); + ASSERT_EQ(key, it->key()); + std::string value; + for (size_t j = 1; j <= num_merges_per_key; ++j) { + value.append(std::to_string(j)); + if (j < num_merges_per_key) { + value.push_back('.'); + } + } + ASSERT_EQ(value, it->value()); + std::string ts1 = Timestamp(count + 10000 + num_merges_per_key, 0); + ASSERT_EQ(ts1, it->timestamp()); + } + ASSERT_OK(it->status()); + ASSERT_EQ(num_keys_per_file, count); + for (it->SeekToLast(); it->Valid(); it->Prev(), --count) { + std::string key = Key1(count - 1); + ASSERT_EQ(key, it->key()); + std::string value; + for (size_t j = 1; j <= num_merges_per_key; ++j) { + value.append(std::to_string(j)); + if (j < num_merges_per_key) { + value.push_back('.'); + } + } + ASSERT_EQ(value, it->value()); + std::string ts1 = Timestamp(count - 1 + 10000 + num_merges_per_key, 0); + ASSERT_EQ(ts1, it->timestamp()); + } + ASSERT_OK(it->status()); + ASSERT_EQ(0, count); + }; + + verify_db(); + + Close(); +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_with_timestamp_compaction_test.cc b/src/rocksdb/db/db_with_timestamp_compaction_test.cc new file mode 100644 index 000000000..d28f67e05 --- /dev/null +++ b/src/rocksdb/db/db_with_timestamp_compaction_test.cc @@ -0,0 +1,334 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction.h" +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "test_util/testutil.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +std::string Key1(uint64_t key) { + std::string ret; + PutFixed64(&ret, key); + std::reverse(ret.begin(), ret.end()); + return ret; +} + +std::string Timestamp(uint64_t ts) { + std::string ret; + PutFixed64(&ret, ts); + return ret; +} +} // anonymous namespace + +class TimestampCompatibleCompactionTest : public DBTestBase { + public: + TimestampCompatibleCompactionTest() + : DBTestBase("ts_compatible_compaction_test", /*env_do_fsync=*/true) {} + + std::string Get(const std::string& key, uint64_t ts) { + ReadOptions read_opts; + std::string ts_str = Timestamp(ts); + Slice ts_slice = ts_str; + read_opts.timestamp = &ts_slice; + std::string value; + Status s = db_->Get(read_opts, key, &value); + if (s.IsNotFound()) { + value.assign("NOT_FOUND"); + } else if (!s.ok()) { + value.assign(s.ToString()); + } + return value; + } +}; + +TEST_F(TimestampCompatibleCompactionTest, UserKeyCrossFileBoundary) { + Options options = CurrentOptions(); + options.env = env_; + options.compaction_style = kCompactionStyleLevel; + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + options.level0_file_num_compaction_trigger = 3; + constexpr size_t kNumKeysPerFile = 101; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + const auto* compaction = reinterpret_cast(arg); + ASSERT_NE(nullptr, compaction); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(1, compaction->num_input_levels()); + // Check that all 3 L0 ssts are picked for level compaction. + ASSERT_EQ(3, compaction->num_input_files(0)); + }); + SyncPoint::GetInstance()->EnableProcessing(); + // Write a L0 with keys 0, 1, ..., 99 with ts from 100 to 199. + uint64_t ts = 100; + uint64_t key = 0; + WriteOptions write_opts; + for (; key < kNumKeysPerFile - 1; ++key, ++ts) { + std::string ts_str = Timestamp(ts); + ASSERT_OK( + db_->Put(write_opts, Key1(key), ts_str, "foo_" + std::to_string(key))); + } + // Write another L0 with keys 99 with newer ts. + ASSERT_OK(Flush()); + uint64_t saved_read_ts1 = ts++; + key = 99; + for (int i = 0; i < 4; ++i, ++ts) { + std::string ts_str = Timestamp(ts); + ASSERT_OK( + db_->Put(write_opts, Key1(key), ts_str, "bar_" + std::to_string(key))); + } + ASSERT_OK(Flush()); + uint64_t saved_read_ts2 = ts++; + // Write another L0 with keys 99, 100, 101, ..., 150 + for (; key <= 150; ++key, ++ts) { + std::string ts_str = Timestamp(ts); + ASSERT_OK( + db_->Put(write_opts, Key1(key), ts_str, "foo1_" + std::to_string(key))); + } + ASSERT_OK(Flush()); + // Wait for compaction to finish + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + uint64_t read_ts = ts; + ASSERT_EQ("foo_99", Get(Key1(99), saved_read_ts1)); + ASSERT_EQ("bar_99", Get(Key1(99), saved_read_ts2)); + ASSERT_EQ("foo1_99", Get(Key1(99), read_ts)); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(TimestampCompatibleCompactionTest, MultipleSubCompactions) { + Options options = CurrentOptions(); + options.env = env_; + options.compaction_style = kCompactionStyleUniversal; + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + options.level0_file_num_compaction_trigger = 3; + options.max_subcompactions = 3; + options.target_file_size_base = 1024; + options.statistics = CreateDBStatistics(); + DestroyAndReopen(options); + + uint64_t ts = 100; + uint64_t key = 0; + WriteOptions write_opts; + + // Write keys 0, 1, ..., 499 with ts from 100 to 599. + { + for (; key <= 499; ++key, ++ts) { + std::string ts_str = Timestamp(ts); + ASSERT_OK(db_->Put(write_opts, Key1(key), ts_str, + "foo_" + std::to_string(key))); + } + } + + // Write keys 500, ..., 999 with ts from 600 to 1099. + { + for (; key <= 999; ++key, ++ts) { + std::string ts_str = Timestamp(ts); + ASSERT_OK(db_->Put(write_opts, Key1(key), ts_str, + "foo_" + std::to_string(key))); + } + ASSERT_OK(Flush()); + } + + // Wait for compaction to finish + { + ASSERT_OK(dbfull()->RunManualCompaction( + static_cast_with_check( + db_->DefaultColumnFamily()) + ->cfd(), + 0 /* input_level */, 1 /* output_level */, CompactRangeOptions(), + nullptr /* begin */, nullptr /* end */, true /* exclusive */, + true /* disallow_trivial_move */, + std::numeric_limits::max() /* max_file_num_to_ignore */, + "" /*trim_ts*/)); + } + + // Check stats to make sure multiple subcompactions were scheduled for + // boundaries not to be nullptr. + { + HistogramData num_sub_compactions; + options.statistics->histogramData(NUM_SUBCOMPACTIONS_SCHEDULED, + &num_sub_compactions); + ASSERT_GT(num_sub_compactions.sum, 1); + } + + for (key = 0; key <= 999; ++key) { + ASSERT_EQ("foo_" + std::to_string(key), Get(Key1(key), ts)); + } +} + +class TestFilePartitioner : public SstPartitioner { + public: + explicit TestFilePartitioner() {} + ~TestFilePartitioner() override {} + + const char* Name() const override { return "TestFilePartitioner"; } + PartitionerResult ShouldPartition( + const PartitionerRequest& /*request*/) override { + return PartitionerResult::kRequired; + } + bool CanDoTrivialMove(const Slice& /*smallest_user_key*/, + const Slice& /*largest_user_key*/) override { + return false; + } +}; + +class TestFilePartitionerFactory : public SstPartitionerFactory { + public: + explicit TestFilePartitionerFactory() {} + std::unique_ptr CreatePartitioner( + const SstPartitioner::Context& /*context*/) const override { + std::unique_ptr ret = + std::make_unique(); + return ret; + } + const char* Name() const override { return "TestFilePartitionerFactory"; } +}; + +#ifndef ROCKSDB_LITE +TEST_F(TimestampCompatibleCompactionTest, CompactFilesRangeCheckL0) { + Options options = CurrentOptions(); + options.env = env_; + options.sst_partitioner_factory = + std::make_shared(); + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + constexpr int kNumFiles = 10; + constexpr int kKeysPerFile = 2; + const std::string user_key = "foo"; + constexpr uint64_t start_ts = 10000; + + uint64_t cur_ts = start_ts; + for (int k = 0; k < kNumFiles; ++k) { + for (int i = 0; i < kKeysPerFile; ++i) { + ASSERT_OK(db_->Put(WriteOptions(), user_key, Timestamp(cur_ts), + "v" + std::to_string(i))); + ++cur_ts; + } + ASSERT_OK(db_->Flush(FlushOptions())); + } + + std::vector input_files{}; + { + std::vector files; + ASSERT_OK(env_->GetChildren(dbname_, &files)); + for (const auto& f : files) { + uint64_t file_num = 0; + FileType file_type = FileType::kWalFile; + if (!ParseFileName(f, &file_num, &file_type) || + file_type != FileType::kTableFile) { + continue; + } + input_files.emplace_back(f); + } + // sorting here by name, which also happens to sort by generation date. + std::sort(input_files.begin(), input_files.end()); + assert(kNumFiles == input_files.size()); + std::vector tmp; + tmp.emplace_back(input_files[input_files.size() / 2]); + input_files.swap(tmp); + } + + { + std::vector output_file_names; + CompactionJobInfo compaction_job_info; + ASSERT_OK(db_->CompactFiles(CompactionOptions(), input_files, + /*output_level=*/1, /*output_path_id=*/-1, + &output_file_names, &compaction_job_info)); + // We expect the L0 files older than the original provided input were all + // included in the compaction. + ASSERT_EQ(static_cast(kNumFiles / 2 + 1), + compaction_job_info.input_files.size()); + } +} + +TEST_F(TimestampCompatibleCompactionTest, CompactFilesRangeCheckL1) { + Options options = CurrentOptions(); + options.env = env_; + options.sst_partitioner_factory = + std::make_shared(); + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + + constexpr int kNumFiles = 4; + options.level0_file_num_compaction_trigger = kNumFiles; + + DestroyAndReopen(options); + + constexpr int kKeysPerFile = 2; + const std::string user_key = "foo"; + constexpr uint64_t start_ts = 10000; + + uint64_t cur_ts = start_ts; + // Generate some initial files in both L0 and L1. + for (int k = 0; k < kNumFiles; ++k) { + for (int i = 0; i < kKeysPerFile; ++i) { + ASSERT_OK(db_->Put(WriteOptions(), user_key, Timestamp(cur_ts), + "v" + std::to_string(i))); + ++cur_ts; + } + ASSERT_OK(db_->Flush(FlushOptions())); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/0, /*cf=*/0)); + ASSERT_EQ(kNumFiles * kKeysPerFile, + NumTableFilesAtLevel(/*level=*/1, /*cf=*/0)); + + constexpr int additional_l0s = 2; + for (int i = 0; i < additional_l0s; ++i, ++cur_ts) { + ASSERT_OK(db_->Put(WriteOptions(), user_key, Timestamp(cur_ts), "v")); + ASSERT_OK(db_->Flush(FlushOptions())); + } + ASSERT_EQ(additional_l0s, NumTableFilesAtLevel(/*level=*/0, /*cf=*/0)); + + std::vector inputs; + { + std::vector fmetas; + db_->GetLiveFilesMetaData(&fmetas); + bool included_one_l1 = false; + for (const auto& meta : fmetas) { + if (meta.level == 0) { + inputs.emplace_back(meta.relative_filename); + } else if (!included_one_l1) { + inputs.emplace_back(meta.relative_filename); + included_one_l1 = true; + } + } + } + ASSERT_EQ(static_cast(3), inputs.size()); + { + std::vector output_file_names; + CompactionJobInfo compaction_job_info; + + ASSERT_OK(db_->CompactFiles(CompactionOptions(), inputs, /*output_level=*/1, + /*output_path_id=*/-1, &output_file_names, + &compaction_job_info)); + ASSERT_EQ(kNumFiles * kKeysPerFile + 2, output_file_names.size()); + ASSERT_EQ(kNumFiles * kKeysPerFile + 2, + static_cast(compaction_job_info.input_files.size())); + } +} +#endif // !ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_with_timestamp_test_util.cc b/src/rocksdb/db/db_with_timestamp_test_util.cc new file mode 100644 index 000000000..f562bcb48 --- /dev/null +++ b/src/rocksdb/db/db_with_timestamp_test_util.cc @@ -0,0 +1,96 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_with_timestamp_test_util.h" + +namespace ROCKSDB_NAMESPACE { +std::string DBBasicTestWithTimestampBase::Key1(uint64_t k) { + std::string ret; + PutFixed64(&ret, k); + std::reverse(ret.begin(), ret.end()); + return ret; +} + +std::string DBBasicTestWithTimestampBase::KeyWithPrefix(std::string prefix, + uint64_t k) { + std::string ret; + PutFixed64(&ret, k); + std::reverse(ret.begin(), ret.end()); + return prefix + ret; +} + +std::vector DBBasicTestWithTimestampBase::ConvertStrToSlice( + std::vector& strings) { + std::vector ret; + for (const auto& s : strings) { + ret.emplace_back(s); + } + return ret; +} + +std::string DBBasicTestWithTimestampBase::Timestamp(uint64_t low, + uint64_t high) { + std::string ts; + PutFixed64(&ts, low); + PutFixed64(&ts, high); + return ts; +} + +void DBBasicTestWithTimestampBase::CheckIterUserEntry( + const Iterator* it, const Slice& expected_key, + ValueType expected_value_type, const Slice& expected_value, + const Slice& expected_ts) const { + ASSERT_TRUE(it->Valid()); + ASSERT_OK(it->status()); + ASSERT_EQ(expected_key, it->key()); + if (kTypeValue == expected_value_type) { + ASSERT_EQ(expected_value, it->value()); + } + ASSERT_EQ(expected_ts, it->timestamp()); +} + +void DBBasicTestWithTimestampBase::CheckIterEntry( + const Iterator* it, const Slice& expected_ukey, SequenceNumber expected_seq, + ValueType expected_val_type, const Slice& expected_value, + const Slice& expected_ts) const { + ASSERT_TRUE(it->Valid()); + ASSERT_OK(it->status()); + std::string ukey_and_ts; + ukey_and_ts.assign(expected_ukey.data(), expected_ukey.size()); + ukey_and_ts.append(expected_ts.data(), expected_ts.size()); + ParsedInternalKey parsed_ikey; + ASSERT_OK(ParseInternalKey(it->key(), &parsed_ikey, true /* log_err_key */)); + ASSERT_EQ(ukey_and_ts, parsed_ikey.user_key); + ASSERT_EQ(expected_val_type, parsed_ikey.type); + ASSERT_EQ(expected_seq, parsed_ikey.sequence); + if (expected_val_type == kTypeValue) { + ASSERT_EQ(expected_value, it->value()); + } + ASSERT_EQ(expected_ts, it->timestamp()); +} + +void DBBasicTestWithTimestampBase::CheckIterEntry( + const Iterator* it, const Slice& expected_ukey, ValueType expected_val_type, + const Slice& expected_value, const Slice& expected_ts) const { + ASSERT_TRUE(it->Valid()); + ASSERT_OK(it->status()); + std::string ukey_and_ts; + ukey_and_ts.assign(expected_ukey.data(), expected_ukey.size()); + ukey_and_ts.append(expected_ts.data(), expected_ts.size()); + + ParsedInternalKey parsed_ikey; + ASSERT_OK(ParseInternalKey(it->key(), &parsed_ikey, true /* log_err_key */)); + ASSERT_EQ(expected_val_type, parsed_ikey.type); + ASSERT_EQ(Slice(ukey_and_ts), parsed_ikey.user_key); + if (expected_val_type == kTypeValue) { + ASSERT_EQ(expected_value, it->value()); + } + ASSERT_EQ(expected_ts, it->timestamp()); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_with_timestamp_test_util.h b/src/rocksdb/db/db_with_timestamp_test_util.h new file mode 100644 index 000000000..8a0d8e4e3 --- /dev/null +++ b/src/rocksdb/db/db_with_timestamp_test_util.h @@ -0,0 +1,126 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "test_util/testutil.h" + +namespace ROCKSDB_NAMESPACE { +class DBBasicTestWithTimestampBase : public DBTestBase { + public: + explicit DBBasicTestWithTimestampBase(const std::string& dbname) + : DBTestBase(dbname, /*env_do_fsync=*/true) {} + + protected: + static std::string Key1(uint64_t k); + + static std::string KeyWithPrefix(std::string prefix, uint64_t k); + + static std::vector ConvertStrToSlice( + std::vector& strings); + + class TestComparator : public Comparator { + private: + const Comparator* cmp_without_ts_; + + public: + explicit TestComparator(size_t ts_sz) + : Comparator(ts_sz), cmp_without_ts_(nullptr) { + cmp_without_ts_ = BytewiseComparator(); + } + + const char* Name() const override { return "TestComparator"; } + + void FindShortSuccessor(std::string*) const override {} + + void FindShortestSeparator(std::string*, const Slice&) const override {} + + int Compare(const Slice& a, const Slice& b) const override { + int r = CompareWithoutTimestamp(a, b); + if (r != 0 || 0 == timestamp_size()) { + return r; + } + return -CompareTimestamp( + Slice(a.data() + a.size() - timestamp_size(), timestamp_size()), + Slice(b.data() + b.size() - timestamp_size(), timestamp_size())); + } + + using Comparator::CompareWithoutTimestamp; + int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b, + bool b_has_ts) const override { + if (a_has_ts) { + assert(a.size() >= timestamp_size()); + } + if (b_has_ts) { + assert(b.size() >= timestamp_size()); + } + Slice lhs = a_has_ts ? StripTimestampFromUserKey(a, timestamp_size()) : a; + Slice rhs = b_has_ts ? StripTimestampFromUserKey(b, timestamp_size()) : b; + return cmp_without_ts_->Compare(lhs, rhs); + } + + int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override { + if (!ts1.data() && !ts2.data()) { + return 0; + } else if (ts1.data() && !ts2.data()) { + return 1; + } else if (!ts1.data() && ts2.data()) { + return -1; + } + assert(ts1.size() == ts2.size()); + uint64_t low1 = 0; + uint64_t low2 = 0; + uint64_t high1 = 0; + uint64_t high2 = 0; + const size_t kSize = ts1.size(); + std::unique_ptr ts1_buf(new char[kSize]); + memcpy(ts1_buf.get(), ts1.data(), ts1.size()); + std::unique_ptr ts2_buf(new char[kSize]); + memcpy(ts2_buf.get(), ts2.data(), ts2.size()); + Slice ts1_copy = Slice(ts1_buf.get(), kSize); + Slice ts2_copy = Slice(ts2_buf.get(), kSize); + auto* ptr1 = const_cast(&ts1_copy); + auto* ptr2 = const_cast(&ts2_copy); + if (!GetFixed64(ptr1, &low1) || !GetFixed64(ptr1, &high1) || + !GetFixed64(ptr2, &low2) || !GetFixed64(ptr2, &high2)) { + assert(false); + } + if (high1 < high2) { + return -1; + } else if (high1 > high2) { + return 1; + } + if (low1 < low2) { + return -1; + } else if (low1 > low2) { + return 1; + } + return 0; + } + }; + + std::string Timestamp(uint64_t low, uint64_t high); + + void CheckIterUserEntry(const Iterator* it, const Slice& expected_key, + ValueType expected_value_type, + const Slice& expected_value, + const Slice& expected_ts) const; + + void CheckIterEntry(const Iterator* it, const Slice& expected_ukey, + SequenceNumber expected_seq, ValueType expected_val_type, + const Slice& expected_value, + const Slice& expected_ts) const; + + void CheckIterEntry(const Iterator* it, const Slice& expected_ukey, + ValueType expected_val_type, const Slice& expected_value, + const Slice& expected_ts) const; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_write_buffer_manager_test.cc b/src/rocksdb/db/db_write_buffer_manager_test.cc new file mode 100644 index 000000000..4c31a7824 --- /dev/null +++ b/src/rocksdb/db/db_write_buffer_manager_test.cc @@ -0,0 +1,862 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "db/write_thread.h" +#include "port/stack_trace.h" + +namespace ROCKSDB_NAMESPACE { + +class DBWriteBufferManagerTest : public DBTestBase, + public testing::WithParamInterface { + public: + DBWriteBufferManagerTest() + : DBTestBase("db_write_buffer_manager_test", /*env_do_fsync=*/false) {} + bool cost_cache_; +}; + +TEST_P(DBWriteBufferManagerTest, SharedBufferAcrossCFs1) { + Options options = CurrentOptions(); + options.arena_block_size = 4096; + options.write_buffer_size = 500000; // this is never hit + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + cost_cache_ = GetParam(); + + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, cache, true)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, true)); + } + + WriteOptions wo; + wo.disableWAL = true; + + CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options); + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + Flush(3); + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); + Flush(0); + + // Write to "Default", "cf2" and "cf3". + ASSERT_OK(Put(3, Key(1), DummyString(30000), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(40000), wo)); + ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); + + ASSERT_OK(Put(3, Key(2), DummyString(40000), wo)); + // WriteBufferManager::buffer_size_ has exceeded after the previous write is + // completed. + + // This make sures write will go through and if stall was in effect, it will + // end. + ASSERT_OK(Put(0, Key(2), DummyString(1), wo)); +} + +// Test Single DB with multiple writer threads get blocked when +// WriteBufferManager execeeds buffer_size_ and flush is waiting to be +// finished. +TEST_P(DBWriteBufferManagerTest, SharedWriteBufferAcrossCFs2) { + Options options = CurrentOptions(); + options.arena_block_size = 4096; + options.write_buffer_size = 500000; // this is never hit + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + cost_cache_ = GetParam(); + + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, cache, true)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, true)); + } + WriteOptions wo; + wo.disableWAL = true; + + CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options); + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + Flush(3); + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); + Flush(0); + + // Write to "Default", "cf2" and "cf3". No flush will be triggered. + ASSERT_OK(Put(3, Key(1), DummyString(30000), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(40000), wo)); + ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); + + ASSERT_OK(Put(3, Key(2), DummyString(40000), wo)); + // WriteBufferManager::buffer_size_ has exceeded after the previous write is + // completed. + + std::unordered_set w_set; + std::vector threads; + int wait_count_db = 0; + int num_writers = 4; + InstrumentedMutex mutex; + InstrumentedCondVar cv(&mutex); + std::atomic thread_num(0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0", + "DBImpl::BackgroundCallFlush:start"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WBMStallInterface::BlockDB", [&](void*) { + InstrumentedMutexLock lock(&mutex); + wait_count_db++; + cv.SignalAll(); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::WriteStall::Wait", [&](void* arg) { + InstrumentedMutexLock lock(&mutex); + WriteThread::Writer* w = reinterpret_cast(arg); + w_set.insert(w); + // Allow the flush to continue if all writer threads are blocked. + if (w_set.size() == (unsigned long)num_writers) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + bool s = true; + + std::function writer = [&](int cf) { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + Status tmp = Put(cf, Slice(key), DummyString(1), wo); + InstrumentedMutexLock lock(&mutex); + s = s && tmp.ok(); + }; + + // Flow: + // main_writer thread will write but will be blocked (as Flush will on hold, + // buffer_size_ has exceeded, thus will create stall in effect). + // | + // | + // multiple writer threads will be created to write across multiple columns + // and they will be blocked. + // | + // | + // Last writer thread will write and when its blocked it will signal Flush to + // continue to clear the stall. + + threads.emplace_back(writer, 1); + // Wait untill first thread (main_writer) writing to DB is blocked and then + // create the multiple writers which will be blocked from getting added to the + // queue because stall is in effect. + { + InstrumentedMutexLock lock(&mutex); + while (wait_count_db != 1) { + cv.Wait(); + } + } + for (int i = 0; i < num_writers; i++) { + threads.emplace_back(writer, i % 4); + } + for (auto& t : threads) { + t.join(); + } + + ASSERT_TRUE(s); + + // Number of DBs blocked. + ASSERT_EQ(wait_count_db, 1); + // Number of Writer threads blocked. + ASSERT_EQ(w_set.size(), num_writers); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +// Test multiple DBs get blocked when WriteBufferManager limit exceeds and flush +// is waiting to be finished but DBs tries to write meanwhile. +TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) { + std::vector dbnames; + std::vector dbs; + int num_dbs = 3; + + for (int i = 0; i < num_dbs; i++) { + dbs.push_back(nullptr); + dbnames.push_back( + test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i))); + } + + Options options = CurrentOptions(); + options.arena_block_size = 4096; + options.write_buffer_size = 500000; // this is never hit + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + cost_cache_ = GetParam(); + + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, cache, true)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, true)); + } + CreateAndReopenWithCF({"cf1", "cf2"}, options); + + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(DestroyDB(dbnames[i], options)); + ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i]))); + } + WriteOptions wo; + wo.disableWAL = true; + + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000))); + } + // Insert to db_. + ASSERT_OK(Put(0, Key(1), DummyString(30000), wo)); + + // WriteBufferManager Limit exceeded. + std::vector threads; + int wait_count_db = 0; + InstrumentedMutex mutex; + InstrumentedCondVar cv(&mutex); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0", + "DBImpl::BackgroundCallFlush:start"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WBMStallInterface::BlockDB", [&](void*) { + { + InstrumentedMutexLock lock(&mutex); + wait_count_db++; + cv.Signal(); + // Since this is the last DB, signal Flush to continue. + if (wait_count_db == num_dbs + 1) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + bool s = true; + + // Write to DB. + std::function write_db = [&](DB* db) { + Status tmp = db->Put(wo, Key(3), DummyString(1)); + InstrumentedMutexLock lock(&mutex); + s = s && tmp.ok(); + }; + + // Flow: + // db_ will write and will be blocked (as Flush will on hold and will create + // stall in effect). + // | + // multiple dbs writers will be created to write to that db and they will be + // blocked. + // | + // | + // Last writer will write and when its blocked it will signal Flush to + // continue to clear the stall. + + threads.emplace_back(write_db, db_); + // Wait untill first DB is blocked and then create the multiple writers for + // different DBs which will be blocked from getting added to the queue because + // stall is in effect. + { + InstrumentedMutexLock lock(&mutex); + while (wait_count_db != 1) { + cv.Wait(); + } + } + for (int i = 0; i < num_dbs; i++) { + threads.emplace_back(write_db, dbs[i]); + } + for (auto& t : threads) { + t.join(); + } + + ASSERT_TRUE(s); + ASSERT_EQ(num_dbs + 1, wait_count_db); + // Clean up DBs. + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(dbs[i]->Close()); + ASSERT_OK(DestroyDB(dbnames[i], options)); + delete dbs[i]; + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +// Test multiple threads writing across multiple DBs and multiple columns get +// blocked when stall by WriteBufferManager is in effect. +TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) { + std::vector dbnames; + std::vector dbs; + int num_dbs = 3; + + for (int i = 0; i < num_dbs; i++) { + dbs.push_back(nullptr); + dbnames.push_back( + test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i))); + } + + Options options = CurrentOptions(); + options.arena_block_size = 4096; + options.write_buffer_size = 500000; // this is never hit + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + cost_cache_ = GetParam(); + + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, cache, true)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, true)); + } + CreateAndReopenWithCF({"cf1", "cf2"}, options); + + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(DestroyDB(dbnames[i], options)); + ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i]))); + } + WriteOptions wo; + wo.disableWAL = true; + + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000))); + } + // Insert to db_. + ASSERT_OK(Put(0, Key(1), DummyString(30000), wo)); + + // WriteBufferManager::buffer_size_ has exceeded after the previous write to + // dbs[0] is completed. + std::vector threads; + int wait_count_db = 0; + InstrumentedMutex mutex; + InstrumentedCondVar cv(&mutex); + std::unordered_set w_set; + std::vector writer_threads; + std::atomic thread_num(0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0", + "DBImpl::BackgroundCallFlush:start"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WBMStallInterface::BlockDB", [&](void*) { + { + InstrumentedMutexLock lock(&mutex); + wait_count_db++; + thread_num.fetch_add(1); + cv.Signal(); + // Allow the flush to continue if all writer threads are blocked. + if (thread_num.load(std::memory_order_relaxed) == 2 * num_dbs + 1) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::WriteStall::Wait", [&](void* arg) { + WriteThread::Writer* w = reinterpret_cast(arg); + { + InstrumentedMutexLock lock(&mutex); + w_set.insert(w); + thread_num.fetch_add(1); + // Allow the flush continue if all writer threads are blocked. + if (thread_num.load(std::memory_order_relaxed) == 2 * num_dbs + 1) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + bool s1 = true, s2 = true; + // Write to multiple columns of db_. + std::function write_cf = [&](int cf) { + Status tmp = Put(cf, Key(3), DummyString(1), wo); + InstrumentedMutexLock lock(&mutex); + s1 = s1 && tmp.ok(); + }; + // Write to multiple DBs. + std::function write_db = [&](DB* db) { + Status tmp = db->Put(wo, Key(3), DummyString(1)); + InstrumentedMutexLock lock(&mutex); + s2 = s2 && tmp.ok(); + }; + + // Flow: + // thread will write to db_ will be blocked (as Flush will on hold, + // buffer_size_ has exceeded and will create stall in effect). + // | + // | + // multiple writers threads writing to different DBs and to db_ across + // multiple columns will be created and they will be blocked due to stall. + // | + // | + // Last writer thread will write and when its blocked it will signal Flush to + // continue to clear the stall. + threads.emplace_back(write_db, db_); + // Wait untill first thread is blocked and then create the multiple writer + // threads. + { + InstrumentedMutexLock lock(&mutex); + while (wait_count_db != 1) { + cv.Wait(); + } + } + + for (int i = 0; i < num_dbs; i++) { + // Write to multiple columns of db_. + writer_threads.emplace_back(write_cf, i % 3); + // Write to different dbs. + threads.emplace_back(write_db, dbs[i]); + } + for (auto& t : threads) { + t.join(); + } + for (auto& t : writer_threads) { + t.join(); + } + + ASSERT_TRUE(s1); + ASSERT_TRUE(s2); + + // Number of DBs blocked. + ASSERT_EQ(num_dbs + 1, wait_count_db); + // Number of Writer threads blocked. + ASSERT_EQ(w_set.size(), num_dbs); + // Clean up DBs. + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(dbs[i]->Close()); + ASSERT_OK(DestroyDB(dbnames[i], options)); + delete dbs[i]; + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +// Test multiple threads writing across multiple columns of db_ by passing +// different values to WriteOption.no_slown_down. +TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsSingleDB) { + Options options = CurrentOptions(); + options.arena_block_size = 4096; + options.write_buffer_size = 500000; // this is never hit + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + cost_cache_ = GetParam(); + + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, cache, true)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, true)); + } + WriteOptions wo; + wo.disableWAL = true; + + CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options); + + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + Flush(3); + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); + Flush(0); + + // Write to "Default", "cf2" and "cf3". No flush will be triggered. + ASSERT_OK(Put(3, Key(1), DummyString(30000), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(40000), wo)); + ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); + ASSERT_OK(Put(3, Key(2), DummyString(40000), wo)); + + // WriteBufferManager::buffer_size_ has exceeded after the previous write to + // db_ is completed. + + std::unordered_set w_slowdown_set; + std::vector threads; + int wait_count_db = 0; + int num_writers = 4; + InstrumentedMutex mutex; + InstrumentedCondVar cv(&mutex); + std::atomic thread_num(0); + std::atomic w_no_slowdown(0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0", + "DBImpl::BackgroundCallFlush:start"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WBMStallInterface::BlockDB", [&](void*) { + { + InstrumentedMutexLock lock(&mutex); + wait_count_db++; + cv.SignalAll(); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::WriteStall::Wait", [&](void* arg) { + { + InstrumentedMutexLock lock(&mutex); + WriteThread::Writer* w = reinterpret_cast(arg); + w_slowdown_set.insert(w); + // Allow the flush continue if all writer threads are blocked. + if (w_slowdown_set.size() + (unsigned long)w_no_slowdown.load( + std::memory_order_relaxed) == + (unsigned long)num_writers) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + bool s1 = true, s2 = true; + + std::function write_slow_down = [&](int cf) { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions write_op; + write_op.no_slowdown = false; + Status tmp = Put(cf, Slice(key), DummyString(1), write_op); + InstrumentedMutexLock lock(&mutex); + s1 = s1 && tmp.ok(); + }; + + std::function write_no_slow_down = [&](int cf) { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions write_op; + write_op.no_slowdown = true; + Status tmp = Put(cf, Slice(key), DummyString(1), write_op); + { + InstrumentedMutexLock lock(&mutex); + s2 = s2 && !tmp.ok(); + w_no_slowdown.fetch_add(1); + // Allow the flush continue if all writer threads are blocked. + if (w_slowdown_set.size() + + (unsigned long)w_no_slowdown.load(std::memory_order_relaxed) == + (unsigned long)num_writers) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + } + }; + + // Flow: + // main_writer thread will write but will be blocked (as Flush will on hold, + // buffer_size_ has exceeded, thus will create stall in effect). + // | + // | + // multiple writer threads will be created to write across multiple columns + // with different values of WriteOptions.no_slowdown. Some of them will + // be blocked and some of them will return with Incomplete status. + // | + // | + // Last writer thread will write and when its blocked/return it will signal + // Flush to continue to clear the stall. + threads.emplace_back(write_slow_down, 1); + // Wait untill first thread (main_writer) writing to DB is blocked and then + // create the multiple writers which will be blocked from getting added to the + // queue because stall is in effect. + { + InstrumentedMutexLock lock(&mutex); + while (wait_count_db != 1) { + cv.Wait(); + } + } + + for (int i = 0; i < num_writers; i += 2) { + threads.emplace_back(write_no_slow_down, (i) % 4); + threads.emplace_back(write_slow_down, (i + 1) % 4); + } + for (auto& t : threads) { + t.join(); + } + + ASSERT_TRUE(s1); + ASSERT_TRUE(s2); + // Number of DBs blocked. + ASSERT_EQ(wait_count_db, 1); + // Number of Writer threads blocked. + ASSERT_EQ(w_slowdown_set.size(), num_writers / 2); + // Number of Writer threads with WriteOptions.no_slowdown = true. + ASSERT_EQ(w_no_slowdown.load(std::memory_order_relaxed), num_writers / 2); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +// Test multiple threads writing across multiple columns of db_ and different +// dbs by passing different values to WriteOption.no_slown_down. +TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) { + std::vector dbnames; + std::vector dbs; + int num_dbs = 4; + + for (int i = 0; i < num_dbs; i++) { + dbs.push_back(nullptr); + dbnames.push_back( + test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i))); + } + + Options options = CurrentOptions(); + options.arena_block_size = 4096; + options.write_buffer_size = 500000; // this is never hit + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + cost_cache_ = GetParam(); + + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, cache, true)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, true)); + } + CreateAndReopenWithCF({"cf1", "cf2"}, options); + + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(DestroyDB(dbnames[i], options)); + ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i]))); + } + WriteOptions wo; + wo.disableWAL = true; + + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000))); + } + // Insert to db_. + ASSERT_OK(Put(0, Key(1), DummyString(30000), wo)); + + // WriteBufferManager::buffer_size_ has exceeded after the previous write to + // dbs[0] is completed. + std::vector threads; + int wait_count_db = 0; + InstrumentedMutex mutex; + InstrumentedCondVar cv(&mutex); + std::unordered_set w_slowdown_set; + std::vector writer_threads; + std::atomic thread_num(0); + std::atomic w_no_slowdown(0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0", + "DBImpl::BackgroundCallFlush:start"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WBMStallInterface::BlockDB", [&](void*) { + InstrumentedMutexLock lock(&mutex); + wait_count_db++; + cv.Signal(); + // Allow the flush continue if all writer threads are blocked. + if (w_slowdown_set.size() + + (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) + + wait_count_db) == + (unsigned long)(2 * num_dbs + 1)) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::WriteStall::Wait", [&](void* arg) { + WriteThread::Writer* w = reinterpret_cast(arg); + InstrumentedMutexLock lock(&mutex); + w_slowdown_set.insert(w); + // Allow the flush continue if all writer threads are blocked. + if (w_slowdown_set.size() + + (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) + + wait_count_db) == + (unsigned long)(2 * num_dbs + 1)) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + bool s1 = true, s2 = true; + std::function write_slow_down = [&](DB* db) { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions write_op; + write_op.no_slowdown = false; + Status tmp = db->Put(write_op, Slice(key), DummyString(1)); + InstrumentedMutexLock lock(&mutex); + s1 = s1 && tmp.ok(); + }; + + std::function write_no_slow_down = [&](DB* db) { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions write_op; + write_op.no_slowdown = true; + Status tmp = db->Put(write_op, Slice(key), DummyString(1)); + { + InstrumentedMutexLock lock(&mutex); + s2 = s2 && !tmp.ok(); + w_no_slowdown.fetch_add(1); + if (w_slowdown_set.size() + + (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) + + wait_count_db) == + (unsigned long)(2 * num_dbs + 1)) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + } + }; + + // Flow: + // first thread will write but will be blocked (as Flush will on hold, + // buffer_size_ has exceeded, thus will create stall in effect). + // | + // | + // multiple writer threads will be created to write across multiple columns + // of db_ and different DBs with different values of + // WriteOptions.no_slowdown. Some of them will be blocked and some of them + // will return with Incomplete status. + // | + // | + // Last writer thread will write and when its blocked/return it will signal + // Flush to continue to clear the stall. + threads.emplace_back(write_slow_down, db_); + // Wait untill first thread writing to DB is blocked and then + // create the multiple writers. + { + InstrumentedMutexLock lock(&mutex); + while (wait_count_db != 1) { + cv.Wait(); + } + } + + for (int i = 0; i < num_dbs; i += 2) { + // Write to multiple columns of db_. + writer_threads.emplace_back(write_slow_down, db_); + writer_threads.emplace_back(write_no_slow_down, db_); + // Write to different DBs. + threads.emplace_back(write_slow_down, dbs[i]); + threads.emplace_back(write_no_slow_down, dbs[i + 1]); + } + + for (auto& t : threads) { + t.join(); + } + + for (auto& t : writer_threads) { + t.join(); + } + + ASSERT_TRUE(s1); + ASSERT_TRUE(s2); + // Number of DBs blocked. + ASSERT_EQ((num_dbs / 2) + 1, wait_count_db); + // Number of writer threads writing to db_ blocked from getting added to the + // queue. + ASSERT_EQ(w_slowdown_set.size(), num_dbs / 2); + // Number of threads with WriteOptions.no_slowdown = true. + ASSERT_EQ(w_no_slowdown.load(std::memory_order_relaxed), num_dbs); + + // Clean up DBs. + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(dbs[i]->Close()); + ASSERT_OK(DestroyDB(dbnames[i], options)); + delete dbs[i]; + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +#ifndef ROCKSDB_LITE + +// Tests a `WriteBufferManager` constructed with `allow_stall == false` does not +// thrash memtable switching when full and a CF receives multiple writes. +// Instead, we expect to switch a CF's memtable for flush only when that CF does +// not have any pending or running flush. +// +// This test uses multiple DBs each with a single CF instead of a single DB +// with multiple CFs. That way we can control which CF is considered for switch +// by writing to that CF's DB. +// +// Not supported in LITE mode due to `GetProperty()` unavailable. +TEST_P(DBWriteBufferManagerTest, StopSwitchingMemTablesOnceFlushing) { + Options options = CurrentOptions(); + options.arena_block_size = 4 << 10; // 4KB + options.write_buffer_size = 1 << 20; // 1MB + std::shared_ptr cache = + NewLRUCache(4 << 20 /* capacity (4MB) */, 2 /* num_shard_bits */); + ASSERT_LT(cache->GetUsage(), 256 << 10 /* 256KB */); + cost_cache_ = GetParam(); + if (cost_cache_) { + options.write_buffer_manager.reset(new WriteBufferManager( + 512 << 10 /* buffer_size (512KB) */, cache, false /* allow_stall */)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(512 << 10 /* buffer_size (512KB) */, + nullptr /* cache */, false /* allow_stall */)); + } + + Reopen(options); + std::string dbname = test::PerThreadDBPath("db_shared_wbm_db"); + DB* shared_wbm_db = nullptr; + + ASSERT_OK(DestroyDB(dbname, options)); + ASSERT_OK(DB::Open(options, dbname, &shared_wbm_db)); + + // The last write will make WBM need flush, but it won't flush yet. + ASSERT_OK(Put(Key(1), DummyString(256 << 10 /* 256KB */), WriteOptions())); + ASSERT_FALSE(options.write_buffer_manager->ShouldFlush()); + ASSERT_OK(Put(Key(1), DummyString(256 << 10 /* 256KB */), WriteOptions())); + ASSERT_TRUE(options.write_buffer_manager->ShouldFlush()); + + // Flushes will be pending, not running because flush threads are blocked. + test::SleepingBackgroundTask sleeping_task_high; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_high, Env::Priority::HIGH); + + for (int i = 0; i < 3; ++i) { + ASSERT_OK( + shared_wbm_db->Put(WriteOptions(), Key(1), DummyString(1 /* len */))); + std::string prop; + ASSERT_TRUE( + shared_wbm_db->GetProperty("rocksdb.num-immutable-mem-table", &prop)); + ASSERT_EQ(std::to_string(i > 0 ? 1 : 0), prop); + ASSERT_TRUE( + shared_wbm_db->GetProperty("rocksdb.mem-table-flush-pending", &prop)); + ASSERT_EQ(std::to_string(i > 0 ? 1 : 0), prop); + } + + // Clean up DBs. + sleeping_task_high.WakeUp(); + sleeping_task_high.WaitUntilDone(); + ASSERT_OK(shared_wbm_db->Close()); + ASSERT_OK(DestroyDB(dbname, options)); + delete shared_wbm_db; +} + +#endif // ROCKSDB_LITE + +INSTANTIATE_TEST_CASE_P(DBWriteBufferManagerTest, DBWriteBufferManagerTest, + testing::Bool()); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_write_test.cc b/src/rocksdb/db/db_write_test.cc new file mode 100644 index 000000000..1011d5c9e --- /dev/null +++ b/src/rocksdb/db/db_write_test.cc @@ -0,0 +1,679 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include +#include +#include +#include + +#include "db/db_test_util.h" +#include "db/write_batch_internal.h" +#include "db/write_thread.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "test_util/sync_point.h" +#include "util/random.h" +#include "util/string_util.h" +#include "utilities/fault_injection_env.h" + +namespace ROCKSDB_NAMESPACE { + +// Test variations of WriteImpl. +class DBWriteTest : public DBTestBase, public testing::WithParamInterface { + public: + DBWriteTest() : DBTestBase("db_write_test", /*env_do_fsync=*/true) {} + + Options GetOptions() { return DBTestBase::GetOptions(GetParam()); } + + void Open() { DBTestBase::Reopen(GetOptions()); } +}; + +class DBWriteTestUnparameterized : public DBTestBase { + public: + explicit DBWriteTestUnparameterized() + : DBTestBase("pipelined_write_test", /*env_do_fsync=*/false) {} +}; + +// It is invalid to do sync write while disabling WAL. +TEST_P(DBWriteTest, SyncAndDisableWAL) { + WriteOptions write_options; + write_options.sync = true; + write_options.disableWAL = true; + ASSERT_TRUE(dbfull()->Put(write_options, "foo", "bar").IsInvalidArgument()); + WriteBatch batch; + ASSERT_OK(batch.Put("foo", "bar")); + ASSERT_TRUE(dbfull()->Write(write_options, &batch).IsInvalidArgument()); +} + +TEST_P(DBWriteTest, WriteStallRemoveNoSlowdownWrite) { + Options options = GetOptions(); + options.level0_stop_writes_trigger = options.level0_slowdown_writes_trigger = + 4; + std::vector threads; + std::atomic thread_num(0); + port::Mutex mutex; + port::CondVar cv(&mutex); + // Guarded by mutex + int writers = 0; + + Reopen(options); + + std::function write_slowdown_func = [&]() { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions wo; + wo.no_slowdown = false; + ASSERT_OK(dbfull()->Put(wo, key, "bar")); + }; + std::function write_no_slowdown_func = [&]() { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions wo; + wo.no_slowdown = true; + Status s = dbfull()->Put(wo, key, "bar"); + ASSERT_TRUE(s.ok() || s.IsIncomplete()); + }; + std::function unblock_main_thread_func = [&](void*) { + mutex.Lock(); + ++writers; + cv.SignalAll(); + mutex.Unlock(); + }; + + // Create 3 L0 files and schedule 4th without waiting + ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:Start", unblock_main_thread_func); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBWriteTest::WriteStallRemoveNoSlowdownWrite:1", + "DBImpl::BackgroundCallFlush:start"}, + {"DBWriteTest::WriteStallRemoveNoSlowdownWrite:2", + "DBImplWrite::PipelinedWriteImpl:AfterJoinBatchGroup"}, + // Make compaction start wait for the write stall to be detected and + // implemented by a write group leader + {"DBWriteTest::WriteStallRemoveNoSlowdownWrite:3", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Schedule creation of 4th L0 file without waiting. This will seal the + // memtable and then wait for a sync point before writing the file. We need + // to do it this way because SwitchMemtable() needs to enter the + // write_thread + FlushOptions fopt; + fopt.wait = false; + ASSERT_OK(dbfull()->Flush(fopt)); + + // Create a mix of slowdown/no_slowdown write threads + mutex.Lock(); + // First leader + threads.emplace_back(write_slowdown_func); + while (writers != 1) { + cv.Wait(); + } + + // Second leader. Will stall writes + // Build a writers list with no slowdown in the middle: + // +-------------+ + // | slowdown +<----+ newest + // +--+----------+ + // | + // v + // +--+----------+ + // | no slowdown | + // +--+----------+ + // | + // v + // +--+----------+ + // | slowdown + + // +-------------+ + threads.emplace_back(write_slowdown_func); + while (writers != 2) { + cv.Wait(); + } + threads.emplace_back(write_no_slowdown_func); + while (writers != 3) { + cv.Wait(); + } + threads.emplace_back(write_slowdown_func); + while (writers != 4) { + cv.Wait(); + } + + mutex.Unlock(); + + TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:1"); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(nullptr)); + // This would have triggered a write stall. Unblock the write group leader + TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:2"); + // The leader is going to create missing newer links. When the leader + // finishes, the next leader is going to delay writes and fail writers with + // no_slowdown + + TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:3"); + for (auto& t : threads) { + t.join(); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) { + Options options = GetOptions(); + options.level0_stop_writes_trigger = options.level0_slowdown_writes_trigger = + 4; + std::vector threads; + std::atomic thread_num(0); + port::Mutex mutex; + port::CondVar cv(&mutex); + // Guarded by mutex + int writers = 0; + + Reopen(options); + + std::function write_slowdown_func = [&]() { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions wo; + wo.no_slowdown = false; + ASSERT_OK(dbfull()->Put(wo, key, "bar")); + }; + std::function write_no_slowdown_func = [&]() { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions wo; + wo.no_slowdown = true; + Status s = dbfull()->Put(wo, key, "bar"); + ASSERT_TRUE(s.ok() || s.IsIncomplete()); + }; + std::function unblock_main_thread_func = [&](void*) { + mutex.Lock(); + ++writers; + cv.SignalAll(); + mutex.Unlock(); + }; + + // Create 3 L0 files and schedule 4th without waiting + ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:Start", unblock_main_thread_func); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBWriteTest::WriteThreadHangOnWriteStall:1", + "DBImpl::BackgroundCallFlush:start"}, + {"DBWriteTest::WriteThreadHangOnWriteStall:2", + "DBImpl::WriteImpl:BeforeLeaderEnters"}, + // Make compaction start wait for the write stall to be detected and + // implemented by a write group leader + {"DBWriteTest::WriteThreadHangOnWriteStall:3", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Schedule creation of 4th L0 file without waiting. This will seal the + // memtable and then wait for a sync point before writing the file. We need + // to do it this way because SwitchMemtable() needs to enter the + // write_thread + FlushOptions fopt; + fopt.wait = false; + ASSERT_OK(dbfull()->Flush(fopt)); + + // Create a mix of slowdown/no_slowdown write threads + mutex.Lock(); + // First leader + threads.emplace_back(write_slowdown_func); + while (writers != 1) { + cv.Wait(); + } + // Second leader. Will stall writes + threads.emplace_back(write_slowdown_func); + threads.emplace_back(write_no_slowdown_func); + threads.emplace_back(write_slowdown_func); + threads.emplace_back(write_no_slowdown_func); + threads.emplace_back(write_slowdown_func); + while (writers != 6) { + cv.Wait(); + } + mutex.Unlock(); + + TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:1"); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(nullptr)); + // This would have triggered a write stall. Unblock the write group leader + TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:2"); + // The leader is going to create missing newer links. When the leader + // finishes, the next leader is going to delay writes and fail writers with + // no_slowdown + + TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:3"); + for (auto& t : threads) { + t.join(); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) { + constexpr int kNumThreads = 5; + std::unique_ptr mock_env( + new FaultInjectionTestEnv(env_)); + Options options = GetOptions(); + options.env = mock_env.get(); + Reopen(options); + std::atomic ready_count{0}; + std::atomic leader_count{0}; + std::vector threads; + mock_env->SetFilesystemActive(false); + + // Wait until all threads linked to write threads, to make sure + // all threads join the same batch group. + SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:Wait", [&](void* arg) { + ready_count++; + auto* w = reinterpret_cast(arg); + if (w->state == WriteThread::STATE_GROUP_LEADER) { + leader_count++; + while (ready_count < kNumThreads) { + // busy waiting + } + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + for (int i = 0; i < kNumThreads; i++) { + threads.push_back(port::Thread( + [&](int index) { + // All threads should fail. + auto res = Put("key" + std::to_string(index), "value"); + if (options.manual_wal_flush) { + ASSERT_TRUE(res.ok()); + // we should see fs error when we do the flush + + // TSAN reports a false alarm for lock-order-inversion but Open and + // FlushWAL are not run concurrently. Disabling this until TSAN is + // fixed. + // res = dbfull()->FlushWAL(false); + // ASSERT_FALSE(res.ok()); + } else { + ASSERT_FALSE(res.ok()); + } + }, + i)); + } + for (int i = 0; i < kNumThreads; i++) { + threads[i].join(); + } + ASSERT_EQ(1, leader_count); + + // The Failed PUT operations can cause a BG error to be set. + // Mark it as Checked for the ASSERT_STATUS_CHECKED + dbfull()->Resume().PermitUncheckedError(); + + // Close before mock_env destruct. + Close(); +} + +TEST_F(DBWriteTestUnparameterized, PipelinedWriteRace) { + // This test was written to trigger a race in ExitAsBatchGroupLeader in case + // enable_pipelined_write_ was true. + // Writers for which ShouldWriteToMemtable() evaluates to false are removed + // from the write_group via CompleteFollower/ CompleteLeader. Writers in the + // middle of the group are fully unlinked, but if that writers is the + // last_writer, then we did not update the predecessor's link_older, i.e., + // this writer was still reachable via newest_writer_. + // + // But the problem was, that CompleteFollower already wakes up the thread + // owning that writer before the writer has been removed. This resulted in a + // race - if the leader thread was fast enough, then everything was fine. + // However, if the woken up thread finished the current write operation and + // then performed yet another write, then a new writer instance was added + // to newest_writer_. It is possible that the new writer is located on the + // same address on stack, and if this happened, then we had a problem, + // because the old code tried to find the last_writer in the list to unlink + // it, which in this case produced a cycle in the list. + // Whether two invocations of PipelinedWriteImpl() by the same thread actually + // allocate the writer on the same address depends on the OS and/or compiler, + // so it is rather hard to create a deterministic test for this. + + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.enable_pipelined_write = true; + std::vector threads; + + std::atomic write_counter{0}; + std::atomic active_writers{0}; + std::atomic second_write_starting{false}; + std::atomic second_write_in_progress{false}; + std::atomic leader{nullptr}; + std::atomic finished_WAL_write{false}; + + DestroyAndReopen(options); + + auto write_one_doc = [&]() { + int a = write_counter.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions wo; + ASSERT_OK(dbfull()->Put(wo, key, "bar")); + --active_writers; + }; + + auto write_two_docs = [&]() { + write_one_doc(); + second_write_starting = true; + write_one_doc(); + }; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:Wait", [&](void* arg) { + if (second_write_starting.load()) { + second_write_in_progress = true; + return; + } + auto* w = reinterpret_cast(arg); + if (w->state == WriteThread::STATE_GROUP_LEADER) { + active_writers++; + if (leader.load() == nullptr) { + leader.store(w); + while (active_writers.load() < 2) { + // wait for another thread to join the write_group + } + } + } else { + // we disable the memtable for all followers so that they they are + // removed from the write_group before enqueuing it for the memtable + // write + w->disable_memtable = true; + active_writers++; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::ExitAsBatchGroupLeader:Start", [&](void* arg) { + auto* wg = reinterpret_cast(arg); + if (wg->leader == leader && !finished_WAL_write) { + finished_WAL_write = true; + while (active_writers.load() < 3) { + // wait for the new writer to be enqueued + } + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::ExitAsBatchGroupLeader:AfterCompleteWriters", + [&](void* arg) { + auto* wg = reinterpret_cast(arg); + if (wg->leader == leader) { + while (!second_write_in_progress.load()) { + // wait for the old follower thread to start the next write + } + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // start leader + one follower + threads.emplace_back(write_one_doc); + while (leader.load() == nullptr) { + // wait for leader + } + + // we perform two writes in the follower, so that for the second write + // the thread reinserts a Writer with the same address + threads.emplace_back(write_two_docs); + + // wait for the leader to enter ExitAsBatchGroupLeader + while (!finished_WAL_write.load()) { + // wait for write_group to have finished the WAL writes + } + + // start another writer thread to be enqueued before the leader can + // complete the writers from its write_group + threads.emplace_back(write_one_doc); + + for (auto& t : threads) { + t.join(); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(DBWriteTest, ManualWalFlushInEffect) { + Options options = GetOptions(); + Reopen(options); + // try the 1st WAL created during open + ASSERT_TRUE(Put("key" + std::to_string(0), "value").ok()); + ASSERT_TRUE(options.manual_wal_flush != dbfull()->WALBufferIsEmpty()); + ASSERT_TRUE(dbfull()->FlushWAL(false).ok()); + ASSERT_TRUE(dbfull()->WALBufferIsEmpty()); + // try the 2nd wal created during SwitchWAL + ASSERT_OK(dbfull()->TEST_SwitchWAL()); + ASSERT_TRUE(Put("key" + std::to_string(0), "value").ok()); + ASSERT_TRUE(options.manual_wal_flush != dbfull()->WALBufferIsEmpty()); + ASSERT_TRUE(dbfull()->FlushWAL(false).ok()); + ASSERT_TRUE(dbfull()->WALBufferIsEmpty()); +} + +TEST_P(DBWriteTest, UnflushedPutRaceWithTrackedWalSync) { + // Repro race condition bug where unflushed WAL data extended the synced size + // recorded to MANIFEST despite being unrecoverable. + Options options = GetOptions(); + std::unique_ptr fault_env( + new FaultInjectionTestEnv(env_)); + options.env = fault_env.get(); + options.manual_wal_flush = true; + options.track_and_verify_wals_in_manifest = true; + Reopen(options); + + ASSERT_OK(Put("key1", "val1")); + + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::SyncWAL:Begin", + [this](void* /* arg */) { ASSERT_OK(Put("key2", "val2")); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db_->FlushWAL(true /* sync */)); + + // Ensure callback ran. + ASSERT_EQ("val2", Get("key2")); + + Close(); + + // Simulate full loss of unsynced data. This drops "key2" -> "val2" from the + // DB WAL. + fault_env->DropUnsyncedFileData(); + + Reopen(options); + + // Need to close before `fault_env` goes out of scope. + Close(); +} + +TEST_P(DBWriteTest, InactiveWalFullySyncedBeforeUntracked) { + // Repro bug where a WAL is appended and switched after + // `FlushWAL(true /* sync */)`'s sync finishes and before it untracks fully + // synced inactive logs. Previously such a WAL would be wrongly untracked + // so the final append would never be synced. + Options options = GetOptions(); + std::unique_ptr fault_env( + new FaultInjectionTestEnv(env_)); + options.env = fault_env.get(); + Reopen(options); + + ASSERT_OK(Put("key1", "val1")); + + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::SyncWAL:BeforeMarkLogsSynced:1", [this](void* /* arg */) { + ASSERT_OK(Put("key2", "val2")); + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db_->FlushWAL(true /* sync */)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_OK(Put("key3", "val3")); + + ASSERT_OK(db_->FlushWAL(true /* sync */)); + + Close(); + + // Simulate full loss of unsynced data. This should drop nothing since we did + // `FlushWAL(true /* sync */)` before `Close()`. + fault_env->DropUnsyncedFileData(); + + Reopen(options); + + ASSERT_EQ("val1", Get("key1")); + ASSERT_EQ("val2", Get("key2")); + ASSERT_EQ("val3", Get("key3")); + + // Need to close before `fault_env` goes out of scope. + Close(); +} + +TEST_P(DBWriteTest, IOErrorOnWALWriteTriggersReadOnlyMode) { + std::unique_ptr mock_env( + new FaultInjectionTestEnv(env_)); + Options options = GetOptions(); + options.env = mock_env.get(); + Reopen(options); + for (int i = 0; i < 2; i++) { + // Forcibly fail WAL write for the first Put only. Subsequent Puts should + // fail due to read-only mode + mock_env->SetFilesystemActive(i != 0); + auto res = Put("key" + std::to_string(i), "value"); + // TSAN reports a false alarm for lock-order-inversion but Open and + // FlushWAL are not run concurrently. Disabling this until TSAN is + // fixed. + /* + if (options.manual_wal_flush && i == 0) { + // even with manual_wal_flush the 2nd Put should return error because of + // the read-only mode + ASSERT_TRUE(res.ok()); + // we should see fs error when we do the flush + res = dbfull()->FlushWAL(false); + } + */ + if (!options.manual_wal_flush) { + ASSERT_NOK(res); + } else { + ASSERT_OK(res); + } + } + // Close before mock_env destruct. + Close(); +} + +TEST_P(DBWriteTest, IOErrorOnSwitchMemtable) { + Random rnd(301); + std::unique_ptr mock_env( + new FaultInjectionTestEnv(env_)); + Options options = GetOptions(); + options.env = mock_env.get(); + options.writable_file_max_buffer_size = 4 * 1024 * 1024; + options.write_buffer_size = 3 * 512 * 1024; + options.wal_bytes_per_sync = 256 * 1024; + options.manual_wal_flush = true; + Reopen(options); + mock_env->SetFilesystemActive(false, Status::IOError("Not active")); + Status s; + for (int i = 0; i < 4 * 512; ++i) { + s = Put(Key(i), rnd.RandomString(1024)); + if (!s.ok()) { + break; + } + } + ASSERT_EQ(s.severity(), Status::Severity::kFatalError); + + mock_env->SetFilesystemActive(true); + // Close before mock_env destruct. + Close(); +} + +// Test that db->LockWAL() flushes the WAL after locking. +TEST_P(DBWriteTest, LockWalInEffect) { + Options options = GetOptions(); + Reopen(options); + // try the 1st WAL created during open + ASSERT_OK(Put("key" + std::to_string(0), "value")); + ASSERT_TRUE(options.manual_wal_flush != dbfull()->WALBufferIsEmpty()); + ASSERT_OK(dbfull()->LockWAL()); + ASSERT_TRUE(dbfull()->WALBufferIsEmpty(false)); + ASSERT_OK(dbfull()->UnlockWAL()); + // try the 2nd wal created during SwitchWAL + ASSERT_OK(dbfull()->TEST_SwitchWAL()); + ASSERT_OK(Put("key" + std::to_string(0), "value")); + ASSERT_TRUE(options.manual_wal_flush != dbfull()->WALBufferIsEmpty()); + ASSERT_OK(dbfull()->LockWAL()); + ASSERT_TRUE(dbfull()->WALBufferIsEmpty(false)); + ASSERT_OK(dbfull()->UnlockWAL()); +} + +TEST_P(DBWriteTest, ConcurrentlyDisabledWAL) { + Options options = GetOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.statistics->set_stats_level(StatsLevel::kAll); + Reopen(options); + std::string wal_key_prefix = "WAL_KEY_"; + std::string no_wal_key_prefix = "K_"; + // 100 KB value each for NO-WAL operation + std::string no_wal_value(1024 * 100, 'X'); + // 1B value each for WAL operation + std::string wal_value = "0"; + std::thread threads[10]; + for (int t = 0; t < 10; t++) { + threads[t] = std::thread([t, wal_key_prefix, wal_value, no_wal_key_prefix, + no_wal_value, this] { + for (int i = 0; i < 10; i++) { + ROCKSDB_NAMESPACE::WriteOptions write_option_disable; + write_option_disable.disableWAL = true; + ROCKSDB_NAMESPACE::WriteOptions write_option_default; + std::string no_wal_key = + no_wal_key_prefix + std::to_string(t) + "_" + std::to_string(i); + ASSERT_OK(this->Put(no_wal_key, no_wal_value, write_option_disable)); + std::string wal_key = + wal_key_prefix + std::to_string(i) + "_" + std::to_string(i); + ASSERT_OK(this->Put(wal_key, wal_value, write_option_default)); + ASSERT_OK(dbfull()->SyncWAL()); + } + return; + }); + } + for (auto& t : threads) { + t.join(); + } + uint64_t bytes_num = options.statistics->getTickerCount( + ROCKSDB_NAMESPACE::Tickers::WAL_FILE_BYTES); + // written WAL size should less than 100KB (even included HEADER & FOOTER + // overhead) + ASSERT_LE(bytes_num, 1024 * 100); +} + +INSTANTIATE_TEST_CASE_P(DBWriteTestInstance, DBWriteTest, + testing::Values(DBTestBase::kDefault, + DBTestBase::kConcurrentWALWrites, + DBTestBase::kPipelinedWrite)); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/dbformat.cc b/src/rocksdb/db/dbformat.cc new file mode 100644 index 000000000..b0ac6c339 --- /dev/null +++ b/src/rocksdb/db/dbformat.cc @@ -0,0 +1,188 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/dbformat.h" + +#include + +#include + +#include "db/lookup_key.h" +#include "monitoring/perf_context_imp.h" +#include "port/port.h" +#include "util/coding.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +// kValueTypeForSeek defines the ValueType that should be passed when +// constructing a ParsedInternalKey object for seeking to a particular +// sequence number (since we sort sequence numbers in decreasing order +// and the value type is embedded as the low 8 bits in the sequence +// number in internal keys, we need to use the highest-numbered +// ValueType, not the lowest). +const ValueType kValueTypeForSeek = kTypeWideColumnEntity; +const ValueType kValueTypeForSeekForPrev = kTypeDeletion; +const std::string kDisableUserTimestamp(""); + +EntryType GetEntryType(ValueType value_type) { + switch (value_type) { + case kTypeValue: + return kEntryPut; + case kTypeDeletion: + return kEntryDelete; + case kTypeDeletionWithTimestamp: + return kEntryDeleteWithTimestamp; + case kTypeSingleDeletion: + return kEntrySingleDelete; + case kTypeMerge: + return kEntryMerge; + case kTypeRangeDeletion: + return kEntryRangeDeletion; + case kTypeBlobIndex: + return kEntryBlobIndex; + case kTypeWideColumnEntity: + return kEntryWideColumnEntity; + default: + return kEntryOther; + } +} + +void AppendInternalKey(std::string* result, const ParsedInternalKey& key) { + result->append(key.user_key.data(), key.user_key.size()); + PutFixed64(result, PackSequenceAndType(key.sequence, key.type)); +} + +void AppendInternalKeyWithDifferentTimestamp(std::string* result, + const ParsedInternalKey& key, + const Slice& ts) { + assert(key.user_key.size() >= ts.size()); + result->append(key.user_key.data(), key.user_key.size() - ts.size()); + result->append(ts.data(), ts.size()); + PutFixed64(result, PackSequenceAndType(key.sequence, key.type)); +} + +void AppendInternalKeyFooter(std::string* result, SequenceNumber s, + ValueType t) { + PutFixed64(result, PackSequenceAndType(s, t)); +} + +void AppendKeyWithMinTimestamp(std::string* result, const Slice& key, + size_t ts_sz) { + assert(ts_sz > 0); + const std::string kTsMin(ts_sz, static_cast(0)); + result->append(key.data(), key.size()); + result->append(kTsMin.data(), ts_sz); +} + +void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key, + size_t ts_sz) { + assert(ts_sz > 0); + const std::string kTsMax(ts_sz, static_cast(0xff)); + result->append(key.data(), key.size()); + result->append(kTsMax.data(), ts_sz); +} + +void AppendUserKeyWithMaxTimestamp(std::string* result, const Slice& key, + size_t ts_sz) { + assert(ts_sz > 0); + result->append(key.data(), key.size() - ts_sz); + + static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff"; + if (ts_sz < strlen(kTsMax)) { + result->append(kTsMax, ts_sz); + } else { + result->append(std::string(ts_sz, '\xff')); + } +} + +std::string ParsedInternalKey::DebugString(bool log_err_key, bool hex) const { + std::string result = "'"; + if (log_err_key) { + result += user_key.ToString(hex); + } else { + result += ""; + } + + char buf[50]; + snprintf(buf, sizeof(buf), "' seq:%" PRIu64 ", type:%d", sequence, + static_cast(type)); + + result += buf; + return result; +} + +std::string InternalKey::DebugString(bool hex) const { + std::string result; + ParsedInternalKey parsed; + if (ParseInternalKey(rep_, &parsed, false /* log_err_key */).ok()) { + result = parsed.DebugString(true /* log_err_key */, hex); // TODO + } else { + result = "(bad)"; + result.append(EscapeString(rep_)); + } + return result; +} + +int InternalKeyComparator::Compare(const ParsedInternalKey& a, + const ParsedInternalKey& b) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + // decreasing type (though sequence# should be enough to disambiguate) + int r = user_comparator_.Compare(a.user_key, b.user_key); + if (r == 0) { + if (a.sequence > b.sequence) { + r = -1; + } else if (a.sequence < b.sequence) { + r = +1; + } else if (a.type > b.type) { + r = -1; + } else if (a.type < b.type) { + r = +1; + } + } + return r; +} + +LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s, + const Slice* ts) { + size_t usize = _user_key.size(); + size_t ts_sz = (nullptr == ts) ? 0 : ts->size(); + size_t needed = usize + ts_sz + 13; // A conservative estimate + char* dst; + if (needed <= sizeof(space_)) { + dst = space_; + } else { + dst = new char[needed]; + } + start_ = dst; + // NOTE: We don't support users keys of more than 2GB :) + dst = EncodeVarint32(dst, static_cast(usize + ts_sz + 8)); + kstart_ = dst; + memcpy(dst, _user_key.data(), usize); + dst += usize; + if (nullptr != ts) { + memcpy(dst, ts->data(), ts_sz); + dst += ts_sz; + } + EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek)); + dst += 8; + end_ = dst; +} + +void IterKey::EnlargeBuffer(size_t key_size) { + // If size is smaller than buffer size, continue using current buffer, + // or the static allocated one, as default + assert(key_size > buf_size_); + // Need to enlarge the buffer. + ResetBuffer(); + buf_ = new char[key_size]; + buf_size_ = key_size; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/dbformat.h b/src/rocksdb/db/dbformat.h new file mode 100644 index 000000000..8c1fc7055 --- /dev/null +++ b/src/rocksdb/db/dbformat.h @@ -0,0 +1,865 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/types.h" +#include "util/coding.h" +#include "util/user_comparator_wrapper.h" + +namespace ROCKSDB_NAMESPACE { + +// The file declares data structures and functions that deal with internal +// keys. +// Each internal key contains a user key, a sequence number (SequenceNumber) +// and a type (ValueType), and they are usually encoded together. +// There are some related helper classes here. + +class InternalKey; + +// Value types encoded as the last component of internal keys. +// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk +// data structures. +// The highest bit of the value type needs to be reserved to SST tables +// for them to do more flexible encoding. +enum ValueType : unsigned char { + kTypeDeletion = 0x0, + kTypeValue = 0x1, + kTypeMerge = 0x2, + kTypeLogData = 0x3, // WAL only. + kTypeColumnFamilyDeletion = 0x4, // WAL only. + kTypeColumnFamilyValue = 0x5, // WAL only. + kTypeColumnFamilyMerge = 0x6, // WAL only. + kTypeSingleDeletion = 0x7, + kTypeColumnFamilySingleDeletion = 0x8, // WAL only. + kTypeBeginPrepareXID = 0x9, // WAL only. + kTypeEndPrepareXID = 0xA, // WAL only. + kTypeCommitXID = 0xB, // WAL only. + kTypeRollbackXID = 0xC, // WAL only. + kTypeNoop = 0xD, // WAL only. + kTypeColumnFamilyRangeDeletion = 0xE, // WAL only. + kTypeRangeDeletion = 0xF, // meta block + kTypeColumnFamilyBlobIndex = 0x10, // Blob DB only + kTypeBlobIndex = 0x11, // Blob DB only + // When the prepared record is also persisted in db, we use a different + // record. This is to ensure that the WAL that is generated by a WritePolicy + // is not mistakenly read by another, which would result into data + // inconsistency. + kTypeBeginPersistedPrepareXID = 0x12, // WAL only. + // Similar to kTypeBeginPersistedPrepareXID, this is to ensure that WAL + // generated by WriteUnprepared write policy is not mistakenly read by + // another. + kTypeBeginUnprepareXID = 0x13, // WAL only. + kTypeDeletionWithTimestamp = 0x14, + kTypeCommitXIDAndTimestamp = 0x15, // WAL only + kTypeWideColumnEntity = 0x16, + kTypeColumnFamilyWideColumnEntity = 0x17, // WAL only + kTypeMaxValid, // Should be after the last valid type, only used for + // validation + kMaxValue = 0x7F // Not used for storing records. +}; + +// Defined in dbformat.cc +extern const ValueType kValueTypeForSeek; +extern const ValueType kValueTypeForSeekForPrev; + +// Checks whether a type is an inline value type +// (i.e. a type used in memtable skiplist and sst file datablock). +inline bool IsValueType(ValueType t) { + return t <= kTypeMerge || kTypeSingleDeletion == t || kTypeBlobIndex == t || + kTypeDeletionWithTimestamp == t || kTypeWideColumnEntity == t; +} + +// Checks whether a type is from user operation +// kTypeRangeDeletion is in meta block so this API is separated from above +inline bool IsExtendedValueType(ValueType t) { + return IsValueType(t) || t == kTypeRangeDeletion; +} + +// We leave eight bits empty at the bottom so a type and sequence# +// can be packed together into 64-bits. +static const SequenceNumber kMaxSequenceNumber = ((0x1ull << 56) - 1); + +static const SequenceNumber kDisableGlobalSequenceNumber = + std::numeric_limits::max(); + +constexpr uint64_t kNumInternalBytes = 8; + +// Defined in dbformat.cc +extern const std::string kDisableUserTimestamp; + +// The data structure that represents an internal key in the way that user_key, +// sequence number and type are stored in separated forms. +struct ParsedInternalKey { + Slice user_key; + SequenceNumber sequence; + ValueType type; + + ParsedInternalKey() + : sequence(kMaxSequenceNumber), + type(kTypeDeletion) // Make code analyzer happy + {} // Intentionally left uninitialized (for speed) + // u contains timestamp if user timestamp feature is enabled. + ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t) + : user_key(u), sequence(seq), type(t) {} + std::string DebugString(bool log_err_key, bool hex) const; + + void clear() { + user_key.clear(); + sequence = 0; + type = kTypeDeletion; + } + + void SetTimestamp(const Slice& ts) { + assert(ts.size() <= user_key.size()); + const char* addr = user_key.data() + user_key.size() - ts.size(); + memcpy(const_cast(addr), ts.data(), ts.size()); + } + + Slice GetTimestamp(size_t ts_sz) { + assert(ts_sz <= user_key.size()); + const char* addr = user_key.data() + user_key.size() - ts_sz; + return Slice(const_cast(addr), ts_sz); + } +}; + +// Return the length of the encoding of "key". +inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) { + return key.user_key.size() + kNumInternalBytes; +} + +// Pack a sequence number and a ValueType into a uint64_t +inline uint64_t PackSequenceAndType(uint64_t seq, ValueType t) { + assert(seq <= kMaxSequenceNumber); + // kTypeMaxValid is used in TruncatedRangeDelIterator, see its constructor. + assert(IsExtendedValueType(t) || t == kTypeMaxValid); + return (seq << 8) | t; +} + +// Given the result of PackSequenceAndType, store the sequence number in *seq +// and the ValueType in *t. +inline void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, + ValueType* t) { + *seq = packed >> 8; + *t = static_cast(packed & 0xff); + + // Commented the following two assertions in order to test key-value checksum + // on corrupted keys without crashing ("DbKvChecksumTest"). + // assert(*seq <= kMaxSequenceNumber); + // assert(IsExtendedValueType(*t)); +} + +EntryType GetEntryType(ValueType value_type); + +// Append the serialization of "key" to *result. +extern void AppendInternalKey(std::string* result, + const ParsedInternalKey& key); + +// Append the serialization of "key" to *result, replacing the original +// timestamp with argument ts. +extern void AppendInternalKeyWithDifferentTimestamp( + std::string* result, const ParsedInternalKey& key, const Slice& ts); + +// Serialized internal key consists of user key followed by footer. +// This function appends the footer to *result, assuming that *result already +// contains the user key at the end. +extern void AppendInternalKeyFooter(std::string* result, SequenceNumber s, + ValueType t); + +// Append the key and a minimal timestamp to *result +extern void AppendKeyWithMinTimestamp(std::string* result, const Slice& key, + size_t ts_sz); + +// Append the key and a maximal timestamp to *result +extern void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key, + size_t ts_sz); + +// `key` is a user key with timestamp. Append the user key without timestamp +// and the maximal timestamp to *result. +extern void AppendUserKeyWithMaxTimestamp(std::string* result, const Slice& key, + size_t ts_sz); + +// Attempt to parse an internal key from "internal_key". On success, +// stores the parsed data in "*result", and returns true. +// +// On error, returns false, leaves "*result" in an undefined state. +extern Status ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result, bool log_err_key); + +// Returns the user key portion of an internal key. +inline Slice ExtractUserKey(const Slice& internal_key) { + assert(internal_key.size() >= kNumInternalBytes); + return Slice(internal_key.data(), internal_key.size() - kNumInternalBytes); +} + +inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key, + size_t ts_sz) { + Slice ret = internal_key; + ret.remove_suffix(kNumInternalBytes + ts_sz); + return ret; +} + +inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) { + Slice ret = user_key; + ret.remove_suffix(ts_sz); + return ret; +} + +inline Slice ExtractTimestampFromUserKey(const Slice& user_key, size_t ts_sz) { + assert(user_key.size() >= ts_sz); + return Slice(user_key.data() + user_key.size() - ts_sz, ts_sz); +} + +inline Slice ExtractTimestampFromKey(const Slice& internal_key, size_t ts_sz) { + const size_t key_size = internal_key.size(); + assert(key_size >= kNumInternalBytes + ts_sz); + return Slice(internal_key.data() + key_size - ts_sz - kNumInternalBytes, + ts_sz); +} + +inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) { + assert(internal_key.size() >= kNumInternalBytes); + const size_t n = internal_key.size(); + return DecodeFixed64(internal_key.data() + n - kNumInternalBytes); +} + +inline ValueType ExtractValueType(const Slice& internal_key) { + uint64_t num = ExtractInternalKeyFooter(internal_key); + unsigned char c = num & 0xff; + return static_cast(c); +} + +// A comparator for internal keys that uses a specified comparator for +// the user key portion and breaks ties by decreasing sequence number. +class InternalKeyComparator +#ifdef NDEBUG + final +#endif + : public CompareInterface { + private: + UserComparatorWrapper user_comparator_; + + public: + // `InternalKeyComparator`s constructed with the default constructor are not + // usable and will segfault on any attempt to use them for comparisons. + InternalKeyComparator() = default; + + // @param named If true, assign a name to this comparator based on the + // underlying comparator's name. This involves an allocation and copy in + // this constructor to precompute the result of `Name()`. To avoid this + // overhead, set `named` to false. In that case, `Name()` will return a + // generic name that is non-specific to the underlying comparator. + explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) {} + virtual ~InternalKeyComparator() {} + + int Compare(const Slice& a, const Slice& b) const override; + + bool Equal(const Slice& a, const Slice& b) const { + // TODO Use user_comparator_.Equal(). Perhaps compare seqno before + // comparing the user key too. + return Compare(a, b) == 0; + } + + // Same as Compare except that it excludes the value type from comparison + int CompareKeySeq(const Slice& a, const Slice& b) const; + + const Comparator* user_comparator() const { + return user_comparator_.user_comparator(); + } + + int Compare(const InternalKey& a, const InternalKey& b) const; + int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const; + // In this `Compare()` overload, the sequence numbers provided in + // `a_global_seqno` and `b_global_seqno` override the sequence numbers in `a` + // and `b`, respectively. To disable sequence number override(s), provide the + // value `kDisableGlobalSequenceNumber`. + int Compare(const Slice& a, SequenceNumber a_global_seqno, const Slice& b, + SequenceNumber b_global_seqno) const; +}; + +// The class represent the internal key in encoded form. +class InternalKey { + private: + std::string rep_; + + public: + InternalKey() {} // Leave rep_ as empty to indicate it is invalid + InternalKey(const Slice& _user_key, SequenceNumber s, ValueType t) { + AppendInternalKey(&rep_, ParsedInternalKey(_user_key, s, t)); + } + InternalKey(const Slice& _user_key, SequenceNumber s, ValueType t, Slice ts) { + AppendInternalKeyWithDifferentTimestamp( + &rep_, ParsedInternalKey(_user_key, s, t), ts); + } + + // sets the internal key to be bigger or equal to all internal keys with this + // user key + void SetMaxPossibleForUserKey(const Slice& _user_key) { + AppendInternalKey( + &rep_, ParsedInternalKey(_user_key, 0, static_cast(0))); + } + + // sets the internal key to be smaller or equal to all internal keys with this + // user key + void SetMinPossibleForUserKey(const Slice& _user_key) { + AppendInternalKey(&rep_, ParsedInternalKey(_user_key, kMaxSequenceNumber, + kValueTypeForSeek)); + } + + bool Valid() const { + ParsedInternalKey parsed; + return (ParseInternalKey(Slice(rep_), &parsed, false /* log_err_key */) + .ok()); // TODO + } + + void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); } + Slice Encode() const { + assert(!rep_.empty()); + return rep_; + } + + Slice user_key() const { return ExtractUserKey(rep_); } + size_t size() const { return rep_.size(); } + + void Set(const Slice& _user_key, SequenceNumber s, ValueType t) { + SetFrom(ParsedInternalKey(_user_key, s, t)); + } + + void Set(const Slice& _user_key_with_ts, SequenceNumber s, ValueType t, + const Slice& ts) { + ParsedInternalKey pik = ParsedInternalKey(_user_key_with_ts, s, t); + // Should not call pik.SetTimestamp() directly as it overwrites the buffer + // containing _user_key. + SetFrom(pik, ts); + } + + void SetFrom(const ParsedInternalKey& p) { + rep_.clear(); + AppendInternalKey(&rep_, p); + } + + void SetFrom(const ParsedInternalKey& p, const Slice& ts) { + rep_.clear(); + AppendInternalKeyWithDifferentTimestamp(&rep_, p, ts); + } + + void Clear() { rep_.clear(); } + + // The underlying representation. + // Intended only to be used together with ConvertFromUserKey(). + std::string* rep() { return &rep_; } + + // Assuming that *rep() contains a user key, this method makes internal key + // out of it in-place. This saves a memcpy compared to Set()/SetFrom(). + void ConvertFromUserKey(SequenceNumber s, ValueType t) { + AppendInternalKeyFooter(&rep_, s, t); + } + + std::string DebugString(bool hex) const; +}; + +inline int InternalKeyComparator::Compare(const InternalKey& a, + const InternalKey& b) const { + return Compare(a.Encode(), b.Encode()); +} + +inline Status ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result, bool log_err_key) { + const size_t n = internal_key.size(); + + if (n < kNumInternalBytes) { + return Status::Corruption("Corrupted Key: Internal Key too small. Size=" + + std::to_string(n) + ". "); + } + + uint64_t num = DecodeFixed64(internal_key.data() + n - kNumInternalBytes); + unsigned char c = num & 0xff; + result->sequence = num >> 8; + result->type = static_cast(c); + assert(result->type <= ValueType::kMaxValue); + result->user_key = Slice(internal_key.data(), n - kNumInternalBytes); + + if (IsExtendedValueType(result->type)) { + return Status::OK(); + } else { + return Status::Corruption("Corrupted Key", + result->DebugString(log_err_key, true)); + } +} + +// Update the sequence number in the internal key. +// Guarantees not to invalidate ikey.data(). +inline void UpdateInternalKey(std::string* ikey, uint64_t seq, ValueType t) { + size_t ikey_sz = ikey->size(); + assert(ikey_sz >= kNumInternalBytes); + uint64_t newval = (seq << 8) | t; + + // Note: Since C++11, strings are guaranteed to be stored contiguously and + // string::operator[]() is guaranteed not to change ikey.data(). + EncodeFixed64(&(*ikey)[ikey_sz - kNumInternalBytes], newval); +} + +// Get the sequence number from the internal key +inline uint64_t GetInternalKeySeqno(const Slice& internal_key) { + const size_t n = internal_key.size(); + assert(n >= kNumInternalBytes); + uint64_t num = DecodeFixed64(internal_key.data() + n - kNumInternalBytes); + return num >> 8; +} + +// The class to store keys in an efficient way. It allows: +// 1. Users can either copy the key into it, or have it point to an unowned +// address. +// 2. For copied key, a short inline buffer is kept to reduce memory +// allocation for smaller keys. +// 3. It tracks user key or internal key, and allow conversion between them. +class IterKey { + public: + IterKey() + : buf_(space_), + key_(buf_), + key_size_(0), + buf_size_(sizeof(space_)), + is_user_key_(true) {} + // No copying allowed + IterKey(const IterKey&) = delete; + void operator=(const IterKey&) = delete; + + ~IterKey() { ResetBuffer(); } + + // The bool will be picked up by the next calls to SetKey + void SetIsUserKey(bool is_user_key) { is_user_key_ = is_user_key; } + + // Returns the key in whichever format that was provided to KeyIter + // If user-defined timestamp is enabled, then timestamp is included in the + // return result. + Slice GetKey() const { return Slice(key_, key_size_); } + + Slice GetInternalKey() const { + assert(!IsUserKey()); + return Slice(key_, key_size_); + } + + // If user-defined timestamp is enabled, then timestamp is included in the + // return result of GetUserKey(); + Slice GetUserKey() const { + if (IsUserKey()) { + return Slice(key_, key_size_); + } else { + assert(key_size_ >= kNumInternalBytes); + return Slice(key_, key_size_ - kNumInternalBytes); + } + } + + size_t Size() const { return key_size_; } + + void Clear() { key_size_ = 0; } + + // Append "non_shared_data" to its back, from "shared_len" + // This function is used in Block::Iter::ParseNextKey + // shared_len: bytes in [0, shard_len-1] would be remained + // non_shared_data: data to be append, its length must be >= non_shared_len + void TrimAppend(const size_t shared_len, const char* non_shared_data, + const size_t non_shared_len) { + assert(shared_len <= key_size_); + size_t total_size = shared_len + non_shared_len; + + if (IsKeyPinned() /* key is not in buf_ */) { + // Copy the key from external memory to buf_ (copy shared_len bytes) + EnlargeBufferIfNeeded(total_size); + memcpy(buf_, key_, shared_len); + } else if (total_size > buf_size_) { + // Need to allocate space, delete previous space + char* p = new char[total_size]; + memcpy(p, key_, shared_len); + + if (buf_ != space_) { + delete[] buf_; + } + + buf_ = p; + buf_size_ = total_size; + } + + memcpy(buf_ + shared_len, non_shared_data, non_shared_len); + key_ = buf_; + key_size_ = total_size; + } + + Slice SetKey(const Slice& key, bool copy = true) { + // is_user_key_ expected to be set already via SetIsUserKey + return SetKeyImpl(key, copy); + } + + // If user-defined timestamp is enabled, then `key` includes timestamp. + // TODO(yanqin) this is also used to set prefix, which do not include + // timestamp. Should be handled. + Slice SetUserKey(const Slice& key, bool copy = true) { + is_user_key_ = true; + return SetKeyImpl(key, copy); + } + + Slice SetInternalKey(const Slice& key, bool copy = true) { + is_user_key_ = false; + return SetKeyImpl(key, copy); + } + + // Copies the content of key, updates the reference to the user key in ikey + // and returns a Slice referencing the new copy. + Slice SetInternalKey(const Slice& key, ParsedInternalKey* ikey) { + size_t key_n = key.size(); + assert(key_n >= kNumInternalBytes); + SetInternalKey(key); + ikey->user_key = Slice(key_, key_n - kNumInternalBytes); + return Slice(key_, key_n); + } + + // Copy the key into IterKey own buf_ + void OwnKey() { + assert(IsKeyPinned() == true); + + Reserve(key_size_); + memcpy(buf_, key_, key_size_); + key_ = buf_; + } + + // Update the sequence number in the internal key. Guarantees not to + // invalidate slices to the key (and the user key). + void UpdateInternalKey(uint64_t seq, ValueType t, const Slice* ts = nullptr) { + assert(!IsKeyPinned()); + assert(key_size_ >= kNumInternalBytes); + if (ts) { + assert(key_size_ >= kNumInternalBytes + ts->size()); + memcpy(&buf_[key_size_ - kNumInternalBytes - ts->size()], ts->data(), + ts->size()); + } + uint64_t newval = (seq << 8) | t; + EncodeFixed64(&buf_[key_size_ - kNumInternalBytes], newval); + } + + bool IsKeyPinned() const { return (key_ != buf_); } + + // If `ts` is provided, user_key should not contain timestamp, + // and `ts` is appended after user_key. + // TODO: more efficient storage for timestamp. + void SetInternalKey(const Slice& key_prefix, const Slice& user_key, + SequenceNumber s, + ValueType value_type = kValueTypeForSeek, + const Slice* ts = nullptr) { + size_t psize = key_prefix.size(); + size_t usize = user_key.size(); + size_t ts_sz = (ts != nullptr ? ts->size() : 0); + EnlargeBufferIfNeeded(psize + usize + sizeof(uint64_t) + ts_sz); + if (psize > 0) { + memcpy(buf_, key_prefix.data(), psize); + } + memcpy(buf_ + psize, user_key.data(), usize); + if (ts) { + memcpy(buf_ + psize + usize, ts->data(), ts_sz); + } + EncodeFixed64(buf_ + usize + psize + ts_sz, + PackSequenceAndType(s, value_type)); + + key_ = buf_; + key_size_ = psize + usize + sizeof(uint64_t) + ts_sz; + is_user_key_ = false; + } + + void SetInternalKey(const Slice& user_key, SequenceNumber s, + ValueType value_type = kValueTypeForSeek, + const Slice* ts = nullptr) { + SetInternalKey(Slice(), user_key, s, value_type, ts); + } + + void Reserve(size_t size) { + EnlargeBufferIfNeeded(size); + key_size_ = size; + } + + void SetInternalKey(const ParsedInternalKey& parsed_key) { + SetInternalKey(Slice(), parsed_key); + } + + void SetInternalKey(const Slice& key_prefix, + const ParsedInternalKey& parsed_key_suffix) { + SetInternalKey(key_prefix, parsed_key_suffix.user_key, + parsed_key_suffix.sequence, parsed_key_suffix.type); + } + + void EncodeLengthPrefixedKey(const Slice& key) { + auto size = key.size(); + EnlargeBufferIfNeeded(size + static_cast(VarintLength(size))); + char* ptr = EncodeVarint32(buf_, static_cast(size)); + memcpy(ptr, key.data(), size); + key_ = buf_; + is_user_key_ = true; + } + + bool IsUserKey() const { return is_user_key_; } + + private: + char* buf_; + const char* key_; + size_t key_size_; + size_t buf_size_; + char space_[32]; // Avoid allocation for short keys + bool is_user_key_; + + Slice SetKeyImpl(const Slice& key, bool copy) { + size_t size = key.size(); + if (copy) { + // Copy key to buf_ + EnlargeBufferIfNeeded(size); + memcpy(buf_, key.data(), size); + key_ = buf_; + } else { + // Update key_ to point to external memory + key_ = key.data(); + } + key_size_ = size; + return Slice(key_, key_size_); + } + + void ResetBuffer() { + if (buf_ != space_) { + delete[] buf_; + buf_ = space_; + } + buf_size_ = sizeof(space_); + key_size_ = 0; + } + + // Enlarge the buffer size if needed based on key_size. + // By default, static allocated buffer is used. Once there is a key + // larger than the static allocated buffer, another buffer is dynamically + // allocated, until a larger key buffer is requested. In that case, we + // reallocate buffer and delete the old one. + void EnlargeBufferIfNeeded(size_t key_size) { + // If size is smaller than buffer size, continue using current buffer, + // or the static allocated one, as default + if (key_size > buf_size_) { + EnlargeBuffer(key_size); + } + } + + void EnlargeBuffer(size_t key_size); +}; + +// Convert from a SliceTransform of user keys, to a SliceTransform of +// internal keys. +class InternalKeySliceTransform : public SliceTransform { + public: + explicit InternalKeySliceTransform(const SliceTransform* transform) + : transform_(transform) {} + + virtual const char* Name() const override { return transform_->Name(); } + + virtual Slice Transform(const Slice& src) const override { + auto user_key = ExtractUserKey(src); + return transform_->Transform(user_key); + } + + virtual bool InDomain(const Slice& src) const override { + auto user_key = ExtractUserKey(src); + return transform_->InDomain(user_key); + } + + virtual bool InRange(const Slice& dst) const override { + auto user_key = ExtractUserKey(dst); + return transform_->InRange(user_key); + } + + const SliceTransform* user_prefix_extractor() const { return transform_; } + + private: + // Like comparator, InternalKeySliceTransform will not take care of the + // deletion of transform_ + const SliceTransform* const transform_; +}; + +// Read the key of a record from a write batch. +// if this record represent the default column family then cf_record +// must be passed as false, otherwise it must be passed as true. +extern bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, + bool cf_record); + +// Read record from a write batch piece from input. +// tag, column_family, key, value and blob are return values. Callers own the +// slice they point to. +// Tag is defined as ValueType. +// input will be advanced to after the record. +// If user-defined timestamp is enabled for a column family, then the `key` +// resulting from this call will include timestamp. +extern Status ReadRecordFromWriteBatch(Slice* input, char* tag, + uint32_t* column_family, Slice* key, + Slice* value, Slice* blob, Slice* xid); + +// When user call DeleteRange() to delete a range of keys, +// we will store a serialized RangeTombstone in MemTable and SST. +// the struct here is an easy-understood form +// start/end_key_ is the start/end user key of the range to be deleted +struct RangeTombstone { + Slice start_key_; + Slice end_key_; + SequenceNumber seq_; + // TODO: we should optimize the storage here when user-defined timestamp + // is NOT enabled: they currently take up (16 + 32 + 32) bytes per tombstone. + Slice ts_; + std::string pinned_start_key_; + std::string pinned_end_key_; + + RangeTombstone() = default; + RangeTombstone(Slice sk, Slice ek, SequenceNumber sn) + : start_key_(sk), end_key_(ek), seq_(sn) {} + + // User-defined timestamp is enabled, `sk` and `ek` should be user key + // with timestamp, `ts` will replace the timestamps in `sk` and + // `ek`. + RangeTombstone(Slice sk, Slice ek, SequenceNumber sn, Slice ts) + : seq_(sn), ts_(ts) { + assert(!ts.empty()); + pinned_start_key_.reserve(sk.size()); + pinned_start_key_.append(sk.data(), sk.size() - ts.size()); + pinned_start_key_.append(ts.data(), ts.size()); + pinned_end_key_.reserve(ek.size()); + pinned_end_key_.append(ek.data(), ek.size() - ts.size()); + pinned_end_key_.append(ts.data(), ts.size()); + start_key_ = pinned_start_key_; + end_key_ = pinned_end_key_; + } + + RangeTombstone(ParsedInternalKey parsed_key, Slice value) { + start_key_ = parsed_key.user_key; + seq_ = parsed_key.sequence; + end_key_ = value; + } + + // be careful to use Serialize(), allocates new memory + std::pair Serialize() const { + auto key = InternalKey(start_key_, seq_, kTypeRangeDeletion); + return std::make_pair(std::move(key), end_key_); + } + + // be careful to use SerializeKey(), allocates new memory + InternalKey SerializeKey() const { + return InternalKey(start_key_, seq_, kTypeRangeDeletion); + } + + // The tombstone end-key is exclusive, so we generate an internal-key here + // which has a similar property. Using kMaxSequenceNumber guarantees that + // the returned internal-key will compare less than any other internal-key + // with the same user-key. This in turn guarantees that the serialized + // end-key for a tombstone such as [a-b] will compare less than the key "b". + // + // be careful to use SerializeEndKey(), allocates new memory + InternalKey SerializeEndKey() const { + if (!ts_.empty()) { + static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff"; + if (ts_.size() <= strlen(kTsMax)) { + return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion, + Slice(kTsMax, ts_.size())); + } else { + return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion, + std::string(ts_.size(), '\xff')); + } + } + return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion); + } +}; + +inline int InternalKeyComparator::Compare(const Slice& akey, + const Slice& bkey) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + // decreasing type (though sequence# should be enough to disambiguate) + int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); + if (r == 0) { + const uint64_t anum = + DecodeFixed64(akey.data() + akey.size() - kNumInternalBytes); + const uint64_t bnum = + DecodeFixed64(bkey.data() + bkey.size() - kNumInternalBytes); + if (anum > bnum) { + r = -1; + } else if (anum < bnum) { + r = +1; + } + } + return r; +} + +inline int InternalKeyComparator::CompareKeySeq(const Slice& akey, + const Slice& bkey) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); + if (r == 0) { + // Shift the number to exclude the last byte which contains the value type + const uint64_t anum = + DecodeFixed64(akey.data() + akey.size() - kNumInternalBytes) >> 8; + const uint64_t bnum = + DecodeFixed64(bkey.data() + bkey.size() - kNumInternalBytes) >> 8; + if (anum > bnum) { + r = -1; + } else if (anum < bnum) { + r = +1; + } + } + return r; +} + +inline int InternalKeyComparator::Compare(const Slice& a, + SequenceNumber a_global_seqno, + const Slice& b, + SequenceNumber b_global_seqno) const { + int r = user_comparator_.Compare(ExtractUserKey(a), ExtractUserKey(b)); + if (r == 0) { + uint64_t a_footer, b_footer; + if (a_global_seqno == kDisableGlobalSequenceNumber) { + a_footer = ExtractInternalKeyFooter(a); + } else { + a_footer = PackSequenceAndType(a_global_seqno, ExtractValueType(a)); + } + if (b_global_seqno == kDisableGlobalSequenceNumber) { + b_footer = ExtractInternalKeyFooter(b); + } else { + b_footer = PackSequenceAndType(b_global_seqno, ExtractValueType(b)); + } + if (a_footer > b_footer) { + r = -1; + } else if (a_footer < b_footer) { + r = +1; + } + } + return r; +} + +// Wrap InternalKeyComparator as a comparator class for ParsedInternalKey. +struct ParsedInternalKeyComparator { + explicit ParsedInternalKeyComparator(const InternalKeyComparator* c) + : cmp(c) {} + + bool operator()(const ParsedInternalKey& a, + const ParsedInternalKey& b) const { + return cmp->Compare(a, b) < 0; + } + + const InternalKeyComparator* cmp; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/dbformat_test.cc b/src/rocksdb/db/dbformat_test.cc new file mode 100644 index 000000000..8dc3387df --- /dev/null +++ b/src/rocksdb/db/dbformat_test.cc @@ -0,0 +1,214 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/dbformat.h" + +#include "table/block_based/index_builder.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" + +namespace ROCKSDB_NAMESPACE { + +static std::string IKey(const std::string& user_key, uint64_t seq, + ValueType vt) { + std::string encoded; + AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt)); + return encoded; +} + +static std::string Shorten(const std::string& s, const std::string& l) { + std::string result = s; + ShortenedIndexBuilder::FindShortestInternalKeySeparator(*BytewiseComparator(), + &result, l); + return result; +} + +static std::string ShortSuccessor(const std::string& s) { + std::string result = s; + ShortenedIndexBuilder::FindShortInternalKeySuccessor(*BytewiseComparator(), + &result); + return result; +} + +static void TestKey(const std::string& key, uint64_t seq, ValueType vt) { + std::string encoded = IKey(key, seq, vt); + + Slice in(encoded); + ParsedInternalKey decoded("", 0, kTypeValue); + + ASSERT_OK(ParseInternalKey(in, &decoded, true /* log_err_key */)); + ASSERT_EQ(key, decoded.user_key.ToString()); + ASSERT_EQ(seq, decoded.sequence); + ASSERT_EQ(vt, decoded.type); + + ASSERT_NOK(ParseInternalKey(Slice("bar"), &decoded, true /* log_err_key */)); +} + +class FormatTest : public testing::Test {}; + +TEST_F(FormatTest, InternalKey_EncodeDecode) { + const char* keys[] = {"", "k", "hello", "longggggggggggggggggggggg"}; + const uint64_t seq[] = {1, + 2, + 3, + (1ull << 8) - 1, + 1ull << 8, + (1ull << 8) + 1, + (1ull << 16) - 1, + 1ull << 16, + (1ull << 16) + 1, + (1ull << 32) - 1, + 1ull << 32, + (1ull << 32) + 1}; + for (unsigned int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) { + for (unsigned int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) { + TestKey(keys[k], seq[s], kTypeValue); + TestKey("hello", 1, kTypeDeletion); + } + } +} + +TEST_F(FormatTest, InternalKeyShortSeparator) { + // When user keys are same + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 99, kTypeValue))); + ASSERT_EQ( + IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 101, kTypeValue))); + ASSERT_EQ( + IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 100, kTypeValue))); + ASSERT_EQ( + IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 100, kTypeDeletion))); + + // When user keys are misordered + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), IKey("bar", 99, kTypeValue))); + + // When user keys are different, but correctly ordered + ASSERT_EQ( + IKey("g", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("foo", 100, kTypeValue), IKey("hello", 200, kTypeValue))); + + ASSERT_EQ(IKey("ABC2", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("ABC1AAAAA", 100, kTypeValue), + IKey("ABC2ABB", 200, kTypeValue))); + + ASSERT_EQ(IKey("AAA2", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("AAA1AAA", 100, kTypeValue), + IKey("AAA2AA", 200, kTypeValue))); + + ASSERT_EQ( + IKey("AAA2", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("AAA1AAA", 100, kTypeValue), IKey("AAA4", 200, kTypeValue))); + + ASSERT_EQ( + IKey("AAA1B", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("AAA1AAA", 100, kTypeValue), IKey("AAA2", 200, kTypeValue))); + + ASSERT_EQ(IKey("AAA2", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("AAA1AAA", 100, kTypeValue), + IKey("AAA2A", 200, kTypeValue))); + + ASSERT_EQ( + IKey("AAA1", 100, kTypeValue), + Shorten(IKey("AAA1", 100, kTypeValue), IKey("AAA2", 200, kTypeValue))); + + // When start user key is prefix of limit user key + ASSERT_EQ( + IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), IKey("foobar", 200, kTypeValue))); + + // When limit user key is prefix of start user key + ASSERT_EQ( + IKey("foobar", 100, kTypeValue), + Shorten(IKey("foobar", 100, kTypeValue), IKey("foo", 200, kTypeValue))); +} + +TEST_F(FormatTest, InternalKeyShortestSuccessor) { + ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), + ShortSuccessor(IKey("foo", 100, kTypeValue))); + ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue), + ShortSuccessor(IKey("\xff\xff", 100, kTypeValue))); +} + +TEST_F(FormatTest, IterKeyOperation) { + IterKey k; + const char p[] = "abcdefghijklmnopqrstuvwxyz"; + const char q[] = "0123456789"; + + ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()), + std::string("")); + + k.TrimAppend(0, p, 3); + ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()), + std::string("abc")); + + k.TrimAppend(1, p, 3); + ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()), + std::string("aabc")); + + k.TrimAppend(0, p, 26); + ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()), + std::string("abcdefghijklmnopqrstuvwxyz")); + + k.TrimAppend(26, q, 10); + ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()), + std::string("abcdefghijklmnopqrstuvwxyz0123456789")); + + k.TrimAppend(36, q, 1); + ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()), + std::string("abcdefghijklmnopqrstuvwxyz01234567890")); + + k.TrimAppend(26, q, 1); + ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()), + std::string("abcdefghijklmnopqrstuvwxyz0")); + + // Size going up, memory allocation is triggered + k.TrimAppend(27, p, 26); + ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()), + std::string("abcdefghijklmnopqrstuvwxyz0" + "abcdefghijklmnopqrstuvwxyz")); +} + +TEST_F(FormatTest, UpdateInternalKey) { + std::string user_key("abcdefghijklmnopqrstuvwxyz"); + uint64_t new_seq = 0x123456; + ValueType new_val_type = kTypeDeletion; + + std::string ikey; + AppendInternalKey(&ikey, ParsedInternalKey(user_key, 100U, kTypeValue)); + size_t ikey_size = ikey.size(); + UpdateInternalKey(&ikey, new_seq, new_val_type); + ASSERT_EQ(ikey_size, ikey.size()); + + Slice in(ikey); + ParsedInternalKey decoded; + ASSERT_OK(ParseInternalKey(in, &decoded, true /* log_err_key */)); + ASSERT_EQ(user_key, decoded.user_key.ToString()); + ASSERT_EQ(new_seq, decoded.sequence); + ASSERT_EQ(new_val_type, decoded.type); +} + +TEST_F(FormatTest, RangeTombstoneSerializeEndKey) { + RangeTombstone t("a", "b", 2); + InternalKey k("b", 3, kTypeValue); + const InternalKeyComparator cmp(BytewiseComparator()); + ASSERT_LT(cmp.Compare(t.SerializeEndKey(), k), 0); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/deletefile_test.cc b/src/rocksdb/db/deletefile_test.cc new file mode 100644 index 000000000..34925e828 --- /dev/null +++ b/src/rocksdb/db/deletefile_test.cc @@ -0,0 +1,614 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE + +#include + +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "file/filename.h" +#include "port/stack_trace.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/transaction_log.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class DeleteFileTest : public DBTestBase { + public: + const int numlevels_; + const std::string wal_dir_; + + DeleteFileTest() + : DBTestBase("deletefile_test", /*env_do_fsync=*/true), + numlevels_(7), + wal_dir_(dbname_ + "/wal_files") {} + + void SetOptions(Options* options) { + ASSERT_NE(options, nullptr); + options->delete_obsolete_files_period_micros = 0; // always do full purge + options->enable_thread_tracking = true; + options->write_buffer_size = 1024 * 1024 * 1000; + options->target_file_size_base = 1024 * 1024 * 1000; + options->max_bytes_for_level_base = 1024 * 1024 * 1000; + options->WAL_ttl_seconds = 300; // Used to test log files + options->WAL_size_limit_MB = 1024; // Used to test log files + options->wal_dir = wal_dir_; + } + + void AddKeys(int numkeys, int startkey = 0) { + WriteOptions options; + options.sync = false; + ReadOptions roptions; + for (int i = startkey; i < (numkeys + startkey); i++) { + std::string temp = std::to_string(i); + Slice key(temp); + Slice value(temp); + ASSERT_OK(db_->Put(options, key, value)); + } + } + + int numKeysInLevels(std::vector& metadata, + std::vector* keysperlevel = nullptr) { + if (keysperlevel != nullptr) { + keysperlevel->resize(numlevels_); + } + + int numKeys = 0; + for (size_t i = 0; i < metadata.size(); i++) { + int startkey = atoi(metadata[i].smallestkey.c_str()); + int endkey = atoi(metadata[i].largestkey.c_str()); + int numkeysinfile = (endkey - startkey + 1); + numKeys += numkeysinfile; + if (keysperlevel != nullptr) { + (*keysperlevel)[(int)metadata[i].level] += numkeysinfile; + } + fprintf(stderr, "level %d name %s smallest %s largest %s\n", + metadata[i].level, metadata[i].name.c_str(), + metadata[i].smallestkey.c_str(), metadata[i].largestkey.c_str()); + } + return numKeys; + } + + void CreateTwoLevels() { + AddKeys(50000, 10000); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + for (int i = 0; i < 2; ++i) { + ASSERT_OK(dbfull()->TEST_CompactRange(i, nullptr, nullptr)); + } + + AddKeys(50000, 10000); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); + } + + void CheckFileTypeCounts(const std::string& dir, int required_log, + int required_sst, int required_manifest) { + std::vector filenames; + ASSERT_OK(env_->GetChildren(dir, &filenames)); + + int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0; + for (auto file : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(file, &number, &type)) { + log_cnt += (type == kWalFile); + sst_cnt += (type == kTableFile); + manifest_cnt += (type == kDescriptorFile); + } + } + if (required_log >= 0) { + ASSERT_EQ(required_log, log_cnt); + } + if (required_sst >= 0) { + ASSERT_EQ(required_sst, sst_cnt); + } + if (required_manifest >= 0) { + ASSERT_EQ(required_manifest, manifest_cnt); + } + } + + static void DoSleep(void* arg) { + auto test = reinterpret_cast(arg); + test->env_->SleepForMicroseconds(2 * 1000 * 1000); + } + + // An empty job to guard all jobs are processed + static void GuardFinish(void* /*arg*/) { + TEST_SYNC_POINT("DeleteFileTest::GuardFinish"); + } +}; + +TEST_F(DeleteFileTest, AddKeysAndQueryLevels) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + + CreateTwoLevels(); + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + + std::string level1file = ""; + int level1keycount = 0; + std::string level2file = ""; + int level2keycount = 0; + int level1index = 0; + int level2index = 1; + + ASSERT_EQ((int)metadata.size(), 2); + if (metadata[0].level == 2) { + level1index = 1; + level2index = 0; + } + + level1file = metadata[level1index].name; + int startkey = atoi(metadata[level1index].smallestkey.c_str()); + int endkey = atoi(metadata[level1index].largestkey.c_str()); + level1keycount = (endkey - startkey + 1); + level2file = metadata[level2index].name; + startkey = atoi(metadata[level2index].smallestkey.c_str()); + endkey = atoi(metadata[level2index].largestkey.c_str()); + level2keycount = (endkey - startkey + 1); + + // COntrolled setup. Levels 1 and 2 should both have 50K files. + // This is a little fragile as it depends on the current + // compaction heuristics. + ASSERT_EQ(level1keycount, 50000); + ASSERT_EQ(level2keycount, 50000); + + Status status = db_->DeleteFile("0.sst"); + ASSERT_TRUE(status.IsInvalidArgument()); + + // intermediate level files cannot be deleted. + status = db_->DeleteFile(level1file); + ASSERT_TRUE(status.IsInvalidArgument()); + + // Lowest level file deletion should succeed. + status = db_->DeleteFile(level2file); + ASSERT_OK(status); +} + +TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + + CreateTwoLevels(); + // there should be only one (empty) log file because CreateTwoLevels() + // flushes the memtables to disk + CheckFileTypeCounts(wal_dir_, 1, 0, 0); + // 2 ssts, 1 manifest + CheckFileTypeCounts(dbname_, 0, 2, 1); + std::string first("0"), last("999999"); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + Slice first_slice(first), last_slice(last); + ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice)); + // 1 sst after compaction + CheckFileTypeCounts(dbname_, 0, 1, 1); + + // this time, we keep an iterator alive + Reopen(options); + Iterator* itr = nullptr; + CreateTwoLevels(); + itr = db_->NewIterator(ReadOptions()); + ASSERT_OK(itr->status()); + ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice)); + ASSERT_OK(itr->status()); + // 3 sst after compaction with live iterator + CheckFileTypeCounts(dbname_, 0, 3, 1); + delete itr; + // 1 sst after iterator deletion + CheckFileTypeCounts(dbname_, 0, 1, 1); +} + +TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + + std::string first("0"), last("999999"); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + Slice first_slice(first), last_slice(last); + + // We keep an iterator alive + Iterator* itr = nullptr; + CreateTwoLevels(); + ReadOptions read_options; + read_options.background_purge_on_iterator_cleanup = true; + itr = db_->NewIterator(read_options); + ASSERT_OK(itr->status()); + ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice)); + // 3 sst after compaction with live iterator + CheckFileTypeCounts(dbname_, 0, 3, 1); + test::SleepingBackgroundTask sleeping_task_before; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_before, Env::Priority::HIGH); + delete itr; + test::SleepingBackgroundTask sleeping_task_after; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_after, Env::Priority::HIGH); + + // Make sure no purges are executed foreground + CheckFileTypeCounts(dbname_, 0, 3, 1); + sleeping_task_before.WakeUp(); + sleeping_task_before.WaitUntilDone(); + + // Make sure all background purges are executed + sleeping_task_after.WakeUp(); + sleeping_task_after.WaitUntilDone(); + // 1 sst after iterator deletion + CheckFileTypeCounts(dbname_, 0, 1, 1); +} + +TEST_F(DeleteFileTest, PurgeDuringOpen) { + Options options = CurrentOptions(); + CheckFileTypeCounts(dbname_, -1, 0, -1); + Close(); + std::unique_ptr file; + ASSERT_OK(options.env->NewWritableFile(dbname_ + "/000002.sst", &file, + EnvOptions())); + ASSERT_OK(file->Close()); + CheckFileTypeCounts(dbname_, -1, 1, -1); + options.avoid_unnecessary_blocking_io = false; + options.create_if_missing = false; + Reopen(options); + CheckFileTypeCounts(dbname_, -1, 0, -1); + Close(); + + // test background purge + options.avoid_unnecessary_blocking_io = true; + options.create_if_missing = false; + ASSERT_OK(options.env->NewWritableFile(dbname_ + "/000002.sst", &file, + EnvOptions())); + ASSERT_OK(file->Close()); + CheckFileTypeCounts(dbname_, -1, 1, -1); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency( + {{"DeleteFileTest::PurgeDuringOpen:1", "DBImpl::BGWorkPurge:start"}}); + SyncPoint::GetInstance()->EnableProcessing(); + Reopen(options); + // the obsolete file is not deleted until the background purge job is ran + CheckFileTypeCounts(dbname_, -1, 1, -1); + TEST_SYNC_POINT("DeleteFileTest::PurgeDuringOpen:1"); + ASSERT_OK(dbfull()->TEST_WaitForPurge()); + CheckFileTypeCounts(dbname_, -1, 0, -1); +} + +TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + + auto do_test = [&](bool bg_purge) { + ColumnFamilyOptions co; + co.max_write_buffer_size_to_maintain = + static_cast(co.write_buffer_size); + WriteOptions wo; + FlushOptions fo; + ColumnFamilyHandle* cfh = nullptr; + + ASSERT_OK(db_->CreateColumnFamily(co, "dropme", &cfh)); + + ASSERT_OK(db_->Put(wo, cfh, "pika", "chu")); + ASSERT_OK(db_->Flush(fo, cfh)); + // Expect 1 sst file. + CheckFileTypeCounts(dbname_, 0, 1, 1); + + ASSERT_OK(db_->DropColumnFamily(cfh)); + // Still 1 file, it won't be deleted while ColumnFamilyHandle is alive. + CheckFileTypeCounts(dbname_, 0, 1, 1); + + delete cfh; + test::SleepingBackgroundTask sleeping_task_after; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_after, Env::Priority::HIGH); + // If background purge is enabled, the file should still be there. + CheckFileTypeCounts(dbname_, 0, bg_purge ? 1 : 0, 1); + TEST_SYNC_POINT("DeleteFileTest::BackgroundPurgeCFDropTest:1"); + + // Execute background purges. + sleeping_task_after.WakeUp(); + sleeping_task_after.WaitUntilDone(); + // The file should have been deleted. + CheckFileTypeCounts(dbname_, 0, 0, 1); + }; + + { + SCOPED_TRACE("avoid_unnecessary_blocking_io = false"); + do_test(false); + } + + options.avoid_unnecessary_blocking_io = true; + options.create_if_missing = false; + Reopen(options); + ASSERT_OK(dbfull()->TEST_WaitForPurge()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency( + {{"DeleteFileTest::BackgroundPurgeCFDropTest:1", + "DBImpl::BGWorkPurge:start"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + { + SCOPED_TRACE("avoid_unnecessary_blocking_io = true"); + do_test(true); + } +} + +// This test is to reproduce a bug that read invalid ReadOption in iterator +// cleanup function +TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + + std::string first("0"), last("999999"); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + Slice first_slice(first), last_slice(last); + + // We keep an iterator alive + Iterator* itr = nullptr; + CreateTwoLevels(); + { + ReadOptions read_options; + read_options.background_purge_on_iterator_cleanup = true; + itr = db_->NewIterator(read_options); + ASSERT_OK(itr->status()); + // ReadOptions is deleted, but iterator cleanup function should not be + // affected + } + + ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice)); + // 3 sst after compaction with live iterator + CheckFileTypeCounts(dbname_, 0, 3, 1); + delete itr; + + test::SleepingBackgroundTask sleeping_task_after; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_after, Env::Priority::HIGH); + + // Make sure all background purges are executed + sleeping_task_after.WakeUp(); + sleeping_task_after.WaitUntilDone(); + // 1 sst after iterator deletion + CheckFileTypeCounts(dbname_, 0, 1, 1); +} + +TEST_F(DeleteFileTest, BackgroundPurgeTestMultipleJobs) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + + std::string first("0"), last("999999"); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + Slice first_slice(first), last_slice(last); + + // We keep an iterator alive + CreateTwoLevels(); + ReadOptions read_options; + read_options.background_purge_on_iterator_cleanup = true; + Iterator* itr1 = db_->NewIterator(read_options); + ASSERT_OK(itr1->status()); + CreateTwoLevels(); + Iterator* itr2 = db_->NewIterator(read_options); + ASSERT_OK(itr2->status()); + ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice)); + // 5 sst files after 2 compactions with 2 live iterators + CheckFileTypeCounts(dbname_, 0, 5, 1); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + // ~DBImpl should wait until all BGWorkPurge are finished + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::~DBImpl:WaitJob", "DBImpl::BGWorkPurge"}, + {"DeleteFileTest::GuardFinish", + "DeleteFileTest::BackgroundPurgeTestMultipleJobs:DBClose"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + delete itr1; + env_->Schedule(&DeleteFileTest::DoSleep, this, Env::Priority::HIGH); + delete itr2; + env_->Schedule(&DeleteFileTest::GuardFinish, nullptr, Env::Priority::HIGH); + Close(); + + TEST_SYNC_POINT("DeleteFileTest::BackgroundPurgeTestMultipleJobs:DBClose"); + // 1 sst after iterator deletion + CheckFileTypeCounts(dbname_, 0, 1, 1); +} + +TEST_F(DeleteFileTest, DeleteFileWithIterator) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + + CreateTwoLevels(); + ReadOptions read_options; + Iterator* it = db_->NewIterator(read_options); + ASSERT_OK(it->status()); + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + + std::string level2file; + + ASSERT_EQ(metadata.size(), static_cast(2)); + if (metadata[0].level == 1) { + level2file = metadata[1].name; + } else { + level2file = metadata[0].name; + } + + Status status = db_->DeleteFile(level2file); + fprintf(stdout, "Deletion status %s: %s\n", level2file.c_str(), + status.ToString().c_str()); + ASSERT_OK(status); + it->SeekToFirst(); + int numKeysIterated = 0; + while (it->Valid()) { + numKeysIterated++; + it->Next(); + } + ASSERT_EQ(numKeysIterated, 50000); + delete it; +} + +TEST_F(DeleteFileTest, DeleteLogFiles) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + + AddKeys(10, 0); + VectorLogPtr logfiles; + ASSERT_OK(db_->GetSortedWalFiles(logfiles)); + ASSERT_GT(logfiles.size(), 0UL); + // Take the last log file which is expected to be alive and try to delete it + // Should not succeed because live logs are not allowed to be deleted + std::unique_ptr alive_log = std::move(logfiles.back()); + ASSERT_EQ(alive_log->Type(), kAliveLogFile); + ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName())); + fprintf(stdout, "Deleting alive log file %s\n", + alive_log->PathName().c_str()); + ASSERT_NOK(db_->DeleteFile(alive_log->PathName())); + ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName())); + logfiles.clear(); + + // Call Flush to bring about a new working log file and add more keys + // Call Flush again to flush out memtable and move alive log to archived log + // and try to delete the archived log file + FlushOptions fopts; + ASSERT_OK(db_->Flush(fopts)); + AddKeys(10, 0); + ASSERT_OK(db_->Flush(fopts)); + ASSERT_OK(db_->GetSortedWalFiles(logfiles)); + ASSERT_GT(logfiles.size(), 0UL); + std::unique_ptr archived_log = std::move(logfiles.front()); + ASSERT_EQ(archived_log->Type(), kArchivedLogFile); + ASSERT_OK(env_->FileExists(wal_dir_ + "/" + archived_log->PathName())); + fprintf(stdout, "Deleting archived log file %s\n", + archived_log->PathName().c_str()); + ASSERT_OK(db_->DeleteFile(archived_log->PathName())); + ASSERT_TRUE( + env_->FileExists(wal_dir_ + "/" + archived_log->PathName()).IsNotFound()); +} + +TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + CreateAndReopenWithCF({"new_cf"}, options); + + Random rnd(5); + for (int i = 0; i < 1000; ++i) { + ASSERT_OK(db_->Put(WriteOptions(), handles_[1], test::RandomKey(&rnd, 10), + test::RandomKey(&rnd, 10))); + } + ASSERT_OK(db_->Flush(FlushOptions(), handles_[1])); + for (int i = 0; i < 1000; ++i) { + ASSERT_OK(db_->Put(WriteOptions(), handles_[1], test::RandomKey(&rnd, 10), + test::RandomKey(&rnd, 10))); + } + ASSERT_OK(db_->Flush(FlushOptions(), handles_[1])); + + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(2U, metadata.size()); + ASSERT_EQ("new_cf", metadata[0].column_family_name); + ASSERT_EQ("new_cf", metadata[1].column_family_name); + auto old_file = metadata[0].smallest_seqno < metadata[1].smallest_seqno + ? metadata[0].name + : metadata[1].name; + auto new_file = metadata[0].smallest_seqno > metadata[1].smallest_seqno + ? metadata[0].name + : metadata[1].name; + ASSERT_TRUE(db_->DeleteFile(new_file).IsInvalidArgument()); + ASSERT_OK(db_->DeleteFile(old_file)); + + { + std::unique_ptr itr(db_->NewIterator(ReadOptions(), handles_[1])); + ASSERT_OK(itr->status()); + int count = 0; + for (itr->SeekToFirst(); itr->Valid(); itr->Next()) { + ASSERT_OK(itr->status()); + ++count; + } + ASSERT_EQ(count, 1000); + } + + Close(); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "new_cf"}, options); + + { + std::unique_ptr itr(db_->NewIterator(ReadOptions(), handles_[1])); + int count = 0; + for (itr->SeekToFirst(); itr->Valid(); itr->Next()) { + ASSERT_OK(itr->status()); + ++count; + } + ASSERT_EQ(count, 1000); + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as DBImpl::DeleteFile is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/error_handler.cc b/src/rocksdb/db/error_handler.cc new file mode 100644 index 000000000..7f68bb026 --- /dev/null +++ b/src/rocksdb/db/error_handler.cc @@ -0,0 +1,819 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "db/error_handler.h" + +#include "db/db_impl/db_impl.h" +#include "db/event_helpers.h" +#include "file/sst_file_manager_impl.h" +#include "logging/logging.h" +#include "port/lang.h" + +namespace ROCKSDB_NAMESPACE { + +// Maps to help decide the severity of an error based on the +// BackgroundErrorReason, Code, SubCode and whether db_options.paranoid_checks +// is set or not. There are 3 maps, going from most specific to least specific +// (i.e from all 4 fields in a tuple to only the BackgroundErrorReason and +// paranoid_checks). The less specific map serves as a catch all in case we miss +// a specific error code or subcode. +std::map, + Status::Severity> + ErrorSeverityMap = { + // Errors during BG compaction + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kIOError, Status::SubCode::kNoSpace, + true), + Status::Severity::kSoftError}, + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kIOError, Status::SubCode::kNoSpace, + false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kIOError, Status::SubCode::kSpaceLimit, + true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kIOError, Status::SubCode::kIOFenced, + true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kIOError, Status::SubCode::kIOFenced, + false), + Status::Severity::kFatalError}, + // Errors during BG flush + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + Status::SubCode::kNoSpace, true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + Status::SubCode::kNoSpace, false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + Status::SubCode::kSpaceLimit, true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + Status::SubCode::kIOFenced, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + Status::SubCode::kIOFenced, false), + Status::Severity::kFatalError}, + // Errors during Write + {std::make_tuple(BackgroundErrorReason::kWriteCallback, + Status::Code::kIOError, Status::SubCode::kNoSpace, + true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kWriteCallback, + Status::Code::kIOError, Status::SubCode::kNoSpace, + false), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kWriteCallback, + Status::Code::kIOError, Status::SubCode::kIOFenced, + true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kWriteCallback, + Status::Code::kIOError, Status::SubCode::kIOFenced, + false), + Status::Severity::kFatalError}, + // Errors during MANIFEST write + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, Status::SubCode::kNoSpace, + true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, Status::SubCode::kNoSpace, + false), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, Status::SubCode::kIOFenced, + true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, Status::SubCode::kIOFenced, + false), + Status::Severity::kFatalError}, + // Errors during BG flush with WAL disabled + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kIOError, Status::SubCode::kNoSpace, + true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kIOError, Status::SubCode::kNoSpace, + false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kIOError, Status::SubCode::kSpaceLimit, + true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kIOError, Status::SubCode::kIOFenced, + true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kIOError, Status::SubCode::kIOFenced, + false), + Status::Severity::kFatalError}, + // Errors during MANIFEST write when WAL is disabled + {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, + Status::Code::kIOError, Status::SubCode::kNoSpace, + true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, + Status::Code::kIOError, Status::SubCode::kNoSpace, + false), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, + Status::Code::kIOError, Status::SubCode::kIOFenced, + true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, + Status::Code::kIOError, Status::SubCode::kIOFenced, + false), + Status::Severity::kFatalError}, + +}; + +std::map, + Status::Severity> + DefaultErrorSeverityMap = { + // Errors during BG compaction + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kCorruption, true), + Status::Severity::kUnrecoverableError}, + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kCorruption, false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kIOError, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kIOError, false), + Status::Severity::kNoError}, + // Errors during BG flush + {std::make_tuple(BackgroundErrorReason::kFlush, + Status::Code::kCorruption, true), + Status::Severity::kUnrecoverableError}, + {std::make_tuple(BackgroundErrorReason::kFlush, + Status::Code::kCorruption, false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + false), + Status::Severity::kNoError}, + // Errors during Write + {std::make_tuple(BackgroundErrorReason::kWriteCallback, + Status::Code::kCorruption, true), + Status::Severity::kUnrecoverableError}, + {std::make_tuple(BackgroundErrorReason::kWriteCallback, + Status::Code::kCorruption, false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kWriteCallback, + Status::Code::kIOError, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kWriteCallback, + Status::Code::kIOError, false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, false), + Status::Severity::kFatalError}, + // Errors during BG flush with WAL disabled + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kCorruption, true), + Status::Severity::kUnrecoverableError}, + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kCorruption, false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kIOError, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kIOError, false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, + Status::Code::kIOError, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, + Status::Code::kIOError, false), + Status::Severity::kFatalError}, +}; + +std::map, Status::Severity> + DefaultReasonMap = { + // Errors during BG compaction + {std::make_tuple(BackgroundErrorReason::kCompaction, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kCompaction, false), + Status::Severity::kNoError}, + // Errors during BG flush + {std::make_tuple(BackgroundErrorReason::kFlush, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kFlush, false), + Status::Severity::kNoError}, + // Errors during Write + {std::make_tuple(BackgroundErrorReason::kWriteCallback, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kWriteCallback, false), + Status::Severity::kFatalError}, + // Errors during Memtable update + {std::make_tuple(BackgroundErrorReason::kMemTable, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kMemTable, false), + Status::Severity::kFatalError}, +}; + +void ErrorHandler::CancelErrorRecovery() { +#ifndef ROCKSDB_LITE + db_mutex_->AssertHeld(); + + // We'll release the lock before calling sfm, so make sure no new + // recovery gets scheduled at that point + auto_recovery_ = false; + SstFileManagerImpl* sfm = + reinterpret_cast(db_options_.sst_file_manager.get()); + if (sfm) { + // This may or may not cancel a pending recovery + db_mutex_->Unlock(); + bool cancelled = sfm->CancelErrorRecovery(this); + db_mutex_->Lock(); + if (cancelled) { + recovery_in_prog_ = false; + } + } + + // If auto recovery is also runing to resume from the retryable error, + // we should wait and end the auto recovery. + EndAutoRecovery(); +#endif +} + +STATIC_AVOID_DESTRUCTION(const Status, kOkStatus){Status::OK()}; + +// This is the main function for looking at an error during a background +// operation and deciding the severity, and error recovery strategy. The high +// level algorithm is as follows - +// 1. Classify the severity of the error based on the ErrorSeverityMap, +// DefaultErrorSeverityMap and DefaultReasonMap defined earlier +// 2. Call a Status code specific override function to adjust the severity +// if needed. The reason for this is our ability to recover may depend on +// the exact options enabled in DBOptions +// 3. Determine if auto recovery is possible. A listener notification callback +// is called, which can disable the auto recovery even if we decide its +// feasible +// 4. For Status::NoSpace() errors, rely on SstFileManagerImpl to control +// the actual recovery. If no sst file manager is specified in DBOptions, +// a default one is allocated during DB::Open(), so there will always be +// one. +// This can also get called as part of a recovery operation. In that case, we +// also track the error separately in recovery_error_ so we can tell in the +// end whether recovery succeeded or not +const Status& ErrorHandler::HandleKnownErrors(const Status& bg_err, + BackgroundErrorReason reason) { + db_mutex_->AssertHeld(); + if (bg_err.ok()) { + return kOkStatus; + } + + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT); + } + ROCKS_LOG_INFO(db_options_.info_log, + "ErrorHandler: Set regular background error\n"); + + bool paranoid = db_options_.paranoid_checks; + Status::Severity sev = Status::Severity::kFatalError; + Status new_bg_err; + DBRecoverContext context; + bool found = false; + + { + auto entry = ErrorSeverityMap.find( + std::make_tuple(reason, bg_err.code(), bg_err.subcode(), paranoid)); + if (entry != ErrorSeverityMap.end()) { + sev = entry->second; + found = true; + } + } + + if (!found) { + auto entry = DefaultErrorSeverityMap.find( + std::make_tuple(reason, bg_err.code(), paranoid)); + if (entry != DefaultErrorSeverityMap.end()) { + sev = entry->second; + found = true; + } + } + + if (!found) { + auto entry = DefaultReasonMap.find(std::make_tuple(reason, paranoid)); + if (entry != DefaultReasonMap.end()) { + sev = entry->second; + } + } + + new_bg_err = Status(bg_err, sev); + + // Check if recovery is currently in progress. If it is, we will save this + // error so we can check it at the end to see if recovery succeeded or not + if (recovery_in_prog_ && recovery_error_.ok()) { + recovery_error_ = new_bg_err; + } + + bool auto_recovery = auto_recovery_; + if (new_bg_err.severity() >= Status::Severity::kFatalError && auto_recovery) { + auto_recovery = false; + } + + // Allow some error specific overrides + if (new_bg_err.subcode() == IOStatus::SubCode::kNoSpace || + new_bg_err.subcode() == IOStatus::SubCode::kSpaceLimit) { + new_bg_err = OverrideNoSpaceError(new_bg_err, &auto_recovery); + } + + if (!new_bg_err.ok()) { + Status s = new_bg_err; + EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s, + db_mutex_, &auto_recovery); + if (!s.ok() && (s.severity() > bg_error_.severity())) { + bg_error_ = s; + } else { + // This error is less severe than previously encountered error. Don't + // take any further action + return bg_error_; + } + } + + recover_context_ = context; + if (auto_recovery) { + recovery_in_prog_ = true; + + // Kick-off error specific recovery + if (new_bg_err.subcode() == IOStatus::SubCode::kNoSpace || + new_bg_err.subcode() == IOStatus::SubCode::kSpaceLimit) { + RecoverFromNoSpace(); + } + } + if (bg_error_.severity() >= Status::Severity::kHardError) { + is_db_stopped_.store(true, std::memory_order_release); + } + return bg_error_; +} + +// This is the main function for looking at IO related error during the +// background operations. The main logic is: +// 1) File scope IO error is treated as retryable IO error in the write +// path. In RocksDB, If a file has write IO error and it is at file scope, +// RocksDB never write to the same file again. RocksDB will create a new +// file and rewrite the whole content. Thus, it is retryable. +// 1) if the error is caused by data loss, the error is mapped to +// unrecoverable error. Application/user must take action to handle +// this situation (File scope case is excluded). +// 2) if the error is a Retryable IO error (i.e., it is a file scope IO error, +// or its retryable flag is set and not a data loss error), auto resume +// will be called and the auto resume can be controlled by resume count +// and resume interval options. There are three sub-cases: +// a) if the error happens during compaction, it is mapped to a soft error. +// the compaction thread will reschedule a new compaction. +// b) if the error happens during flush and also WAL is empty, it is mapped +// to a soft error. Note that, it includes the case that IO error happens +// in SST or manifest write during flush. +// c) all other errors are mapped to hard error. +// 3) for other cases, SetBGError(const Status& bg_err, BackgroundErrorReason +// reason) will be called to handle other error cases. +const Status& ErrorHandler::SetBGError(const Status& bg_status, + BackgroundErrorReason reason) { + db_mutex_->AssertHeld(); + Status tmp_status = bg_status; + IOStatus bg_io_err = status_to_io_status(std::move(tmp_status)); + + if (bg_io_err.ok()) { + return kOkStatus; + } + ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s", + bg_io_err.ToString().c_str()); + + if (recovery_in_prog_ && recovery_io_error_.ok()) { + recovery_io_error_ = bg_io_err; + } + if (BackgroundErrorReason::kManifestWrite == reason || + BackgroundErrorReason::kManifestWriteNoWAL == reason) { + // Always returns ok + ROCKS_LOG_INFO(db_options_.info_log, "Disabling File Deletions"); + db_->DisableFileDeletionsWithLock().PermitUncheckedError(); + } + + Status new_bg_io_err = bg_io_err; + DBRecoverContext context; + if (bg_io_err.GetScope() != IOStatus::IOErrorScope::kIOErrorScopeFile && + bg_io_err.GetDataLoss()) { + // First, data loss (non file scope) is treated as unrecoverable error. So + // it can directly overwrite any existing bg_error_. + bool auto_recovery = false; + Status bg_err(new_bg_io_err, Status::Severity::kUnrecoverableError); + CheckAndSetRecoveryAndBGError(bg_err); + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT); + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT); + } + ROCKS_LOG_INFO( + db_options_.info_log, + "ErrorHandler: Set background IO error as unrecoverable error\n"); + EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, + &bg_err, db_mutex_, &auto_recovery); + recover_context_ = context; + return bg_error_; + } else if (bg_io_err.subcode() != IOStatus::SubCode::kNoSpace && + (bg_io_err.GetScope() == + IOStatus::IOErrorScope::kIOErrorScopeFile || + bg_io_err.GetRetryable())) { + // Second, check if the error is a retryable IO error (file scope IO error + // is also treated as retryable IO error in RocksDB write path). if it is + // retryable error and its severity is higher than bg_error_, overwrite the + // bg_error_ with new error. In current stage, for retryable IO error of + // compaction, treat it as soft error. In other cases, treat the retryable + // IO error as hard error. Note that, all the NoSpace error should be + // handled by the SstFileManager::StartErrorRecovery(). Therefore, no matter + // it is retryable or file scope, this logic will be bypassed. + bool auto_recovery = false; + EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, + &new_bg_io_err, db_mutex_, + &auto_recovery); + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT); + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT); + RecordTick(bg_error_stats_.get(), + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT); + } + ROCKS_LOG_INFO(db_options_.info_log, + "ErrorHandler: Set background retryable IO error\n"); + if (BackgroundErrorReason::kCompaction == reason) { + // We map the retryable IO error during compaction to soft error. Since + // compaction can reschedule by itself. We will not set the BG error in + // this case + // TODO: a better way to set or clean the retryable IO error which + // happens during compaction SST file write. + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT); + } + ROCKS_LOG_INFO( + db_options_.info_log, + "ErrorHandler: Compaction will schedule by itself to resume\n"); + return bg_error_; + } else if (BackgroundErrorReason::kFlushNoWAL == reason || + BackgroundErrorReason::kManifestWriteNoWAL == reason) { + // When the BG Retryable IO error reason is flush without WAL, + // We map it to a soft error. At the same time, all the background work + // should be stopped except the BG work from recovery. Therefore, we + // set the soft_error_no_bg_work_ to true. At the same time, since DB + // continues to receive writes when BG error is soft error, to avoid + // to many small memtable being generated during auto resume, the flush + // reason is set to kErrorRecoveryRetryFlush. + Status bg_err(new_bg_io_err, Status::Severity::kSoftError); + CheckAndSetRecoveryAndBGError(bg_err); + soft_error_no_bg_work_ = true; + context.flush_reason = FlushReason::kErrorRecoveryRetryFlush; + recover_context_ = context; + return StartRecoverFromRetryableBGIOError(bg_io_err); + } else { + Status bg_err(new_bg_io_err, Status::Severity::kHardError); + CheckAndSetRecoveryAndBGError(bg_err); + recover_context_ = context; + return StartRecoverFromRetryableBGIOError(bg_io_err); + } + } else { + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT); + } + // HandleKnownErrors() will use recovery_error_, so ignore + // recovery_io_error_. + // TODO: Do some refactoring and use only one recovery_error_ + recovery_io_error_.PermitUncheckedError(); + return HandleKnownErrors(new_bg_io_err, reason); + } +} + +Status ErrorHandler::OverrideNoSpaceError(const Status& bg_error, + bool* auto_recovery) { +#ifndef ROCKSDB_LITE + if (bg_error.severity() >= Status::Severity::kFatalError) { + return bg_error; + } + + if (db_options_.sst_file_manager.get() == nullptr) { + // We rely on SFM to poll for enough disk space and recover + *auto_recovery = false; + return bg_error; + } + + if (db_options_.allow_2pc && + (bg_error.severity() <= Status::Severity::kSoftError)) { + // Don't know how to recover, as the contents of the current WAL file may + // be inconsistent, and it may be needed for 2PC. If 2PC is not enabled, + // we can just flush the memtable and discard the log + *auto_recovery = false; + return Status(bg_error, Status::Severity::kFatalError); + } + + { + uint64_t free_space; + if (db_options_.env->GetFreeSpace(db_options_.db_paths[0].path, + &free_space) == Status::NotSupported()) { + *auto_recovery = false; + } + } + + return bg_error; +#else + (void)auto_recovery; + return Status(bg_error, Status::Severity::kFatalError); +#endif +} + +void ErrorHandler::RecoverFromNoSpace() { +#ifndef ROCKSDB_LITE + SstFileManagerImpl* sfm = + reinterpret_cast(db_options_.sst_file_manager.get()); + + // Inform SFM of the error, so it can kick-off the recovery + if (sfm) { + sfm->StartErrorRecovery(this, bg_error_); + } +#endif +} + +Status ErrorHandler::ClearBGError() { +#ifndef ROCKSDB_LITE + db_mutex_->AssertHeld(); + + // Signal that recovery succeeded + if (recovery_error_.ok()) { + Status old_bg_error = bg_error_; + // old_bg_error is only for notifying listeners, so may not be checked + old_bg_error.PermitUncheckedError(); + // Clear and check the recovery IO and BG error + bg_error_ = Status::OK(); + recovery_io_error_ = IOStatus::OK(); + bg_error_.PermitUncheckedError(); + recovery_io_error_.PermitUncheckedError(); + recovery_in_prog_ = false; + soft_error_no_bg_work_ = false; + EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, old_bg_error, + bg_error_, db_mutex_); + } + return recovery_error_; +#else + return bg_error_; +#endif +} + +Status ErrorHandler::RecoverFromBGError(bool is_manual) { +#ifndef ROCKSDB_LITE + InstrumentedMutexLock l(db_mutex_); + bool no_bg_work_original_flag = soft_error_no_bg_work_; + if (is_manual) { + // If its a manual recovery and there's a background recovery in progress + // return busy status + if (recovery_in_prog_) { + return Status::Busy(); + } + recovery_in_prog_ = true; + + // In manual resume, we allow the bg work to run. If it is a auto resume, + // the bg work should follow this tag. + soft_error_no_bg_work_ = false; + + // In manual resume, if the bg error is a soft error and also requires + // no bg work, the error must be recovered by call the flush with + // flush reason: kErrorRecoveryRetryFlush. In other case, the flush + // reason is set to kErrorRecovery. + if (no_bg_work_original_flag) { + recover_context_.flush_reason = FlushReason::kErrorRecoveryRetryFlush; + } else { + recover_context_.flush_reason = FlushReason::kErrorRecovery; + } + } + + if (bg_error_.severity() == Status::Severity::kSoftError && + recover_context_.flush_reason == FlushReason::kErrorRecovery) { + // Simply clear the background error and return + recovery_error_ = Status::OK(); + return ClearBGError(); + } + + // Reset recovery_error_. We will use this to record any errors that happen + // during the recovery process. While recovering, the only operations that + // can generate background errors should be the flush operations + recovery_error_ = Status::OK(); + recovery_error_.PermitUncheckedError(); + Status s = db_->ResumeImpl(recover_context_); + if (s.ok()) { + soft_error_no_bg_work_ = false; + } else { + soft_error_no_bg_work_ = no_bg_work_original_flag; + } + + // For manual recover, shutdown, and fatal error cases, set + // recovery_in_prog_ to false. For automatic background recovery, leave it + // as is regardless of success or failure as it will be retried + if (is_manual || s.IsShutdownInProgress() || + bg_error_.severity() >= Status::Severity::kFatalError) { + recovery_in_prog_ = false; + } + return s; +#else + (void)is_manual; + return bg_error_; +#endif +} + +const Status& ErrorHandler::StartRecoverFromRetryableBGIOError( + const IOStatus& io_error) { +#ifndef ROCKSDB_LITE + db_mutex_->AssertHeld(); + if (bg_error_.ok()) { + return bg_error_; + } else if (io_error.ok()) { + return kOkStatus; + } else if (db_options_.max_bgerror_resume_count <= 0 || recovery_in_prog_) { + // Auto resume BG error is not enabled, directly return bg_error_. + return bg_error_; + } + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT); + } + ROCKS_LOG_INFO( + db_options_.info_log, + "ErrorHandler: Call StartRecoverFromRetryableBGIOError to resume\n"); + if (recovery_thread_) { + // In this case, if recovery_in_prog_ is false, current thread should + // wait the previous recover thread to finish and create a new thread + // to recover from the bg error. + db_mutex_->Unlock(); + recovery_thread_->join(); + db_mutex_->Lock(); + } + + recovery_in_prog_ = true; + recovery_thread_.reset( + new port::Thread(&ErrorHandler::RecoverFromRetryableBGIOError, this)); + + if (recovery_io_error_.ok() && recovery_error_.ok()) { + return recovery_error_; + } else { + return bg_error_; + } +#else + (void)io_error; + return bg_error_; +#endif +} + +// Automatic recover from Retryable BG IO error. Must be called after db +// mutex is released. +void ErrorHandler::RecoverFromRetryableBGIOError() { +#ifndef ROCKSDB_LITE + TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart"); + InstrumentedMutexLock l(db_mutex_); + if (end_recovery_) { + EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_, + Status::ShutdownInProgress(), + db_mutex_); + return; + } + DBRecoverContext context = recover_context_; + int resume_count = db_options_.max_bgerror_resume_count; + uint64_t wait_interval = db_options_.bgerror_resume_retry_interval; + uint64_t retry_count = 0; + // Recover from the retryable error. Create a separate thread to do it. + while (resume_count > 0) { + if (end_recovery_) { + EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_, + Status::ShutdownInProgress(), + db_mutex_); + return; + } + TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume0"); + TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume1"); + recovery_io_error_ = IOStatus::OK(); + recovery_error_ = Status::OK(); + retry_count++; + Status s = db_->ResumeImpl(context); + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT); + } + if (s.IsShutdownInProgress() || + bg_error_.severity() >= Status::Severity::kFatalError) { + // If DB shutdown in progress or the error severity is higher than + // Hard Error, stop auto resume and returns. + recovery_in_prog_ = false; + if (bg_error_stats_ != nullptr) { + RecordInHistogram(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); + } + EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_, + bg_error_, db_mutex_); + return; + } + if (!recovery_io_error_.ok() && + recovery_error_.severity() <= Status::Severity::kHardError && + recovery_io_error_.GetRetryable()) { + // If new BG IO error happens during auto recovery and it is retryable + // and its severity is Hard Error or lower, the auto resmue sleep for + // a period of time and redo auto resume if it is allowed. + TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait0"); + TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait1"); + int64_t wait_until = db_options_.clock->NowMicros() + wait_interval; + cv_.TimedWait(wait_until); + } else { + // There are three possibility: 1) recover_io_error is set during resume + // and the error is not retryable, 2) recover is successful, 3) other + // error happens during resume and cannot be resumed here. + if (recovery_io_error_.ok() && recovery_error_.ok() && s.ok()) { + // recover from the retryable IO error and no other BG errors. Clean + // the bg_error and notify user. + TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverSuccess"); + Status old_bg_error = bg_error_; + is_db_stopped_.store(false, std::memory_order_release); + bg_error_ = Status::OK(); + bg_error_.PermitUncheckedError(); + EventHelpers::NotifyOnErrorRecoveryEnd( + db_options_.listeners, old_bg_error, bg_error_, db_mutex_); + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT); + RecordInHistogram(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); + } + recovery_in_prog_ = false; + if (soft_error_no_bg_work_) { + soft_error_no_bg_work_ = false; + } + return; + } else { + // In this case: 1) recovery_io_error is more serious or not retryable + // 2) other Non IO recovery_error happens. The auto recovery stops. + recovery_in_prog_ = false; + if (bg_error_stats_ != nullptr) { + RecordInHistogram(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); + } + EventHelpers::NotifyOnErrorRecoveryEnd( + db_options_.listeners, bg_error_, + !recovery_io_error_.ok() + ? recovery_io_error_ + : (!recovery_error_.ok() ? recovery_error_ : s), + db_mutex_); + return; + } + } + resume_count--; + } + recovery_in_prog_ = false; + EventHelpers::NotifyOnErrorRecoveryEnd( + db_options_.listeners, bg_error_, + Status::Aborted("Exceeded resume retry count"), db_mutex_); + TEST_SYNC_POINT("RecoverFromRetryableBGIOError:LoopOut"); + if (bg_error_stats_ != nullptr) { + RecordInHistogram(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); + } + return; +#else + return; +#endif +} + +void ErrorHandler::CheckAndSetRecoveryAndBGError(const Status& bg_err) { + if (recovery_in_prog_ && recovery_error_.ok()) { + recovery_error_ = bg_err; + } + if (bg_err.severity() > bg_error_.severity()) { + bg_error_ = bg_err; + } + if (bg_error_.severity() >= Status::Severity::kHardError) { + is_db_stopped_.store(true, std::memory_order_release); + } + return; +} + +void ErrorHandler::EndAutoRecovery() { + db_mutex_->AssertHeld(); + if (!end_recovery_) { + end_recovery_ = true; + } + cv_.SignalAll(); + db_mutex_->Unlock(); + if (recovery_thread_) { + recovery_thread_->join(); + } + db_mutex_->Lock(); + return; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/error_handler.h b/src/rocksdb/db/error_handler.h new file mode 100644 index 000000000..34e08a525 --- /dev/null +++ b/src/rocksdb/db/error_handler.h @@ -0,0 +1,124 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include "monitoring/instrumented_mutex.h" +#include "options/db_options.h" +#include "rocksdb/io_status.h" +#include "rocksdb/listener.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class DBImpl; + +// This structure is used to store the DB recovery context. The context is +// the information that related to the recover actions. For example, it contains +// FlushReason, which tells the flush job why this flush is called. +struct DBRecoverContext { + FlushReason flush_reason; + + DBRecoverContext() : flush_reason(FlushReason::kErrorRecovery) {} + + DBRecoverContext(FlushReason reason) : flush_reason(reason) {} +}; + +class ErrorHandler { + public: + ErrorHandler(DBImpl* db, const ImmutableDBOptions& db_options, + InstrumentedMutex* db_mutex) + : db_(db), + db_options_(db_options), + cv_(db_mutex), + end_recovery_(false), + recovery_thread_(nullptr), + db_mutex_(db_mutex), + auto_recovery_(false), + recovery_in_prog_(false), + soft_error_no_bg_work_(false), + is_db_stopped_(false), + bg_error_stats_(db_options.statistics) { + // Clear the checked flag for uninitialized errors + bg_error_.PermitUncheckedError(); + recovery_error_.PermitUncheckedError(); + recovery_io_error_.PermitUncheckedError(); + } + + void EnableAutoRecovery() { auto_recovery_ = true; } + + Status::Severity GetErrorSeverity(BackgroundErrorReason reason, + Status::Code code, Status::SubCode subcode); + + const Status& SetBGError(const Status& bg_err, BackgroundErrorReason reason); + + Status GetBGError() const { return bg_error_; } + + Status GetRecoveryError() const { return recovery_error_; } + + Status ClearBGError(); + + bool IsDBStopped() { return is_db_stopped_.load(std::memory_order_acquire); } + + bool IsBGWorkStopped() { + assert(db_mutex_); + db_mutex_->AssertHeld(); + return !bg_error_.ok() && + (bg_error_.severity() >= Status::Severity::kHardError || + !auto_recovery_ || soft_error_no_bg_work_); + } + + bool IsSoftErrorNoBGWork() { return soft_error_no_bg_work_; } + + bool IsRecoveryInProgress() { return recovery_in_prog_; } + + Status RecoverFromBGError(bool is_manual = false); + void CancelErrorRecovery(); + + void EndAutoRecovery(); + + private: + DBImpl* db_; + const ImmutableDBOptions& db_options_; + Status bg_error_; + // A separate Status variable used to record any errors during the + // recovery process from hard errors + Status recovery_error_; + // A separate IO Status variable used to record any IO errors during + // the recovery process. At the same time, recovery_error_ is also set. + IOStatus recovery_io_error_; + // The condition variable used with db_mutex during auto resume for time + // wait. + InstrumentedCondVar cv_; + bool end_recovery_; + std::unique_ptr recovery_thread_; + + InstrumentedMutex* db_mutex_; + // A flag indicating whether automatic recovery from errors is enabled + bool auto_recovery_; + bool recovery_in_prog_; + // A flag to indicate that for the soft error, we should not allow any + // background work except the work is from recovery. + bool soft_error_no_bg_work_; + + // Used to store the context for recover, such as flush reason. + DBRecoverContext recover_context_; + std::atomic is_db_stopped_; + + // The pointer of DB statistics. + std::shared_ptr bg_error_stats_; + + const Status& HandleKnownErrors(const Status& bg_err, + BackgroundErrorReason reason); + Status OverrideNoSpaceError(const Status& bg_error, bool* auto_recovery); + void RecoverFromNoSpace(); + const Status& StartRecoverFromRetryableBGIOError(const IOStatus& io_error); + void RecoverFromRetryableBGIOError(); + // First, if it is in recovery and the recovery_error is ok. Set the + // recovery_error_ to bg_err. Second, if the severity is higher than the + // current bg_error_, overwrite it. + void CheckAndSetRecoveryAndBGError(const Status& bg_err); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/error_handler_fs_test.cc b/src/rocksdb/db/error_handler_fs_test.cc new file mode 100644 index 000000000..153f3b79e --- /dev/null +++ b/src/rocksdb/db/error_handler_fs_test.cc @@ -0,0 +1,2875 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#ifndef ROCKSDB_LITE + +#include "db/db_test_util.h" +#include "file/sst_file_manager_impl.h" +#include "port/stack_trace.h" +#include "rocksdb/io_status.h" +#include "rocksdb/sst_file_manager.h" +#if !defined(ROCKSDB_LITE) +#include "test_util/sync_point.h" +#endif +#include "util/random.h" +#include "utilities/fault_injection_env.h" +#include "utilities/fault_injection_fs.h" + +namespace ROCKSDB_NAMESPACE { + +class DBErrorHandlingFSTest : public DBTestBase { + public: + DBErrorHandlingFSTest() + : DBTestBase("db_error_handling_fs_test", /*env_do_fsync=*/true) { + fault_fs_.reset(new FaultInjectionTestFS(env_->GetFileSystem())); + fault_env_.reset(new CompositeEnvWrapper(env_, fault_fs_)); + } + + std::string GetManifestNameFromLiveFiles() { + std::vector live_files; + uint64_t manifest_size; + + Status s = dbfull()->GetLiveFiles(live_files, &manifest_size, false); + if (!s.ok()) { + return ""; + } + for (auto& file : live_files) { + uint64_t num = 0; + FileType type; + if (ParseFileName(file, &num, &type) && type == kDescriptorFile) { + return file; + } + } + return ""; + } + + std::shared_ptr fault_fs_; + std::unique_ptr fault_env_; +}; + +class ErrorHandlerFSListener : public EventListener { + public: + ErrorHandlerFSListener() + : mutex_(), + cv_(&mutex_), + no_auto_recovery_(false), + recovery_complete_(false), + file_creation_started_(false), + override_bg_error_(false), + file_count_(0), + fault_fs_(nullptr) {} + ~ErrorHandlerFSListener() { + file_creation_error_.PermitUncheckedError(); + bg_error_.PermitUncheckedError(); + new_bg_error_.PermitUncheckedError(); + } + + void OnTableFileCreationStarted( + const TableFileCreationBriefInfo& /*ti*/) override { + InstrumentedMutexLock l(&mutex_); + file_creation_started_ = true; + if (file_count_ > 0) { + if (--file_count_ == 0) { + fault_fs_->SetFilesystemActive(false, file_creation_error_); + file_creation_error_ = IOStatus::OK(); + } + } + cv_.SignalAll(); + } + + void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/, Status bg_error, + bool* auto_recovery) override { + bg_error.PermitUncheckedError(); + if (*auto_recovery && no_auto_recovery_) { + *auto_recovery = false; + } + } + + void OnErrorRecoveryEnd(const BackgroundErrorRecoveryInfo& info) override { + InstrumentedMutexLock l(&mutex_); + recovery_complete_ = true; + cv_.SignalAll(); + new_bg_error_ = info.new_bg_error; + } + + bool WaitForRecovery(uint64_t /*abs_time_us*/) { + InstrumentedMutexLock l(&mutex_); + while (!recovery_complete_) { + cv_.Wait(/*abs_time_us*/); + } + if (recovery_complete_) { + recovery_complete_ = false; + return true; + } + return false; + } + + void WaitForTableFileCreationStarted(uint64_t /*abs_time_us*/) { + InstrumentedMutexLock l(&mutex_); + while (!file_creation_started_) { + cv_.Wait(/*abs_time_us*/); + } + file_creation_started_ = false; + } + + void OnBackgroundError(BackgroundErrorReason /*reason*/, + Status* bg_error) override { + if (override_bg_error_) { + *bg_error = bg_error_; + override_bg_error_ = false; + } + } + + void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; } + + void OverrideBGError(Status bg_err) { + bg_error_ = bg_err; + override_bg_error_ = true; + } + + void InjectFileCreationError(FaultInjectionTestFS* fs, int file_count, + IOStatus io_s) { + fault_fs_ = fs; + file_count_ = file_count; + file_creation_error_ = io_s; + } + + Status new_bg_error() { return new_bg_error_; } + + private: + InstrumentedMutex mutex_; + InstrumentedCondVar cv_; + bool no_auto_recovery_; + bool recovery_complete_; + bool file_creation_started_; + bool override_bg_error_; + int file_count_; + IOStatus file_creation_error_; + Status bg_error_; + Status new_bg_error_; + FaultInjectionTestFS* fault_fs_; +}; + +TEST_F(DBErrorHandlingFSTest, FLushWriteError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.statistics = CreateDBStatistics(); + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + ASSERT_OK(Put(Key(0), "val")); + SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { + fault_fs_->SetFilesystemActive(false, IOStatus::NoSpace("Out of space")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + Destroy(options); +} + +// All the NoSpace IOError will be handled as the regular BG Error no matter the +// retryable flag is set of not. So the auto resume for retryable IO Error will +// not be triggered. Also, it is mapped as hard error. +TEST_F(DBErrorHandlingFSTest, FLushWriteNoSpaceError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 2; + options.bgerror_resume_retry_interval = 100000; // 0.1 second + options.statistics = CreateDBStatistics(); + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::NoSpace("Retryable IO Error"); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(1), "val1")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, FLushWriteRetryableError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + options.statistics = CreateDBStatistics(); + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(1), "val1")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); + Reopen(options); + ASSERT_EQ("val1", Get(Key(1))); + + ASSERT_OK(Put(Key(2), "val2")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeSyncTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Reopen(options); + ASSERT_EQ("val2", Get(Key(2))); + + ASSERT_OK(Put(Key(3), "val3")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeCloseTableFile", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Reopen(options); + ASSERT_EQ("val3", Get(Key(3))); + + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, FLushWriteFileScopeError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error"); + error_msg.SetDataLoss(true); + error_msg.SetScope( + ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile); + error_msg.SetRetryable(false); + + ASSERT_OK(Put(Key(1), "val1")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Reopen(options); + ASSERT_EQ("val1", Get(Key(1))); + + ASSERT_OK(Put(Key(2), "val2")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeSyncTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Reopen(options); + ASSERT_EQ("val2", Get(Key(2))); + + ASSERT_OK(Put(Key(3), "val3")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeCloseTableFile", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Reopen(options); + ASSERT_EQ("val3", Get(Key(3))); + + // not file scope, but retyrable set + error_msg.SetDataLoss(false); + error_msg.SetScope( + ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFileSystem); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(3), "val3")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeCloseTableFile", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Reopen(options); + ASSERT_EQ("val3", Get(Key(3))); + + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, FLushWALWriteRetryableError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + listener->EnableAutoRecovery(false); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::SyncClosedLogs:Start", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + + CreateAndReopenWithCF({"pikachu, sdfsdfsdf"}, options); + + WriteOptions wo = WriteOptions(); + wo.disableWAL = false; + ASSERT_OK(Put(Key(1), "val1", wo)); + + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + auto cfh = dbfull()->GetColumnFamilyHandle(1); + s = dbfull()->DropColumnFamily(cfh); + + s = dbfull()->Resume(); + ASSERT_OK(s); + ASSERT_EQ("val1", Get(Key(1))); + ASSERT_OK(Put(Key(3), "val3", wo)); + ASSERT_EQ("val3", Get(Key(3))); + s = Flush(); + ASSERT_OK(s); + ASSERT_EQ("val3", Get(Key(3))); + + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, FLushWALAtomicWriteRetryableError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + options.atomic_flush = true; + Status s; + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + listener->EnableAutoRecovery(false); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::SyncClosedLogs:Start", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + + CreateAndReopenWithCF({"pikachu, sdfsdfsdf"}, options); + + WriteOptions wo = WriteOptions(); + wo.disableWAL = false; + ASSERT_OK(Put(Key(1), "val1", wo)); + + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + auto cfh = dbfull()->GetColumnFamilyHandle(1); + s = dbfull()->DropColumnFamily(cfh); + + s = dbfull()->Resume(); + ASSERT_OK(s); + ASSERT_EQ("val1", Get(Key(1))); + ASSERT_OK(Put(Key(3), "val3", wo)); + ASSERT_EQ("val3", Get(Key(3))); + s = Flush(); + ASSERT_OK(s); + ASSERT_EQ("val3", Get(Key(3))); + + Destroy(options); +} + +// The flush error is injected before we finish the table build +TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + options.statistics = CreateDBStatistics(); + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + WriteOptions wo = WriteOptions(); + wo.disableWAL = true; + ASSERT_OK(Put(Key(1), "val1", wo)); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_OK(Put(Key(2), "val2", wo)); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + ASSERT_EQ("val2", Get(Key(2))); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + ASSERT_EQ("val1", Get(Key(1))); + ASSERT_EQ("val2", Get(Key(2))); + ASSERT_OK(Put(Key(3), "val3", wo)); + ASSERT_EQ("val3", Get(Key(3))); + s = Flush(); + ASSERT_OK(s); + ASSERT_EQ("val3", Get(Key(3))); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); + + Destroy(options); +} + +// The retryable IO error is injected before we sync table +TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError2) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + WriteOptions wo = WriteOptions(); + wo.disableWAL = true; + + ASSERT_OK(Put(Key(1), "val1", wo)); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeSyncTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_OK(Put(Key(2), "val2", wo)); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + ASSERT_EQ("val2", Get(Key(2))); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + ASSERT_EQ("val1", Get(Key(1))); + ASSERT_EQ("val2", Get(Key(2))); + ASSERT_OK(Put(Key(3), "val3", wo)); + ASSERT_EQ("val3", Get(Key(3))); + s = Flush(); + ASSERT_OK(s); + ASSERT_EQ("val3", Get(Key(3))); + + Destroy(options); +} + +// The retryable IO error is injected before we close the table file +TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError3) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + WriteOptions wo = WriteOptions(); + wo.disableWAL = true; + + ASSERT_OK(Put(Key(1), "val1", wo)); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeCloseTableFile", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_OK(Put(Key(2), "val2", wo)); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + ASSERT_EQ("val2", Get(Key(2))); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + ASSERT_EQ("val1", Get(Key(1))); + ASSERT_EQ("val2", Get(Key(2))); + ASSERT_OK(Put(Key(3), "val3", wo)); + ASSERT_EQ("val3", Get(Key(3))); + s = Flush(); + ASSERT_OK(s); + ASSERT_EQ("val3", Get(Key(3))); + + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, ManifestWriteError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + Status s; + std::string old_manifest; + std::string new_manifest; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "val")); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", [&](void*) { + fault_fs_->SetFilesystemActive(false, + IOStatus::NoSpace("Out of space")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Close(); +} + +TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + std::string old_manifest; + std::string new_manifest; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "val")); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Close(); +} + +TEST_F(DBErrorHandlingFSTest, ManifestWriteFileScopeError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + std::string old_manifest; + std::string new_manifest; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error"); + error_msg.SetDataLoss(true); + error_msg.SetScope( + ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile); + error_msg.SetRetryable(false); + + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "val")); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Close(); +} + +TEST_F(DBErrorHandlingFSTest, ManifestWriteNoWALRetryableError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + std::string old_manifest; + std::string new_manifest; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + WriteOptions wo = WriteOptions(); + wo.disableWAL = true; + ASSERT_OK(Put(Key(0), "val", wo)); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "val", wo)); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Close(); +} + +TEST_F(DBErrorHandlingFSTest, DoubleManifestWriteError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + Status s; + std::string old_manifest; + std::string new_manifest; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "val")); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", [&](void*) { + fault_fs_->SetFilesystemActive(false, + IOStatus::NoSpace("Out of space")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + fault_fs_->SetFilesystemActive(true); + + // This Resume() will attempt to create a new manifest file and fail again + s = dbfull()->Resume(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + fault_fs_->SetFilesystemActive(true); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + + // A successful Resume() will create a new manifest file + s = dbfull()->Resume(); + ASSERT_OK(s); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Close(); +} + +TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteError) { + if (mem_env_ != nullptr) { + ROCKSDB_GTEST_SKIP("Test requires non-mock environment"); + return; + } + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(listener); + Status s; + std::string old_manifest; + std::string new_manifest; + std::atomic fail_manifest(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Put(Key(2), "val")); + s = Flush(); + ASSERT_OK(s); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + // Wait for flush of 2nd L0 file before starting compaction + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}, + // Wait for compaction to detect manifest write error + {"BackgroundCallCompaction:1", "CompactionManifestWriteError:0"}, + // Make compaction thread wait for error to be cleared + {"CompactionManifestWriteError:1", + "DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"}, + // Wait for DB instance to clear bg_error before calling + // TEST_WaitForCompact + {"SstFileManagerImpl::ErrorCleared", "CompactionManifestWriteError:2"}}); + // trigger manifest write failure in compaction thread + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { fail_manifest.store(true); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", [&](void*) { + if (fail_manifest.load()) { + fault_fs_->SetFilesystemActive(false, + IOStatus::NoSpace("Out of space")); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(1), "val")); + // This Flush will trigger a compaction, which will fail when appending to + // the manifest + s = Flush(); + ASSERT_OK(s); + + TEST_SYNC_POINT("CompactionManifestWriteError:0"); + // Clear all errors so when the compaction is retried, it will succeed + fault_fs_->SetFilesystemActive(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + TEST_SYNC_POINT("CompactionManifestWriteError:1"); + TEST_SYNC_POINT("CompactionManifestWriteError:2"); + + s = dbfull()->TEST_WaitForCompact(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_OK(s); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + ASSERT_EQ("val", Get(Key(2))); + Close(); +} + +TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + std::string old_manifest; + std::string new_manifest; + std::atomic fail_manifest(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Put(Key(2), "val")); + s = Flush(); + ASSERT_OK(s); + + listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError)); + listener->EnableAutoRecovery(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + // Wait for flush of 2nd L0 file before starting compaction + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}, + // Wait for compaction to detect manifest write error + {"BackgroundCallCompaction:1", "CompactionManifestWriteError:0"}, + // Make compaction thread wait for error to be cleared + {"CompactionManifestWriteError:1", + "DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"}}); + // trigger manifest write failure in compaction thread + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { fail_manifest.store(true); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", [&](void*) { + if (fail_manifest.load()) { + fault_fs_->SetFilesystemActive(false, error_msg); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(1), "val")); + s = Flush(); + ASSERT_OK(s); + + TEST_SYNC_POINT("CompactionManifestWriteError:0"); + TEST_SYNC_POINT("CompactionManifestWriteError:1"); + + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + + fault_fs_->SetFilesystemActive(true); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + s = dbfull()->Resume(); + ASSERT_OK(s); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + ASSERT_EQ("val", Get(Key(2))); + Close(); +} + +TEST_F(DBErrorHandlingFSTest, CompactionWriteError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(listener); + Status s; + DestroyAndReopen(options); + + ASSERT_OK(Put(Key(0), "va;")); + ASSERT_OK(Put(Key(2), "va;")); + s = Flush(); + ASSERT_OK(s); + + listener->OverrideBGError( + Status(Status::NoSpace(), Status::Severity::kHardError)); + listener->EnableAutoRecovery(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { + fault_fs_->SetFilesystemActive(false, + IOStatus::NoSpace("Out of space")); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(1), "val")); + s = Flush(); + ASSERT_OK(s); + + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, DISABLED_CompactionWriteRetryableError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(0), "va;")); + ASSERT_OK(Put(Key(2), "va;")); + s = Flush(); + ASSERT_OK(s); + + listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError)); + listener->EnableAutoRecovery(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::OpenCompactionOutputFile", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:Finish", + [&](void*) { CancelAllBackgroundWork(dbfull()); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(1), "val")); + s = Flush(); + ASSERT_OK(s); + + s = dbfull()->TEST_GetBGError(); + ASSERT_OK(s); + fault_fs_->SetFilesystemActive(true); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + s = dbfull()->Resume(); + ASSERT_OK(s); + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, DISABLED_CompactionWriteFileScopeError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error"); + error_msg.SetDataLoss(true); + error_msg.SetScope( + ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile); + error_msg.SetRetryable(false); + + ASSERT_OK(Put(Key(0), "va;")); + ASSERT_OK(Put(Key(2), "va;")); + s = Flush(); + ASSERT_OK(s); + + listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError)); + listener->EnableAutoRecovery(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::OpenCompactionOutputFile", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:Finish", + [&](void*) { CancelAllBackgroundWork(dbfull()); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(1), "val")); + s = Flush(); + ASSERT_OK(s); + + s = dbfull()->TEST_GetBGError(); + ASSERT_OK(s); + + fault_fs_->SetFilesystemActive(true); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + s = dbfull()->Resume(); + ASSERT_OK(s); + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, CorruptionError) { + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + Status s; + DestroyAndReopen(options); + + ASSERT_OK(Put(Key(0), "va;")); + ASSERT_OK(Put(Key(2), "va;")); + s = Flush(); + ASSERT_OK(s); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { + fault_fs_->SetFilesystemActive(false, + IOStatus::Corruption("Corruption")); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(1), "val")); + s = Flush(); + ASSERT_OK(s); + + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), + ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError); + + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_NOK(s); + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, AutoRecoverFlushError) { + if (mem_env_ != nullptr) { + ROCKSDB_GTEST_SKIP("Test requires non-mock environment"); + return; + } + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.statistics = CreateDBStatistics(); + Status s; + + listener->EnableAutoRecovery(); + DestroyAndReopen(options); + + ASSERT_OK(Put(Key(0), "val")); + SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { + fault_fs_->SetFilesystemActive(false, IOStatus::NoSpace("Out of space")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + ASSERT_EQ(listener->WaitForRecovery(5000000), true); + + s = Put(Key(1), "val"); + ASSERT_OK(s); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, FailRecoverFlushError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + Status s; + + listener->EnableAutoRecovery(); + DestroyAndReopen(options); + + ASSERT_OK(Put(Key(0), "val")); + SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { + fault_fs_->SetFilesystemActive(false, IOStatus::NoSpace("Out of space")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + // We should be able to shutdown the database while auto recovery is going + // on in the background + Close(); + DestroyDB(dbname_, options).PermitUncheckedError(); +} + +TEST_F(DBErrorHandlingFSTest, WALWriteError) { + if (mem_env_ != nullptr) { + ROCKSDB_GTEST_SKIP("Test requires non-mock environment"); + return; + } + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.writable_file_max_buffer_size = 32768; + options.listeners.emplace_back(listener); + Status s; + Random rnd(301); + + listener->EnableAutoRecovery(); + DestroyAndReopen(options); + + { + WriteBatch batch; + + for (auto i = 0; i < 100; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(dbfull()->Write(wopts, &batch)); + }; + + { + WriteBatch batch; + int write_error = 0; + + for (auto i = 100; i < 199; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + + SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) { + write_error++; + if (write_error > 2) { + fault_fs_->SetFilesystemActive(false, + IOStatus::NoSpace("Out of space")); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + WriteOptions wopts; + wopts.sync = true; + s = dbfull()->Write(wopts, &batch); + ASSERT_EQ(s, s.NoSpace()); + } + SyncPoint::GetInstance()->DisableProcessing(); + // `ClearAllCallBacks()` is needed in addition to `DisableProcessing()` to + // drain all callbacks. Otherwise, a pending callback in the background + // could re-disable `fault_fs_` after we enable it below. + SyncPoint::GetInstance()->ClearAllCallBacks(); + fault_fs_->SetFilesystemActive(true); + ASSERT_EQ(listener->WaitForRecovery(5000000), true); + for (auto i = 0; i < 199; ++i) { + if (i < 100) { + ASSERT_NE(Get(Key(i)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + } + Reopen(options); + for (auto i = 0; i < 199; ++i) { + if (i < 100) { + ASSERT_NE(Get(Key(i)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + } + Close(); +} + +TEST_F(DBErrorHandlingFSTest, WALWriteRetryableError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.writable_file_max_buffer_size = 32768; + options.listeners.emplace_back(listener); + options.paranoid_checks = true; + options.max_bgerror_resume_count = 0; + Random rnd(301); + + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + // For the first batch, write is successful, require sync + { + WriteBatch batch; + + for (auto i = 0; i < 100; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(dbfull()->Write(wopts, &batch)); + }; + + // For the second batch, the first 2 file Append are successful, then the + // following Append fails due to file system retryable IOError. + { + WriteBatch batch; + int write_error = 0; + + for (auto i = 100; i < 200; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + + SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) { + write_error++; + if (write_error > 2) { + fault_fs_->SetFilesystemActive(false, error_msg); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + WriteOptions wopts; + wopts.sync = true; + Status s = dbfull()->Write(wopts, &batch); + ASSERT_TRUE(s.IsIOError()); + } + fault_fs_->SetFilesystemActive(true); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + + // Data in corrupted WAL are not stored + for (auto i = 0; i < 199; ++i) { + if (i < 100) { + ASSERT_NE(Get(Key(i)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + } + + // Resume and write a new batch, should be in the WAL + ASSERT_OK(dbfull()->Resume()); + { + WriteBatch batch; + + for (auto i = 200; i < 300; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(dbfull()->Write(wopts, &batch)); + }; + + Reopen(options); + for (auto i = 0; i < 300; ++i) { + if (i < 100 || i >= 200) { + ASSERT_NE(Get(Key(i)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + } + Close(); +} + +TEST_F(DBErrorHandlingFSTest, MultiCFWALWriteError) { + if (mem_env_ != nullptr) { + ROCKSDB_GTEST_SKIP("Test requires non-mock environment"); + return; + } + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.writable_file_max_buffer_size = 32768; + options.listeners.emplace_back(listener); + Random rnd(301); + + listener->EnableAutoRecovery(); + CreateAndReopenWithCF({"one", "two", "three"}, options); + + { + WriteBatch batch; + + for (auto i = 1; i < 4; ++i) { + for (auto j = 0; j < 100; ++j) { + ASSERT_OK(batch.Put(handles_[i], Key(j), rnd.RandomString(1024))); + } + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(dbfull()->Write(wopts, &batch)); + }; + + { + WriteBatch batch; + int write_error = 0; + + // Write to one CF + for (auto i = 100; i < 199; ++i) { + ASSERT_OK(batch.Put(handles_[2], Key(i), rnd.RandomString(1024))); + } + + SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) { + write_error++; + if (write_error > 2) { + fault_fs_->SetFilesystemActive(false, + IOStatus::NoSpace("Out of space")); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + WriteOptions wopts; + wopts.sync = true; + Status s = dbfull()->Write(wopts, &batch); + ASSERT_TRUE(s.IsNoSpace()); + } + SyncPoint::GetInstance()->DisableProcessing(); + // `ClearAllCallBacks()` is needed in addition to `DisableProcessing()` to + // drain all callbacks. Otherwise, a pending callback in the background + // could re-disable `fault_fs_` after we enable it below. + SyncPoint::GetInstance()->ClearAllCallBacks(); + fault_fs_->SetFilesystemActive(true); + ASSERT_EQ(listener->WaitForRecovery(5000000), true); + + for (auto i = 1; i < 4; ++i) { + // Every CF should have been flushed + ASSERT_EQ(NumTableFilesAtLevel(0, i), 1); + } + + for (auto i = 1; i < 4; ++i) { + for (auto j = 0; j < 199; ++j) { + if (j < 100) { + ASSERT_NE(Get(i, Key(j)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND"); + } + } + } + ReopenWithColumnFamilies({"default", "one", "two", "three"}, options); + for (auto i = 1; i < 4; ++i) { + for (auto j = 0; j < 199; ++j) { + if (j < 100) { + ASSERT_NE(Get(i, Key(j)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND"); + } + } + } + Close(); +} + +TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) { + if (mem_env_ != nullptr) { + ROCKSDB_GTEST_SKIP("Test requires non-mock environment"); + return; + } + FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(env_); + std::vector> fault_envs; + std::vector fault_fs; + std::vector options; + std::vector> listener; + std::vector db; + std::shared_ptr sfm(NewSstFileManager(def_env)); + int kNumDbInstances = 3; + Random rnd(301); + + for (auto i = 0; i < kNumDbInstances; ++i) { + listener.emplace_back(new ErrorHandlerFSListener()); + options.emplace_back(GetDefaultOptions()); + fault_fs.emplace_back(new FaultInjectionTestFS(env_->GetFileSystem())); + std::shared_ptr fs(fault_fs.back()); + fault_envs.emplace_back(new CompositeEnvWrapper(def_env, fs)); + options[i].env = fault_envs.back().get(); + options[i].create_if_missing = true; + options[i].level0_file_num_compaction_trigger = 2; + options[i].writable_file_max_buffer_size = 32768; + options[i].listeners.emplace_back(listener[i]); + options[i].sst_file_manager = sfm; + DB* dbptr; + char buf[16]; + + listener[i]->EnableAutoRecovery(); + // Setup for returning error for the 3rd SST, which would be level 1 + listener[i]->InjectFileCreationError(fault_fs[i], 3, + IOStatus::NoSpace("Out of space")); + snprintf(buf, sizeof(buf), "_%d", i); + ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i])); + ASSERT_OK(DB::Open(options[i], dbname_ + std::string(buf), &dbptr)); + db.emplace_back(dbptr); + } + + for (auto i = 0; i < kNumDbInstances; ++i) { + WriteBatch batch; + + for (auto j = 0; j <= 100; ++j) { + ASSERT_OK(batch.Put(Key(j), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(db[i]->Write(wopts, &batch)); + ASSERT_OK(db[i]->Flush(FlushOptions())); + } + + def_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); + for (auto i = 0; i < kNumDbInstances; ++i) { + WriteBatch batch; + + // Write to one CF + for (auto j = 100; j < 199; ++j) { + ASSERT_OK(batch.Put(Key(j), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(db[i]->Write(wopts, &batch)); + ASSERT_OK(db[i]->Flush(FlushOptions())); + } + + for (auto i = 0; i < kNumDbInstances; ++i) { + Status s = static_cast(db[i])->TEST_WaitForCompact(true); + ASSERT_EQ(s.severity(), Status::Severity::kSoftError); + fault_fs[i]->SetFilesystemActive(true); + } + + def_env->SetFilesystemActive(true); + for (auto i = 0; i < kNumDbInstances; ++i) { + std::string prop; + ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true); + ASSERT_OK(static_cast(db[i])->TEST_WaitForCompact(true)); + EXPECT_TRUE(db[i]->GetProperty( + "rocksdb.num-files-at-level" + std::to_string(0), &prop)); + EXPECT_EQ(atoi(prop.c_str()), 0); + EXPECT_TRUE(db[i]->GetProperty( + "rocksdb.num-files-at-level" + std::to_string(1), &prop)); + EXPECT_EQ(atoi(prop.c_str()), 1); + } + + SstFileManagerImpl* sfmImpl = + static_cast_with_check(sfm.get()); + sfmImpl->Close(); + + for (auto i = 0; i < kNumDbInstances; ++i) { + char buf[16]; + snprintf(buf, sizeof(buf), "_%d", i); + delete db[i]; + fault_fs[i]->SetFilesystemActive(true); + if (getenv("KEEP_DB")) { + printf("DB is still at %s%s\n", dbname_.c_str(), buf); + } else { + ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i])); + } + } + options.clear(); + sfm.reset(); + delete def_env; +} + +TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) { + if (mem_env_ != nullptr) { + ROCKSDB_GTEST_SKIP("Test requires non-mock environment"); + return; + } + FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(env_); + std::vector> fault_envs; + std::vector fault_fs; + std::vector options; + std::vector> listener; + std::vector db; + std::shared_ptr sfm(NewSstFileManager(def_env)); + int kNumDbInstances = 3; + Random rnd(301); + + for (auto i = 0; i < kNumDbInstances; ++i) { + listener.emplace_back(new ErrorHandlerFSListener()); + options.emplace_back(GetDefaultOptions()); + fault_fs.emplace_back(new FaultInjectionTestFS(env_->GetFileSystem())); + std::shared_ptr fs(fault_fs.back()); + fault_envs.emplace_back(new CompositeEnvWrapper(def_env, fs)); + options[i].env = fault_envs.back().get(); + options[i].create_if_missing = true; + options[i].level0_file_num_compaction_trigger = 2; + options[i].writable_file_max_buffer_size = 32768; + options[i].listeners.emplace_back(listener[i]); + options[i].sst_file_manager = sfm; + DB* dbptr; + char buf[16]; + + listener[i]->EnableAutoRecovery(); + switch (i) { + case 0: + // Setup for returning error for the 3rd SST, which would be level 1 + listener[i]->InjectFileCreationError(fault_fs[i], 3, + IOStatus::NoSpace("Out of space")); + break; + case 1: + // Setup for returning error after the 1st SST, which would result + // in a hard error + listener[i]->InjectFileCreationError(fault_fs[i], 2, + IOStatus::NoSpace("Out of space")); + break; + default: + break; + } + snprintf(buf, sizeof(buf), "_%d", i); + ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i])); + ASSERT_OK(DB::Open(options[i], dbname_ + std::string(buf), &dbptr)); + db.emplace_back(dbptr); + } + + for (auto i = 0; i < kNumDbInstances; ++i) { + WriteBatch batch; + + for (auto j = 0; j <= 100; ++j) { + ASSERT_OK(batch.Put(Key(j), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(db[i]->Write(wopts, &batch)); + ASSERT_OK(db[i]->Flush(FlushOptions())); + } + + def_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); + for (auto i = 0; i < kNumDbInstances; ++i) { + WriteBatch batch; + + // Write to one CF + for (auto j = 100; j < 199; ++j) { + ASSERT_OK(batch.Put(Key(j), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(db[i]->Write(wopts, &batch)); + if (i != 1) { + ASSERT_OK(db[i]->Flush(FlushOptions())); + } else { + ASSERT_TRUE(db[i]->Flush(FlushOptions()).IsNoSpace()); + } + } + + for (auto i = 0; i < kNumDbInstances; ++i) { + Status s = static_cast(db[i])->TEST_WaitForCompact(true); + switch (i) { + case 0: + ASSERT_EQ(s.severity(), Status::Severity::kSoftError); + break; + case 1: + ASSERT_EQ(s.severity(), Status::Severity::kHardError); + break; + case 2: + ASSERT_OK(s); + break; + } + fault_fs[i]->SetFilesystemActive(true); + } + + def_env->SetFilesystemActive(true); + for (auto i = 0; i < kNumDbInstances; ++i) { + std::string prop; + if (i < 2) { + ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true); + } + if (i == 1) { + ASSERT_OK(static_cast(db[i])->TEST_WaitForCompact(true)); + } + EXPECT_TRUE(db[i]->GetProperty( + "rocksdb.num-files-at-level" + std::to_string(0), &prop)); + EXPECT_EQ(atoi(prop.c_str()), 0); + EXPECT_TRUE(db[i]->GetProperty( + "rocksdb.num-files-at-level" + std::to_string(1), &prop)); + EXPECT_EQ(atoi(prop.c_str()), 1); + } + + SstFileManagerImpl* sfmImpl = + static_cast_with_check(sfm.get()); + sfmImpl->Close(); + + for (auto i = 0; i < kNumDbInstances; ++i) { + char buf[16]; + snprintf(buf, sizeof(buf), "_%d", i); + fault_fs[i]->SetFilesystemActive(true); + delete db[i]; + if (getenv("KEEP_DB")) { + printf("DB is still at %s%s\n", dbname_.c_str(), buf); + } else { + EXPECT_OK(DestroyDB(dbname_ + std::string(buf), options[i])); + } + } + options.clear(); + delete def_env; +} + +// When Put the KV-pair, the write option is set to disable WAL. +// If retryable error happens in this condition, map the bg error +// to soft error and trigger auto resume. During auto resume, SwitchMemtable +// is disabled to avoid small SST tables. Write can still be applied before +// the bg error is cleaned unless the memtable is full. +TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover1) { + // Activate the FS before the first resume + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 2; + options.bgerror_resume_retry_interval = 100000; // 0.1 second + options.statistics = CreateDBStatistics(); + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + WriteOptions wo = WriteOptions(); + wo.disableWAL = true; + ASSERT_OK(Put(Key(1), "val1", wo)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"RecoverFromRetryableBGIOError:LoopOut", + "FLushWritNoWALRetryableeErrorAutoRecover1:1"}}); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ("val1", Get(Key(1))); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + TEST_SYNC_POINT("FLushWritNoWALRetryableeErrorAutoRecover1:1"); + ASSERT_EQ("val1", Get(Key(1))); + ASSERT_EQ("val1", Get(Key(1))); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + ASSERT_EQ(3, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(3, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(3, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_LE(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_LE(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); + HistogramData autoresume_retry; + options.statistics->histogramData(ERROR_HANDLER_AUTORESUME_RETRY_COUNT, + &autoresume_retry); + ASSERT_GE(autoresume_retry.max, 0); + ASSERT_OK(Put(Key(2), "val2", wo)); + s = Flush(); + // Since auto resume fails, the bg error is not cleand, flush will + // return the bg_error set before. + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + ASSERT_EQ("val2", Get(Key(2))); + + // call auto resume + ASSERT_OK(dbfull()->Resume()); + ASSERT_OK(Put(Key(3), "val3", wo)); + // After resume is successful, the flush should be ok. + ASSERT_OK(Flush()); + ASSERT_EQ("val3", Get(Key(3))); + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover2) { + // Activate the FS before the first resume + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 2; + options.bgerror_resume_retry_interval = 100000; // 0.1 second + options.statistics = CreateDBStatistics(); + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + WriteOptions wo = WriteOptions(); + wo.disableWAL = true; + ASSERT_OK(Put(Key(1), "val1", wo)); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ("val1", Get(Key(1))); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + ASSERT_EQ(listener->WaitForRecovery(5000000), true); + ASSERT_EQ("val1", Get(Key(1))); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_LE(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_LE(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); + HistogramData autoresume_retry; + options.statistics->histogramData(ERROR_HANDLER_AUTORESUME_RETRY_COUNT, + &autoresume_retry); + ASSERT_GE(autoresume_retry.max, 0); + ASSERT_OK(Put(Key(2), "val2", wo)); + s = Flush(); + // Since auto resume is successful, the bg error is cleaned, flush will + // be successful. + ASSERT_OK(s); + ASSERT_EQ("val2", Get(Key(2))); + Destroy(options); +} + +// Auto resume fromt the flush retryable IO error. Activate the FS before the +// first resume. Resume is successful +TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover1) { + // Activate the FS before the first resume + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 2; + options.bgerror_resume_retry_interval = 100000; // 0.1 second + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(1), "val1")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + ASSERT_EQ(listener->WaitForRecovery(5000000), true); + + ASSERT_EQ("val1", Get(Key(1))); + Reopen(options); + ASSERT_EQ("val1", Get(Key(1))); + ASSERT_OK(Put(Key(2), "val2")); + ASSERT_OK(Flush()); + ASSERT_EQ("val2", Get(Key(2))); + + Destroy(options); +} + +// Auto resume fromt the flush retryable IO error and set the retry limit count. +// Never activate the FS and auto resume should fail at the end +TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover2) { + // Fail all the resume and let user to resume + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 2; + options.bgerror_resume_retry_interval = 100000; // 0.1 second + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(1), "val1")); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"FLushWritRetryableeErrorAutoRecover2:0", + "RecoverFromRetryableBGIOError:BeforeStart"}, + {"RecoverFromRetryableBGIOError:LoopOut", + "FLushWritRetryableeErrorAutoRecover2:1"}}); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover2:0"); + TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover2:1"); + fault_fs_->SetFilesystemActive(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + + ASSERT_EQ("val1", Get(Key(1))); + // Auto resume fails due to FS does not recover during resume. User call + // resume manually here. + s = dbfull()->Resume(); + ASSERT_EQ("val1", Get(Key(1))); + ASSERT_OK(s); + ASSERT_OK(Put(Key(2), "val2")); + ASSERT_OK(Flush()); + ASSERT_EQ("val2", Get(Key(2))); + + Destroy(options); +} + +// Auto resume fromt the flush retryable IO error and set the retry limit count. +// Fail the first resume and let the second resume be successful. +TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableErrorAutoRecover) { + // Fail the first resume and let the second resume be successful + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 2; + options.bgerror_resume_retry_interval = 100000; // 0.1 second + Status s; + std::string old_manifest; + std::string new_manifest; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "val")); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"RecoverFromRetryableBGIOError:BeforeStart", + "ManifestWriteRetryableErrorAutoRecover:0"}, + {"ManifestWriteRetryableErrorAutoRecover:1", + "RecoverFromRetryableBGIOError:BeforeWait1"}, + {"RecoverFromRetryableBGIOError:RecoverSuccess", + "ManifestWriteRetryableErrorAutoRecover:2"}}); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:0"); + fault_fs_->SetFilesystemActive(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:1"); + TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:2"); + SyncPoint::GetInstance()->DisableProcessing(); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Close(); +} + +TEST_F(DBErrorHandlingFSTest, ManifestWriteNoWALRetryableErrorAutoRecover) { + // Fail the first resume and let the second resume be successful + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 2; + options.bgerror_resume_retry_interval = 100000; // 0.1 second + Status s; + std::string old_manifest; + std::string new_manifest; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + WriteOptions wo = WriteOptions(); + wo.disableWAL = true; + ASSERT_OK(Put(Key(0), "val", wo)); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "val", wo)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"RecoverFromRetryableBGIOError:BeforeStart", + "ManifestWriteNoWALRetryableErrorAutoRecover:0"}, + {"ManifestWriteNoWALRetryableErrorAutoRecover:1", + "RecoverFromRetryableBGIOError:BeforeWait1"}, + {"RecoverFromRetryableBGIOError:RecoverSuccess", + "ManifestWriteNoWALRetryableErrorAutoRecover:2"}}); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + TEST_SYNC_POINT("ManifestWriteNoWALRetryableErrorAutoRecover:0"); + fault_fs_->SetFilesystemActive(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + TEST_SYNC_POINT("ManifestWriteNoWALRetryableErrorAutoRecover:1"); + TEST_SYNC_POINT("ManifestWriteNoWALRetryableErrorAutoRecover:2"); + SyncPoint::GetInstance()->DisableProcessing(); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Close(); +} + +TEST_F(DBErrorHandlingFSTest, + CompactionManifestWriteRetryableErrorAutoRecover) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 2; + options.bgerror_resume_retry_interval = 100000; // 0.1 second + Status s; + std::string old_manifest; + std::string new_manifest; + std::atomic fail_manifest(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Put(Key(2), "val")); + ASSERT_OK(Flush()); + + listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError)); + listener->EnableAutoRecovery(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + // Wait for flush of 2nd L0 file before starting compaction + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}, + // Wait for compaction to detect manifest write error + {"BackgroundCallCompaction:1", "CompactionManifestWriteErrorAR:0"}, + // Make compaction thread wait for error to be cleared + {"CompactionManifestWriteErrorAR:1", + "DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"}, + {"CompactionManifestWriteErrorAR:2", + "RecoverFromRetryableBGIOError:BeforeStart"}, + // Fail the first resume, before the wait in resume + {"RecoverFromRetryableBGIOError:BeforeResume0", + "CompactionManifestWriteErrorAR:3"}, + // Activate the FS before the second resume + {"CompactionManifestWriteErrorAR:4", + "RecoverFromRetryableBGIOError:BeforeResume1"}, + // Wait the auto resume be sucessful + {"RecoverFromRetryableBGIOError:RecoverSuccess", + "CompactionManifestWriteErrorAR:5"}}); + // trigger manifest write failure in compaction thread + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { fail_manifest.store(true); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", [&](void*) { + if (fail_manifest.load()) { + fault_fs_->SetFilesystemActive(false, error_msg); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(1), "val")); + s = Flush(); + ASSERT_OK(s); + + TEST_SYNC_POINT("CompactionManifestWriteErrorAR:0"); + TEST_SYNC_POINT("CompactionManifestWriteErrorAR:1"); + + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + TEST_SYNC_POINT("CompactionManifestWriteErrorAR:2"); + TEST_SYNC_POINT("CompactionManifestWriteErrorAR:3"); + fault_fs_->SetFilesystemActive(true); + SyncPoint::GetInstance()->ClearAllCallBacks(); + TEST_SYNC_POINT("CompactionManifestWriteErrorAR:4"); + TEST_SYNC_POINT("CompactionManifestWriteErrorAR:5"); + SyncPoint::GetInstance()->DisableProcessing(); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + ASSERT_EQ("val", Get(Key(2))); + Close(); +} + +TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableErrorAutoRecover) { + // In this test, in the first round of compaction, the FS is set to error. + // So the first compaction fails due to retryable IO error and it is mapped + // to soft error. Then, compaction is rescheduled, in the second round of + // compaction, the FS is set to active and compaction is successful, so + // the test will hit the CompactionJob::FinishCompactionOutputFile1 sync + // point. + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(listener); + Status s; + std::atomic fail_first(false); + std::atomic fail_second(true); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(0), "va;")); + ASSERT_OK(Put(Key(2), "va;")); + s = Flush(); + ASSERT_OK(s); + + listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError)); + listener->EnableAutoRecovery(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}, + {"CompactionJob::FinishCompactionOutputFile1", + "CompactionWriteRetryableErrorAutoRecover0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:Start", + [&](void*) { fault_fs_->SetFilesystemActive(true); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { fail_first.store(true); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::OpenCompactionOutputFile", [&](void*) { + if (fail_first.load() && fail_second.load()) { + fault_fs_->SetFilesystemActive(false, error_msg); + fail_second.store(false); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(1), "val")); + s = Flush(); + ASSERT_OK(s); + + s = dbfull()->TEST_WaitForCompact(); + ASSERT_OK(s); + TEST_SYNC_POINT("CompactionWriteRetryableErrorAutoRecover0"); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover1) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.writable_file_max_buffer_size = 32768; + options.listeners.emplace_back(listener); + options.paranoid_checks = true; + options.max_bgerror_resume_count = 2; + options.bgerror_resume_retry_interval = 100000; // 0.1 second + Status s; + Random rnd(301); + + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + // For the first batch, write is successful, require sync + { + WriteBatch batch; + + for (auto i = 0; i < 100; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(dbfull()->Write(wopts, &batch)); + }; + + // For the second batch, the first 2 file Append are successful, then the + // following Append fails due to file system retryable IOError. + { + WriteBatch batch; + int write_error = 0; + + for (auto i = 100; i < 200; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"WALWriteErrorDone", "RecoverFromRetryableBGIOError:BeforeStart"}, + {"RecoverFromRetryableBGIOError:BeforeResume0", "WALWriteError1:0"}, + {"WALWriteError1:1", "RecoverFromRetryableBGIOError:BeforeResume1"}, + {"RecoverFromRetryableBGIOError:RecoverSuccess", "WALWriteError1:2"}}); + + SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) { + write_error++; + if (write_error > 2) { + fault_fs_->SetFilesystemActive(false, error_msg); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + WriteOptions wopts; + wopts.sync = true; + s = dbfull()->Write(wopts, &batch); + ASSERT_EQ(true, s.IsIOError()); + TEST_SYNC_POINT("WALWriteErrorDone"); + + TEST_SYNC_POINT("WALWriteError1:0"); + fault_fs_->SetFilesystemActive(true); + SyncPoint::GetInstance()->ClearAllCallBacks(); + TEST_SYNC_POINT("WALWriteError1:1"); + TEST_SYNC_POINT("WALWriteError1:2"); + } + SyncPoint::GetInstance()->DisableProcessing(); + + // Data in corrupted WAL are not stored + for (auto i = 0; i < 199; ++i) { + if (i < 100) { + ASSERT_NE(Get(Key(i)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + } + + // Resume and write a new batch, should be in the WAL + { + WriteBatch batch; + + for (auto i = 200; i < 300; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(dbfull()->Write(wopts, &batch)); + }; + + Reopen(options); + for (auto i = 0; i < 300; ++i) { + if (i < 100 || i >= 200) { + ASSERT_NE(Get(Key(i)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + } + Close(); +} + +TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover2) { + // Fail the first recover and try second time. + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.writable_file_max_buffer_size = 32768; + options.listeners.emplace_back(listener); + options.paranoid_checks = true; + options.max_bgerror_resume_count = 2; + options.bgerror_resume_retry_interval = 100000; // 0.1 second + Status s; + Random rnd(301); + + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + // For the first batch, write is successful, require sync + { + WriteBatch batch; + + for (auto i = 0; i < 100; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(dbfull()->Write(wopts, &batch)); + }; + + // For the second batch, the first 2 file Append are successful, then the + // following Append fails due to file system retryable IOError. + { + WriteBatch batch; + int write_error = 0; + + for (auto i = 100; i < 200; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"RecoverFromRetryableBGIOError:BeforeWait0", "WALWriteError2:0"}, + {"WALWriteError2:1", "RecoverFromRetryableBGIOError:BeforeWait1"}, + {"RecoverFromRetryableBGIOError:RecoverSuccess", "WALWriteError2:2"}}); + + SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) { + write_error++; + if (write_error > 2) { + fault_fs_->SetFilesystemActive(false, error_msg); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + WriteOptions wopts; + wopts.sync = true; + s = dbfull()->Write(wopts, &batch); + ASSERT_EQ(true, s.IsIOError()); + + TEST_SYNC_POINT("WALWriteError2:0"); + fault_fs_->SetFilesystemActive(true); + SyncPoint::GetInstance()->ClearAllCallBacks(); + TEST_SYNC_POINT("WALWriteError2:1"); + TEST_SYNC_POINT("WALWriteError2:2"); + } + SyncPoint::GetInstance()->DisableProcessing(); + + // Data in corrupted WAL are not stored + for (auto i = 0; i < 199; ++i) { + if (i < 100) { + ASSERT_NE(Get(Key(i)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + } + + // Resume and write a new batch, should be in the WAL + { + WriteBatch batch; + + for (auto i = 200; i < 300; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(dbfull()->Write(wopts, &batch)); + }; + + Reopen(options); + for (auto i = 0; i < 300; ++i) { + if (i < 100 || i >= 200) { + ASSERT_NE(Get(Key(i)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + } + Close(); +} + +// Fail auto resume from a flush retryable error and verify that +// OnErrorRecoveryEnd listener callback is called +TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAbortRecovery) { + // Activate the FS before the first resume + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 2; + options.bgerror_resume_retry_interval = 100000; // 0.1 second + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(1), "val1")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + ASSERT_EQ(listener->WaitForRecovery(5000000), true); + ASSERT_EQ(listener->new_bg_error(), Status::Aborted()); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, FlushReadError) { + std::shared_ptr listener = + std::make_shared(); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.statistics = CreateDBStatistics(); + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + ASSERT_OK(Put(Key(0), "val")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeOutputValidation", [&](void*) { + IOStatus st = IOStatus::IOError(); + st.SetRetryable(true); + st.SetScope(IOStatus::IOErrorScope::kIOErrorScopeFile); + fault_fs_->SetFilesystemActive(false, st); + }); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeDeleteFile", + [&](void*) { fault_fs_->SetFilesystemActive(true, IOStatus::OK()); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + ASSERT_EQ(listener->WaitForRecovery(5000000), true); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_LE(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_LE(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + s = dbfull()->TEST_GetBGError(); + ASSERT_OK(s); + + Reopen(GetDefaultOptions()); + ASSERT_EQ("val", Get(Key(0))); +} + +TEST_F(DBErrorHandlingFSTest, AtomicFlushReadError) { + std::shared_ptr listener = + std::make_shared(); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.statistics = CreateDBStatistics(); + Status s; + + listener->EnableAutoRecovery(false); + options.atomic_flush = true; + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(0, Key(0), "val")); + ASSERT_OK(Put(1, Key(0), "val")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeOutputValidation", [&](void*) { + IOStatus st = IOStatus::IOError(); + st.SetRetryable(true); + st.SetScope(IOStatus::IOErrorScope::kIOErrorScopeFile); + fault_fs_->SetFilesystemActive(false, st); + }); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeDeleteFile", + [&](void*) { fault_fs_->SetFilesystemActive(true, IOStatus::OK()); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush({0, 1}); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + ASSERT_EQ(listener->WaitForRecovery(5000000), true); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_LE(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_LE(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + s = dbfull()->TEST_GetBGError(); + ASSERT_OK(s); + + TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, + GetDefaultOptions()); + ASSERT_EQ("val", Get(Key(0))); +} + +TEST_F(DBErrorHandlingFSTest, AtomicFlushNoSpaceError) { + std::shared_ptr listener = + std::make_shared(); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.statistics = CreateDBStatistics(); + Status s; + + listener->EnableAutoRecovery(true); + options.atomic_flush = true; + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(0, Key(0), "val")); + ASSERT_OK(Put(1, Key(0), "val")); + SyncPoint::GetInstance()->SetCallBack("BuildTable:create_file", [&](void*) { + IOStatus st = IOStatus::NoSpace(); + fault_fs_->SetFilesystemActive(false, st); + }); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeDeleteFile", + [&](void*) { fault_fs_->SetFilesystemActive(true, IOStatus::OK()); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush({0, 1}); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + ASSERT_EQ(listener->WaitForRecovery(5000000), true); + ASSERT_LE(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_LE(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + s = dbfull()->TEST_GetBGError(); + ASSERT_OK(s); + + TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, + GetDefaultOptions()); + ASSERT_EQ("val", Get(Key(0))); +} + +TEST_F(DBErrorHandlingFSTest, CompactionReadRetryableErrorAutoRecover) { + // In this test, in the first round of compaction, the FS is set to error. + // So the first compaction fails due to retryable IO error and it is mapped + // to soft error. Then, compaction is rescheduled, in the second round of + // compaction, the FS is set to active and compaction is successful, so + // the test will hit the CompactionJob::FinishCompactionOutputFile1 sync + // point. + std::shared_ptr listener = + std::make_shared(); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(listener); + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Status s; + std::atomic fail_first(false); + std::atomic fail_second(true); + Random rnd(301); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + for (int i = 0; i < 100; ++i) { + ASSERT_OK(Put(Key(i), rnd.RandomString(1024))); + } + s = Flush(); + ASSERT_OK(s); + + listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError)); + listener->EnableAutoRecovery(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}, + {"CompactionJob::FinishCompactionOutputFile1", + "CompactionWriteRetryableErrorAutoRecover0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:Start", + [&](void*) { fault_fs_->SetFilesystemActive(true); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { fail_first.store(true); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():PausingManualCompaction:2", [&](void*) { + if (fail_first.load() && fail_second.load()) { + fault_fs_->SetFilesystemActive(false, error_msg); + fail_second.store(false); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(1), "val")); + s = Flush(); + ASSERT_OK(s); + + s = dbfull()->TEST_WaitForCompact(); + ASSERT_OK(s); + TEST_SYNC_POINT("CompactionWriteRetryableErrorAutoRecover0"); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + + Reopen(GetDefaultOptions()); +} + +class DBErrorHandlingFencingTest : public DBErrorHandlingFSTest, + public testing::WithParamInterface {}; + +TEST_P(DBErrorHandlingFencingTest, FLushWriteFenced) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.paranoid_checks = GetParam(); + Status s; + + listener->EnableAutoRecovery(true); + DestroyAndReopen(options); + + ASSERT_OK(Put(Key(0), "val")); + SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { + fault_fs_->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError); + ASSERT_TRUE(s.IsIOFenced()); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_TRUE(s.IsIOFenced()); + Destroy(options); +} + +TEST_P(DBErrorHandlingFencingTest, ManifestWriteFenced) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.paranoid_checks = GetParam(); + Status s; + std::string old_manifest; + std::string new_manifest; + + listener->EnableAutoRecovery(true); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "val")); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", [&](void*) { + fault_fs_->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError); + ASSERT_TRUE(s.IsIOFenced()); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_TRUE(s.IsIOFenced()); + Close(); +} + +TEST_P(DBErrorHandlingFencingTest, CompactionWriteFenced) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(listener); + options.paranoid_checks = GetParam(); + Status s; + DestroyAndReopen(options); + + ASSERT_OK(Put(Key(0), "va;")); + ASSERT_OK(Put(Key(2), "va;")); + s = Flush(); + ASSERT_OK(s); + + listener->EnableAutoRecovery(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { + fault_fs_->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced")); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(1), "val")); + s = Flush(); + ASSERT_OK(s); + + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError); + ASSERT_TRUE(s.IsIOFenced()); + + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_TRUE(s.IsIOFenced()); + Destroy(options); +} + +TEST_P(DBErrorHandlingFencingTest, WALWriteFenced) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.writable_file_max_buffer_size = 32768; + options.listeners.emplace_back(listener); + options.paranoid_checks = GetParam(); + Status s; + Random rnd(301); + + listener->EnableAutoRecovery(true); + DestroyAndReopen(options); + + { + WriteBatch batch; + + for (auto i = 0; i < 100; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(dbfull()->Write(wopts, &batch)); + }; + + { + WriteBatch batch; + int write_error = 0; + + for (auto i = 100; i < 199; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + + SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) { + write_error++; + if (write_error > 2) { + fault_fs_->SetFilesystemActive(false, + IOStatus::IOFenced("IO fenced")); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + WriteOptions wopts; + wopts.sync = true; + s = dbfull()->Write(wopts, &batch); + ASSERT_TRUE(s.IsIOFenced()); + } + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + { + WriteBatch batch; + + for (auto i = 0; i < 100; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + s = dbfull()->Write(wopts, &batch); + ASSERT_TRUE(s.IsIOFenced()); + } + Close(); +} + +INSTANTIATE_TEST_CASE_P(DBErrorHandlingFSTest, DBErrorHandlingFencingTest, + ::testing::Bool()); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/event_helpers.cc b/src/rocksdb/db/event_helpers.cc new file mode 100644 index 000000000..7987b8ec6 --- /dev/null +++ b/src/rocksdb/db/event_helpers.cc @@ -0,0 +1,371 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/event_helpers.h" + +#include "rocksdb/convenience.h" +#include "rocksdb/listener.h" +#include "rocksdb/utilities/customizable_util.h" + +namespace ROCKSDB_NAMESPACE { +#ifndef ROCKSDB_LITE +Status EventListener::CreateFromString(const ConfigOptions& config_options, + const std::string& id, + std::shared_ptr* result) { + return LoadSharedObject(config_options, id, nullptr, result); +} +#endif // ROCKSDB_LITE + +namespace { +template +inline T SafeDivide(T a, T b) { + return b == 0 ? 0 : a / b; +} +} // anonymous namespace + +void EventHelpers::AppendCurrentTime(JSONWriter* jwriter) { + *jwriter << "time_micros" + << std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); +} + +#ifndef ROCKSDB_LITE +void EventHelpers::NotifyTableFileCreationStarted( + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, TableFileCreationReason reason) { + if (listeners.empty()) { + return; + } + TableFileCreationBriefInfo info; + info.db_name = db_name; + info.cf_name = cf_name; + info.file_path = file_path; + info.job_id = job_id; + info.reason = reason; + for (auto& listener : listeners) { + listener->OnTableFileCreationStarted(info); + } +} +#endif // !ROCKSDB_LITE + +void EventHelpers::NotifyOnBackgroundError( + const std::vector>& listeners, + BackgroundErrorReason reason, Status* bg_error, InstrumentedMutex* db_mutex, + bool* auto_recovery) { +#ifndef ROCKSDB_LITE + if (listeners.empty()) { + return; + } + db_mutex->AssertHeld(); + // release lock while notifying events + db_mutex->Unlock(); + for (auto& listener : listeners) { + listener->OnBackgroundError(reason, bg_error); + bg_error->PermitUncheckedError(); + if (*auto_recovery) { + listener->OnErrorRecoveryBegin(reason, *bg_error, auto_recovery); + } + } + db_mutex->Lock(); +#else + (void)listeners; + (void)reason; + (void)bg_error; + (void)db_mutex; + (void)auto_recovery; +#endif // ROCKSDB_LITE +} + +void EventHelpers::LogAndNotifyTableFileCreationFinished( + EventLogger* event_logger, + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, const FileDescriptor& fd, + uint64_t oldest_blob_file_number, const TableProperties& table_properties, + TableFileCreationReason reason, const Status& s, + const std::string& file_checksum, + const std::string& file_checksum_func_name) { + if (s.ok() && event_logger) { + JSONWriter jwriter; + AppendCurrentTime(&jwriter); + jwriter << "cf_name" << cf_name << "job" << job_id << "event" + << "table_file_creation" + << "file_number" << fd.GetNumber() << "file_size" + << fd.GetFileSize() << "file_checksum" + << Slice(file_checksum).ToString(true) << "file_checksum_func_name" + << file_checksum_func_name << "smallest_seqno" << fd.smallest_seqno + << "largest_seqno" << fd.largest_seqno; + + // table_properties + { + jwriter << "table_properties"; + jwriter.StartObject(); + + // basic properties: + jwriter << "data_size" << table_properties.data_size << "index_size" + << table_properties.index_size << "index_partitions" + << table_properties.index_partitions << "top_level_index_size" + << table_properties.top_level_index_size + << "index_key_is_user_key" + << table_properties.index_key_is_user_key + << "index_value_is_delta_encoded" + << table_properties.index_value_is_delta_encoded << "filter_size" + << table_properties.filter_size << "raw_key_size" + << table_properties.raw_key_size << "raw_average_key_size" + << SafeDivide(table_properties.raw_key_size, + table_properties.num_entries) + << "raw_value_size" << table_properties.raw_value_size + << "raw_average_value_size" + << SafeDivide(table_properties.raw_value_size, + table_properties.num_entries) + << "num_data_blocks" << table_properties.num_data_blocks + << "num_entries" << table_properties.num_entries + << "num_filter_entries" << table_properties.num_filter_entries + << "num_deletions" << table_properties.num_deletions + << "num_merge_operands" << table_properties.num_merge_operands + << "num_range_deletions" << table_properties.num_range_deletions + << "format_version" << table_properties.format_version + << "fixed_key_len" << table_properties.fixed_key_len + << "filter_policy" << table_properties.filter_policy_name + << "column_family_name" << table_properties.column_family_name + << "column_family_id" << table_properties.column_family_id + << "comparator" << table_properties.comparator_name + << "merge_operator" << table_properties.merge_operator_name + << "prefix_extractor_name" + << table_properties.prefix_extractor_name << "property_collectors" + << table_properties.property_collectors_names << "compression" + << table_properties.compression_name << "compression_options" + << table_properties.compression_options << "creation_time" + << table_properties.creation_time << "oldest_key_time" + << table_properties.oldest_key_time << "file_creation_time" + << table_properties.file_creation_time + << "slow_compression_estimated_data_size" + << table_properties.slow_compression_estimated_data_size + << "fast_compression_estimated_data_size" + << table_properties.fast_compression_estimated_data_size + << "db_id" << table_properties.db_id << "db_session_id" + << table_properties.db_session_id << "orig_file_number" + << table_properties.orig_file_number << "seqno_to_time_mapping"; + + if (table_properties.seqno_to_time_mapping.empty()) { + jwriter << "N/A"; + } else { + SeqnoToTimeMapping tmp; + Status status = tmp.Add(table_properties.seqno_to_time_mapping); + if (status.ok()) { + jwriter << tmp.ToHumanString(); + } else { + jwriter << "Invalid"; + } + } + + // user collected properties + for (const auto& prop : table_properties.readable_properties) { + jwriter << prop.first << prop.second; + } + jwriter.EndObject(); + } + + if (oldest_blob_file_number != kInvalidBlobFileNumber) { + jwriter << "oldest_blob_file_number" << oldest_blob_file_number; + } + + jwriter.EndObject(); + + event_logger->Log(jwriter); + } + +#ifndef ROCKSDB_LITE + if (listeners.empty()) { + return; + } + TableFileCreationInfo info; + info.db_name = db_name; + info.cf_name = cf_name; + info.file_path = file_path; + info.file_size = fd.file_size; + info.job_id = job_id; + info.table_properties = table_properties; + info.reason = reason; + info.status = s; + info.file_checksum = file_checksum; + info.file_checksum_func_name = file_checksum_func_name; + for (auto& listener : listeners) { + listener->OnTableFileCreated(info); + } + info.status.PermitUncheckedError(); +#else + (void)listeners; + (void)db_name; + (void)cf_name; + (void)file_path; + (void)reason; +#endif // !ROCKSDB_LITE +} + +void EventHelpers::LogAndNotifyTableFileDeletion( + EventLogger* event_logger, int job_id, uint64_t file_number, + const std::string& file_path, const Status& status, + const std::string& dbname, + const std::vector>& listeners) { + JSONWriter jwriter; + AppendCurrentTime(&jwriter); + + jwriter << "job" << job_id << "event" + << "table_file_deletion" + << "file_number" << file_number; + if (!status.ok()) { + jwriter << "status" << status.ToString(); + } + + jwriter.EndObject(); + + event_logger->Log(jwriter); + +#ifndef ROCKSDB_LITE + if (listeners.empty()) { + return; + } + TableFileDeletionInfo info; + info.db_name = dbname; + info.job_id = job_id; + info.file_path = file_path; + info.status = status; + for (auto& listener : listeners) { + listener->OnTableFileDeleted(info); + } + info.status.PermitUncheckedError(); +#else + (void)file_path; + (void)dbname; + (void)listeners; +#endif // !ROCKSDB_LITE +} + +void EventHelpers::NotifyOnErrorRecoveryEnd( + const std::vector>& listeners, + const Status& old_bg_error, const Status& new_bg_error, + InstrumentedMutex* db_mutex) { +#ifndef ROCKSDB_LITE + if (!listeners.empty()) { + db_mutex->AssertHeld(); + // release lock while notifying events + db_mutex->Unlock(); + for (auto& listener : listeners) { + BackgroundErrorRecoveryInfo info; + info.old_bg_error = old_bg_error; + info.new_bg_error = new_bg_error; + listener->OnErrorRecoveryCompleted(old_bg_error); + listener->OnErrorRecoveryEnd(info); + info.old_bg_error.PermitUncheckedError(); + info.new_bg_error.PermitUncheckedError(); + } + db_mutex->Lock(); + } +#else + (void)listeners; + (void)old_bg_error; + (void)new_bg_error; + (void)db_mutex; +#endif // ROCKSDB_LITE +} + +#ifndef ROCKSDB_LITE +void EventHelpers::NotifyBlobFileCreationStarted( + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, + BlobFileCreationReason creation_reason) { + if (listeners.empty()) { + return; + } + BlobFileCreationBriefInfo info(db_name, cf_name, file_path, job_id, + creation_reason); + for (const auto& listener : listeners) { + listener->OnBlobFileCreationStarted(info); + } +} +#endif // !ROCKSDB_LITE + +void EventHelpers::LogAndNotifyBlobFileCreationFinished( + EventLogger* event_logger, + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, uint64_t file_number, + BlobFileCreationReason creation_reason, const Status& s, + const std::string& file_checksum, + const std::string& file_checksum_func_name, uint64_t total_blob_count, + uint64_t total_blob_bytes) { + if (s.ok() && event_logger) { + JSONWriter jwriter; + AppendCurrentTime(&jwriter); + jwriter << "cf_name" << cf_name << "job" << job_id << "event" + << "blob_file_creation" + << "file_number" << file_number << "total_blob_count" + << total_blob_count << "total_blob_bytes" << total_blob_bytes + << "file_checksum" << file_checksum << "file_checksum_func_name" + << file_checksum_func_name << "status" << s.ToString(); + + jwriter.EndObject(); + event_logger->Log(jwriter); + } + +#ifndef ROCKSDB_LITE + if (listeners.empty()) { + return; + } + BlobFileCreationInfo info(db_name, cf_name, file_path, job_id, + creation_reason, total_blob_count, total_blob_bytes, + s, file_checksum, file_checksum_func_name); + for (const auto& listener : listeners) { + listener->OnBlobFileCreated(info); + } + info.status.PermitUncheckedError(); +#else + (void)listeners; + (void)db_name; + (void)file_path; + (void)creation_reason; +#endif +} + +void EventHelpers::LogAndNotifyBlobFileDeletion( + EventLogger* event_logger, + const std::vector>& listeners, int job_id, + uint64_t file_number, const std::string& file_path, const Status& status, + const std::string& dbname) { + if (event_logger) { + JSONWriter jwriter; + AppendCurrentTime(&jwriter); + + jwriter << "job" << job_id << "event" + << "blob_file_deletion" + << "file_number" << file_number; + if (!status.ok()) { + jwriter << "status" << status.ToString(); + } + + jwriter.EndObject(); + event_logger->Log(jwriter); + } +#ifndef ROCKSDB_LITE + if (listeners.empty()) { + return; + } + BlobFileDeletionInfo info(dbname, file_path, job_id, status); + for (const auto& listener : listeners) { + listener->OnBlobFileDeleted(info); + } + info.status.PermitUncheckedError(); +#else + (void)listeners; + (void)dbname; + (void)file_path; +#endif // !ROCKSDB_LITE +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/event_helpers.h b/src/rocksdb/db/event_helpers.h new file mode 100644 index 000000000..68d819fe6 --- /dev/null +++ b/src/rocksdb/db/event_helpers.h @@ -0,0 +1,82 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include +#include + +#include "db/column_family.h" +#include "db/version_edit.h" +#include "logging/event_logger.h" +#include "rocksdb/listener.h" +#include "rocksdb/table_properties.h" + +namespace ROCKSDB_NAMESPACE { + +class EventHelpers { + public: + static void AppendCurrentTime(JSONWriter* json_writer); +#ifndef ROCKSDB_LITE + static void NotifyTableFileCreationStarted( + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, TableFileCreationReason reason); +#endif // !ROCKSDB_LITE + static void NotifyOnBackgroundError( + const std::vector>& listeners, + BackgroundErrorReason reason, Status* bg_error, + InstrumentedMutex* db_mutex, bool* auto_recovery); + static void LogAndNotifyTableFileCreationFinished( + EventLogger* event_logger, + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, const FileDescriptor& fd, + uint64_t oldest_blob_file_number, const TableProperties& table_properties, + TableFileCreationReason reason, const Status& s, + const std::string& file_checksum, + const std::string& file_checksum_func_name); + static void LogAndNotifyTableFileDeletion( + EventLogger* event_logger, int job_id, uint64_t file_number, + const std::string& file_path, const Status& status, + const std::string& db_name, + const std::vector>& listeners); + static void NotifyOnErrorRecoveryEnd( + const std::vector>& listeners, + const Status& old_bg_error, const Status& new_bg_error, + InstrumentedMutex* db_mutex); + +#ifndef ROCKSDB_LITE + static void NotifyBlobFileCreationStarted( + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, + BlobFileCreationReason creation_reason); +#endif // !ROCKSDB_LITE + + static void LogAndNotifyBlobFileCreationFinished( + EventLogger* event_logger, + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, uint64_t file_number, + BlobFileCreationReason creation_reason, const Status& s, + const std::string& file_checksum, + const std::string& file_checksum_func_name, uint64_t total_blob_count, + uint64_t total_blob_bytes); + + static void LogAndNotifyBlobFileDeletion( + EventLogger* event_logger, + const std::vector>& listeners, int job_id, + uint64_t file_number, const std::string& file_path, const Status& status, + const std::string& db_name); + + private: + static void LogAndNotifyTableFileCreation( + EventLogger* event_logger, + const std::vector>& listeners, + const FileDescriptor& fd, const TableFileCreationInfo& info); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/experimental.cc b/src/rocksdb/db/experimental.cc new file mode 100644 index 000000000..d838ebde5 --- /dev/null +++ b/src/rocksdb/db/experimental.cc @@ -0,0 +1,155 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/experimental.h" + +#include "db/db_impl/db_impl.h" +#include "db/version_util.h" +#include "logging/logging.h" + +namespace ROCKSDB_NAMESPACE { +namespace experimental { + +#ifndef ROCKSDB_LITE + +Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) { + if (db == nullptr) { + return Status::InvalidArgument("DB is empty"); + } + + return db->SuggestCompactRange(column_family, begin, end); +} + +Status PromoteL0(DB* db, ColumnFamilyHandle* column_family, int target_level) { + if (db == nullptr) { + return Status::InvalidArgument("Didn't recognize DB object"); + } + return db->PromoteL0(column_family, target_level); +} + +#else // ROCKSDB_LITE + +Status SuggestCompactRange(DB* /*db*/, ColumnFamilyHandle* /*column_family*/, + const Slice* /*begin*/, const Slice* /*end*/) { + return Status::NotSupported("Not supported in RocksDB LITE"); +} + +Status PromoteL0(DB* /*db*/, ColumnFamilyHandle* /*column_family*/, + int /*target_level*/) { + return Status::NotSupported("Not supported in RocksDB LITE"); +} + +#endif // ROCKSDB_LITE + +Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end) { + return SuggestCompactRange(db, db->DefaultColumnFamily(), begin, end); +} + +Status UpdateManifestForFilesState( + const DBOptions& db_opts, const std::string& db_name, + const std::vector& column_families, + const UpdateManifestForFilesStateOptions& opts) { + OfflineManifestWriter w(db_opts, db_name); + Status s = w.Recover(column_families); + + size_t files_updated = 0; + size_t cfs_updated = 0; + auto fs = db_opts.env->GetFileSystem(); + + for (auto cfd : *w.Versions().GetColumnFamilySet()) { + if (!s.ok()) { + break; + } + assert(cfd); + + if (cfd->IsDropped() || !cfd->initialized()) { + continue; + } + + const auto* current = cfd->current(); + assert(current); + + const auto* vstorage = current->storage_info(); + assert(vstorage); + + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + + /* SST files */ + for (int level = 0; level < cfd->NumberLevels(); level++) { + if (!s.ok()) { + break; + } + const auto& level_files = vstorage->LevelFiles(level); + + for (const auto& lf : level_files) { + assert(lf); + + uint64_t number = lf->fd.GetNumber(); + std::string fname = + TableFileName(w.IOptions().db_paths, number, lf->fd.GetPathId()); + + std::unique_ptr f; + FileOptions fopts; + // Use kUnknown to signal the FileSystem to search all tiers for the + // file. + fopts.temperature = Temperature::kUnknown; + + IOStatus file_ios = + fs->NewSequentialFile(fname, fopts, &f, /*dbg*/ nullptr); + if (file_ios.ok()) { + if (opts.update_temperatures) { + Temperature temp = f->GetTemperature(); + if (temp != Temperature::kUnknown && temp != lf->temperature) { + // Current state inconsistent with manifest + ++files_updated; + edit.DeleteFile(level, number); + edit.AddFile( + level, number, lf->fd.GetPathId(), lf->fd.GetFileSize(), + lf->smallest, lf->largest, lf->fd.smallest_seqno, + lf->fd.largest_seqno, lf->marked_for_compaction, temp, + lf->oldest_blob_file_number, lf->oldest_ancester_time, + lf->file_creation_time, lf->file_checksum, + lf->file_checksum_func_name, lf->unique_id); + } + } + } else { + s = file_ios; + break; + } + } + } + + if (s.ok() && edit.NumEntries() > 0) { + std::unique_ptr db_dir; + s = fs->NewDirectory(db_name, IOOptions(), &db_dir, nullptr); + if (s.ok()) { + s = w.LogAndApply(cfd, &edit, db_dir.get()); + } + if (s.ok()) { + ++cfs_updated; + } + } + } + + if (cfs_updated > 0) { + ROCKS_LOG_INFO(db_opts.info_log, + "UpdateManifestForFilesState: updated %zu files in %zu CFs", + files_updated, cfs_updated); + } else if (s.ok()) { + ROCKS_LOG_INFO(db_opts.info_log, + "UpdateManifestForFilesState: no updates needed"); + } + if (!s.ok()) { + ROCKS_LOG_ERROR(db_opts.info_log, "UpdateManifestForFilesState failed: %s", + s.ToString().c_str()); + } + + return s; +} + +} // namespace experimental +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/external_sst_file_basic_test.cc b/src/rocksdb/db/external_sst_file_basic_test.cc new file mode 100644 index 000000000..665c89869 --- /dev/null +++ b/src/rocksdb/db/external_sst_file_basic_test.cc @@ -0,0 +1,1997 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "db/db_test_util.h" +#include "db/version_edit.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/sst_file_writer.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/random.h" +#include "utilities/fault_injection_env.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE +class ExternalSSTFileBasicTest + : public DBTestBase, + public ::testing::WithParamInterface> { + public: + ExternalSSTFileBasicTest() + : DBTestBase("external_sst_file_basic_test", /*env_do_fsync=*/true) { + sst_files_dir_ = dbname_ + "_sst_files/"; + fault_injection_test_env_.reset(new FaultInjectionTestEnv(env_)); + DestroyAndRecreateExternalSSTFilesDir(); + + // Check if the Env supports RandomRWFile + std::string file_path = sst_files_dir_ + "test_random_rw_file"; + std::unique_ptr wfile; + assert(env_->NewWritableFile(file_path, &wfile, EnvOptions()).ok()); + wfile.reset(); + std::unique_ptr rwfile; + Status s = env_->NewRandomRWFile(file_path, &rwfile, EnvOptions()); + if (s.IsNotSupported()) { + random_rwfile_supported_ = false; + } else { + EXPECT_OK(s); + random_rwfile_supported_ = true; + } + rwfile.reset(); + EXPECT_OK(env_->DeleteFile(file_path)); + } + + void DestroyAndRecreateExternalSSTFilesDir() { + ASSERT_OK(DestroyDir(env_, sst_files_dir_)); + ASSERT_OK(env_->CreateDir(sst_files_dir_)); + } + + Status DeprecatedAddFile(const std::vector& files, + bool move_files = false, + bool skip_snapshot_check = false) { + IngestExternalFileOptions opts; + opts.move_files = move_files; + opts.snapshot_consistency = !skip_snapshot_check; + opts.allow_global_seqno = false; + opts.allow_blocking_flush = false; + return db_->IngestExternalFile(files, opts); + } + + Status AddFileWithFileChecksum( + const std::vector& files, + const std::vector& files_checksums, + const std::vector& files_checksum_func_names, + bool verify_file_checksum = true, bool move_files = false, + bool skip_snapshot_check = false, bool write_global_seqno = true) { + IngestExternalFileOptions opts; + opts.move_files = move_files; + opts.snapshot_consistency = !skip_snapshot_check; + opts.allow_global_seqno = false; + opts.allow_blocking_flush = false; + opts.write_global_seqno = write_global_seqno; + opts.verify_file_checksum = verify_file_checksum; + + IngestExternalFileArg arg; + arg.column_family = db_->DefaultColumnFamily(); + arg.external_files = files; + arg.options = opts; + arg.files_checksums = files_checksums; + arg.files_checksum_func_names = files_checksum_func_names; + return db_->IngestExternalFiles({arg}); + } + + Status GenerateAndAddExternalFile( + const Options options, std::vector keys, + const std::vector& value_types, + std::vector> range_deletions, int file_id, + bool write_global_seqno, bool verify_checksums_before_ingest, + std::map* true_data) { + assert(value_types.size() == 1 || keys.size() == value_types.size()); + std::string file_path = sst_files_dir_ + std::to_string(file_id); + SstFileWriter sst_file_writer(EnvOptions(), options); + + Status s = sst_file_writer.Open(file_path); + if (!s.ok()) { + return s; + } + for (size_t i = 0; i < range_deletions.size(); i++) { + // Account for the effect of range deletions on true_data before + // all point operators, even though sst_file_writer.DeleteRange + // must be called before other sst_file_writer methods. This is + // because point writes take precedence over range deletions + // in the same ingested sst. + std::string start_key = Key(range_deletions[i].first); + std::string end_key = Key(range_deletions[i].second); + s = sst_file_writer.DeleteRange(start_key, end_key); + if (!s.ok()) { + sst_file_writer.Finish(); + return s; + } + auto start_key_it = true_data->find(start_key); + if (start_key_it == true_data->end()) { + start_key_it = true_data->upper_bound(start_key); + } + auto end_key_it = true_data->find(end_key); + if (end_key_it == true_data->end()) { + end_key_it = true_data->upper_bound(end_key); + } + true_data->erase(start_key_it, end_key_it); + } + for (size_t i = 0; i < keys.size(); i++) { + std::string key = Key(keys[i]); + std::string value = Key(keys[i]) + std::to_string(file_id); + ValueType value_type = + (value_types.size() == 1 ? value_types[0] : value_types[i]); + switch (value_type) { + case ValueType::kTypeValue: + s = sst_file_writer.Put(key, value); + (*true_data)[key] = value; + break; + case ValueType::kTypeMerge: + s = sst_file_writer.Merge(key, value); + // we only use TestPutOperator in this test + (*true_data)[key] = value; + break; + case ValueType::kTypeDeletion: + s = sst_file_writer.Delete(key); + true_data->erase(key); + break; + default: + return Status::InvalidArgument("Value type is not supported"); + } + if (!s.ok()) { + sst_file_writer.Finish(); + return s; + } + } + s = sst_file_writer.Finish(); + + if (s.ok()) { + IngestExternalFileOptions ifo; + ifo.allow_global_seqno = true; + ifo.write_global_seqno = write_global_seqno; + ifo.verify_checksums_before_ingest = verify_checksums_before_ingest; + s = db_->IngestExternalFile({file_path}, ifo); + } + return s; + } + + Status GenerateAndAddExternalFile( + const Options options, std::vector keys, + const std::vector& value_types, int file_id, + bool write_global_seqno, bool verify_checksums_before_ingest, + std::map* true_data) { + return GenerateAndAddExternalFile( + options, keys, value_types, {}, file_id, write_global_seqno, + verify_checksums_before_ingest, true_data); + } + + Status GenerateAndAddExternalFile( + const Options options, std::vector keys, const ValueType value_type, + int file_id, bool write_global_seqno, bool verify_checksums_before_ingest, + std::map* true_data) { + return GenerateAndAddExternalFile( + options, keys, std::vector(1, value_type), file_id, + write_global_seqno, verify_checksums_before_ingest, true_data); + } + + ~ExternalSSTFileBasicTest() override { + DestroyDir(env_, sst_files_dir_).PermitUncheckedError(); + } + + protected: + std::string sst_files_dir_; + std::unique_ptr fault_injection_test_env_; + bool random_rwfile_supported_; +}; + +TEST_F(ExternalSSTFileBasicTest, Basic) { + Options options = CurrentOptions(); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // Current file size should be 0 after sst_file_writer init and before open a + // file. + ASSERT_EQ(sst_file_writer.FileSize(), 0); + + // file1.sst (0 => 99) + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 0; k < 100; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file1_info; + Status s = sst_file_writer.Finish(&file1_info); + ASSERT_OK(s) << s.ToString(); + + // Current file size should be non-zero after success write. + ASSERT_GT(sst_file_writer.FileSize(), 0); + + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 100); + ASSERT_EQ(file1_info.smallest_key, Key(0)); + ASSERT_EQ(file1_info.largest_key, Key(99)); + ASSERT_EQ(file1_info.num_range_del_entries, 0); + ASSERT_EQ(file1_info.smallest_range_del_key, ""); + ASSERT_EQ(file1_info.largest_range_del_key, ""); + ASSERT_EQ(file1_info.file_checksum, kUnknownFileChecksum); + ASSERT_EQ(file1_info.file_checksum_func_name, kUnknownFileChecksumFuncName); + // sst_file_writer already finished, cannot add this value + s = sst_file_writer.Put(Key(100), "bad_val"); + ASSERT_NOK(s) << s.ToString(); + s = sst_file_writer.DeleteRange(Key(100), Key(200)); + ASSERT_NOK(s) << s.ToString(); + + DestroyAndReopen(options); + // Add file using file path + s = DeprecatedAddFile({file1}); + ASSERT_OK(s) << s.ToString(); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + for (int k = 0; k < 100; k++) { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } + + DestroyAndRecreateExternalSSTFilesDir(); +} + +class ChecksumVerifyHelper { + private: + Options options_; + + public: + ChecksumVerifyHelper(Options& options) : options_(options) {} + ~ChecksumVerifyHelper() {} + + Status GetSingleFileChecksumAndFuncName( + const std::string& file_path, std::string* file_checksum, + std::string* file_checksum_func_name) { + Status s; + EnvOptions soptions; + std::unique_ptr file_reader; + s = options_.env->NewSequentialFile(file_path, &file_reader, soptions); + if (!s.ok()) { + return s; + } + std::unique_ptr scratch(new char[2048]); + Slice result; + FileChecksumGenFactory* file_checksum_gen_factory = + options_.file_checksum_gen_factory.get(); + if (file_checksum_gen_factory == nullptr) { + *file_checksum = kUnknownFileChecksum; + *file_checksum_func_name = kUnknownFileChecksumFuncName; + return Status::OK(); + } else { + FileChecksumGenContext gen_context; + std::unique_ptr file_checksum_gen = + file_checksum_gen_factory->CreateFileChecksumGenerator(gen_context); + *file_checksum_func_name = file_checksum_gen->Name(); + s = file_reader->Read(2048, &result, scratch.get()); + if (!s.ok()) { + return s; + } + while (result.size() != 0) { + file_checksum_gen->Update(scratch.get(), result.size()); + s = file_reader->Read(2048, &result, scratch.get()); + if (!s.ok()) { + return s; + } + } + file_checksum_gen->Finalize(); + *file_checksum = file_checksum_gen->GetChecksum(); + } + return Status::OK(); + } +}; + +TEST_F(ExternalSSTFileBasicTest, BasicWithFileChecksumCrc32c) { + Options options = CurrentOptions(); + options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + ChecksumVerifyHelper checksum_helper(options); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // Current file size should be 0 after sst_file_writer init and before open a + // file. + ASSERT_EQ(sst_file_writer.FileSize(), 0); + + // file1.sst (0 => 99) + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 0; k < 100; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file1_info; + Status s = sst_file_writer.Finish(&file1_info); + ASSERT_OK(s) << s.ToString(); + std::string file_checksum, file_checksum_func_name; + ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( + file1, &file_checksum, &file_checksum_func_name)); + + // Current file size should be non-zero after success write. + ASSERT_GT(sst_file_writer.FileSize(), 0); + + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 100); + ASSERT_EQ(file1_info.smallest_key, Key(0)); + ASSERT_EQ(file1_info.largest_key, Key(99)); + ASSERT_EQ(file1_info.num_range_del_entries, 0); + ASSERT_EQ(file1_info.smallest_range_del_key, ""); + ASSERT_EQ(file1_info.largest_range_del_key, ""); + ASSERT_EQ(file1_info.file_checksum, file_checksum); + ASSERT_EQ(file1_info.file_checksum_func_name, file_checksum_func_name); + // sst_file_writer already finished, cannot add this value + s = sst_file_writer.Put(Key(100), "bad_val"); + ASSERT_NOK(s) << s.ToString(); + s = sst_file_writer.DeleteRange(Key(100), Key(200)); + ASSERT_NOK(s) << s.ToString(); + + DestroyAndReopen(options); + // Add file using file path + s = DeprecatedAddFile({file1}); + ASSERT_OK(s) << s.ToString(); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + for (int k = 0; k < 100; k++) { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } + + DestroyAndRecreateExternalSSTFilesDir(); +} + +TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { + Options old_options = CurrentOptions(); + Options options = CurrentOptions(); + options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + const ImmutableCFOptions ioptions(options); + ChecksumVerifyHelper checksum_helper(options); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // file01.sst (1000 => 1099) + std::string file1 = sst_files_dir_ + "file01.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 1000; k < 1100; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file1_info; + Status s = sst_file_writer.Finish(&file1_info); + ASSERT_OK(s) << s.ToString(); + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 100); + ASSERT_EQ(file1_info.smallest_key, Key(1000)); + ASSERT_EQ(file1_info.largest_key, Key(1099)); + std::string file_checksum1, file_checksum_func_name1; + ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( + file1, &file_checksum1, &file_checksum_func_name1)); + ASSERT_EQ(file1_info.file_checksum, file_checksum1); + ASSERT_EQ(file1_info.file_checksum_func_name, file_checksum_func_name1); + + // file02.sst (1100 => 1299) + std::string file2 = sst_files_dir_ + "file02.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + for (int k = 1100; k < 1300; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file2_info; + s = sst_file_writer.Finish(&file2_info); + ASSERT_OK(s) << s.ToString(); + ASSERT_EQ(file2_info.file_path, file2); + ASSERT_EQ(file2_info.num_entries, 200); + ASSERT_EQ(file2_info.smallest_key, Key(1100)); + ASSERT_EQ(file2_info.largest_key, Key(1299)); + std::string file_checksum2, file_checksum_func_name2; + ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( + file2, &file_checksum2, &file_checksum_func_name2)); + ASSERT_EQ(file2_info.file_checksum, file_checksum2); + ASSERT_EQ(file2_info.file_checksum_func_name, file_checksum_func_name2); + + // file03.sst (1300 => 1499) + std::string file3 = sst_files_dir_ + "file03.sst"; + ASSERT_OK(sst_file_writer.Open(file3)); + for (int k = 1300; k < 1500; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file3_info; + s = sst_file_writer.Finish(&file3_info); + ASSERT_OK(s) << s.ToString(); + ASSERT_EQ(file3_info.file_path, file3); + ASSERT_EQ(file3_info.num_entries, 200); + ASSERT_EQ(file3_info.smallest_key, Key(1300)); + ASSERT_EQ(file3_info.largest_key, Key(1499)); + std::string file_checksum3, file_checksum_func_name3; + ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( + file3, &file_checksum3, &file_checksum_func_name3)); + ASSERT_EQ(file3_info.file_checksum, file_checksum3); + ASSERT_EQ(file3_info.file_checksum_func_name, file_checksum_func_name3); + + // file04.sst (1500 => 1799) + std::string file4 = sst_files_dir_ + "file04.sst"; + ASSERT_OK(sst_file_writer.Open(file4)); + for (int k = 1500; k < 1800; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file4_info; + s = sst_file_writer.Finish(&file4_info); + ASSERT_OK(s) << s.ToString(); + ASSERT_EQ(file4_info.file_path, file4); + ASSERT_EQ(file4_info.num_entries, 300); + ASSERT_EQ(file4_info.smallest_key, Key(1500)); + ASSERT_EQ(file4_info.largest_key, Key(1799)); + std::string file_checksum4, file_checksum_func_name4; + ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( + file4, &file_checksum4, &file_checksum_func_name4)); + ASSERT_EQ(file4_info.file_checksum, file_checksum4); + ASSERT_EQ(file4_info.file_checksum_func_name, file_checksum_func_name4); + + // file05.sst (1800 => 1899) + std::string file5 = sst_files_dir_ + "file05.sst"; + ASSERT_OK(sst_file_writer.Open(file5)); + for (int k = 1800; k < 2000; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file5_info; + s = sst_file_writer.Finish(&file5_info); + ASSERT_OK(s) << s.ToString(); + ASSERT_EQ(file5_info.file_path, file5); + ASSERT_EQ(file5_info.num_entries, 200); + ASSERT_EQ(file5_info.smallest_key, Key(1800)); + ASSERT_EQ(file5_info.largest_key, Key(1999)); + std::string file_checksum5, file_checksum_func_name5; + ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( + file5, &file_checksum5, &file_checksum_func_name5)); + ASSERT_EQ(file5_info.file_checksum, file_checksum5); + ASSERT_EQ(file5_info.file_checksum_func_name, file_checksum_func_name5); + + // file06.sst (2000 => 2199) + std::string file6 = sst_files_dir_ + "file06.sst"; + ASSERT_OK(sst_file_writer.Open(file6)); + for (int k = 2000; k < 2200; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file6_info; + s = sst_file_writer.Finish(&file6_info); + ASSERT_OK(s) << s.ToString(); + ASSERT_EQ(file6_info.file_path, file6); + ASSERT_EQ(file6_info.num_entries, 200); + ASSERT_EQ(file6_info.smallest_key, Key(2000)); + ASSERT_EQ(file6_info.largest_key, Key(2199)); + std::string file_checksum6, file_checksum_func_name6; + ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( + file6, &file_checksum6, &file_checksum_func_name6)); + ASSERT_EQ(file6_info.file_checksum, file_checksum6); + ASSERT_EQ(file6_info.file_checksum_func_name, file_checksum_func_name6); + + s = AddFileWithFileChecksum({file1}, {file_checksum1, "xyz"}, + {file_checksum1}, true, false, false, false); + // does not care the checksum input since db does not enable file checksum + ASSERT_OK(s) << s.ToString(); + ASSERT_OK(env_->FileExists(file1)); + std::vector live_files; + dbfull()->GetLiveFilesMetaData(&live_files); + std::set set1; + for (auto f : live_files) { + set1.insert(f.name); + ASSERT_EQ(f.file_checksum, kUnknownFileChecksum); + ASSERT_EQ(f.file_checksum_func_name, kUnknownFileChecksumFuncName); + } + + // check the temperature of the file being ingested + ColumnFamilyMetaData metadata; + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(1, metadata.file_count); + ASSERT_EQ(Temperature::kUnknown, metadata.levels[6].files[0].temperature); + auto size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_GT(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kHot); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kCold); + ASSERT_EQ(size, 0); + + // Reopen Db with checksum enabled + Reopen(options); + // Enable verify_file_checksum option + // The checksum vector does not match, fail the ingestion + s = AddFileWithFileChecksum({file2}, {file_checksum2, "xyz"}, + {file_checksum_func_name2}, true, false, false, + false); + ASSERT_NOK(s) << s.ToString(); + + // Enable verify_file_checksum option + // The checksum name does not match, fail the ingestion + s = AddFileWithFileChecksum({file2}, {file_checksum2}, {"xyz"}, true, false, + false, false); + ASSERT_NOK(s) << s.ToString(); + + // Enable verify_file_checksum option + // The checksum itself does not match, fail the ingestion + s = AddFileWithFileChecksum({file2}, {"xyz"}, {file_checksum_func_name2}, + true, false, false, false); + ASSERT_NOK(s) << s.ToString(); + + // Enable verify_file_checksum option + // All matches, ingestion is successful + s = AddFileWithFileChecksum({file2}, {file_checksum2}, + {file_checksum_func_name2}, true, false, false, + false); + ASSERT_OK(s) << s.ToString(); + std::vector live_files1; + dbfull()->GetLiveFilesMetaData(&live_files1); + for (auto f : live_files1) { + if (set1.find(f.name) == set1.end()) { + ASSERT_EQ(f.file_checksum, file_checksum2); + ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name2); + set1.insert(f.name); + } + } + ASSERT_OK(env_->FileExists(file2)); + + // Enable verify_file_checksum option + // No checksum information is provided, generate it when ingesting + std::vector checksum, checksum_func; + s = AddFileWithFileChecksum({file3}, checksum, checksum_func, true, false, + false, false); + ASSERT_OK(s) << s.ToString(); + std::vector live_files2; + dbfull()->GetLiveFilesMetaData(&live_files2); + for (auto f : live_files2) { + if (set1.find(f.name) == set1.end()) { + ASSERT_EQ(f.file_checksum, file_checksum3); + ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name3); + set1.insert(f.name); + } + } + ASSERT_OK(s) << s.ToString(); + ASSERT_OK(env_->FileExists(file3)); + + // Does not enable verify_file_checksum options + // The checksum name does not match, fail the ingestion + s = AddFileWithFileChecksum({file4}, {file_checksum4}, {"xyz"}, false, false, + false, false); + ASSERT_NOK(s) << s.ToString(); + + // Does not enable verify_file_checksum options + // Checksum function name matches, store the checksum being ingested. + s = AddFileWithFileChecksum({file4}, {"asd"}, {file_checksum_func_name4}, + false, false, false, false); + ASSERT_OK(s) << s.ToString(); + std::vector live_files3; + dbfull()->GetLiveFilesMetaData(&live_files3); + for (auto f : live_files3) { + if (set1.find(f.name) == set1.end()) { + ASSERT_FALSE(f.file_checksum == file_checksum4); + ASSERT_EQ(f.file_checksum, "asd"); + ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name4); + set1.insert(f.name); + } + } + ASSERT_OK(s) << s.ToString(); + ASSERT_OK(env_->FileExists(file4)); + + // enable verify_file_checksum options, DB enable checksum, and enable + // write_global_seq. So the checksum stored is different from the one + // ingested due to the sequence number changes. + s = AddFileWithFileChecksum({file5}, {file_checksum5}, + {file_checksum_func_name5}, true, false, false, + true); + ASSERT_OK(s) << s.ToString(); + std::vector live_files4; + dbfull()->GetLiveFilesMetaData(&live_files4); + for (auto f : live_files4) { + if (set1.find(f.name) == set1.end()) { + std::string cur_checksum5, cur_checksum_func_name5; + ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( + dbname_ + f.name, &cur_checksum5, &cur_checksum_func_name5)); + ASSERT_EQ(f.file_checksum, cur_checksum5); + ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name5); + set1.insert(f.name); + } + } + ASSERT_OK(s) << s.ToString(); + ASSERT_OK(env_->FileExists(file5)); + + // Does not enable verify_file_checksum options and also the ingested file + // checksum information is empty. DB will generate and store the checksum + // in Manifest. + std::vector files_c6, files_name6; + s = AddFileWithFileChecksum({file6}, files_c6, files_name6, false, false, + false, false); + ASSERT_OK(s) << s.ToString(); + std::vector live_files6; + dbfull()->GetLiveFilesMetaData(&live_files6); + for (auto f : live_files6) { + if (set1.find(f.name) == set1.end()) { + ASSERT_EQ(f.file_checksum, file_checksum6); + ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name6); + set1.insert(f.name); + } + } + ASSERT_OK(s) << s.ToString(); + ASSERT_OK(env_->FileExists(file6)); + db_->GetColumnFamilyMetaData(&metadata); + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_GT(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kHot); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kCold); + ASSERT_EQ(size, 0); +} + +TEST_F(ExternalSSTFileBasicTest, NoCopy) { + Options options = CurrentOptions(); + const ImmutableCFOptions ioptions(options); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // file1.sst (0 => 99) + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 0; k < 100; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file1_info; + Status s = sst_file_writer.Finish(&file1_info); + ASSERT_OK(s) << s.ToString(); + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 100); + ASSERT_EQ(file1_info.smallest_key, Key(0)); + ASSERT_EQ(file1_info.largest_key, Key(99)); + + // file2.sst (100 => 299) + std::string file2 = sst_files_dir_ + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + for (int k = 100; k < 300; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file2_info; + s = sst_file_writer.Finish(&file2_info); + ASSERT_OK(s) << s.ToString(); + ASSERT_EQ(file2_info.file_path, file2); + ASSERT_EQ(file2_info.num_entries, 200); + ASSERT_EQ(file2_info.smallest_key, Key(100)); + ASSERT_EQ(file2_info.largest_key, Key(299)); + + // file3.sst (110 => 124) .. overlap with file2.sst + std::string file3 = sst_files_dir_ + "file3.sst"; + ASSERT_OK(sst_file_writer.Open(file3)); + for (int k = 110; k < 125; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file3_info; + s = sst_file_writer.Finish(&file3_info); + ASSERT_OK(s) << s.ToString(); + ASSERT_EQ(file3_info.file_path, file3); + ASSERT_EQ(file3_info.num_entries, 15); + ASSERT_EQ(file3_info.smallest_key, Key(110)); + ASSERT_EQ(file3_info.largest_key, Key(124)); + + s = DeprecatedAddFile({file1}, true /* move file */); + ASSERT_OK(s) << s.ToString(); + ASSERT_EQ(Status::NotFound(), env_->FileExists(file1)); + + s = DeprecatedAddFile({file2}, false /* copy file */); + ASSERT_OK(s) << s.ToString(); + ASSERT_OK(env_->FileExists(file2)); + + // This file has overlapping values with the existing data + s = DeprecatedAddFile({file3}, true /* move file */); + ASSERT_NOK(s) << s.ToString(); + ASSERT_OK(env_->FileExists(file3)); + + for (int k = 0; k < 300; k++) { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } +} + +TEST_P(ExternalSSTFileBasicTest, IngestFileWithGlobalSeqnoPickedSeqno) { + bool write_global_seqno = std::get<0>(GetParam()); + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + do { + Options options = CurrentOptions(); + DestroyAndReopen(options); + std::map true_data; + + int file_id = 1; + + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {10, 11, 12, 13}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 4, 6}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {11, 15, 19}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {120, 130}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 130}, ValueType::kTypeValue, file_id++, write_global_seqno, + verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3); + + // Write some keys through normal write path + for (int i = 0; i < 50; i++) { + ASSERT_OK(Put(Key(i), "memtable")); + true_data[Key(i)] = "memtable"; + } + SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber(); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {60, 61, 62}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {40, 41, 42}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {20, 30, 40}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2); + + const Snapshot* snapshot = db_->GetSnapshot(); + + // We will need a seqno for the file regardless if the file overwrite + // keys in the DB or not because we have a snapshot + ASSERT_OK(GenerateAndAddExternalFile( + options, {1000, 1002}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // A global seqno will be assigned anyway because of the snapshot + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {2000, 3002}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // A global seqno will be assigned anyway because of the snapshot + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 20, 40, 100, 150}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // A global seqno will be assigned anyway because of the snapshot + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5); + + db_->ReleaseSnapshot(snapshot); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {5000, 5001}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // No snapshot anymore, no need to assign a seqno + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5); + + size_t kcnt = 0; + VerifyDBFromMap(true_data, &kcnt, false); + } while (ChangeOptionsForFileIngestionTest()); +} + +TEST_P(ExternalSSTFileBasicTest, IngestFileWithMultipleValueType) { + bool write_global_seqno = std::get<0>(GetParam()); + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + do { + Options options = CurrentOptions(); + options.merge_operator.reset(new TestPutOperator()); + DestroyAndReopen(options); + std::map true_data; + + int file_id = 1; + + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {10, 11, 12, 13}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 4, 6}, ValueType::kTypeMerge, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {11, 15, 19}, ValueType::kTypeDeletion, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {120, 130}, ValueType::kTypeMerge, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 130}, ValueType::kTypeDeletion, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {120}, {ValueType::kTypeValue}, {{120, 135}}, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {}, {}, {{110, 120}}, file_id++, write_global_seqno, + verify_checksums_before_ingest, &true_data)); + // The range deletion ends on a key, but it doesn't actually delete + // this key because the largest key in the range is exclusive. Still, + // it counts as an overlap so a new seqno will be assigned. + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {}, {}, {{100, 109}}, file_id++, write_global_seqno, + verify_checksums_before_ingest, &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5); + + // Write some keys through normal write path + for (int i = 0; i < 50; i++) { + ASSERT_OK(Put(Key(i), "memtable")); + true_data[Key(i)] = "memtable"; + } + SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber(); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {60, 61, 62}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {40, 41, 42}, ValueType::kTypeMerge, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {20, 30, 40}, ValueType::kTypeDeletion, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2); + + const Snapshot* snapshot = db_->GetSnapshot(); + + // We will need a seqno for the file regardless if the file overwrite + // keys in the DB or not because we have a snapshot + ASSERT_OK(GenerateAndAddExternalFile( + options, {1000, 1002}, ValueType::kTypeMerge, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // A global seqno will be assigned anyway because of the snapshot + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {2000, 3002}, ValueType::kTypeMerge, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // A global seqno will be assigned anyway because of the snapshot + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 20, 40, 100, 150}, ValueType::kTypeMerge, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // A global seqno will be assigned anyway because of the snapshot + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5); + + db_->ReleaseSnapshot(snapshot); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {5000, 5001}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // No snapshot anymore, no need to assign a seqno + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5); + + size_t kcnt = 0; + VerifyDBFromMap(true_data, &kcnt, false); + } while (ChangeOptionsForFileIngestionTest()); +} + +TEST_P(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) { + bool write_global_seqno = std::get<0>(GetParam()); + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + do { + Options options = CurrentOptions(); + options.merge_operator.reset(new TestPutOperator()); + DestroyAndReopen(options); + std::map true_data; + + int file_id = 1; + + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 2, 3, 4, 5, 6}, + {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue, + ValueType::kTypeMerge, ValueType::kTypeValue, ValueType::kTypeMerge}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {10, 11, 12, 13}, + {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue, + ValueType::kTypeMerge}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 4, 6}, + {ValueType::kTypeDeletion, ValueType::kTypeValue, + ValueType::kTypeMerge}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {11, 15, 19}, + {ValueType::kTypeDeletion, ValueType::kTypeMerge, + ValueType::kTypeValue}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {120, 130}, {ValueType::kTypeValue, ValueType::kTypeMerge}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 130}, {ValueType::kTypeMerge, ValueType::kTypeDeletion}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {150, 151, 152}, + {ValueType::kTypeValue, ValueType::kTypeMerge, + ValueType::kTypeDeletion}, + {{150, 160}, {180, 190}}, file_id++, write_global_seqno, + verify_checksums_before_ingest, &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {150, 151, 152}, + {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue}, + {{200, 250}}, file_id++, write_global_seqno, + verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {300, 301, 302}, + {ValueType::kTypeValue, ValueType::kTypeMerge, + ValueType::kTypeDeletion}, + {{1, 2}, {152, 154}}, file_id++, write_global_seqno, + verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5); + + // Write some keys through normal write path + for (int i = 0; i < 50; i++) { + ASSERT_OK(Put(Key(i), "memtable")); + true_data[Key(i)] = "memtable"; + } + SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber(); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {60, 61, 62}, + {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {40, 41, 42}, + {ValueType::kTypeValue, ValueType::kTypeDeletion, + ValueType::kTypeDeletion}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {20, 30, 40}, + {ValueType::kTypeDeletion, ValueType::kTypeDeletion, + ValueType::kTypeDeletion}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2); + + const Snapshot* snapshot = db_->GetSnapshot(); + + // We will need a seqno for the file regardless if the file overwrite + // keys in the DB or not because we have a snapshot + ASSERT_OK(GenerateAndAddExternalFile( + options, {1000, 1002}, {ValueType::kTypeValue, ValueType::kTypeMerge}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // A global seqno will be assigned anyway because of the snapshot + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {2000, 3002}, {ValueType::kTypeValue, ValueType::kTypeMerge}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // A global seqno will be assigned anyway because of the snapshot + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 20, 40, 100, 150}, + {ValueType::kTypeDeletion, ValueType::kTypeDeletion, + ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeMerge}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // A global seqno will be assigned anyway because of the snapshot + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5); + + db_->ReleaseSnapshot(snapshot); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {5000, 5001}, {ValueType::kTypeValue, ValueType::kTypeMerge}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // No snapshot anymore, no need to assign a seqno + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5); + + size_t kcnt = 0; + VerifyDBFromMap(true_data, &kcnt, false); + } while (ChangeOptionsForFileIngestionTest()); +} + +TEST_F(ExternalSSTFileBasicTest, FadviseTrigger) { + Options options = CurrentOptions(); + const int kNumKeys = 10000; + + size_t total_fadvised_bytes = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileWriter::Rep::InvalidatePageCache", [&](void* arg) { + size_t fadvise_size = *(reinterpret_cast(arg)); + total_fadvised_bytes += fadvise_size; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + std::unique_ptr sst_file_writer; + + std::string sst_file_path = sst_files_dir_ + "file_fadvise_disable.sst"; + sst_file_writer.reset( + new SstFileWriter(EnvOptions(), options, nullptr, false)); + ASSERT_OK(sst_file_writer->Open(sst_file_path)); + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(sst_file_writer->Put(Key(i), Key(i))); + } + ASSERT_OK(sst_file_writer->Finish()); + // fadvise disabled + ASSERT_EQ(total_fadvised_bytes, 0); + + sst_file_path = sst_files_dir_ + "file_fadvise_enable.sst"; + sst_file_writer.reset( + new SstFileWriter(EnvOptions(), options, nullptr, true)); + ASSERT_OK(sst_file_writer->Open(sst_file_path)); + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(sst_file_writer->Put(Key(i), Key(i))); + } + ASSERT_OK(sst_file_writer->Finish()); + // fadvise enabled + ASSERT_EQ(total_fadvised_bytes, sst_file_writer->FileSize()); + ASSERT_GT(total_fadvised_bytes, 0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(ExternalSSTFileBasicTest, SyncFailure) { + Options options; + options.create_if_missing = true; + options.env = fault_injection_test_env_.get(); + + std::vector> test_cases = { + {"ExternalSstFileIngestionJob::BeforeSyncIngestedFile", + "ExternalSstFileIngestionJob::AfterSyncIngestedFile"}, + {"ExternalSstFileIngestionJob::BeforeSyncDir", + "ExternalSstFileIngestionJob::AfterSyncDir"}, + {"ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno", + "ExternalSstFileIngestionJob::AfterSyncGlobalSeqno"}}; + + for (size_t i = 0; i < test_cases.size(); i++) { + bool no_sync = false; + SyncPoint::GetInstance()->SetCallBack(test_cases[i].first, [&](void*) { + fault_injection_test_env_->SetFilesystemActive(false); + }); + SyncPoint::GetInstance()->SetCallBack(test_cases[i].second, [&](void*) { + fault_injection_test_env_->SetFilesystemActive(true); + }); + if (i == 0) { + SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::Prepare:Reopen", [&](void* s) { + Status* status = static_cast(s); + if (status->IsNotSupported()) { + no_sync = true; + } + }); + } + if (i == 2) { + SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::NewRandomRWFile", [&](void* s) { + Status* status = static_cast(s); + if (status->IsNotSupported()) { + no_sync = true; + } + }); + } + SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndReopen(options); + if (i == 2) { + ASSERT_OK(Put("foo", "v1")); + } + + Options sst_file_writer_options; + sst_file_writer_options.env = fault_injection_test_env_.get(); + std::unique_ptr sst_file_writer( + new SstFileWriter(EnvOptions(), sst_file_writer_options)); + std::string file_name = + sst_files_dir_ + "sync_failure_test_" + std::to_string(i) + ".sst"; + ASSERT_OK(sst_file_writer->Open(file_name)); + ASSERT_OK(sst_file_writer->Put("bar", "v2")); + ASSERT_OK(sst_file_writer->Finish()); + + IngestExternalFileOptions ingest_opt; + if (i == 0) { + ingest_opt.move_files = true; + } + const Snapshot* snapshot = db_->GetSnapshot(); + if (i == 2) { + ingest_opt.write_global_seqno = true; + } + Status s = db_->IngestExternalFile({file_name}, ingest_opt); + if (no_sync) { + ASSERT_OK(s); + } else { + ASSERT_NOK(s); + } + db_->ReleaseSnapshot(snapshot); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + Destroy(options); + } +} + +TEST_F(ExternalSSTFileBasicTest, ReopenNotSupported) { + Options options; + options.create_if_missing = true; + options.env = env_; + + SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::Prepare:Reopen", [&](void* arg) { + Status* s = static_cast(arg); + *s = Status::NotSupported(); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndReopen(options); + + Options sst_file_writer_options; + sst_file_writer_options.env = env_; + std::unique_ptr sst_file_writer( + new SstFileWriter(EnvOptions(), sst_file_writer_options)); + std::string file_name = + sst_files_dir_ + "reopen_not_supported_test_" + ".sst"; + ASSERT_OK(sst_file_writer->Open(file_name)); + ASSERT_OK(sst_file_writer->Put("bar", "v2")); + ASSERT_OK(sst_file_writer->Finish()); + + IngestExternalFileOptions ingest_opt; + ingest_opt.move_files = true; + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt)); + db_->ReleaseSnapshot(snapshot); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + Destroy(options); +} + +TEST_F(ExternalSSTFileBasicTest, VerifyChecksumReadahead) { + Options options; + options.create_if_missing = true; + SpecialEnv senv(env_); + options.env = &senv; + DestroyAndReopen(options); + + Options sst_file_writer_options; + sst_file_writer_options.env = env_; + std::unique_ptr sst_file_writer( + new SstFileWriter(EnvOptions(), sst_file_writer_options)); + std::string file_name = sst_files_dir_ + "verify_checksum_readahead_test.sst"; + ASSERT_OK(sst_file_writer->Open(file_name)); + Random rnd(301); + std::string value = rnd.RandomString(4000); + for (int i = 0; i < 5000; i++) { + ASSERT_OK(sst_file_writer->Put(DBTestBase::Key(i), value)); + } + ASSERT_OK(sst_file_writer->Finish()); + + // Ingest it once without verifying checksums to see the baseline + // preads. + IngestExternalFileOptions ingest_opt; + ingest_opt.move_files = false; + senv.count_random_reads_ = true; + senv.random_read_bytes_counter_ = 0; + ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt)); + + auto base_num_reads = senv.random_read_counter_.Read(); + // Make sure the counter is enabled. + ASSERT_GT(base_num_reads, 0); + + // Ingest again and observe the reads made for for readahead. + ingest_opt.move_files = false; + ingest_opt.verify_checksums_before_ingest = true; + ingest_opt.verify_checksums_readahead_size = size_t{2 * 1024 * 1024}; + + senv.count_random_reads_ = true; + senv.random_read_bytes_counter_ = 0; + ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt)); + + // Make sure the counter is enabled. + ASSERT_GT(senv.random_read_counter_.Read() - base_num_reads, 0); + + // The SST file is about 20MB. Readahead size is 2MB. + // Give a conservative 15 reads for metadata blocks, the number + // of random reads should be within 20 MB / 2MB + 15 = 25. + ASSERT_LE(senv.random_read_counter_.Read() - base_num_reads, 40); + + Destroy(options); +} + +TEST_F(ExternalSSTFileBasicTest, IngestRangeDeletionTombstoneWithGlobalSeqno) { + for (int i = 5; i < 25; i++) { + ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(i), + Key(i) + "_val")); + } + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + Reopen(options); + SstFileWriter sst_file_writer(EnvOptions(), options); + + // file.sst (delete 0 => 30) + std::string file = sst_files_dir_ + "file.sst"; + ASSERT_OK(sst_file_writer.Open(file)); + ASSERT_OK(sst_file_writer.DeleteRange(Key(0), Key(30))); + ExternalSstFileInfo file_info; + ASSERT_OK(sst_file_writer.Finish(&file_info)); + ASSERT_EQ(file_info.file_path, file); + ASSERT_EQ(file_info.num_entries, 0); + ASSERT_EQ(file_info.smallest_key, ""); + ASSERT_EQ(file_info.largest_key, ""); + ASSERT_EQ(file_info.num_range_del_entries, 1); + ASSERT_EQ(file_info.smallest_range_del_key, Key(0)); + ASSERT_EQ(file_info.largest_range_del_key, Key(30)); + + IngestExternalFileOptions ifo; + ifo.move_files = true; + ifo.snapshot_consistency = true; + ifo.allow_global_seqno = true; + ifo.write_global_seqno = true; + ifo.verify_checksums_before_ingest = false; + ASSERT_OK(db_->IngestExternalFile({file}, ifo)); + + for (int i = 5; i < 25; i++) { + std::string res; + ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &res).IsNotFound()); + } +} + +TEST_P(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) { + int kNumLevels = 7; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = kNumLevels; + Reopen(options); + + std::map true_data; + int file_id = 1; + // prevent range deletions from being dropped due to becoming obsolete. + const Snapshot* snapshot = db_->GetSnapshot(); + + // range del [0, 50) in L6 file, [50, 100) in L0 file, [100, 150) in memtable + for (int i = 0; i < 3; i++) { + if (i != 0) { + db_->Flush(FlushOptions()); + if (i == 1) { + MoveFilesToLevel(kNumLevels - 1); + } + } + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(50 * i), Key(50 * (i + 1)))); + } + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + ASSERT_EQ(0, NumTableFilesAtLevel(kNumLevels - 2)); + ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 1)); + + bool write_global_seqno = std::get<0>(GetParam()); + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + // overlaps with L0 file but not memtable, so flush is skipped and file is + // ingested into L0 + SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber(); + ASSERT_OK(GenerateAndAddExternalFile( + options, {60, 90}, {ValueType::kTypeValue, ValueType::kTypeValue}, + {{65, 70}, {70, 85}}, file_id++, write_global_seqno, + verify_checksums_before_ingest, &true_data)); + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno); + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + ASSERT_EQ(0, NumTableFilesAtLevel(kNumLevels - 2)); + ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1)); + + // overlaps with L6 file but not memtable or L0 file, so flush is skipped and + // file is ingested into L5 + ASSERT_OK(GenerateAndAddExternalFile( + options, {10, 40}, {ValueType::kTypeValue, ValueType::kTypeValue}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno); + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2)); + ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1)); + + // overlaps with L5 file but not memtable or L0 file, so flush is skipped and + // file is ingested into L4 + ASSERT_OK(GenerateAndAddExternalFile( + options, {}, {}, {{5, 15}}, file_id++, write_global_seqno, + verify_checksums_before_ingest, &true_data)); + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno); + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2)); + ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 2)); + ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1)); + + // ingested file overlaps with memtable, so flush is triggered before the file + // is ingested such that the ingested data is considered newest. So L0 file + // count increases by two. + ASSERT_OK(GenerateAndAddExternalFile( + options, {100, 140}, {ValueType::kTypeValue, ValueType::kTypeValue}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno); + ASSERT_EQ(4, NumTableFilesAtLevel(0)); + ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2)); + ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1)); + + // snapshot unneeded now that all range deletions are persisted + db_->ReleaseSnapshot(snapshot); + + // overlaps with nothing, so places at bottom level and skips incrementing + // seqnum. + ASSERT_OK(GenerateAndAddExternalFile( + options, {151, 175}, {ValueType::kTypeValue, ValueType::kTypeValue}, + {{160, 200}}, file_id++, write_global_seqno, + verify_checksums_before_ingest, &true_data)); + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno); + ASSERT_EQ(4, NumTableFilesAtLevel(0)); + ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2)); + ASSERT_EQ(2, NumTableFilesAtLevel(options.num_levels - 1)); +} + +TEST_F(ExternalSSTFileBasicTest, AdjacentRangeDeletionTombstones) { + Options options = CurrentOptions(); + SstFileWriter sst_file_writer(EnvOptions(), options); + + // file8.sst (delete 300 => 400) + std::string file8 = sst_files_dir_ + "file8.sst"; + ASSERT_OK(sst_file_writer.Open(file8)); + ASSERT_OK(sst_file_writer.DeleteRange(Key(300), Key(400))); + ExternalSstFileInfo file8_info; + Status s = sst_file_writer.Finish(&file8_info); + ASSERT_OK(s) << s.ToString(); + ASSERT_EQ(file8_info.file_path, file8); + ASSERT_EQ(file8_info.num_entries, 0); + ASSERT_EQ(file8_info.smallest_key, ""); + ASSERT_EQ(file8_info.largest_key, ""); + ASSERT_EQ(file8_info.num_range_del_entries, 1); + ASSERT_EQ(file8_info.smallest_range_del_key, Key(300)); + ASSERT_EQ(file8_info.largest_range_del_key, Key(400)); + + // file9.sst (delete 400 => 500) + std::string file9 = sst_files_dir_ + "file9.sst"; + ASSERT_OK(sst_file_writer.Open(file9)); + ASSERT_OK(sst_file_writer.DeleteRange(Key(400), Key(500))); + ExternalSstFileInfo file9_info; + s = sst_file_writer.Finish(&file9_info); + ASSERT_OK(s) << s.ToString(); + ASSERT_EQ(file9_info.file_path, file9); + ASSERT_EQ(file9_info.num_entries, 0); + ASSERT_EQ(file9_info.smallest_key, ""); + ASSERT_EQ(file9_info.largest_key, ""); + ASSERT_EQ(file9_info.num_range_del_entries, 1); + ASSERT_EQ(file9_info.smallest_range_del_key, Key(400)); + ASSERT_EQ(file9_info.largest_range_del_key, Key(500)); + + // Range deletion tombstones are exclusive on their end key, so these SSTs + // should not be considered as overlapping. + s = DeprecatedAddFile({file8, file9}); + ASSERT_OK(s) << s.ToString(); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + DestroyAndRecreateExternalSSTFilesDir(); +} + +TEST_P(ExternalSSTFileBasicTest, IngestFileWithBadBlockChecksum) { + bool change_checksum_called = false; + const auto& change_checksum = [&](void* arg) { + if (!change_checksum_called) { + char* buf = reinterpret_cast(arg); + assert(nullptr != buf); + buf[0] ^= 0x1; + change_checksum_called = true; + } + }; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableBuilder::WriteMaybeCompressedBlock:TamperWithChecksum", + change_checksum); + SyncPoint::GetInstance()->EnableProcessing(); + int file_id = 0; + bool write_global_seqno = std::get<0>(GetParam()); + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + do { + Options options = CurrentOptions(); + DestroyAndReopen(options); + std::map true_data; + Status s = GenerateAndAddExternalFile( + options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data); + if (verify_checksums_before_ingest) { + ASSERT_NOK(s); + } else { + ASSERT_OK(s); + } + change_checksum_called = false; + } while (ChangeOptionsForFileIngestionTest()); +} + +TEST_P(ExternalSSTFileBasicTest, IngestFileWithFirstByteTampered) { + if (!random_rwfile_supported_) { + ROCKSDB_GTEST_SKIP("Test requires NewRandomRWFile support"); + return; + } + SyncPoint::GetInstance()->DisableProcessing(); + int file_id = 0; + EnvOptions env_options; + do { + Options options = CurrentOptions(); + std::string file_path = sst_files_dir_ + std::to_string(file_id++); + SstFileWriter sst_file_writer(env_options, options); + Status s = sst_file_writer.Open(file_path); + ASSERT_OK(s); + for (int i = 0; i != 100; ++i) { + std::string key = Key(i); + std::string value = Key(i) + std::to_string(0); + ASSERT_OK(sst_file_writer.Put(key, value)); + } + ASSERT_OK(sst_file_writer.Finish()); + { + // Get file size + uint64_t file_size = 0; + ASSERT_OK(env_->GetFileSize(file_path, &file_size)); + ASSERT_GT(file_size, 8); + std::unique_ptr rwfile; + ASSERT_OK(env_->NewRandomRWFile(file_path, &rwfile, EnvOptions())); + // Manually corrupt the file + // We deterministically corrupt the first byte because we currently + // cannot choose a random offset. The reason for this limitation is that + // we do not checksum property block at present. + const uint64_t offset = 0; + char scratch[8] = {0}; + Slice buf; + ASSERT_OK(rwfile->Read(offset, sizeof(scratch), &buf, scratch)); + scratch[0] ^= 0xff; // flip one bit + ASSERT_OK(rwfile->Write(offset, buf)); + } + // Ingest file. + IngestExternalFileOptions ifo; + ifo.write_global_seqno = std::get<0>(GetParam()); + ifo.verify_checksums_before_ingest = std::get<1>(GetParam()); + s = db_->IngestExternalFile({file_path}, ifo); + if (ifo.verify_checksums_before_ingest) { + ASSERT_NOK(s); + } else { + ASSERT_OK(s); + } + } while (ChangeOptionsForFileIngestionTest()); +} + +TEST_P(ExternalSSTFileBasicTest, IngestExternalFileWithCorruptedPropsBlock) { + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + if (!verify_checksums_before_ingest) { + ROCKSDB_GTEST_BYPASS("Bypassing test when !verify_checksums_before_ingest"); + return; + } + if (!random_rwfile_supported_) { + ROCKSDB_GTEST_SKIP("Test requires NewRandomRWFile support"); + return; + } + uint64_t props_block_offset = 0; + size_t props_block_size = 0; + const auto& get_props_block_offset = [&](void* arg) { + props_block_offset = *reinterpret_cast(arg); + }; + const auto& get_props_block_size = [&](void* arg) { + props_block_size = *reinterpret_cast(arg); + }; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockOffset", + get_props_block_offset); + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockSize", + get_props_block_size); + SyncPoint::GetInstance()->EnableProcessing(); + int file_id = 0; + Random64 rand(time(nullptr)); + do { + std::string file_path = sst_files_dir_ + std::to_string(file_id++); + Options options = CurrentOptions(); + SstFileWriter sst_file_writer(EnvOptions(), options); + Status s = sst_file_writer.Open(file_path); + ASSERT_OK(s); + for (int i = 0; i != 100; ++i) { + std::string key = Key(i); + std::string value = Key(i) + std::to_string(0); + ASSERT_OK(sst_file_writer.Put(key, value)); + } + ASSERT_OK(sst_file_writer.Finish()); + + { + std::unique_ptr rwfile; + ASSERT_OK(env_->NewRandomRWFile(file_path, &rwfile, EnvOptions())); + // Manually corrupt the file + ASSERT_GT(props_block_size, 8); + uint64_t offset = + props_block_offset + rand.Next() % (props_block_size - 8); + char scratch[8] = {0}; + Slice buf; + ASSERT_OK(rwfile->Read(offset, sizeof(scratch), &buf, scratch)); + scratch[0] ^= 0xff; // flip one bit + ASSERT_OK(rwfile->Write(offset, buf)); + } + + // Ingest file. + IngestExternalFileOptions ifo; + ifo.write_global_seqno = std::get<0>(GetParam()); + ifo.verify_checksums_before_ingest = true; + s = db_->IngestExternalFile({file_path}, ifo); + ASSERT_NOK(s); + } while (ChangeOptionsForFileIngestionTest()); +} + +TEST_F(ExternalSSTFileBasicTest, OverlappingFiles) { + Options options = CurrentOptions(); + + std::vector files; + { + SstFileWriter sst_file_writer(EnvOptions(), options); + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + ASSERT_OK(sst_file_writer.Put("a", "z")); + ASSERT_OK(sst_file_writer.Put("i", "m")); + ExternalSstFileInfo file1_info; + ASSERT_OK(sst_file_writer.Finish(&file1_info)); + files.push_back(std::move(file1)); + } + { + SstFileWriter sst_file_writer(EnvOptions(), options); + std::string file2 = sst_files_dir_ + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + ASSERT_OK(sst_file_writer.Put("i", "k")); + ExternalSstFileInfo file2_info; + ASSERT_OK(sst_file_writer.Finish(&file2_info)); + files.push_back(std::move(file2)); + } + + IngestExternalFileOptions ifo; + ASSERT_OK(db_->IngestExternalFile(files, ifo)); + ASSERT_EQ(Get("a"), "z"); + ASSERT_EQ(Get("i"), "k"); + + int total_keys = 0; + Iterator* iter = db_->NewIterator(ReadOptions()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + total_keys++; + } + delete iter; + ASSERT_EQ(total_keys, 2); + + ASSERT_EQ(2, NumTableFilesAtLevel(0)); +} + +TEST_F(ExternalSSTFileBasicTest, IngestFileAfterDBPut) { + // Repro https://github.com/facebook/rocksdb/issues/6245. + // Flush three files to L0. Ingest one more file to trigger L0->L1 compaction + // via trivial move. The bug happened when L1 files were incorrectly sorted + // resulting in an old value for "k" returned by `Get()`. + Options options = CurrentOptions(); + + ASSERT_OK(Put("k", "a")); + Flush(); + ASSERT_OK(Put("k", "a")); + Flush(); + ASSERT_OK(Put("k", "a")); + Flush(); + SstFileWriter sst_file_writer(EnvOptions(), options); + + // Current file size should be 0 after sst_file_writer init and before open a + // file. + ASSERT_EQ(sst_file_writer.FileSize(), 0); + + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + ASSERT_OK(sst_file_writer.Put("k", "b")); + + ExternalSstFileInfo file1_info; + Status s = sst_file_writer.Finish(&file1_info); + ASSERT_OK(s) << s.ToString(); + + // Current file size should be non-zero after success write. + ASSERT_GT(sst_file_writer.FileSize(), 0); + + IngestExternalFileOptions ifo; + s = db_->IngestExternalFile({file1}, ifo); + ASSERT_OK(s); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ(Get("k"), "b"); +} + +TEST_F(ExternalSSTFileBasicTest, IngestWithTemperature) { + Options options = CurrentOptions(); + const ImmutableCFOptions ioptions(options); + options.bottommost_temperature = Temperature::kWarm; + SstFileWriter sst_file_writer(EnvOptions(), options); + options.level0_file_num_compaction_trigger = 2; + Reopen(options); + + auto size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kHot); + ASSERT_EQ(size, 0); + + // create file01.sst (1000 => 1099) and ingest it + std::string file1 = sst_files_dir_ + "file01.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 1000; k < 1100; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file1_info; + Status s = sst_file_writer.Finish(&file1_info); + ASSERT_OK(s); + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 100); + ASSERT_EQ(file1_info.smallest_key, Key(1000)); + ASSERT_EQ(file1_info.largest_key, Key(1099)); + + std::vector files; + std::vector files_checksums; + std::vector files_checksum_func_names; + Temperature file_temperature = Temperature::kWarm; + + files.push_back(file1); + IngestExternalFileOptions in_opts; + in_opts.move_files = false; + in_opts.snapshot_consistency = true; + in_opts.allow_global_seqno = false; + in_opts.allow_blocking_flush = false; + in_opts.write_global_seqno = true; + in_opts.verify_file_checksum = false; + IngestExternalFileArg arg; + arg.column_family = db_->DefaultColumnFamily(); + arg.external_files = files; + arg.options = in_opts; + arg.files_checksums = files_checksums; + arg.files_checksum_func_names = files_checksum_func_names; + arg.file_temperature = file_temperature; + s = db_->IngestExternalFiles({arg}); + ASSERT_OK(s); + + // check the temperature of the file being ingested + ColumnFamilyMetaData metadata; + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(1, metadata.file_count); + ASSERT_EQ(Temperature::kWarm, metadata.levels[6].files[0].temperature); + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_GT(size, 1); + + // non-bottommost file still has unknown temperature + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(2, metadata.file_count); + ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature); + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_GT(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_GT(size, 0); + + // reopen and check the information is persisted + Reopen(options); + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(2, metadata.file_count); + ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature); + ASSERT_EQ(Temperature::kWarm, metadata.levels[6].files[0].temperature); + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_GT(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_GT(size, 0); + + // check other non-exist temperatures + size = GetSstSizeHelper(Temperature::kHot); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kCold); + ASSERT_EQ(size, 0); + std::string prop; + ASSERT_TRUE(dbfull()->GetProperty( + DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22), + &prop)); + ASSERT_EQ(std::atoi(prop.c_str()), 0); +} + +TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevel) { + Options options = GetDefaultOptions(); + + std::string file_path = sst_files_dir_ + std::to_string(1); + SstFileWriter sfw(EnvOptions(), options); + + ASSERT_OK(sfw.Open(file_path)); + ASSERT_OK(sfw.Put("b", "dontcare")); + ASSERT_OK(sfw.Finish()); + + // Test universal compaction + ingest with snapshot consistency + options.create_if_missing = true; + options.compaction_style = CompactionStyle::kCompactionStyleUniversal; + DestroyAndReopen(options); + { + const Snapshot* snapshot = db_->GetSnapshot(); + ManagedSnapshot snapshot_guard(db_, snapshot); + IngestExternalFileOptions ifo; + ifo.fail_if_not_bottommost_level = true; + ifo.snapshot_consistency = true; + const Status s = db_->IngestExternalFile({file_path}, ifo); + ASSERT_TRUE(s.IsTryAgain()); + } + + // Test level compaction + options.compaction_style = CompactionStyle::kCompactionStyleLevel; + options.num_levels = 2; + DestroyAndReopen(options); + ASSERT_OK(db_->Put(WriteOptions(), "a", "dontcare")); + ASSERT_OK(db_->Put(WriteOptions(), "c", "dontcare")); + ASSERT_OK(db_->Flush(FlushOptions())); + + ASSERT_OK(db_->Put(WriteOptions(), "b", "dontcare")); + ASSERT_OK(db_->Put(WriteOptions(), "d", "dontcare")); + ASSERT_OK(db_->Flush(FlushOptions())); + + { + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + IngestExternalFileOptions ifo; + ifo.fail_if_not_bottommost_level = true; + const Status s = db_->IngestExternalFile({file_path}, ifo); + ASSERT_TRUE(s.IsTryAgain()); + } +} + +TEST_F(ExternalSSTFileBasicTest, VerifyChecksum) { + const std::string kPutVal = "put_val"; + const std::string kIngestedVal = "ingested_val"; + + ASSERT_OK(Put("k", kPutVal, WriteOptions())); + ASSERT_OK(Flush()); + + std::string external_file = sst_files_dir_ + "/file_to_ingest.sst"; + { + SstFileWriter sst_file_writer{EnvOptions(), CurrentOptions()}; + + ASSERT_OK(sst_file_writer.Open(external_file)); + ASSERT_OK(sst_file_writer.Put("k", kIngestedVal)); + ASSERT_OK(sst_file_writer.Finish()); + } + + ASSERT_OK(db_->IngestExternalFile(db_->DefaultColumnFamily(), {external_file}, + IngestExternalFileOptions())); + + ASSERT_OK(db_->VerifyChecksum()); +} + +TEST_F(ExternalSSTFileBasicTest, VerifySstUniqueId) { + const std::string kPutVal = "put_val"; + const std::string kIngestedVal = "ingested_val"; + + ASSERT_OK(Put("k", kPutVal, WriteOptions())); + ASSERT_OK(Flush()); + + std::string external_file = sst_files_dir_ + "/file_to_ingest.sst"; + { + SstFileWriter sst_file_writer{EnvOptions(), CurrentOptions()}; + + ASSERT_OK(sst_file_writer.Open(external_file)); + ASSERT_OK(sst_file_writer.Put("k", kIngestedVal)); + ASSERT_OK(sst_file_writer.Finish()); + } + + ASSERT_OK(db_->IngestExternalFile(db_->DefaultColumnFamily(), {external_file}, + IngestExternalFileOptions())); + + // Test ingest file without session_id and db_id (for example generated by an + // older version of sst_writer) + SyncPoint::GetInstance()->SetCallBack( + "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) { + auto props = static_cast(props_vs); + // update table property session_id to a different one + props->db_session_id = ""; + props->db_id = ""; + }); + std::atomic_int skipped = 0, passed = 0; + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::Open::SkippedVerifyUniqueId", + [&](void* /*arg*/) { skipped++; }); + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::Open::PassedVerifyUniqueId", + [&](void* /*arg*/) { passed++; }); + SyncPoint::GetInstance()->EnableProcessing(); + + auto options = CurrentOptions(); + ASSERT_TRUE(options.verify_sst_unique_id_in_manifest); + Reopen(options); + ASSERT_EQ(skipped, 0); + ASSERT_EQ(passed, 2); // one flushed + one ingested + + external_file = sst_files_dir_ + "/file_to_ingest2.sst"; + { + SstFileWriter sst_file_writer{EnvOptions(), CurrentOptions()}; + + ASSERT_OK(sst_file_writer.Open(external_file)); + ASSERT_OK(sst_file_writer.Put("k", kIngestedVal)); + ASSERT_OK(sst_file_writer.Finish()); + } + + ASSERT_OK(db_->IngestExternalFile(db_->DefaultColumnFamily(), {external_file}, + IngestExternalFileOptions())); + + // Two table file opens skipping verification: + // * ExternalSstFileIngestionJob::GetIngestedFileInfo + // * TableCache::GetTableReader + ASSERT_EQ(skipped, 2); + ASSERT_EQ(passed, 2); + + // Check same after re-open (except no GetIngestedFileInfo) + skipped = 0; + passed = 0; + Reopen(options); + ASSERT_EQ(skipped, 1); + ASSERT_EQ(passed, 2); +} + +TEST_F(ExternalSSTFileBasicTest, StableSnapshotWhileLoggingToManifest) { + const std::string kPutVal = "put_val"; + const std::string kIngestedVal = "ingested_val"; + + ASSERT_OK(Put("k", kPutVal, WriteOptions())); + ASSERT_OK(Flush()); + + std::string external_file = sst_files_dir_ + "/file_to_ingest.sst"; + { + SstFileWriter sst_file_writer{EnvOptions(), CurrentOptions()}; + ASSERT_OK(sst_file_writer.Open(external_file)); + ASSERT_OK(sst_file_writer.Put("k", kIngestedVal)); + ASSERT_OK(sst_file_writer.Finish()); + } + + const Snapshot* snapshot = nullptr; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", [&](void* /* arg */) { + // prevent background compaction job to call this callback + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + snapshot = db_->GetSnapshot(); + ReadOptions read_opts; + read_opts.snapshot = snapshot; + std::string value; + ASSERT_OK(db_->Get(read_opts, "k", &value)); + ASSERT_EQ(kPutVal, value); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db_->IngestExternalFile(db_->DefaultColumnFamily(), {external_file}, + IngestExternalFileOptions())); + auto ingested_file_seqno = db_->GetLatestSequenceNumber(); + ASSERT_NE(nullptr, snapshot); + // snapshot is taken before SST ingestion is done + ASSERT_EQ(ingested_file_seqno, snapshot->GetSequenceNumber() + 1); + + ReadOptions read_opts; + read_opts.snapshot = snapshot; + std::string value; + ASSERT_OK(db_->Get(read_opts, "k", &value)); + ASSERT_EQ(kPutVal, value); + db_->ReleaseSnapshot(snapshot); + + // After reopen, sequence number should be up current such that + // ingested value is read + Reopen(CurrentOptions()); + ASSERT_OK(db_->Get(ReadOptions(), "k", &value)); + ASSERT_EQ(kIngestedVal, value); + + // New write should get higher seqno compared to ingested file + ASSERT_OK(Put("k", kPutVal, WriteOptions())); + ASSERT_EQ(db_->GetLatestSequenceNumber(), ingested_file_seqno + 1); +} + +INSTANTIATE_TEST_CASE_P(ExternalSSTFileBasicTest, ExternalSSTFileBasicTest, + testing::Values(std::make_tuple(true, true), + std::make_tuple(true, false), + std::make_tuple(false, true), + std::make_tuple(false, false))); + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/external_sst_file_ingestion_job.cc b/src/rocksdb/db/external_sst_file_ingestion_job.cc new file mode 100644 index 000000000..ba1277eab --- /dev/null +++ b/src/rocksdb/db/external_sst_file_ingestion_job.cc @@ -0,0 +1,1020 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "db/external_sst_file_ingestion_job.h" + +#include +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/version_edit.h" +#include "file/file_util.h" +#include "file/random_access_file_reader.h" +#include "logging/logging.h" +#include "table/merging_iterator.h" +#include "table/scoped_arena_iterator.h" +#include "table/sst_file_writer_collectors.h" +#include "table/table_builder.h" +#include "table/unique_id_impl.h" +#include "test_util/sync_point.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +Status ExternalSstFileIngestionJob::Prepare( + const std::vector& external_files_paths, + const std::vector& files_checksums, + const std::vector& files_checksum_func_names, + const Temperature& file_temperature, uint64_t next_file_number, + SuperVersion* sv) { + Status status; + + // Read the information of files we are ingesting + for (const std::string& file_path : external_files_paths) { + IngestedFileInfo file_to_ingest; + status = + GetIngestedFileInfo(file_path, next_file_number++, &file_to_ingest, sv); + if (!status.ok()) { + return status; + } + + if (file_to_ingest.cf_id != + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily && + file_to_ingest.cf_id != cfd_->GetID()) { + return Status::InvalidArgument( + "External file column family id don't match"); + } + + if (file_to_ingest.num_entries == 0 && + file_to_ingest.num_range_deletions == 0) { + return Status::InvalidArgument("File contain no entries"); + } + + if (!file_to_ingest.smallest_internal_key.Valid() || + !file_to_ingest.largest_internal_key.Valid()) { + return Status::Corruption("Generated table have corrupted keys"); + } + + files_to_ingest_.emplace_back(std::move(file_to_ingest)); + } + + const Comparator* ucmp = cfd_->internal_comparator().user_comparator(); + auto num_files = files_to_ingest_.size(); + if (num_files == 0) { + return Status::InvalidArgument("The list of files is empty"); + } else if (num_files > 1) { + // Verify that passed files don't have overlapping ranges + autovector sorted_files; + for (size_t i = 0; i < num_files; i++) { + sorted_files.push_back(&files_to_ingest_[i]); + } + + std::sort( + sorted_files.begin(), sorted_files.end(), + [&ucmp](const IngestedFileInfo* info1, const IngestedFileInfo* info2) { + return sstableKeyCompare(ucmp, info1->smallest_internal_key, + info2->smallest_internal_key) < 0; + }); + + for (size_t i = 0; i + 1 < num_files; i++) { + if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key, + sorted_files[i + 1]->smallest_internal_key) >= 0) { + files_overlap_ = true; + break; + } + } + } + + // Hanlde the file temperature + for (size_t i = 0; i < num_files; i++) { + files_to_ingest_[i].file_temperature = file_temperature; + } + + if (ingestion_options_.ingest_behind && files_overlap_) { + return Status::NotSupported("Files have overlapping ranges"); + } + + // Copy/Move external files into DB + std::unordered_set ingestion_path_ids; + for (IngestedFileInfo& f : files_to_ingest_) { + f.copy_file = false; + const std::string path_outside_db = f.external_file_path; + const std::string path_inside_db = TableFileName( + cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId()); + if (ingestion_options_.move_files) { + status = + fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr); + if (status.ok()) { + // It is unsafe to assume application had sync the file and file + // directory before ingest the file. For integrity of RocksDB we need + // to sync the file. + std::unique_ptr file_to_sync; + Status s = fs_->ReopenWritableFile(path_inside_db, env_options_, + &file_to_sync, nullptr); + TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:Reopen", + &s); + // Some file systems (especially remote/distributed) don't support + // reopening a file for writing and don't require reopening and + // syncing the file. Ignore the NotSupported error in that case. + if (!s.IsNotSupported()) { + status = s; + if (status.ok()) { + TEST_SYNC_POINT( + "ExternalSstFileIngestionJob::BeforeSyncIngestedFile"); + status = SyncIngestedFile(file_to_sync.get()); + TEST_SYNC_POINT( + "ExternalSstFileIngestionJob::AfterSyncIngestedFile"); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to sync ingested file %s: %s", + path_inside_db.c_str(), status.ToString().c_str()); + } + } + } + } else if (status.IsNotSupported() && + ingestion_options_.failed_move_fall_back_to_copy) { + // Original file is on a different FS, use copy instead of hard linking. + f.copy_file = true; + ROCKS_LOG_INFO(db_options_.info_log, + "Triy to link file %s but it's not supported : %s", + path_outside_db.c_str(), status.ToString().c_str()); + } + } else { + f.copy_file = true; + } + + if (f.copy_file) { + TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile", + nullptr); + // CopyFile also sync the new file. + status = + CopyFile(fs_.get(), path_outside_db, path_inside_db, 0, + db_options_.use_fsync, io_tracer_, Temperature::kUnknown); + } + TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded"); + if (!status.ok()) { + break; + } + f.internal_file_path = path_inside_db; + // Initialize the checksum information of ingested files. + f.file_checksum = kUnknownFileChecksum; + f.file_checksum_func_name = kUnknownFileChecksumFuncName; + ingestion_path_ids.insert(f.fd.GetPathId()); + } + + TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncDir"); + if (status.ok()) { + for (auto path_id : ingestion_path_ids) { + status = directories_->GetDataDir(path_id)->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to sync directory %" ROCKSDB_PRIszt + " while ingest file: %s", + path_id, status.ToString().c_str()); + break; + } + } + } + TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncDir"); + + // Generate and check the sst file checksum. Note that, if + // IngestExternalFileOptions::write_global_seqno is true, we will not update + // the checksum information in the files_to_ingests_ here, since the file is + // upadted with the new global_seqno. After global_seqno is updated, DB will + // generate the new checksum and store it in the Manifest. In all other cases + // if ingestion_options_.write_global_seqno == true and + // verify_file_checksum is false, we only check the checksum function name. + if (status.ok() && db_options_.file_checksum_gen_factory != nullptr) { + if (ingestion_options_.verify_file_checksum == false && + files_checksums.size() == files_to_ingest_.size() && + files_checksum_func_names.size() == files_to_ingest_.size()) { + // Only when verify_file_checksum == false and the checksum for ingested + // files are provided, DB will use the provided checksum and does not + // generate the checksum for ingested files. + need_generate_file_checksum_ = false; + } else { + need_generate_file_checksum_ = true; + } + FileChecksumGenContext gen_context; + std::unique_ptr file_checksum_gen = + db_options_.file_checksum_gen_factory->CreateFileChecksumGenerator( + gen_context); + std::vector generated_checksums; + std::vector generated_checksum_func_names; + // Step 1: generate the checksum for ingested sst file. + if (need_generate_file_checksum_) { + for (size_t i = 0; i < files_to_ingest_.size(); i++) { + std::string generated_checksum; + std::string generated_checksum_func_name; + std::string requested_checksum_func_name; + // TODO: rate limit file reads for checksum calculation during file + // ingestion. + IOStatus io_s = GenerateOneFileChecksum( + fs_.get(), files_to_ingest_[i].internal_file_path, + db_options_.file_checksum_gen_factory.get(), + requested_checksum_func_name, &generated_checksum, + &generated_checksum_func_name, + ingestion_options_.verify_checksums_readahead_size, + db_options_.allow_mmap_reads, io_tracer_, + db_options_.rate_limiter.get(), + Env::IO_TOTAL /* rate_limiter_priority */); + if (!io_s.ok()) { + status = io_s; + ROCKS_LOG_WARN(db_options_.info_log, + "Sst file checksum generation of file: %s failed: %s", + files_to_ingest_[i].internal_file_path.c_str(), + status.ToString().c_str()); + break; + } + if (ingestion_options_.write_global_seqno == false) { + files_to_ingest_[i].file_checksum = generated_checksum; + files_to_ingest_[i].file_checksum_func_name = + generated_checksum_func_name; + } + generated_checksums.push_back(generated_checksum); + generated_checksum_func_names.push_back(generated_checksum_func_name); + } + } + + // Step 2: based on the verify_file_checksum and ingested checksum + // information, do the verification. + if (status.ok()) { + if (files_checksums.size() == files_to_ingest_.size() && + files_checksum_func_names.size() == files_to_ingest_.size()) { + // Verify the checksum and checksum function name. + if (ingestion_options_.verify_file_checksum) { + for (size_t i = 0; i < files_to_ingest_.size(); i++) { + if (files_checksum_func_names[i] != + generated_checksum_func_names[i]) { + status = Status::InvalidArgument( + "Checksum function name does not match with the checksum " + "function name of this DB"); + ROCKS_LOG_WARN( + db_options_.info_log, + "Sst file checksum verification of file: %s failed: %s", + external_files_paths[i].c_str(), status.ToString().c_str()); + break; + } + if (files_checksums[i] != generated_checksums[i]) { + status = Status::Corruption( + "Ingested checksum does not match with the generated " + "checksum"); + ROCKS_LOG_WARN( + db_options_.info_log, + "Sst file checksum verification of file: %s failed: %s", + files_to_ingest_[i].internal_file_path.c_str(), + status.ToString().c_str()); + break; + } + } + } else { + // If verify_file_checksum is not enabled, we only verify the + // checksum function name. If it does not match, fail the ingestion. + // If matches, we trust the ingested checksum information and store + // in the Manifest. + for (size_t i = 0; i < files_to_ingest_.size(); i++) { + if (files_checksum_func_names[i] != file_checksum_gen->Name()) { + status = Status::InvalidArgument( + "Checksum function name does not match with the checksum " + "function name of this DB"); + ROCKS_LOG_WARN( + db_options_.info_log, + "Sst file checksum verification of file: %s failed: %s", + external_files_paths[i].c_str(), status.ToString().c_str()); + break; + } + files_to_ingest_[i].file_checksum = files_checksums[i]; + files_to_ingest_[i].file_checksum_func_name = + files_checksum_func_names[i]; + } + } + } else if (files_checksums.size() != files_checksum_func_names.size() || + (files_checksums.size() == files_checksum_func_names.size() && + files_checksums.size() != 0)) { + // The checksum or checksum function name vector are not both empty + // and they are incomplete. + status = Status::InvalidArgument( + "The checksum information of ingested sst files are nonempty and " + "the size of checksums or the size of the checksum function " + "names " + "does not match with the number of ingested sst files"); + ROCKS_LOG_WARN( + db_options_.info_log, + "The ingested sst files checksum information is incomplete: %s", + status.ToString().c_str()); + } + } + } + + // TODO: The following is duplicated with Cleanup(). + if (!status.ok()) { + IOOptions io_opts; + // We failed, remove all files that we copied into the db + for (IngestedFileInfo& f : files_to_ingest_) { + if (f.internal_file_path.empty()) { + continue; + } + Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "AddFile() clean up for file %s failed : %s", + f.internal_file_path.c_str(), s.ToString().c_str()); + } + } + } + + return status; +} + +Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed, + SuperVersion* super_version) { + autovector ranges; + autovector keys; + size_t ts_sz = cfd_->user_comparator()->timestamp_size(); + if (ts_sz) { + // Check all ranges [begin, end] inclusively. Add maximum + // timestamp to include all `begin` keys, and add minimal timestamp to + // include all `end` keys. + for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) { + std::string begin_str; + std::string end_str; + AppendUserKeyWithMaxTimestamp( + &begin_str, file_to_ingest.smallest_internal_key.user_key(), ts_sz); + AppendKeyWithMinTimestamp( + &end_str, file_to_ingest.largest_internal_key.user_key(), ts_sz); + keys.emplace_back(std::move(begin_str)); + keys.emplace_back(std::move(end_str)); + } + for (size_t i = 0; i < files_to_ingest_.size(); ++i) { + ranges.emplace_back(keys[2 * i], keys[2 * i + 1]); + } + } else { + for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) { + ranges.emplace_back(file_to_ingest.smallest_internal_key.user_key(), + file_to_ingest.largest_internal_key.user_key()); + } + } + Status status = cfd_->RangesOverlapWithMemtables( + ranges, super_version, db_options_.allow_data_in_errors, flush_needed); + if (status.ok() && *flush_needed && + !ingestion_options_.allow_blocking_flush) { + status = Status::InvalidArgument("External file requires flush"); + } + return status; +} + +// REQUIRES: we have become the only writer by entering both write_thread_ and +// nonmem_write_thread_ +Status ExternalSstFileIngestionJob::Run() { + Status status; + SuperVersion* super_version = cfd_->GetSuperVersion(); +#ifndef NDEBUG + // We should never run the job with a memtable that is overlapping + // with the files we are ingesting + bool need_flush = false; + status = NeedsFlush(&need_flush, super_version); + if (!status.ok()) { + return status; + } + if (need_flush) { + return Status::TryAgain(); + } + assert(status.ok() && need_flush == false); +#endif + + bool force_global_seqno = false; + + if (ingestion_options_.snapshot_consistency && !db_snapshots_->empty()) { + // We need to assign a global sequence number to all the files even + // if the don't overlap with any ranges since we have snapshots + force_global_seqno = true; + } + // It is safe to use this instead of LastAllocatedSequence since we are + // the only active writer, and hence they are equal + SequenceNumber last_seqno = versions_->LastSequence(); + edit_.SetColumnFamily(cfd_->GetID()); + // The levels that the files will be ingested into + + for (IngestedFileInfo& f : files_to_ingest_) { + SequenceNumber assigned_seqno = 0; + if (ingestion_options_.ingest_behind) { + status = CheckLevelForIngestedBehindFile(&f); + } else { + status = AssignLevelAndSeqnoForIngestedFile( + super_version, force_global_seqno, cfd_->ioptions()->compaction_style, + last_seqno, &f, &assigned_seqno); + } + + // Modify the smallest/largest internal key to include the sequence number + // that we just learned. Only overwrite sequence number zero. There could + // be a nonzero sequence number already to indicate a range tombstone's + // exclusive endpoint. + ParsedInternalKey smallest_parsed, largest_parsed; + if (status.ok()) { + status = ParseInternalKey(*f.smallest_internal_key.rep(), + &smallest_parsed, false /* log_err_key */); + } + if (status.ok()) { + status = ParseInternalKey(*f.largest_internal_key.rep(), &largest_parsed, + false /* log_err_key */); + } + if (!status.ok()) { + return status; + } + if (smallest_parsed.sequence == 0) { + UpdateInternalKey(f.smallest_internal_key.rep(), assigned_seqno, + smallest_parsed.type); + } + if (largest_parsed.sequence == 0) { + UpdateInternalKey(f.largest_internal_key.rep(), assigned_seqno, + largest_parsed.type); + } + + status = AssignGlobalSeqnoForIngestedFile(&f, assigned_seqno); + TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run", + &assigned_seqno); + if (assigned_seqno > last_seqno) { + assert(assigned_seqno == last_seqno + 1); + last_seqno = assigned_seqno; + ++consumed_seqno_count_; + } + if (!status.ok()) { + return status; + } + + status = GenerateChecksumForIngestedFile(&f); + if (!status.ok()) { + return status; + } + + // We use the import time as the ancester time. This is the time the data + // is written to the database. + int64_t temp_current_time = 0; + uint64_t current_time = kUnknownFileCreationTime; + uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; + if (clock_->GetCurrentTime(&temp_current_time).ok()) { + current_time = oldest_ancester_time = + static_cast(temp_current_time); + } + FileMetaData f_metadata( + f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(), + f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno, + f.assigned_seqno, false, f.file_temperature, kInvalidBlobFileNumber, + oldest_ancester_time, current_time, f.file_checksum, + f.file_checksum_func_name, f.unique_id); + f_metadata.temperature = f.file_temperature; + edit_.AddFile(f.picked_level, f_metadata); + } + return status; +} + +void ExternalSstFileIngestionJob::UpdateStats() { + // Update internal stats for new ingested files + uint64_t total_keys = 0; + uint64_t total_l0_files = 0; + uint64_t total_time = clock_->NowMicros() - job_start_time_; + + EventLoggerStream stream = event_logger_->Log(); + stream << "event" + << "ingest_finished"; + stream << "files_ingested"; + stream.StartArray(); + + for (IngestedFileInfo& f : files_to_ingest_) { + InternalStats::CompactionStats stats( + CompactionReason::kExternalSstIngestion, 1); + stats.micros = total_time; + // If actual copy occurred for this file, then we need to count the file + // size as the actual bytes written. If the file was linked, then we ignore + // the bytes written for file metadata. + // TODO (yanqin) maybe account for file metadata bytes for exact accuracy? + if (f.copy_file) { + stats.bytes_written = f.fd.GetFileSize(); + } else { + stats.bytes_moved = f.fd.GetFileSize(); + } + stats.num_output_files = 1; + cfd_->internal_stats()->AddCompactionStats(f.picked_level, + Env::Priority::USER, stats); + cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_INGESTED_ADD_FILE, + f.fd.GetFileSize()); + total_keys += f.num_entries; + if (f.picked_level == 0) { + total_l0_files += 1; + } + ROCKS_LOG_INFO( + db_options_.info_log, + "[AddFile] External SST file %s was ingested in L%d with path %s " + "(global_seqno=%" PRIu64 ")\n", + f.external_file_path.c_str(), f.picked_level, + f.internal_file_path.c_str(), f.assigned_seqno); + stream << "file" << f.internal_file_path << "level" << f.picked_level; + } + stream.EndArray(); + + stream << "lsm_state"; + stream.StartArray(); + auto vstorage = cfd_->current()->storage_info(); + for (int level = 0; level < vstorage->num_levels(); ++level) { + stream << vstorage->NumLevelFiles(level); + } + stream.EndArray(); + + cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_KEYS_TOTAL, + total_keys); + cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_FILES_TOTAL, + files_to_ingest_.size()); + cfd_->internal_stats()->AddCFStats( + InternalStats::INGESTED_LEVEL0_NUM_FILES_TOTAL, total_l0_files); +} + +void ExternalSstFileIngestionJob::Cleanup(const Status& status) { + IOOptions io_opts; + if (!status.ok()) { + // We failed to add the files to the database + // remove all the files we copied + for (IngestedFileInfo& f : files_to_ingest_) { + if (f.internal_file_path.empty()) { + continue; + } + Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "AddFile() clean up for file %s failed : %s", + f.internal_file_path.c_str(), s.ToString().c_str()); + } + } + consumed_seqno_count_ = 0; + files_overlap_ = false; + } else if (status.ok() && ingestion_options_.move_files) { + // The files were moved and added successfully, remove original file links + for (IngestedFileInfo& f : files_to_ingest_) { + Status s = fs_->DeleteFile(f.external_file_path, io_opts, nullptr); + if (!s.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "%s was added to DB successfully but failed to remove original " + "file link : %s", + f.external_file_path.c_str(), s.ToString().c_str()); + } + } + } +} + +Status ExternalSstFileIngestionJob::GetIngestedFileInfo( + const std::string& external_file, uint64_t new_file_number, + IngestedFileInfo* file_to_ingest, SuperVersion* sv) { + file_to_ingest->external_file_path = external_file; + + // Get external file size + Status status = fs_->GetFileSize(external_file, IOOptions(), + &file_to_ingest->file_size, nullptr); + if (!status.ok()) { + return status; + } + + // Assign FD with number + file_to_ingest->fd = + FileDescriptor(new_file_number, 0, file_to_ingest->file_size); + + // Create TableReader for external file + std::unique_ptr table_reader; + std::unique_ptr sst_file; + std::unique_ptr sst_file_reader; + + status = + fs_->NewRandomAccessFile(external_file, env_options_, &sst_file, nullptr); + if (!status.ok()) { + return status; + } + sst_file_reader.reset(new RandomAccessFileReader( + std::move(sst_file), external_file, nullptr /*Env*/, io_tracer_)); + + status = cfd_->ioptions()->table_factory->NewTableReader( + TableReaderOptions( + *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor, + env_options_, cfd_->internal_comparator(), + /*skip_filters*/ false, /*immortal*/ false, + /*force_direct_prefetch*/ false, /*level*/ -1, + /*block_cache_tracer*/ nullptr, + /*max_file_size_for_l0_meta_pin*/ 0, versions_->DbSessionId(), + /*cur_file_num*/ new_file_number), + std::move(sst_file_reader), file_to_ingest->file_size, &table_reader); + if (!status.ok()) { + return status; + } + + if (ingestion_options_.verify_checksums_before_ingest) { + // If customized readahead size is needed, we can pass a user option + // all the way to here. Right now we just rely on the default readahead + // to keep things simple. + ReadOptions ro; + ro.readahead_size = ingestion_options_.verify_checksums_readahead_size; + status = table_reader->VerifyChecksum( + ro, TableReaderCaller::kExternalSSTIngestion); + } + if (!status.ok()) { + return status; + } + + // Get the external file properties + auto props = table_reader->GetTableProperties(); + const auto& uprops = props->user_collected_properties; + + // Get table version + auto version_iter = uprops.find(ExternalSstFilePropertyNames::kVersion); + if (version_iter == uprops.end()) { + return Status::Corruption("External file version not found"); + } + file_to_ingest->version = DecodeFixed32(version_iter->second.c_str()); + + auto seqno_iter = uprops.find(ExternalSstFilePropertyNames::kGlobalSeqno); + if (file_to_ingest->version == 2) { + // version 2 imply that we have global sequence number + if (seqno_iter == uprops.end()) { + return Status::Corruption( + "External file global sequence number not found"); + } + + // Set the global sequence number + file_to_ingest->original_seqno = DecodeFixed64(seqno_iter->second.c_str()); + if (props->external_sst_file_global_seqno_offset == 0) { + file_to_ingest->global_seqno_offset = 0; + return Status::Corruption("Was not able to find file global seqno field"); + } + file_to_ingest->global_seqno_offset = + static_cast(props->external_sst_file_global_seqno_offset); + } else if (file_to_ingest->version == 1) { + // SST file V1 should not have global seqno field + assert(seqno_iter == uprops.end()); + file_to_ingest->original_seqno = 0; + if (ingestion_options_.allow_blocking_flush || + ingestion_options_.allow_global_seqno) { + return Status::InvalidArgument( + "External SST file V1 does not support global seqno"); + } + } else { + return Status::InvalidArgument("External file version is not supported"); + } + // Get number of entries in table + file_to_ingest->num_entries = props->num_entries; + file_to_ingest->num_range_deletions = props->num_range_deletions; + + ParsedInternalKey key; + ReadOptions ro; + // During reading the external file we can cache blocks that we read into + // the block cache, if we later change the global seqno of this file, we will + // have block in cache that will include keys with wrong seqno. + // We need to disable fill_cache so that we read from the file without + // updating the block cache. + ro.fill_cache = false; + std::unique_ptr iter(table_reader->NewIterator( + ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion)); + std::unique_ptr range_del_iter( + table_reader->NewRangeTombstoneIterator(ro)); + + // Get first (smallest) and last (largest) key from file. + file_to_ingest->smallest_internal_key = + InternalKey("", 0, ValueType::kTypeValue); + file_to_ingest->largest_internal_key = + InternalKey("", 0, ValueType::kTypeValue); + bool bounds_set = false; + bool allow_data_in_errors = db_options_.allow_data_in_errors; + iter->SeekToFirst(); + if (iter->Valid()) { + Status pik_status = + ParseInternalKey(iter->key(), &key, allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted key in external file. ", + pik_status.getState()); + } + if (key.sequence != 0) { + return Status::Corruption("External file has non zero sequence number"); + } + file_to_ingest->smallest_internal_key.SetFrom(key); + + iter->SeekToLast(); + pik_status = ParseInternalKey(iter->key(), &key, allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted key in external file. ", + pik_status.getState()); + } + if (key.sequence != 0) { + return Status::Corruption("External file has non zero sequence number"); + } + file_to_ingest->largest_internal_key.SetFrom(key); + + bounds_set = true; + } + + // We may need to adjust these key bounds, depending on whether any range + // deletion tombstones extend past them. + const Comparator* ucmp = cfd_->internal_comparator().user_comparator(); + if (range_del_iter != nullptr) { + for (range_del_iter->SeekToFirst(); range_del_iter->Valid(); + range_del_iter->Next()) { + Status pik_status = + ParseInternalKey(range_del_iter->key(), &key, allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted key in external file. ", + pik_status.getState()); + } + RangeTombstone tombstone(key, range_del_iter->value()); + + InternalKey start_key = tombstone.SerializeKey(); + if (!bounds_set || + sstableKeyCompare(ucmp, start_key, + file_to_ingest->smallest_internal_key) < 0) { + file_to_ingest->smallest_internal_key = start_key; + } + InternalKey end_key = tombstone.SerializeEndKey(); + if (!bounds_set || + sstableKeyCompare(ucmp, end_key, + file_to_ingest->largest_internal_key) > 0) { + file_to_ingest->largest_internal_key = end_key; + } + bounds_set = true; + } + } + + file_to_ingest->cf_id = static_cast(props->column_family_id); + + file_to_ingest->table_properties = *props; + + auto s = GetSstInternalUniqueId(props->db_id, props->db_session_id, + props->orig_file_number, + &(file_to_ingest->unique_id)); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to get SST unique id for file %s", + file_to_ingest->internal_file_path.c_str()); + file_to_ingest->unique_id = kNullUniqueId64x2; + } + + return status; +} + +Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( + SuperVersion* sv, bool force_global_seqno, CompactionStyle compaction_style, + SequenceNumber last_seqno, IngestedFileInfo* file_to_ingest, + SequenceNumber* assigned_seqno) { + Status status; + *assigned_seqno = 0; + if (force_global_seqno) { + *assigned_seqno = last_seqno + 1; + if (compaction_style == kCompactionStyleUniversal || files_overlap_) { + if (ingestion_options_.fail_if_not_bottommost_level) { + status = Status::TryAgain( + "Files cannot be ingested to Lmax. Please make sure key range of " + "Lmax does not overlap with files to ingest."); + return status; + } + file_to_ingest->picked_level = 0; + return status; + } + } + + bool overlap_with_db = false; + Arena arena; + ReadOptions ro; + ro.total_order_seek = true; + int target_level = 0; + auto* vstorage = cfd_->current()->storage_info(); + + for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) { + if (lvl > 0 && lvl < vstorage->base_level()) { + continue; + } + + if (vstorage->NumLevelFiles(lvl) > 0) { + bool overlap_with_level = false; + status = sv->current->OverlapWithLevelIterator( + ro, env_options_, file_to_ingest->smallest_internal_key.user_key(), + file_to_ingest->largest_internal_key.user_key(), lvl, + &overlap_with_level); + if (!status.ok()) { + return status; + } + if (overlap_with_level) { + // We must use L0 or any level higher than `lvl` to be able to overwrite + // the keys that we overlap with in this level, We also need to assign + // this file a seqno to overwrite the existing keys in level `lvl` + overlap_with_db = true; + break; + } + + if (compaction_style == kCompactionStyleUniversal && lvl != 0) { + const std::vector& level_files = + vstorage->LevelFiles(lvl); + const SequenceNumber level_largest_seqno = + (*std::max_element(level_files.begin(), level_files.end(), + [](FileMetaData* f1, FileMetaData* f2) { + return f1->fd.largest_seqno < + f2->fd.largest_seqno; + })) + ->fd.largest_seqno; + // should only assign seqno to current level's largest seqno when + // the file fits + if (level_largest_seqno != 0 && + IngestedFileFitInLevel(file_to_ingest, lvl)) { + *assigned_seqno = level_largest_seqno; + } else { + continue; + } + } + } else if (compaction_style == kCompactionStyleUniversal) { + continue; + } + + // We don't overlap with any keys in this level, but we still need to check + // if our file can fit in it + if (IngestedFileFitInLevel(file_to_ingest, lvl)) { + target_level = lvl; + } + } + // If files overlap, we have to ingest them at level 0 and assign the newest + // sequence number + if (files_overlap_) { + target_level = 0; + *assigned_seqno = last_seqno + 1; + } + + if (ingestion_options_.fail_if_not_bottommost_level && + target_level < cfd_->NumberLevels() - 1) { + status = Status::TryAgain( + "Files cannot be ingested to Lmax. Please make sure key range of Lmax " + "does not overlap with files to ingest."); + return status; + } + + TEST_SYNC_POINT_CALLBACK( + "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile", + &overlap_with_db); + file_to_ingest->picked_level = target_level; + if (overlap_with_db && *assigned_seqno == 0) { + *assigned_seqno = last_seqno + 1; + } + return status; +} + +Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile( + IngestedFileInfo* file_to_ingest) { + auto* vstorage = cfd_->current()->storage_info(); + // first check if new files fit in the bottommost level + int bottom_lvl = cfd_->NumberLevels() - 1; + if (!IngestedFileFitInLevel(file_to_ingest, bottom_lvl)) { + return Status::InvalidArgument( + "Can't ingest_behind file as it doesn't fit " + "at the bottommost level!"); + } + + // second check if despite allow_ingest_behind=true we still have 0 seqnums + // at some upper level + for (int lvl = 0; lvl < cfd_->NumberLevels() - 1; lvl++) { + for (auto file : vstorage->LevelFiles(lvl)) { + if (file->fd.smallest_seqno == 0) { + return Status::InvalidArgument( + "Can't ingest_behind file as despite allow_ingest_behind=true " + "there are files with 0 seqno in database at upper levels!"); + } + } + } + + file_to_ingest->picked_level = bottom_lvl; + return Status::OK(); +} + +Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile( + IngestedFileInfo* file_to_ingest, SequenceNumber seqno) { + if (file_to_ingest->original_seqno == seqno) { + // This file already have the correct global seqno + return Status::OK(); + } else if (!ingestion_options_.allow_global_seqno) { + return Status::InvalidArgument("Global seqno is required, but disabled"); + } else if (file_to_ingest->global_seqno_offset == 0) { + return Status::InvalidArgument( + "Trying to set global seqno for a file that don't have a global seqno " + "field"); + } + + if (ingestion_options_.write_global_seqno) { + // Determine if we can write global_seqno to a given offset of file. + // If the file system does not support random write, then we should not. + // Otherwise we should. + std::unique_ptr rwfile; + Status status = fs_->NewRandomRWFile(file_to_ingest->internal_file_path, + env_options_, &rwfile, nullptr); + TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::NewRandomRWFile", + &status); + if (status.ok()) { + FSRandomRWFilePtr fsptr(std::move(rwfile), io_tracer_, + file_to_ingest->internal_file_path); + std::string seqno_val; + PutFixed64(&seqno_val, seqno); + status = fsptr->Write(file_to_ingest->global_seqno_offset, seqno_val, + IOOptions(), nullptr); + if (status.ok()) { + TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno"); + status = SyncIngestedFile(fsptr.get()); + TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncGlobalSeqno"); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to sync ingested file %s after writing global " + "sequence number: %s", + file_to_ingest->internal_file_path.c_str(), + status.ToString().c_str()); + } + } + if (!status.ok()) { + return status; + } + } else if (!status.IsNotSupported()) { + return status; + } + } + + file_to_ingest->assigned_seqno = seqno; + return Status::OK(); +} + +IOStatus ExternalSstFileIngestionJob::GenerateChecksumForIngestedFile( + IngestedFileInfo* file_to_ingest) { + if (db_options_.file_checksum_gen_factory == nullptr || + need_generate_file_checksum_ == false || + ingestion_options_.write_global_seqno == false) { + // If file_checksum_gen_factory is not set, we are not able to generate + // the checksum. if write_global_seqno is false, it means we will use + // file checksum generated during Prepare(). This step will be skipped. + return IOStatus::OK(); + } + std::string file_checksum; + std::string file_checksum_func_name; + std::string requested_checksum_func_name; + // TODO: rate limit file reads for checksum calculation during file ingestion. + IOStatus io_s = GenerateOneFileChecksum( + fs_.get(), file_to_ingest->internal_file_path, + db_options_.file_checksum_gen_factory.get(), requested_checksum_func_name, + &file_checksum, &file_checksum_func_name, + ingestion_options_.verify_checksums_readahead_size, + db_options_.allow_mmap_reads, io_tracer_, db_options_.rate_limiter.get(), + Env::IO_TOTAL /* rate_limiter_priority */); + if (!io_s.ok()) { + return io_s; + } + file_to_ingest->file_checksum = file_checksum; + file_to_ingest->file_checksum_func_name = file_checksum_func_name; + return IOStatus::OK(); +} + +bool ExternalSstFileIngestionJob::IngestedFileFitInLevel( + const IngestedFileInfo* file_to_ingest, int level) { + if (level == 0) { + // Files can always fit in L0 + return true; + } + + auto* vstorage = cfd_->current()->storage_info(); + Slice file_smallest_user_key( + file_to_ingest->smallest_internal_key.user_key()); + Slice file_largest_user_key(file_to_ingest->largest_internal_key.user_key()); + + if (vstorage->OverlapInLevel(level, &file_smallest_user_key, + &file_largest_user_key)) { + // File overlap with another files in this level, we cannot + // add it to this level + return false; + } + if (cfd_->RangeOverlapWithCompaction(file_smallest_user_key, + file_largest_user_key, level)) { + // File overlap with a running compaction output that will be stored + // in this level, we cannot add this file to this level + return false; + } + + // File did not overlap with level files, our compaction output + return true; +} + +template +Status ExternalSstFileIngestionJob::SyncIngestedFile(TWritableFile* file) { + assert(file != nullptr); + if (db_options_.use_fsync) { + return file->Fsync(IOOptions(), nullptr); + } else { + return file->Sync(IOOptions(), nullptr); + } +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/external_sst_file_ingestion_job.h b/src/rocksdb/db/external_sst_file_ingestion_job.h new file mode 100644 index 000000000..ce50ae86d --- /dev/null +++ b/src/rocksdb/db/external_sst_file_ingestion_job.h @@ -0,0 +1,201 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include +#include +#include + +#include "db/column_family.h" +#include "db/internal_stats.h" +#include "db/snapshot_impl.h" +#include "env/file_system_tracer.h" +#include "logging/event_logger.h" +#include "options/db_options.h" +#include "rocksdb/db.h" +#include "rocksdb/file_system.h" +#include "rocksdb/sst_file_writer.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class Directories; +class SystemClock; + +struct IngestedFileInfo { + // External file path + std::string external_file_path; + // Smallest internal key in external file + InternalKey smallest_internal_key; + // Largest internal key in external file + InternalKey largest_internal_key; + // Sequence number for keys in external file + SequenceNumber original_seqno; + // Offset of the global sequence number field in the file, will + // be zero if version is 1 (global seqno is not supported) + size_t global_seqno_offset; + // External file size + uint64_t file_size; + // total number of keys in external file + uint64_t num_entries; + // total number of range deletions in external file + uint64_t num_range_deletions; + // Id of column family this file shoule be ingested into + uint32_t cf_id; + // TableProperties read from external file + TableProperties table_properties; + // Version of external file + int version; + + // FileDescriptor for the file inside the DB + FileDescriptor fd; + // file path that we picked for file inside the DB + std::string internal_file_path; + // Global sequence number that we picked for the file inside the DB + SequenceNumber assigned_seqno = 0; + // Level inside the DB we picked for the external file. + int picked_level = 0; + // Whether to copy or link the external sst file. copy_file will be set to + // false if ingestion_options.move_files is true and underlying FS + // supports link operation. Need to provide a default value to make the + // undefined-behavior sanity check of llvm happy. Since + // ingestion_options.move_files is false by default, thus copy_file is true + // by default. + bool copy_file = true; + // The checksum of ingested file + std::string file_checksum; + // The name of checksum function that generate the checksum + std::string file_checksum_func_name; + // The temperature of the file to be ingested + Temperature file_temperature = Temperature::kUnknown; + // Unique id of the file to be ingested + UniqueId64x2 unique_id{}; +}; + +class ExternalSstFileIngestionJob { + public: + ExternalSstFileIngestionJob( + VersionSet* versions, ColumnFamilyData* cfd, + const ImmutableDBOptions& db_options, const EnvOptions& env_options, + SnapshotList* db_snapshots, + const IngestExternalFileOptions& ingestion_options, + Directories* directories, EventLogger* event_logger, + const std::shared_ptr& io_tracer) + : clock_(db_options.clock), + fs_(db_options.fs, io_tracer), + versions_(versions), + cfd_(cfd), + db_options_(db_options), + env_options_(env_options), + db_snapshots_(db_snapshots), + ingestion_options_(ingestion_options), + directories_(directories), + event_logger_(event_logger), + job_start_time_(clock_->NowMicros()), + consumed_seqno_count_(0), + io_tracer_(io_tracer) { + assert(directories != nullptr); + } + + // Prepare the job by copying external files into the DB. + Status Prepare(const std::vector& external_files_paths, + const std::vector& files_checksums, + const std::vector& files_checksum_func_names, + const Temperature& file_temperature, uint64_t next_file_number, + SuperVersion* sv); + + // Check if we need to flush the memtable before running the ingestion job + // This will be true if the files we are ingesting are overlapping with any + // key range in the memtable. + // + // @param super_version A referenced SuperVersion that will be held for the + // duration of this function. + // + // Thread-safe + Status NeedsFlush(bool* flush_needed, SuperVersion* super_version); + + // Will execute the ingestion job and prepare edit() to be applied. + // REQUIRES: Mutex held + Status Run(); + + // Update column family stats. + // REQUIRES: Mutex held + void UpdateStats(); + + // Cleanup after successful/failed job + void Cleanup(const Status& status); + + VersionEdit* edit() { return &edit_; } + + const autovector& files_to_ingest() const { + return files_to_ingest_; + } + + // How many sequence numbers did we consume as part of the ingest job? + int ConsumedSequenceNumbersCount() const { return consumed_seqno_count_; } + + private: + // Open the external file and populate `file_to_ingest` with all the + // external information we need to ingest this file. + Status GetIngestedFileInfo(const std::string& external_file, + uint64_t new_file_number, + IngestedFileInfo* file_to_ingest, + SuperVersion* sv); + + // Assign `file_to_ingest` the appropriate sequence number and the lowest + // possible level that it can be ingested to according to compaction_style. + // REQUIRES: Mutex held + Status AssignLevelAndSeqnoForIngestedFile(SuperVersion* sv, + bool force_global_seqno, + CompactionStyle compaction_style, + SequenceNumber last_seqno, + IngestedFileInfo* file_to_ingest, + SequenceNumber* assigned_seqno); + + // File that we want to ingest behind always goes to the lowest level; + // we just check that it fits in the level, that DB allows ingest_behind, + // and that we don't have 0 seqnums at the upper levels. + // REQUIRES: Mutex held + Status CheckLevelForIngestedBehindFile(IngestedFileInfo* file_to_ingest); + + // Set the file global sequence number to `seqno` + Status AssignGlobalSeqnoForIngestedFile(IngestedFileInfo* file_to_ingest, + SequenceNumber seqno); + // Generate the file checksum and store in the IngestedFileInfo + IOStatus GenerateChecksumForIngestedFile(IngestedFileInfo* file_to_ingest); + + // Check if `file_to_ingest` can fit in level `level` + // REQUIRES: Mutex held + bool IngestedFileFitInLevel(const IngestedFileInfo* file_to_ingest, + int level); + + // Helper method to sync given file. + template + Status SyncIngestedFile(TWritableFile* file); + + SystemClock* clock_; + FileSystemPtr fs_; + VersionSet* versions_; + ColumnFamilyData* cfd_; + const ImmutableDBOptions& db_options_; + const EnvOptions& env_options_; + SnapshotList* db_snapshots_; + autovector files_to_ingest_; + const IngestExternalFileOptions& ingestion_options_; + Directories* directories_; + EventLogger* event_logger_; + VersionEdit edit_; + uint64_t job_start_time_; + int consumed_seqno_count_; + // Set in ExternalSstFileIngestionJob::Prepare(), if true all files are + // ingested in L0 + bool files_overlap_{false}; + // Set in ExternalSstFileIngestionJob::Prepare(), if true and DB + // file_checksum_gen_factory is set, DB will generate checksum each file. + bool need_generate_file_checksum_{true}; + std::shared_ptr io_tracer_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/external_sst_file_test.cc b/src/rocksdb/db/external_sst_file_test.cc new file mode 100644 index 000000000..d16f6a58c --- /dev/null +++ b/src/rocksdb/db/external_sst_file_test.cc @@ -0,0 +1,2967 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include + +#include "db/db_test_util.h" +#include "db/dbformat.h" +#include "file/filename.h" +#include "options/options_helper.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/sst_file_reader.h" +#include "rocksdb/sst_file_writer.h" +#include "test_util/testutil.h" +#include "util/random.h" +#include "util/thread_guard.h" +#include "utilities/fault_injection_env.h" + +namespace ROCKSDB_NAMESPACE { + +// A test environment that can be configured to fail the Link operation. +class ExternalSSTTestEnv : public EnvWrapper { + public: + ExternalSSTTestEnv(Env* t, bool fail_link) + : EnvWrapper(t), fail_link_(fail_link) {} + static const char* kClassName() { return "ExternalSSTTestEnv"; } + const char* Name() const override { return kClassName(); } + + Status LinkFile(const std::string& s, const std::string& t) override { + if (fail_link_) { + return Status::NotSupported("Link failed"); + } + return target()->LinkFile(s, t); + } + + void set_fail_link(bool fail_link) { fail_link_ = fail_link; } + + private: + bool fail_link_; +}; + +class ExternalSSTFileTestBase : public DBTestBase { + public: + ExternalSSTFileTestBase() + : DBTestBase("external_sst_file_test", /*env_do_fsync=*/true) { + sst_files_dir_ = dbname_ + "/sst_files/"; + DestroyAndRecreateExternalSSTFilesDir(); + } + + void DestroyAndRecreateExternalSSTFilesDir() { + ASSERT_OK(DestroyDir(env_, sst_files_dir_)); + ASSERT_OK(env_->CreateDir(sst_files_dir_)); + } + + ~ExternalSSTFileTestBase() override { + DestroyDir(env_, sst_files_dir_).PermitUncheckedError(); + } + + protected: + std::string sst_files_dir_; +}; + +class ExternSSTFileLinkFailFallbackTest + : public ExternalSSTFileTestBase, + public ::testing::WithParamInterface> { + public: + ExternSSTFileLinkFailFallbackTest() + : test_env_(new ExternalSSTTestEnv(env_, true)) { + options_ = CurrentOptions(); + options_.disable_auto_compactions = true; + options_.env = test_env_; + } + + void TearDown() override { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, options_)); + delete test_env_; + test_env_ = nullptr; + } + + protected: + Options options_; + ExternalSSTTestEnv* test_env_; +}; + +class ExternalSSTFileTest + : public ExternalSSTFileTestBase, + public ::testing::WithParamInterface> { + public: + ExternalSSTFileTest() {} + + Status GenerateOneExternalFile( + const Options& options, ColumnFamilyHandle* cfh, + std::vector>& data, int file_id, + bool sort_data, std::string* external_file_path, + std::map* true_data) { + // Generate a file id if not provided + if (-1 == file_id) { + file_id = (++last_file_id_); + } + // Sort data if asked to do so + if (sort_data) { + std::sort(data.begin(), data.end(), + [&](const std::pair& e1, + const std::pair& e2) { + return options.comparator->Compare(e1.first, e2.first) < 0; + }); + auto uniq_iter = std::unique( + data.begin(), data.end(), + [&](const std::pair& e1, + const std::pair& e2) { + return options.comparator->Compare(e1.first, e2.first) == 0; + }); + data.resize(uniq_iter - data.begin()); + } + std::string file_path = sst_files_dir_ + std::to_string(file_id); + SstFileWriter sst_file_writer(EnvOptions(), options, cfh); + Status s = sst_file_writer.Open(file_path); + if (!s.ok()) { + return s; + } + for (const auto& entry : data) { + s = sst_file_writer.Put(entry.first, entry.second); + if (!s.ok()) { + sst_file_writer.Finish().PermitUncheckedError(); + return s; + } + } + s = sst_file_writer.Finish(); + if (s.ok() && external_file_path != nullptr) { + *external_file_path = file_path; + } + if (s.ok() && nullptr != true_data) { + for (const auto& entry : data) { + true_data->insert({entry.first, entry.second}); + } + } + return s; + } + + Status GenerateAndAddExternalFile( + const Options options, + std::vector> data, int file_id = -1, + bool allow_global_seqno = false, bool write_global_seqno = false, + bool verify_checksums_before_ingest = true, bool ingest_behind = false, + bool sort_data = false, + std::map* true_data = nullptr, + ColumnFamilyHandle* cfh = nullptr) { + // Generate a file id if not provided + if (file_id == -1) { + file_id = last_file_id_ + 1; + last_file_id_++; + } + + // Sort data if asked to do so + if (sort_data) { + std::sort(data.begin(), data.end(), + [&](const std::pair& e1, + const std::pair& e2) { + return options.comparator->Compare(e1.first, e2.first) < 0; + }); + auto uniq_iter = std::unique( + data.begin(), data.end(), + [&](const std::pair& e1, + const std::pair& e2) { + return options.comparator->Compare(e1.first, e2.first) == 0; + }); + data.resize(uniq_iter - data.begin()); + } + std::string file_path = sst_files_dir_ + std::to_string(file_id); + SstFileWriter sst_file_writer(EnvOptions(), options, cfh); + + Status s = sst_file_writer.Open(file_path); + if (!s.ok()) { + return s; + } + for (auto& entry : data) { + s = sst_file_writer.Put(entry.first, entry.second); + if (!s.ok()) { + sst_file_writer.Finish().PermitUncheckedError(); + return s; + } + } + s = sst_file_writer.Finish(); + + if (s.ok()) { + IngestExternalFileOptions ifo; + ifo.allow_global_seqno = allow_global_seqno; + ifo.write_global_seqno = allow_global_seqno ? write_global_seqno : false; + ifo.verify_checksums_before_ingest = verify_checksums_before_ingest; + ifo.ingest_behind = ingest_behind; + if (cfh) { + s = db_->IngestExternalFile(cfh, {file_path}, ifo); + } else { + s = db_->IngestExternalFile({file_path}, ifo); + } + } + + if (s.ok() && true_data) { + for (auto& entry : data) { + (*true_data)[entry.first] = entry.second; + } + } + + return s; + } + + Status GenerateAndAddExternalFiles( + const Options& options, + const std::vector& column_families, + const std::vector& ifos, + std::vector>>& data, + int file_id, bool sort_data, + std::vector>& true_data) { + if (-1 == file_id) { + file_id = (++last_file_id_); + } + // Generate external SST files, one for each column family + size_t num_cfs = column_families.size(); + assert(ifos.size() == num_cfs); + assert(data.size() == num_cfs); + std::vector args(num_cfs); + for (size_t i = 0; i != num_cfs; ++i) { + std::string external_file_path; + Status s = GenerateOneExternalFile( + options, column_families[i], data[i], file_id, sort_data, + &external_file_path, + true_data.size() == num_cfs ? &true_data[i] : nullptr); + if (!s.ok()) { + return s; + } + ++file_id; + + args[i].column_family = column_families[i]; + args[i].external_files.push_back(external_file_path); + args[i].options = ifos[i]; + } + return db_->IngestExternalFiles(args); + } + + Status GenerateAndAddExternalFile( + const Options options, std::vector> data, + int file_id = -1, bool allow_global_seqno = false, + bool write_global_seqno = false, + bool verify_checksums_before_ingest = true, bool ingest_behind = false, + bool sort_data = false, + std::map* true_data = nullptr, + ColumnFamilyHandle* cfh = nullptr) { + std::vector> file_data; + for (auto& entry : data) { + file_data.emplace_back(Key(entry.first), entry.second); + } + return GenerateAndAddExternalFile(options, file_data, file_id, + allow_global_seqno, write_global_seqno, + verify_checksums_before_ingest, + ingest_behind, sort_data, true_data, cfh); + } + + Status GenerateAndAddExternalFile( + const Options options, std::vector keys, int file_id = -1, + bool allow_global_seqno = false, bool write_global_seqno = false, + bool verify_checksums_before_ingest = true, bool ingest_behind = false, + bool sort_data = false, + std::map* true_data = nullptr, + ColumnFamilyHandle* cfh = nullptr) { + std::vector> file_data; + for (auto& k : keys) { + file_data.emplace_back(Key(k), Key(k) + std::to_string(file_id)); + } + return GenerateAndAddExternalFile(options, file_data, file_id, + allow_global_seqno, write_global_seqno, + verify_checksums_before_ingest, + ingest_behind, sort_data, true_data, cfh); + } + + Status DeprecatedAddFile(const std::vector& files, + bool move_files = false, + bool skip_snapshot_check = false, + bool skip_write_global_seqno = false) { + IngestExternalFileOptions opts; + opts.move_files = move_files; + opts.snapshot_consistency = !skip_snapshot_check; + opts.allow_global_seqno = false; + opts.allow_blocking_flush = false; + opts.write_global_seqno = !skip_write_global_seqno; + return db_->IngestExternalFile(files, opts); + } + + protected: + int last_file_id_ = 0; +}; + +TEST_F(ExternalSSTFileTest, Basic) { + do { + Options options = CurrentOptions(); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // Current file size should be 0 after sst_file_writer init and before open + // a file. + ASSERT_EQ(sst_file_writer.FileSize(), 0); + + // file1.sst (0 => 99) + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 0; k < 100; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file1_info; + ASSERT_OK(sst_file_writer.Finish(&file1_info)); + + // Current file size should be non-zero after success write. + ASSERT_GT(sst_file_writer.FileSize(), 0); + + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 100); + ASSERT_EQ(file1_info.smallest_key, Key(0)); + ASSERT_EQ(file1_info.largest_key, Key(99)); + ASSERT_EQ(file1_info.num_range_del_entries, 0); + ASSERT_EQ(file1_info.smallest_range_del_key, ""); + ASSERT_EQ(file1_info.largest_range_del_key, ""); + // sst_file_writer already finished, cannot add this value + ASSERT_NOK(sst_file_writer.Put(Key(100), "bad_val")); + + // file2.sst (100 => 199) + std::string file2 = sst_files_dir_ + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + for (int k = 100; k < 200; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + // Cannot add this key because it's not after last added key + ASSERT_NOK(sst_file_writer.Put(Key(99), "bad_val")); + ExternalSstFileInfo file2_info; + ASSERT_OK(sst_file_writer.Finish(&file2_info)); + ASSERT_EQ(file2_info.file_path, file2); + ASSERT_EQ(file2_info.num_entries, 100); + ASSERT_EQ(file2_info.smallest_key, Key(100)); + ASSERT_EQ(file2_info.largest_key, Key(199)); + + // file3.sst (195 => 299) + // This file values overlap with file2 values + std::string file3 = sst_files_dir_ + "file3.sst"; + ASSERT_OK(sst_file_writer.Open(file3)); + for (int k = 195; k < 300; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file3_info; + ASSERT_OK(sst_file_writer.Finish(&file3_info)); + + // Current file size should be non-zero after success finish. + ASSERT_GT(sst_file_writer.FileSize(), 0); + ASSERT_EQ(file3_info.file_path, file3); + ASSERT_EQ(file3_info.num_entries, 105); + ASSERT_EQ(file3_info.smallest_key, Key(195)); + ASSERT_EQ(file3_info.largest_key, Key(299)); + + // file4.sst (30 => 39) + // This file values overlap with file1 values + std::string file4 = sst_files_dir_ + "file4.sst"; + ASSERT_OK(sst_file_writer.Open(file4)); + for (int k = 30; k < 40; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file4_info; + ASSERT_OK(sst_file_writer.Finish(&file4_info)); + ASSERT_EQ(file4_info.file_path, file4); + ASSERT_EQ(file4_info.num_entries, 10); + ASSERT_EQ(file4_info.smallest_key, Key(30)); + ASSERT_EQ(file4_info.largest_key, Key(39)); + + // file5.sst (400 => 499) + std::string file5 = sst_files_dir_ + "file5.sst"; + ASSERT_OK(sst_file_writer.Open(file5)); + for (int k = 400; k < 500; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file5_info; + ASSERT_OK(sst_file_writer.Finish(&file5_info)); + ASSERT_EQ(file5_info.file_path, file5); + ASSERT_EQ(file5_info.num_entries, 100); + ASSERT_EQ(file5_info.smallest_key, Key(400)); + ASSERT_EQ(file5_info.largest_key, Key(499)); + + // file6.sst (delete 400 => 500) + std::string file6 = sst_files_dir_ + "file6.sst"; + ASSERT_OK(sst_file_writer.Open(file6)); + ASSERT_OK(sst_file_writer.DeleteRange(Key(400), Key(500))); + ExternalSstFileInfo file6_info; + ASSERT_OK(sst_file_writer.Finish(&file6_info)); + ASSERT_EQ(file6_info.file_path, file6); + ASSERT_EQ(file6_info.num_entries, 0); + ASSERT_EQ(file6_info.smallest_key, ""); + ASSERT_EQ(file6_info.largest_key, ""); + ASSERT_EQ(file6_info.num_range_del_entries, 1); + ASSERT_EQ(file6_info.smallest_range_del_key, Key(400)); + ASSERT_EQ(file6_info.largest_range_del_key, Key(500)); + + // file7.sst (delete 500 => 570, put 520 => 599 divisible by 2) + std::string file7 = sst_files_dir_ + "file7.sst"; + ASSERT_OK(sst_file_writer.Open(file7)); + ASSERT_OK(sst_file_writer.DeleteRange(Key(500), Key(550))); + for (int k = 520; k < 560; k += 2) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ASSERT_OK(sst_file_writer.DeleteRange(Key(525), Key(575))); + for (int k = 560; k < 600; k += 2) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file7_info; + ASSERT_OK(sst_file_writer.Finish(&file7_info)); + ASSERT_EQ(file7_info.file_path, file7); + ASSERT_EQ(file7_info.num_entries, 40); + ASSERT_EQ(file7_info.smallest_key, Key(520)); + ASSERT_EQ(file7_info.largest_key, Key(598)); + ASSERT_EQ(file7_info.num_range_del_entries, 2); + ASSERT_EQ(file7_info.smallest_range_del_key, Key(500)); + ASSERT_EQ(file7_info.largest_range_del_key, Key(575)); + + // file8.sst (delete 600 => 700) + std::string file8 = sst_files_dir_ + "file8.sst"; + ASSERT_OK(sst_file_writer.Open(file8)); + ASSERT_OK(sst_file_writer.DeleteRange(Key(600), Key(700))); + ExternalSstFileInfo file8_info; + ASSERT_OK(sst_file_writer.Finish(&file8_info)); + ASSERT_EQ(file8_info.file_path, file8); + ASSERT_EQ(file8_info.num_entries, 0); + ASSERT_EQ(file8_info.smallest_key, ""); + ASSERT_EQ(file8_info.largest_key, ""); + ASSERT_EQ(file8_info.num_range_del_entries, 1); + ASSERT_EQ(file8_info.smallest_range_del_key, Key(600)); + ASSERT_EQ(file8_info.largest_range_del_key, Key(700)); + + // Cannot create an empty sst file + std::string file_empty = sst_files_dir_ + "file_empty.sst"; + ExternalSstFileInfo file_empty_info; + ASSERT_NOK(sst_file_writer.Finish(&file_empty_info)); + + DestroyAndReopen(options); + // Add file using file path + ASSERT_OK(DeprecatedAddFile({file1})); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + for (int k = 0; k < 100; k++) { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } + + // Add file while holding a snapshot will fail + const Snapshot* s1 = db_->GetSnapshot(); + if (s1 != nullptr) { + ASSERT_NOK(DeprecatedAddFile({file2})); + db_->ReleaseSnapshot(s1); + } + // We can add the file after releaseing the snapshot + ASSERT_OK(DeprecatedAddFile({file2})); + + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + for (int k = 0; k < 200; k++) { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } + + // This file has overlapping values with the existing data + ASSERT_NOK(DeprecatedAddFile({file3})); + + // This file has overlapping values with the existing data + ASSERT_NOK(DeprecatedAddFile({file4})); + + // Overwrite values of keys divisible by 5 + for (int k = 0; k < 200; k += 5) { + ASSERT_OK(Put(Key(k), Key(k) + "_val_new")); + } + ASSERT_NE(db_->GetLatestSequenceNumber(), 0U); + + // Key range of file5 (400 => 499) don't overlap with any keys in DB + ASSERT_OK(DeprecatedAddFile({file5})); + + // This file has overlapping values with the existing data + ASSERT_NOK(DeprecatedAddFile({file6})); + + // Key range of file7 (500 => 598) don't overlap with any keys in DB + ASSERT_OK(DeprecatedAddFile({file7})); + + // Key range of file7 (600 => 700) don't overlap with any keys in DB + ASSERT_OK(DeprecatedAddFile({file8})); + + // Make sure values are correct before and after flush/compaction + for (int i = 0; i < 2; i++) { + for (int k = 0; k < 200; k++) { + std::string value = Key(k) + "_val"; + if (k % 5 == 0) { + value += "_new"; + } + ASSERT_EQ(Get(Key(k)), value); + } + for (int k = 400; k < 500; k++) { + std::string value = Key(k) + "_val"; + ASSERT_EQ(Get(Key(k)), value); + } + for (int k = 500; k < 600; k++) { + std::string value = Key(k) + "_val"; + if (k < 520 || k % 2 == 1) { + value = "NOT_FOUND"; + } + ASSERT_EQ(Get(Key(k)), value); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } + + Close(); + options.disable_auto_compactions = true; + Reopen(options); + + // Delete keys in range (400 => 499) + for (int k = 400; k < 500; k++) { + ASSERT_OK(Delete(Key(k))); + } + // We deleted range (400 => 499) but cannot add file5 because + // of the range tombstones + ASSERT_NOK(DeprecatedAddFile({file5})); + + // Compacting the DB will remove the tombstones + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Now we can add the file + ASSERT_OK(DeprecatedAddFile({file5})); + + // Verify values of file5 in DB + for (int k = 400; k < 500; k++) { + std::string value = Key(k) + "_val"; + ASSERT_EQ(Get(Key(k)), value); + } + DestroyAndRecreateExternalSSTFilesDir(); + } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction | + kRangeDelSkipConfigs)); +} + +class SstFileWriterCollector : public TablePropertiesCollector { + public: + explicit SstFileWriterCollector(const std::string prefix) : prefix_(prefix) { + name_ = prefix_ + "_SstFileWriterCollector"; + } + + const char* Name() const override { return name_.c_str(); } + + Status Finish(UserCollectedProperties* properties) override { + std::string count = std::to_string(count_); + *properties = UserCollectedProperties{ + {prefix_ + "_SstFileWriterCollector", "YES"}, + {prefix_ + "_Count", count}, + }; + return Status::OK(); + } + + Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/, + EntryType /*type*/, SequenceNumber /*seq*/, + uint64_t /*file_size*/) override { + ++count_; + return Status::OK(); + } + + UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties{}; + } + + private: + uint32_t count_ = 0; + std::string prefix_; + std::string name_; +}; + +class SstFileWriterCollectorFactory : public TablePropertiesCollectorFactory { + public: + explicit SstFileWriterCollectorFactory(std::string prefix) + : prefix_(prefix), num_created_(0) {} + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context /*context*/) override { + num_created_++; + return new SstFileWriterCollector(prefix_); + } + const char* Name() const override { return "SstFileWriterCollectorFactory"; } + + std::string prefix_; + uint32_t num_created_; +}; + +TEST_F(ExternalSSTFileTest, AddList) { + do { + Options options = CurrentOptions(); + + auto abc_collector = std::make_shared("abc"); + auto xyz_collector = std::make_shared("xyz"); + + options.table_properties_collector_factories.emplace_back(abc_collector); + options.table_properties_collector_factories.emplace_back(xyz_collector); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // file1.sst (0 => 99) + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 0; k < 100; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file1_info; + ASSERT_OK(sst_file_writer.Finish(&file1_info)); + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 100); + ASSERT_EQ(file1_info.smallest_key, Key(0)); + ASSERT_EQ(file1_info.largest_key, Key(99)); + // sst_file_writer already finished, cannot add this value + ASSERT_NOK(sst_file_writer.Put(Key(100), "bad_val")); + + // file2.sst (100 => 199) + std::string file2 = sst_files_dir_ + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + for (int k = 100; k < 200; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + // Cannot add this key because it's not after last added key + ASSERT_NOK(sst_file_writer.Put(Key(99), "bad_val")); + ExternalSstFileInfo file2_info; + ASSERT_OK(sst_file_writer.Finish(&file2_info)); + ASSERT_EQ(file2_info.file_path, file2); + ASSERT_EQ(file2_info.num_entries, 100); + ASSERT_EQ(file2_info.smallest_key, Key(100)); + ASSERT_EQ(file2_info.largest_key, Key(199)); + + // file3.sst (195 => 199) + // This file values overlap with file2 values + std::string file3 = sst_files_dir_ + "file3.sst"; + ASSERT_OK(sst_file_writer.Open(file3)); + for (int k = 195; k < 200; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file3_info; + ASSERT_OK(sst_file_writer.Finish(&file3_info)); + ASSERT_EQ(file3_info.file_path, file3); + ASSERT_EQ(file3_info.num_entries, 5); + ASSERT_EQ(file3_info.smallest_key, Key(195)); + ASSERT_EQ(file3_info.largest_key, Key(199)); + + // file4.sst (30 => 39) + // This file values overlap with file1 values + std::string file4 = sst_files_dir_ + "file4.sst"; + ASSERT_OK(sst_file_writer.Open(file4)); + for (int k = 30; k < 40; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file4_info; + ASSERT_OK(sst_file_writer.Finish(&file4_info)); + ASSERT_EQ(file4_info.file_path, file4); + ASSERT_EQ(file4_info.num_entries, 10); + ASSERT_EQ(file4_info.smallest_key, Key(30)); + ASSERT_EQ(file4_info.largest_key, Key(39)); + + // file5.sst (200 => 299) + std::string file5 = sst_files_dir_ + "file5.sst"; + ASSERT_OK(sst_file_writer.Open(file5)); + for (int k = 200; k < 300; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file5_info; + ASSERT_OK(sst_file_writer.Finish(&file5_info)); + ASSERT_EQ(file5_info.file_path, file5); + ASSERT_EQ(file5_info.num_entries, 100); + ASSERT_EQ(file5_info.smallest_key, Key(200)); + ASSERT_EQ(file5_info.largest_key, Key(299)); + + // file6.sst (delete 0 => 100) + std::string file6 = sst_files_dir_ + "file6.sst"; + ASSERT_OK(sst_file_writer.Open(file6)); + ASSERT_OK(sst_file_writer.DeleteRange(Key(0), Key(75))); + ASSERT_OK(sst_file_writer.DeleteRange(Key(25), Key(100))); + ExternalSstFileInfo file6_info; + ASSERT_OK(sst_file_writer.Finish(&file6_info)); + ASSERT_EQ(file6_info.file_path, file6); + ASSERT_EQ(file6_info.num_entries, 0); + ASSERT_EQ(file6_info.smallest_key, ""); + ASSERT_EQ(file6_info.largest_key, ""); + ASSERT_EQ(file6_info.num_range_del_entries, 2); + ASSERT_EQ(file6_info.smallest_range_del_key, Key(0)); + ASSERT_EQ(file6_info.largest_range_del_key, Key(100)); + + // file7.sst (delete 99 => 201) + std::string file7 = sst_files_dir_ + "file7.sst"; + ASSERT_OK(sst_file_writer.Open(file7)); + ASSERT_OK(sst_file_writer.DeleteRange(Key(99), Key(201))); + ExternalSstFileInfo file7_info; + ASSERT_OK(sst_file_writer.Finish(&file7_info)); + ASSERT_EQ(file7_info.file_path, file7); + ASSERT_EQ(file7_info.num_entries, 0); + ASSERT_EQ(file7_info.smallest_key, ""); + ASSERT_EQ(file7_info.largest_key, ""); + ASSERT_EQ(file7_info.num_range_del_entries, 1); + ASSERT_EQ(file7_info.smallest_range_del_key, Key(99)); + ASSERT_EQ(file7_info.largest_range_del_key, Key(201)); + + // list 1 has internal key range conflict + std::vector file_list0({file1, file2}); + std::vector file_list1({file3, file2, file1}); + std::vector file_list2({file5}); + std::vector file_list3({file3, file4}); + std::vector file_list4({file5, file7}); + std::vector file_list5({file6, file7}); + + DestroyAndReopen(options); + + // These lists of files have key ranges that overlap with each other + ASSERT_NOK(DeprecatedAddFile(file_list1)); + // Both of the following overlap on the range deletion tombstone. + ASSERT_NOK(DeprecatedAddFile(file_list4)); + ASSERT_NOK(DeprecatedAddFile(file_list5)); + + // Add files using file path list + ASSERT_OK(DeprecatedAddFile(file_list0)); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + for (int k = 0; k < 200; k++) { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } + + TablePropertiesCollection props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&props)); + ASSERT_EQ(props.size(), 2); + for (auto file_props : props) { + auto user_props = file_props.second->user_collected_properties; + ASSERT_EQ(user_props["abc_SstFileWriterCollector"], "YES"); + ASSERT_EQ(user_props["xyz_SstFileWriterCollector"], "YES"); + ASSERT_EQ(user_props["abc_Count"], "100"); + ASSERT_EQ(user_props["xyz_Count"], "100"); + } + + // Add file while holding a snapshot will fail + const Snapshot* s1 = db_->GetSnapshot(); + if (s1 != nullptr) { + ASSERT_NOK(DeprecatedAddFile(file_list2)); + db_->ReleaseSnapshot(s1); + } + // We can add the file after releaseing the snapshot + ASSERT_OK(DeprecatedAddFile(file_list2)); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + for (int k = 0; k < 300; k++) { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } + + ASSERT_OK(db_->GetPropertiesOfAllTables(&props)); + ASSERT_EQ(props.size(), 3); + for (auto file_props : props) { + auto user_props = file_props.second->user_collected_properties; + ASSERT_EQ(user_props["abc_SstFileWriterCollector"], "YES"); + ASSERT_EQ(user_props["xyz_SstFileWriterCollector"], "YES"); + ASSERT_EQ(user_props["abc_Count"], "100"); + ASSERT_EQ(user_props["xyz_Count"], "100"); + } + + // This file list has overlapping values with the existing data + ASSERT_NOK(DeprecatedAddFile(file_list3)); + + // Overwrite values of keys divisible by 5 + for (int k = 0; k < 200; k += 5) { + ASSERT_OK(Put(Key(k), Key(k) + "_val_new")); + } + ASSERT_NE(db_->GetLatestSequenceNumber(), 0U); + + // Make sure values are correct before and after flush/compaction + for (int i = 0; i < 2; i++) { + for (int k = 0; k < 200; k++) { + std::string value = Key(k) + "_val"; + if (k % 5 == 0) { + value += "_new"; + } + ASSERT_EQ(Get(Key(k)), value); + } + for (int k = 200; k < 300; k++) { + std::string value = Key(k) + "_val"; + ASSERT_EQ(Get(Key(k)), value); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } + + // Delete keys in range (200 => 299) + for (int k = 200; k < 300; k++) { + ASSERT_OK(Delete(Key(k))); + } + // We deleted range (200 => 299) but cannot add file5 because + // of the range tombstones + ASSERT_NOK(DeprecatedAddFile(file_list2)); + + // Compacting the DB will remove the tombstones + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Now we can add the file + ASSERT_OK(DeprecatedAddFile(file_list2)); + + // Verify values of file5 in DB + for (int k = 200; k < 300; k++) { + std::string value = Key(k) + "_val"; + ASSERT_EQ(Get(Key(k)), value); + } + DestroyAndRecreateExternalSSTFilesDir(); + } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction | + kRangeDelSkipConfigs)); +} + +TEST_F(ExternalSSTFileTest, AddListAtomicity) { + do { + Options options = CurrentOptions(); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // files[0].sst (0 => 99) + // files[1].sst (100 => 199) + // ... + // file[8].sst (800 => 899) + int n = 9; + std::vector files(n); + std::vector files_info(n); + for (int i = 0; i < n; i++) { + files[i] = sst_files_dir_ + "file" + std::to_string(i) + ".sst"; + ASSERT_OK(sst_file_writer.Open(files[i])); + for (int k = i * 100; k < (i + 1) * 100; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ASSERT_OK(sst_file_writer.Finish(&files_info[i])); + ASSERT_EQ(files_info[i].file_path, files[i]); + ASSERT_EQ(files_info[i].num_entries, 100); + ASSERT_EQ(files_info[i].smallest_key, Key(i * 100)); + ASSERT_EQ(files_info[i].largest_key, Key((i + 1) * 100 - 1)); + } + files.push_back(sst_files_dir_ + "file" + std::to_string(n) + ".sst"); + ASSERT_NOK(DeprecatedAddFile(files)); + for (int k = 0; k < n * 100; k++) { + ASSERT_EQ("NOT_FOUND", Get(Key(k))); + } + files.pop_back(); + ASSERT_OK(DeprecatedAddFile(files)); + for (int k = 0; k < n * 100; k++) { + std::string value = Key(k) + "_val"; + ASSERT_EQ(Get(Key(k)), value); + } + DestroyAndRecreateExternalSSTFilesDir(); + } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction)); +} +// This test reporduce a bug that can happen in some cases if the DB started +// purging obsolete files when we are adding an external sst file. +// This situation may result in deleting the file while it's being added. +TEST_F(ExternalSSTFileTest, PurgeObsoleteFilesBug) { + Options options = CurrentOptions(); + SstFileWriter sst_file_writer(EnvOptions(), options); + + // file1.sst (0 => 500) + std::string sst_file_path = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(sst_file_path)); + for (int i = 0; i < 500; i++) { + std::string k = Key(i); + ASSERT_OK(sst_file_writer.Put(k, k + "_val")); + } + + ExternalSstFileInfo sst_file_info; + ASSERT_OK(sst_file_writer.Finish(&sst_file_info)); + + options.delete_obsolete_files_period_micros = 0; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::Prepare:FileAdded", [&](void* /* arg */) { + ASSERT_OK(Put("aaa", "bbb")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("aaa", "xxx")); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(DeprecatedAddFile({sst_file_path})); + + for (int i = 0; i < 500; i++) { + std::string k = Key(i); + std::string v = k + "_val"; + ASSERT_EQ(Get(k), v); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(ExternalSSTFileTest, SkipSnapshot) { + Options options = CurrentOptions(); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // file1.sst (0 => 99) + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 0; k < 100; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file1_info; + ASSERT_OK(sst_file_writer.Finish(&file1_info)); + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 100); + ASSERT_EQ(file1_info.smallest_key, Key(0)); + ASSERT_EQ(file1_info.largest_key, Key(99)); + + // file2.sst (100 => 299) + std::string file2 = sst_files_dir_ + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + for (int k = 100; k < 300; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file2_info; + ASSERT_OK(sst_file_writer.Finish(&file2_info)); + ASSERT_EQ(file2_info.file_path, file2); + ASSERT_EQ(file2_info.num_entries, 200); + ASSERT_EQ(file2_info.smallest_key, Key(100)); + ASSERT_EQ(file2_info.largest_key, Key(299)); + + ASSERT_OK(DeprecatedAddFile({file1})); + + // Add file will fail when holding snapshot and use the default + // skip_snapshot_check to false + const Snapshot* s1 = db_->GetSnapshot(); + if (s1 != nullptr) { + ASSERT_NOK(DeprecatedAddFile({file2})); + } + + // Add file will success when set skip_snapshot_check to true even db holding + // snapshot + if (s1 != nullptr) { + ASSERT_OK(DeprecatedAddFile({file2}, false, true)); + db_->ReleaseSnapshot(s1); + } + + // file3.sst (300 => 399) + std::string file3 = sst_files_dir_ + "file3.sst"; + ASSERT_OK(sst_file_writer.Open(file3)); + for (int k = 300; k < 400; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file3_info; + ASSERT_OK(sst_file_writer.Finish(&file3_info)); + ASSERT_EQ(file3_info.file_path, file3); + ASSERT_EQ(file3_info.num_entries, 100); + ASSERT_EQ(file3_info.smallest_key, Key(300)); + ASSERT_EQ(file3_info.largest_key, Key(399)); + + // check that we have change the old key + ASSERT_EQ(Get(Key(300)), "NOT_FOUND"); + const Snapshot* s2 = db_->GetSnapshot(); + ASSERT_OK(DeprecatedAddFile({file3}, false, true)); + ASSERT_EQ(Get(Key(300)), Key(300) + ("_val")); + ASSERT_EQ(Get(Key(300), s2), Key(300) + ("_val")); + + db_->ReleaseSnapshot(s2); +} + +TEST_F(ExternalSSTFileTest, MultiThreaded) { + env_->skip_fsync_ = true; + // Bulk load 10 files every file contain 1000 keys + int num_files = 10; + int keys_per_file = 1000; + + // Generate file names + std::vector file_names; + for (int i = 0; i < num_files; i++) { + std::string file_name = "file_" + std::to_string(i) + ".sst"; + file_names.push_back(sst_files_dir_ + file_name); + } + + do { + Options options = CurrentOptions(); + + std::atomic thread_num(0); + std::function write_file_func = [&]() { + int file_idx = thread_num.fetch_add(1); + int range_start = file_idx * keys_per_file; + int range_end = range_start + keys_per_file; + + SstFileWriter sst_file_writer(EnvOptions(), options); + + ASSERT_OK(sst_file_writer.Open(file_names[file_idx])); + + for (int k = range_start; k < range_end; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k))); + } + + ASSERT_OK(sst_file_writer.Finish()); + }; + // Write num_files files in parallel + std::vector sst_writer_threads; + for (int i = 0; i < num_files; ++i) { + sst_writer_threads.emplace_back(write_file_func); + } + + for (auto& t : sst_writer_threads) { + t.join(); + } + + fprintf(stderr, "Wrote %d files (%d keys)\n", num_files, + num_files * keys_per_file); + + thread_num.store(0); + std::atomic files_added(0); + // Thread 0 -> Load {f0,f1} + // Thread 1 -> Load {f0,f1} + // Thread 2 -> Load {f2,f3} + // Thread 3 -> Load {f2,f3} + // Thread 4 -> Load {f4,f5} + // Thread 5 -> Load {f4,f5} + // ... + std::function load_file_func = [&]() { + // We intentionally add every file twice, and assert that it was added + // only once and the other add failed + int thread_id = thread_num.fetch_add(1); + int file_idx = (thread_id / 2) * 2; + // sometimes we use copy, sometimes link .. the result should be the same + bool move_file = (thread_id % 3 == 0); + + std::vector files_to_add; + + files_to_add = {file_names[file_idx]}; + if (static_cast(file_idx + 1) < file_names.size()) { + files_to_add.push_back(file_names[file_idx + 1]); + } + + Status s = DeprecatedAddFile(files_to_add, move_file); + if (s.ok()) { + files_added += static_cast(files_to_add.size()); + } + }; + + // Bulk load num_files files in parallel + std::vector add_file_threads; + DestroyAndReopen(options); + for (int i = 0; i < num_files; ++i) { + add_file_threads.emplace_back(load_file_func); + } + + for (auto& t : add_file_threads) { + t.join(); + } + ASSERT_EQ(files_added.load(), num_files); + fprintf(stderr, "Loaded %d files (%d keys)\n", num_files, + num_files * keys_per_file); + + // Overwrite values of keys divisible by 100 + for (int k = 0; k < num_files * keys_per_file; k += 100) { + std::string key = Key(k); + ASSERT_OK(Put(key, key + "_new")); + } + + for (int i = 0; i < 2; i++) { + // Make sure the values are correct before and after flush/compaction + for (int k = 0; k < num_files * keys_per_file; ++k) { + std::string key = Key(k); + std::string value = (k % 100 == 0) ? (key + "_new") : key; + ASSERT_EQ(Get(key), value); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } + + fprintf(stderr, "Verified %d values\n", num_files * keys_per_file); + DestroyAndRecreateExternalSSTFilesDir(); + } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction)); +} + +TEST_F(ExternalSSTFileTest, OverlappingRanges) { + env_->skip_fsync_ = true; + Random rnd(301); + SequenceNumber assigned_seqno = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::Run", [&assigned_seqno](void* arg) { + ASSERT_TRUE(arg != nullptr); + assigned_seqno = *(static_cast(arg)); + }); + bool need_flush = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::IngestExternalFile:NeedFlush", [&need_flush](void* arg) { + ASSERT_TRUE(arg != nullptr); + need_flush = *(static_cast(arg)); + }); + bool overlap_with_db = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile", + [&overlap_with_db](void* arg) { + ASSERT_TRUE(arg != nullptr); + overlap_with_db = *(static_cast(arg)); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + do { + Options options = CurrentOptions(); + env_->skip_fsync_ = true; + DestroyAndReopen(options); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + printf("Option config = %d\n", option_config_); + std::vector> key_ranges; + for (int i = 0; i < 100; i++) { + int range_start = rnd.Uniform(20000); + int keys_per_range = 10 + rnd.Uniform(41); + + key_ranges.emplace_back(range_start, range_start + keys_per_range); + } + + int memtable_add = 0; + int success_add_file = 0; + int failed_add_file = 0; + std::map true_data; + for (size_t i = 0; i < key_ranges.size(); i++) { + int range_start = key_ranges[i].first; + int range_end = key_ranges[i].second; + + Status s; + std::string range_val = "range_" + std::to_string(i); + + // For 20% of ranges we use DB::Put, for 80% we use DB::AddFile + if (i && i % 5 == 0) { + // Use DB::Put to insert range (insert into memtable) + range_val += "_put"; + for (int k = range_start; k <= range_end; k++) { + s = Put(Key(k), range_val); + ASSERT_OK(s); + } + memtable_add++; + } else { + // Use DB::AddFile to insert range + range_val += "_add_file"; + + // Generate the file containing the range + std::string file_name = sst_files_dir_ + env_->GenerateUniqueId(); + s = sst_file_writer.Open(file_name); + ASSERT_OK(s); + for (int k = range_start; k <= range_end; k++) { + s = sst_file_writer.Put(Key(k), range_val); + ASSERT_OK(s); + } + ExternalSstFileInfo file_info; + s = sst_file_writer.Finish(&file_info); + ASSERT_OK(s); + + // Insert the generated file + s = DeprecatedAddFile({file_name}); + auto it = true_data.lower_bound(Key(range_start)); + if (option_config_ != kUniversalCompaction && + option_config_ != kUniversalCompactionMultiLevel && + option_config_ != kUniversalSubcompactions) { + if (it != true_data.end() && it->first <= Key(range_end)) { + // This range overlap with data already exist in DB + ASSERT_NOK(s); + failed_add_file++; + } else { + ASSERT_OK(s); + success_add_file++; + } + } else { + if ((it != true_data.end() && it->first <= Key(range_end)) || + need_flush || assigned_seqno > 0 || overlap_with_db) { + // This range overlap with data already exist in DB + ASSERT_NOK(s); + failed_add_file++; + } else { + ASSERT_OK(s); + success_add_file++; + } + } + } + + if (s.ok()) { + // Update true_data map to include the new inserted data + for (int k = range_start; k <= range_end; k++) { + true_data[Key(k)] = range_val; + } + } + + // Flush / Compact the DB + if (i && i % 50 == 0) { + ASSERT_OK(Flush()); + } + if (i && i % 75 == 0) { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } + } + + printf("Total: %" ROCKSDB_PRIszt + " ranges\n" + "AddFile()|Success: %d ranges\n" + "AddFile()|RangeConflict: %d ranges\n" + "Put(): %d ranges\n", + key_ranges.size(), success_add_file, failed_add_file, memtable_add); + + // Verify the correctness of the data + for (const auto& kv : true_data) { + ASSERT_EQ(Get(kv.first), kv.second); + } + printf("keys/values verified\n"); + DestroyAndRecreateExternalSSTFilesDir(); + } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction)); +} + +TEST_P(ExternalSSTFileTest, PickedLevel) { + env_->skip_fsync_ = true; + Options options = CurrentOptions(); + options.disable_auto_compactions = false; + options.level0_file_num_compaction_trigger = 4; + options.num_levels = 4; + DestroyAndReopen(options); + + std::map true_data; + + // File 0 will go to last level (L3) + ASSERT_OK(GenerateAndAddExternalFile(options, {1, 10}, -1, false, false, true, + false, false, &true_data)); + EXPECT_EQ(FilesPerLevel(), "0,0,0,1"); + + // File 1 will go to level L2 (since it overlap with file 0 in L3) + ASSERT_OK(GenerateAndAddExternalFile(options, {2, 9}, -1, false, false, true, + false, false, &true_data)); + EXPECT_EQ(FilesPerLevel(), "0,0,1,1"); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"ExternalSSTFileTest::PickedLevel:0", "BackgroundCallCompaction:0"}, + {"DBImpl::BackgroundCompaction:Start", + "ExternalSSTFileTest::PickedLevel:1"}, + {"ExternalSSTFileTest::PickedLevel:2", + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Flush 4 files containing the same keys + for (int i = 0; i < 4; i++) { + ASSERT_OK(Put(Key(3), Key(3) + "put")); + ASSERT_OK(Put(Key(8), Key(8) + "put")); + true_data[Key(3)] = Key(3) + "put"; + true_data[Key(8)] = Key(8) + "put"; + ASSERT_OK(Flush()); + } + + // Wait for BackgroundCompaction() to be called + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:0"); + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:1"); + + EXPECT_EQ(FilesPerLevel(), "4,0,1,1"); + + // This file overlaps with file 0 (L3), file 1 (L2) and the + // output of compaction going to L1 + ASSERT_OK(GenerateAndAddExternalFile(options, {4, 7}, -1, false, false, true, + false, false, &true_data)); + EXPECT_EQ(FilesPerLevel(), "5,0,1,1"); + + // This file does not overlap with any file or with the running compaction + ASSERT_OK(GenerateAndAddExternalFile(options, {9000, 9001}, -1, false, false, + false, false, false, &true_data)); + EXPECT_EQ(FilesPerLevel(), "5,0,1,2"); + + // Hold compaction from finishing + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:2"); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + EXPECT_EQ(FilesPerLevel(), "1,1,1,2"); + + size_t kcnt = 0; + VerifyDBFromMap(true_data, &kcnt, false); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(ExternalSSTFileTest, PickedLevelBug) { + env_->skip_fsync_ = true; + Options options = CurrentOptions(); + options.disable_auto_compactions = false; + options.level0_file_num_compaction_trigger = 3; + options.num_levels = 2; + DestroyAndReopen(options); + + std::vector file_keys; + + // file #1 in L0 + file_keys = {0, 5, 7}; + for (int k : file_keys) { + ASSERT_OK(Put(Key(k), Key(k))); + } + ASSERT_OK(Flush()); + + // file #2 in L0 + file_keys = {4, 6, 8, 9}; + for (int k : file_keys) { + ASSERT_OK(Put(Key(k), Key(k))); + } + ASSERT_OK(Flush()); + + // We have 2 overlapping files in L0 + EXPECT_EQ(FilesPerLevel(), "2"); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::IngestExternalFile:AfterIncIngestFileCounter", + "ExternalSSTFileTest::PickedLevelBug:0"}, + {"ExternalSSTFileTest::PickedLevelBug:1", "DBImpl::AddFile:MutexUnlock"}, + {"ExternalSSTFileTest::PickedLevelBug:2", + "DBImpl::RunManualCompaction:0"}, + {"ExternalSSTFileTest::PickedLevelBug:3", + "DBImpl::RunManualCompaction:1"}}); + + std::atomic bg_compact_started(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:Start", + [&](void* /*arg*/) { bg_compact_started.store(true); }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Status bg_compact_status; + Status bg_addfile_status; + + { + // While writing the MANIFEST start a thread that will ask for compaction + ThreadGuard bg_compact(port::Thread([&]() { + bg_compact_status = + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + })); + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:2"); + + // Start a thread that will ingest a new file + ThreadGuard bg_addfile(port::Thread([&]() { + file_keys = {1, 2, 3}; + bg_addfile_status = GenerateAndAddExternalFile(options, file_keys, 1); + })); + + // Wait for AddFile to start picking levels and writing MANIFEST + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:0"); + + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:3"); + + // We need to verify that no compactions can run while AddFile is + // ingesting the files into the levels it find suitable. So we will + // wait for 2 seconds to give a chance for compactions to run during + // this period, and then make sure that no compactions where able to run + env_->SleepForMicroseconds(1000000 * 2); + bool bg_compact_started_tmp = bg_compact_started.load(); + + // Hold AddFile from finishing writing the MANIFEST + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:1"); + + // check the status at the end, so even if the ASSERT fails the threads + // could be joined and return. + ASSERT_FALSE(bg_compact_started_tmp); + } + + ASSERT_OK(bg_addfile_status); + ASSERT_OK(bg_compact_status); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + int total_keys = 0; + Iterator* iter = db_->NewIterator(ReadOptions()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + total_keys++; + } + ASSERT_EQ(total_keys, 10); + + delete iter; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(ExternalSSTFileTest, IngestNonExistingFile) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + + Status s = db_->IngestExternalFile({"non_existing_file"}, + IngestExternalFileOptions()); + ASSERT_NOK(s); + + // Verify file deletion is not impacted (verify a bug fix) + ASSERT_OK(Put(Key(1), Key(1))); + ASSERT_OK(Put(Key(9), Key(9))); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(1), Key(1))); + ASSERT_OK(Put(Key(9), Key(9))); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + // After full compaction, there should be only 1 file. + std::vector files; + ASSERT_OK(env_->GetChildren(dbname_, &files)); + int num_sst_files = 0; + for (auto& f : files) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kTableFile) { + num_sst_files++; + } + } + ASSERT_EQ(1, num_sst_files); +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +TEST_F(ExternalSSTFileTest, CompactDuringAddFileRandom) { + env_->skip_fsync_ = true; + Options options = CurrentOptions(); + options.disable_auto_compactions = false; + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 2; + DestroyAndReopen(options); + + std::function bg_compact = [&]() { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + }; + + int range_id = 0; + std::vector file_keys; + std::function bg_addfile = [&]() { + ASSERT_OK(GenerateAndAddExternalFile(options, file_keys, range_id)); + }; + + const int num_of_ranges = 1000; + std::vector threads; + while (range_id < num_of_ranges) { + int range_start = range_id * 10; + int range_end = range_start + 10; + + file_keys.clear(); + for (int k = range_start + 1; k < range_end; k++) { + file_keys.push_back(k); + } + ASSERT_OK(Put(Key(range_start), Key(range_start))); + ASSERT_OK(Put(Key(range_end), Key(range_end))); + ASSERT_OK(Flush()); + + if (range_id % 10 == 0) { + threads.emplace_back(bg_compact); + } + threads.emplace_back(bg_addfile); + + for (auto& t : threads) { + t.join(); + } + threads.clear(); + + range_id++; + } + + for (int rid = 0; rid < num_of_ranges; rid++) { + int range_start = rid * 10; + int range_end = range_start + 10; + + ASSERT_EQ(Get(Key(range_start)), Key(range_start)) << rid; + ASSERT_EQ(Get(Key(range_end)), Key(range_end)) << rid; + for (int k = range_start + 1; k < range_end; k++) { + std::string v = Key(k) + std::to_string(rid); + ASSERT_EQ(Get(Key(k)), v) << rid; + } + } +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_F(ExternalSSTFileTest, PickedLevelDynamic) { + env_->skip_fsync_ = true; + Options options = CurrentOptions(); + options.disable_auto_compactions = false; + options.level0_file_num_compaction_trigger = 4; + options.level_compaction_dynamic_level_bytes = true; + options.num_levels = 4; + DestroyAndReopen(options); + std::map true_data; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"ExternalSSTFileTest::PickedLevelDynamic:0", + "BackgroundCallCompaction:0"}, + {"DBImpl::BackgroundCompaction:Start", + "ExternalSSTFileTest::PickedLevelDynamic:1"}, + {"ExternalSSTFileTest::PickedLevelDynamic:2", + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Flush 4 files containing the same keys + for (int i = 0; i < 4; i++) { + for (int k = 20; k <= 30; k++) { + ASSERT_OK(Put(Key(k), Key(k) + "put")); + true_data[Key(k)] = Key(k) + "put"; + } + for (int k = 50; k <= 60; k++) { + ASSERT_OK(Put(Key(k), Key(k) + "put")); + true_data[Key(k)] = Key(k) + "put"; + } + ASSERT_OK(Flush()); + } + + // Wait for BackgroundCompaction() to be called + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:0"); + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:1"); + + // This file overlaps with the output of the compaction (going to L3) + // so the file will be added to L0 since L3 is the base level + ASSERT_OK(GenerateAndAddExternalFile(options, {31, 32, 33, 34}, -1, false, + false, true, false, false, &true_data)); + EXPECT_EQ(FilesPerLevel(), "5"); + + // This file does not overlap with the current running compactiong + ASSERT_OK(GenerateAndAddExternalFile(options, {9000, 9001}, -1, false, false, + true, false, false, &true_data)); + EXPECT_EQ(FilesPerLevel(), "5,0,0,1"); + + // Hold compaction from finishing + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:2"); + + // Output of the compaction will go to L3 + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + EXPECT_EQ(FilesPerLevel(), "1,0,0,2"); + + Close(); + options.disable_auto_compactions = true; + Reopen(options); + + ASSERT_OK(GenerateAndAddExternalFile(options, {1, 15, 19}, -1, false, false, + true, false, false, &true_data)); + ASSERT_EQ(FilesPerLevel(), "1,0,0,3"); + + ASSERT_OK(GenerateAndAddExternalFile(options, {1000, 1001, 1002}, -1, false, + false, true, false, false, &true_data)); + ASSERT_EQ(FilesPerLevel(), "1,0,0,4"); + + ASSERT_OK(GenerateAndAddExternalFile(options, {500, 600, 700}, -1, false, + false, true, false, false, &true_data)); + ASSERT_EQ(FilesPerLevel(), "1,0,0,5"); + + // File 5 overlaps with file 2 (L3 / base level) + ASSERT_OK(GenerateAndAddExternalFile(options, {2, 10}, -1, false, false, true, + false, false, &true_data)); + ASSERT_EQ(FilesPerLevel(), "2,0,0,5"); + + // File 6 overlaps with file 2 (L3 / base level) and file 5 (L0) + ASSERT_OK(GenerateAndAddExternalFile(options, {3, 9}, -1, false, false, true, + false, false, &true_data)); + ASSERT_EQ(FilesPerLevel(), "3,0,0,5"); + + // Verify data in files + size_t kcnt = 0; + VerifyDBFromMap(true_data, &kcnt, false); + + // Write range [5 => 10] to L0 + for (int i = 5; i <= 10; i++) { + std::string k = Key(i); + std::string v = k + "put"; + ASSERT_OK(Put(k, v)); + true_data[k] = v; + } + ASSERT_OK(Flush()); + ASSERT_EQ(FilesPerLevel(), "4,0,0,5"); + + // File 7 overlaps with file 4 (L3) + ASSERT_OK(GenerateAndAddExternalFile(options, {650, 651, 652}, -1, false, + false, true, false, false, &true_data)); + ASSERT_EQ(FilesPerLevel(), "5,0,0,5"); + + VerifyDBFromMap(true_data, &kcnt, false); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(ExternalSSTFileTest, AddExternalSstFileWithCustomCompartor) { + Options options = CurrentOptions(); + options.comparator = ReverseBytewiseComparator(); + DestroyAndReopen(options); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // Generate files with these key ranges + // {14 -> 0} + // {24 -> 10} + // {34 -> 20} + // {44 -> 30} + // .. + std::vector generated_files; + for (int i = 0; i < 10; i++) { + std::string file_name = sst_files_dir_ + env_->GenerateUniqueId(); + ASSERT_OK(sst_file_writer.Open(file_name)); + + int range_end = i * 10; + int range_start = range_end + 15; + for (int k = (range_start - 1); k >= range_end; k--) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k))); + } + ExternalSstFileInfo file_info; + ASSERT_OK(sst_file_writer.Finish(&file_info)); + generated_files.push_back(file_name); + } + + std::vector in_files; + + // These 2nd and 3rd files overlap with each other + in_files = {generated_files[0], generated_files[4], generated_files[5], + generated_files[7]}; + ASSERT_NOK(DeprecatedAddFile(in_files)); + + // These 2 files don't overlap with each other + in_files = {generated_files[0], generated_files[2]}; + ASSERT_OK(DeprecatedAddFile(in_files)); + + // These 2 files don't overlap with each other but overlap with keys in DB + in_files = {generated_files[3], generated_files[7]}; + ASSERT_NOK(DeprecatedAddFile(in_files)); + + // Files don't overlap and don't overlap with DB key range + in_files = {generated_files[4], generated_files[6], generated_files[8]}; + ASSERT_OK(DeprecatedAddFile(in_files)); + + for (int i = 0; i < 100; i++) { + if (i % 20 <= 14) { + ASSERT_EQ(Get(Key(i)), Key(i)); + } else { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + } +} + +TEST_F(ExternalSSTFileTest, AddFileTrivialMoveBug) { + Options options = CurrentOptions(); + options.num_levels = 3; + options.IncreaseParallelism(20); + DestroyAndReopen(options); + + ASSERT_OK(GenerateAndAddExternalFile(options, {1, 4}, 1)); // L3 + ASSERT_OK(GenerateAndAddExternalFile(options, {2, 3}, 2)); // L2 + + ASSERT_OK(GenerateAndAddExternalFile(options, {10, 14}, 3)); // L3 + ASSERT_OK(GenerateAndAddExternalFile(options, {12, 13}, 4)); // L2 + + ASSERT_OK(GenerateAndAddExternalFile(options, {20, 24}, 5)); // L3 + ASSERT_OK(GenerateAndAddExternalFile(options, {22, 23}, 6)); // L2 + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():Start", [&](void* /*arg*/) { + // fit in L3 but will overlap with compaction so will be added + // to L2 but a compaction will trivially move it to L3 + // and break LSM consistency + static std::atomic called = {false}; + if (!called) { + called = true; + ASSERT_OK(dbfull()->SetOptions({{"max_bytes_for_level_base", "1"}})); + ASSERT_OK(GenerateAndAddExternalFile(options, {15, 16}, 7)); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + CompactRangeOptions cro; + cro.exclusive_manual_compaction = false; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(ExternalSSTFileTest, CompactAddedFiles) { + Options options = CurrentOptions(); + options.num_levels = 3; + DestroyAndReopen(options); + + ASSERT_OK(GenerateAndAddExternalFile(options, {1, 10}, 1)); // L3 + ASSERT_OK(GenerateAndAddExternalFile(options, {2, 9}, 2)); // L2 + ASSERT_OK(GenerateAndAddExternalFile(options, {3, 8}, 3)); // L1 + ASSERT_OK(GenerateAndAddExternalFile(options, {4, 7}, 4)); // L0 + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); +} + +TEST_F(ExternalSSTFileTest, SstFileWriterNonSharedKeys) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + std::string file_path = sst_files_dir_ + "/not_shared"; + SstFileWriter sst_file_writer(EnvOptions(), options); + + std::string suffix(100, 'X'); + ASSERT_OK(sst_file_writer.Open(file_path)); + ASSERT_OK(sst_file_writer.Put("A" + suffix, "VAL")); + ASSERT_OK(sst_file_writer.Put("BB" + suffix, "VAL")); + ASSERT_OK(sst_file_writer.Put("CC" + suffix, "VAL")); + ASSERT_OK(sst_file_writer.Put("CXD" + suffix, "VAL")); + ASSERT_OK(sst_file_writer.Put("CZZZ" + suffix, "VAL")); + ASSERT_OK(sst_file_writer.Put("ZAAAX" + suffix, "VAL")); + + ASSERT_OK(sst_file_writer.Finish()); + ASSERT_OK(DeprecatedAddFile({file_path})); +} + +TEST_F(ExternalSSTFileTest, WithUnorderedWrite) { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL", + "ExternalSSTFileTest::WithUnorderedWrite:WaitWriteWAL"}, + {"DBImpl::WaitForPendingWrites:BeforeBlock", + "DBImpl::WriteImpl:BeforeUnorderedWriteMemtable"}}); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::IngestExternalFile:NeedFlush", [&](void* need_flush) { + ASSERT_TRUE(*reinterpret_cast(need_flush)); + }); + + Options options = CurrentOptions(); + options.unordered_write = true; + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "v1")); + SyncPoint::GetInstance()->EnableProcessing(); + port::Thread writer([&]() { ASSERT_OK(Put("bar", "v2")); }); + + TEST_SYNC_POINT("ExternalSSTFileTest::WithUnorderedWrite:WaitWriteWAL"); + ASSERT_OK(GenerateAndAddExternalFile(options, {{"bar", "v3"}}, -1, + true /* allow_global_seqno */)); + ASSERT_EQ(Get("bar"), "v3"); + + writer.join(); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoRandomized) { + env_->skip_fsync_ = true; + Options options = CurrentOptions(); + options.IncreaseParallelism(20); + options.level0_slowdown_writes_trigger = 256; + options.level0_stop_writes_trigger = 256; + + bool write_global_seqno = std::get<0>(GetParam()); + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + for (int iter = 0; iter < 2; iter++) { + bool write_to_memtable = (iter == 0); + DestroyAndReopen(options); + + Random rnd(301); + std::map true_data; + for (int i = 0; i < 500; i++) { + std::vector> random_data; + for (int j = 0; j < 100; j++) { + std::string k = rnd.RandomString(rnd.Next() % 20); + std::string v = rnd.RandomString(rnd.Next() % 50); + random_data.emplace_back(k, v); + } + + if (write_to_memtable && rnd.OneIn(4)) { + // 25% of writes go through memtable + for (auto& entry : random_data) { + ASSERT_OK(Put(entry.first, entry.second)); + true_data[entry.first] = entry.second; + } + } else { + ASSERT_OK(GenerateAndAddExternalFile( + options, random_data, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, true, &true_data)); + } + } + size_t kcnt = 0; + VerifyDBFromMap(true_data, &kcnt, false); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + VerifyDBFromMap(true_data, &kcnt, false); + } +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedLevel) { + Options options = CurrentOptions(); + options.num_levels = 5; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + std::vector> file_data; + std::map true_data; + + // Insert 100 -> 200 into the memtable + for (int i = 100; i <= 200; i++) { + ASSERT_OK(Put(Key(i), "memtable")); + true_data[Key(i)] = "memtable"; + } + + // Insert 0 -> 20 using AddFile + file_data.clear(); + for (int i = 0; i <= 20; i++) { + file_data.emplace_back(Key(i), "L4"); + } + bool write_global_seqno = std::get<0>(GetParam()); + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + ASSERT_OK(GenerateAndAddExternalFile( + options, file_data, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, false, &true_data)); + + // This file don't overlap with anything in the DB, will go to L4 + ASSERT_EQ("0,0,0,0,1", FilesPerLevel()); + + // Insert 80 -> 130 using AddFile + file_data.clear(); + for (int i = 80; i <= 130; i++) { + file_data.emplace_back(Key(i), "L0"); + } + ASSERT_OK(GenerateAndAddExternalFile( + options, file_data, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, false, &true_data)); + + // This file overlap with the memtable, so it will flush it and add + // it self to L0 + ASSERT_EQ("2,0,0,0,1", FilesPerLevel()); + + // Insert 30 -> 50 using AddFile + file_data.clear(); + for (int i = 30; i <= 50; i++) { + file_data.emplace_back(Key(i), "L4"); + } + ASSERT_OK(GenerateAndAddExternalFile( + options, file_data, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, false, &true_data)); + + // This file don't overlap with anything in the DB and fit in L4 as well + ASSERT_EQ("2,0,0,0,2", FilesPerLevel()); + + // Insert 10 -> 40 using AddFile + file_data.clear(); + for (int i = 10; i <= 40; i++) { + file_data.emplace_back(Key(i), "L3"); + } + ASSERT_OK(GenerateAndAddExternalFile( + options, file_data, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, false, &true_data)); + + // This file overlap with files in L4, we will ingest it in L3 + ASSERT_EQ("2,0,0,1,2", FilesPerLevel()); + + size_t kcnt = 0; + VerifyDBFromMap(true_data, &kcnt, false); +} + +TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoMemtableFlush) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + uint64_t entries_in_memtable; + std::map true_data; + + for (int k : {10, 20, 40, 80}) { + ASSERT_OK(Put(Key(k), "memtable")); + true_data[Key(k)] = "memtable"; + } + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &entries_in_memtable)); + ASSERT_GE(entries_in_memtable, 1); + + bool write_global_seqno = std::get<0>(GetParam()); + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + // No need for flush + ASSERT_OK(GenerateAndAddExternalFile( + options, {90, 100, 110}, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, false, &true_data)); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &entries_in_memtable)); + ASSERT_GE(entries_in_memtable, 1); + + // This file will flush the memtable + ASSERT_OK(GenerateAndAddExternalFile( + options, {19, 20, 21}, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, false, &true_data)); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &entries_in_memtable)); + ASSERT_EQ(entries_in_memtable, 0); + + for (int k : {200, 201, 205, 206}) { + ASSERT_OK(Put(Key(k), "memtable")); + true_data[Key(k)] = "memtable"; + } + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &entries_in_memtable)); + ASSERT_GE(entries_in_memtable, 1); + + // No need for flush, this file keys fit between the memtable keys + ASSERT_OK(GenerateAndAddExternalFile( + options, {202, 203, 204}, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, false, &true_data)); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &entries_in_memtable)); + ASSERT_GE(entries_in_memtable, 1); + + // This file will flush the memtable + ASSERT_OK(GenerateAndAddExternalFile( + options, {206, 207}, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, false, &true_data)); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &entries_in_memtable)); + ASSERT_EQ(entries_in_memtable, 0); + + size_t kcnt = 0; + VerifyDBFromMap(true_data, &kcnt, false); +} + +TEST_P(ExternalSSTFileTest, L0SortingIssue) { + Options options = CurrentOptions(); + options.num_levels = 2; + DestroyAndReopen(options); + std::map true_data; + + ASSERT_OK(Put(Key(1), "memtable")); + ASSERT_OK(Put(Key(10), "memtable")); + + bool write_global_seqno = std::get<0>(GetParam()); + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + // No Flush needed, No global seqno needed, Ingest in L1 + ASSERT_OK( + GenerateAndAddExternalFile(options, {7, 8}, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, false)); + // No Flush needed, but need a global seqno, Ingest in L0 + ASSERT_OK( + GenerateAndAddExternalFile(options, {7, 8}, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, false)); + printf("%s\n", FilesPerLevel().c_str()); + + // Overwrite what we added using external files + ASSERT_OK(Put(Key(7), "memtable")); + ASSERT_OK(Put(Key(8), "memtable")); + + // Read values from memtable + ASSERT_EQ(Get(Key(7)), "memtable"); + ASSERT_EQ(Get(Key(8)), "memtable"); + + // Flush and read from L0 + ASSERT_OK(Flush()); + printf("%s\n", FilesPerLevel().c_str()); + ASSERT_EQ(Get(Key(7)), "memtable"); + ASSERT_EQ(Get(Key(8)), "memtable"); +} + +TEST_F(ExternalSSTFileTest, CompactionDeadlock) { + Options options = CurrentOptions(); + options.num_levels = 2; + options.level0_file_num_compaction_trigger = 4; + options.level0_slowdown_writes_trigger = 4; + options.level0_stop_writes_trigger = 4; + DestroyAndReopen(options); + + // atomic conter of currently running bg threads + std::atomic running_threads(0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::DelayWrite:Wait", "ExternalSSTFileTest::DeadLock:0"}, + {"ExternalSSTFileTest::DeadLock:1", "DBImpl::AddFile:Start"}, + {"DBImpl::AddFile:MutexLock", "ExternalSSTFileTest::DeadLock:2"}, + {"ExternalSSTFileTest::DeadLock:3", "BackgroundCallCompaction:0"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Start ingesting and extrnal file in the background + ROCKSDB_NAMESPACE::port::Thread bg_ingest_file([&]() { + running_threads += 1; + ASSERT_OK(GenerateAndAddExternalFile(options, {5, 6})); + running_threads -= 1; + }); + + ASSERT_OK(Put(Key(1), "memtable")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(2), "memtable")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(3), "memtable")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(4), "memtable")); + ASSERT_OK(Flush()); + + // This thread will try to insert into the memtable but since we have 4 L0 + // files this thread will be blocked and hold the writer thread + ROCKSDB_NAMESPACE::port::Thread bg_block_put([&]() { + running_threads += 1; + ASSERT_OK(Put(Key(10), "memtable")); + running_threads -= 1; + }); + + // Make sure DelayWrite is called first + TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:0"); + + // `DBImpl::AddFile:Start` will wait until we be here + TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:1"); + + // Wait for IngestExternalFile() to start and aquire mutex + TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:2"); + + // Now let compaction start + TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:3"); + + // Wait for max 5 seconds, if we did not finish all bg threads + // then we hit the deadlock bug + for (int i = 0; i < 10; i++) { + if (running_threads.load() == 0) { + break; + } + // Make sure we do a "real sleep", not a mock one. + SystemClock::Default()->SleepForMicroseconds(500000); + } + + ASSERT_EQ(running_threads.load(), 0); + + bg_ingest_file.join(); + bg_block_put.join(); +} + +TEST_F(ExternalSSTFileTest, DirtyExit) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + std::string file_path = sst_files_dir_ + "/dirty_exit"; + std::unique_ptr sst_file_writer; + + // Destruct SstFileWriter without calling Finish() + sst_file_writer.reset(new SstFileWriter(EnvOptions(), options)); + ASSERT_OK(sst_file_writer->Open(file_path)); + sst_file_writer.reset(); + + // Destruct SstFileWriter with a failing Finish + sst_file_writer.reset(new SstFileWriter(EnvOptions(), options)); + ASSERT_OK(sst_file_writer->Open(file_path)); + ASSERT_NOK(sst_file_writer->Finish()); +} + +TEST_F(ExternalSSTFileTest, FileWithCFInfo) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko", "toto"}, options); + + SstFileWriter sfw_default(EnvOptions(), options, handles_[0]); + SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]); + SstFileWriter sfw_cf2(EnvOptions(), options, handles_[2]); + SstFileWriter sfw_unknown(EnvOptions(), options); + + // default_cf.sst + const std::string cf_default_sst = sst_files_dir_ + "/default_cf.sst"; + ASSERT_OK(sfw_default.Open(cf_default_sst)); + ASSERT_OK(sfw_default.Put("K1", "V1")); + ASSERT_OK(sfw_default.Put("K2", "V2")); + ASSERT_OK(sfw_default.Finish()); + + // cf1.sst + const std::string cf1_sst = sst_files_dir_ + "/cf1.sst"; + ASSERT_OK(sfw_cf1.Open(cf1_sst)); + ASSERT_OK(sfw_cf1.Put("K3", "V1")); + ASSERT_OK(sfw_cf1.Put("K4", "V2")); + ASSERT_OK(sfw_cf1.Finish()); + + // cf_unknown.sst + const std::string unknown_sst = sst_files_dir_ + "/cf_unknown.sst"; + ASSERT_OK(sfw_unknown.Open(unknown_sst)); + ASSERT_OK(sfw_unknown.Put("K5", "V1")); + ASSERT_OK(sfw_unknown.Put("K6", "V2")); + ASSERT_OK(sfw_unknown.Finish()); + + IngestExternalFileOptions ifo; + + // SST CF don't match + ASSERT_NOK(db_->IngestExternalFile(handles_[0], {cf1_sst}, ifo)); + // SST CF don't match + ASSERT_NOK(db_->IngestExternalFile(handles_[2], {cf1_sst}, ifo)); + // SST CF match + ASSERT_OK(db_->IngestExternalFile(handles_[1], {cf1_sst}, ifo)); + + // SST CF don't match + ASSERT_NOK(db_->IngestExternalFile(handles_[1], {cf_default_sst}, ifo)); + // SST CF don't match + ASSERT_NOK(db_->IngestExternalFile(handles_[2], {cf_default_sst}, ifo)); + // SST CF match + ASSERT_OK(db_->IngestExternalFile(handles_[0], {cf_default_sst}, ifo)); + + // SST CF unknown + ASSERT_OK(db_->IngestExternalFile(handles_[1], {unknown_sst}, ifo)); + // SST CF unknown + ASSERT_OK(db_->IngestExternalFile(handles_[2], {unknown_sst}, ifo)); + // SST CF unknown + ASSERT_OK(db_->IngestExternalFile(handles_[0], {unknown_sst}, ifo)); + + // Cannot ingest a file into a dropped CF + ASSERT_OK(db_->DropColumnFamily(handles_[1])); + ASSERT_NOK(db_->IngestExternalFile(handles_[1], {unknown_sst}, ifo)); + + // CF was not dropped, ok to Ingest + ASSERT_OK(db_->IngestExternalFile(handles_[2], {unknown_sst}, ifo)); +} + +/* + * Test and verify the functionality of ingestion_options.move_files and + * ingestion_options.failed_move_fall_back_to_copy + */ +TEST_P(ExternSSTFileLinkFailFallbackTest, LinkFailFallBackExternalSst) { + const bool fail_link = std::get<0>(GetParam()); + const bool failed_move_fall_back_to_copy = std::get<1>(GetParam()); + test_env_->set_fail_link(fail_link); + const EnvOptions env_options; + DestroyAndReopen(options_); + const int kNumKeys = 10000; + IngestExternalFileOptions ifo; + ifo.move_files = true; + ifo.failed_move_fall_back_to_copy = failed_move_fall_back_to_copy; + + std::string file_path = sst_files_dir_ + "file1.sst"; + // Create SstFileWriter for default column family + SstFileWriter sst_file_writer(env_options, options_); + ASSERT_OK(sst_file_writer.Open(file_path)); + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_value")); + } + ASSERT_OK(sst_file_writer.Finish()); + uint64_t file_size = 0; + ASSERT_OK(env_->GetFileSize(file_path, &file_size)); + + bool copyfile = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::Prepare:CopyFile", + [&](void* /* arg */) { copyfile = true; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + const Status s = db_->IngestExternalFile({file_path}, ifo); + + ColumnFamilyHandleImpl* cfh = + static_cast(dbfull()->DefaultColumnFamily()); + ColumnFamilyData* cfd = cfh->cfd(); + const InternalStats* internal_stats_ptr = cfd->internal_stats(); + const std::vector& comp_stats = + internal_stats_ptr->TEST_GetCompactionStats(); + uint64_t bytes_copied = 0; + uint64_t bytes_moved = 0; + for (const auto& stats : comp_stats) { + bytes_copied += stats.bytes_written; + bytes_moved += stats.bytes_moved; + } + + if (!fail_link) { + // Link operation succeeds. External SST should be moved. + ASSERT_OK(s); + ASSERT_EQ(0, bytes_copied); + ASSERT_EQ(file_size, bytes_moved); + ASSERT_FALSE(copyfile); + } else { + // Link operation fails. + ASSERT_EQ(0, bytes_moved); + if (failed_move_fall_back_to_copy) { + ASSERT_OK(s); + // Copy file is true since a failed link falls back to copy file. + ASSERT_TRUE(copyfile); + ASSERT_EQ(file_size, bytes_copied); + } else { + ASSERT_TRUE(s.IsNotSupported()); + // Copy file is false since a failed link does not fall back to copy file. + ASSERT_FALSE(copyfile); + ASSERT_EQ(0, bytes_copied); + } + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +class TestIngestExternalFileListener : public EventListener { + public: + void OnExternalFileIngested(DB* /*db*/, + const ExternalFileIngestionInfo& info) override { + ingested_files.push_back(info); + } + + std::vector ingested_files; +}; + +TEST_P(ExternalSSTFileTest, IngestionListener) { + Options options = CurrentOptions(); + TestIngestExternalFileListener* listener = + new TestIngestExternalFileListener(); + options.listeners.emplace_back(listener); + CreateAndReopenWithCF({"koko", "toto"}, options); + + bool write_global_seqno = std::get<0>(GetParam()); + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + // Ingest into default cf + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 2}, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, true, nullptr, handles_[0])); + ASSERT_EQ(listener->ingested_files.size(), 1); + ASSERT_EQ(listener->ingested_files.back().cf_name, "default"); + ASSERT_EQ(listener->ingested_files.back().global_seqno, 0); + ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_id, + 0); + ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_name, + "default"); + + // Ingest into cf1 + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 2}, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, true, nullptr, handles_[1])); + ASSERT_EQ(listener->ingested_files.size(), 2); + ASSERT_EQ(listener->ingested_files.back().cf_name, "koko"); + ASSERT_EQ(listener->ingested_files.back().global_seqno, 0); + ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_id, + 1); + ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_name, + "koko"); + + // Ingest into cf2 + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 2}, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, true, nullptr, handles_[2])); + ASSERT_EQ(listener->ingested_files.size(), 3); + ASSERT_EQ(listener->ingested_files.back().cf_name, "toto"); + ASSERT_EQ(listener->ingested_files.back().global_seqno, 0); + ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_id, + 2); + ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_name, + "toto"); +} + +TEST_F(ExternalSSTFileTest, SnapshotInconsistencyBug) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + const int kNumKeys = 10000; + + // Insert keys using normal path and take a snapshot + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(i), Key(i) + "_V1")); + } + const Snapshot* snap = db_->GetSnapshot(); + + // Overwrite all keys using IngestExternalFile + std::string sst_file_path = sst_files_dir_ + "file1.sst"; + SstFileWriter sst_file_writer(EnvOptions(), options); + ASSERT_OK(sst_file_writer.Open(sst_file_path)); + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_V2")); + } + ASSERT_OK(sst_file_writer.Finish()); + + IngestExternalFileOptions ifo; + ifo.move_files = true; + ASSERT_OK(db_->IngestExternalFile({sst_file_path}, ifo)); + + for (int i = 0; i < kNumKeys; i++) { + ASSERT_EQ(Get(Key(i), snap), Key(i) + "_V1"); + ASSERT_EQ(Get(Key(i)), Key(i) + "_V2"); + } + + db_->ReleaseSnapshot(snap); +} + +TEST_P(ExternalSSTFileTest, IngestBehind) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 3; + options.disable_auto_compactions = false; + DestroyAndReopen(options); + std::vector> file_data; + std::map true_data; + + // Insert 100 -> 200 into the memtable + for (int i = 100; i <= 200; i++) { + ASSERT_OK(Put(Key(i), "memtable")); + true_data[Key(i)] = "memtable"; + } + + // Insert 100 -> 200 using IngestExternalFile + file_data.clear(); + for (int i = 0; i <= 20; i++) { + file_data.emplace_back(Key(i), "ingest_behind"); + } + + bool allow_global_seqno = true; + bool ingest_behind = true; + bool write_global_seqno = std::get<0>(GetParam()); + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + + // Can't ingest behind since allow_ingest_behind isn't set to true + ASSERT_NOK(GenerateAndAddExternalFile( + options, file_data, -1, allow_global_seqno, write_global_seqno, + verify_checksums_before_ingest, ingest_behind, false /*sort_data*/, + &true_data)); + + options.allow_ingest_behind = true; + // check that we still can open the DB, as num_levels should be + // sanitized to 3 + options.num_levels = 2; + DestroyAndReopen(options); + + options.num_levels = 3; + DestroyAndReopen(options); + // Insert 100 -> 200 into the memtable + for (int i = 100; i <= 200; i++) { + ASSERT_OK(Put(Key(i), "memtable")); + true_data[Key(i)] = "memtable"; + } + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + // Universal picker should go at second from the bottom level + ASSERT_EQ("0,1", FilesPerLevel()); + ASSERT_OK(GenerateAndAddExternalFile( + options, file_data, -1, allow_global_seqno, write_global_seqno, + verify_checksums_before_ingest, true /*ingest_behind*/, + false /*sort_data*/, &true_data)); + ASSERT_EQ("0,1,1", FilesPerLevel()); + // this time ingest should fail as the file doesn't fit to the bottom level + ASSERT_NOK(GenerateAndAddExternalFile( + options, file_data, -1, allow_global_seqno, write_global_seqno, + verify_checksums_before_ingest, true /*ingest_behind*/, + false /*sort_data*/, &true_data)); + ASSERT_EQ("0,1,1", FilesPerLevel()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + // bottom level should be empty + ASSERT_EQ("0,1", FilesPerLevel()); + + size_t kcnt = 0; + VerifyDBFromMap(true_data, &kcnt, false); +} + +TEST_F(ExternalSSTFileTest, SkipBloomFilter) { + Options options = CurrentOptions(); + + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + table_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + // Create external SST file and include bloom filters + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + { + std::string file_path = sst_files_dir_ + "sst_with_bloom.sst"; + SstFileWriter sst_file_writer(EnvOptions(), options); + ASSERT_OK(sst_file_writer.Open(file_path)); + ASSERT_OK(sst_file_writer.Put("Key1", "Value1")); + ASSERT_OK(sst_file_writer.Finish()); + + ASSERT_OK( + db_->IngestExternalFile({file_path}, IngestExternalFileOptions())); + + ASSERT_EQ(Get("Key1"), "Value1"); + ASSERT_GE( + options.statistics->getTickerCount(Tickers::BLOCK_CACHE_FILTER_ADD), 1); + } + + // Create external SST file but skip bloom filters + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + { + std::string file_path = sst_files_dir_ + "sst_with_no_bloom.sst"; + SstFileWriter sst_file_writer(EnvOptions(), options, nullptr, true, + Env::IOPriority::IO_TOTAL, + true /* skip_filters */); + ASSERT_OK(sst_file_writer.Open(file_path)); + ASSERT_OK(sst_file_writer.Put("Key1", "Value1")); + ASSERT_OK(sst_file_writer.Finish()); + + ASSERT_OK( + db_->IngestExternalFile({file_path}, IngestExternalFileOptions())); + + ASSERT_EQ(Get("Key1"), "Value1"); + ASSERT_EQ( + options.statistics->getTickerCount(Tickers::BLOCK_CACHE_FILTER_ADD), 0); + } +} + +TEST_F(ExternalSSTFileTest, IngestFileWrittenWithCompressionDictionary) { + if (!ZSTD_Supported()) { + return; + } + const int kNumEntries = 1 << 10; + const int kNumBytesPerEntry = 1 << 10; + Options options = CurrentOptions(); + options.compression = kZSTD; + options.compression_opts.max_dict_bytes = 1 << 14; // 16KB + options.compression_opts.zstd_max_train_bytes = 1 << 18; // 256KB + DestroyAndReopen(options); + + std::atomic num_compression_dicts(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict", + [&](void* /* arg */) { ++num_compression_dicts; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + std::vector> random_data; + for (int i = 0; i < kNumEntries; i++) { + std::string val = rnd.RandomString(kNumBytesPerEntry); + random_data.emplace_back(Key(i), std::move(val)); + } + ASSERT_OK(GenerateAndAddExternalFile(options, std::move(random_data))); + ASSERT_EQ(1, num_compression_dicts); +} + +class ExternalSSTBlockChecksumTest + : public ExternalSSTFileTestBase, + public testing::WithParamInterface {}; + +INSTANTIATE_TEST_CASE_P(FormatVersions, ExternalSSTBlockChecksumTest, + testing::ValuesIn(test::kFooterFormatVersionsToTest)); + +// Very slow, not worth the cost to run regularly +TEST_P(ExternalSSTBlockChecksumTest, DISABLED_HugeBlockChecksum) { + BlockBasedTableOptions table_options; + table_options.format_version = GetParam(); + for (auto t : GetSupportedChecksums()) { + table_options.checksum = t; + Options options = CurrentOptions(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // 2^32 - 1, will lead to data block with more than 2^32 bytes + size_t huge_size = std::numeric_limits::max(); + + std::string f = sst_files_dir_ + "f.sst"; + ASSERT_OK(sst_file_writer.Open(f)); + { + Random64 r(123); + std::string huge(huge_size, 0); + for (size_t j = 0; j + 7 < huge_size; j += 8) { + EncodeFixed64(&huge[j], r.Next()); + } + ASSERT_OK(sst_file_writer.Put("Huge", huge)); + } + + ExternalSstFileInfo f_info; + ASSERT_OK(sst_file_writer.Finish(&f_info)); + ASSERT_GT(f_info.file_size, uint64_t{huge_size} + 10); + + SstFileReader sst_file_reader(options); + ASSERT_OK(sst_file_reader.Open(f)); + ASSERT_OK(sst_file_reader.VerifyChecksum()); + } +} + +TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_Success) { + std::unique_ptr fault_injection_env( + new FaultInjectionTestEnv(env_)); + Options options = CurrentOptions(); + options.env = fault_injection_env.get(); + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + + // Exercise different situations in different column families: two are empty + // (so no new sequence number is needed), but at least one overlaps with the + // DB and needs to bump the sequence number. + ASSERT_OK(db_->Put(WriteOptions(), "foo1", "oldvalue")); + + std::vector column_families; + column_families.push_back(handles_[0]); + column_families.push_back(handles_[1]); + column_families.push_back(handles_[2]); + std::vector ifos(column_families.size()); + for (auto& ifo : ifos) { + ifo.allow_global_seqno = true; // Always allow global_seqno + // May or may not write global_seqno + ifo.write_global_seqno = std::get<0>(GetParam()); + // Whether to verify checksums before ingestion + ifo.verify_checksums_before_ingest = std::get<1>(GetParam()); + } + std::vector>> data; + data.push_back( + {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")}); + data.push_back( + {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")}); + data.push_back( + {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")}); + + // Resize the true_data vector upon construction to avoid re-alloc + std::vector> true_data( + column_families.size()); + ASSERT_OK(GenerateAndAddExternalFiles(options, column_families, ifos, data, + -1, true, true_data)); + Close(); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"}, + options); + ASSERT_EQ(3, handles_.size()); + int cf = 0; + for (const auto& verify_map : true_data) { + for (const auto& elem : verify_map) { + const std::string& key = elem.first; + const std::string& value = elem.second; + ASSERT_EQ(value, Get(cf, key)); + } + ++cf; + } + Close(); + Destroy(options, true /* delete_cf_paths */); +} + +TEST_P(ExternalSSTFileTest, + IngestFilesIntoMultipleColumnFamilies_NoMixedStateWithSnapshot) { + std::unique_ptr fault_injection_env( + new FaultInjectionTestEnv(env_)); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::IngestExternalFiles:InstallSVForFirstCF:0", + "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:" + "BeforeRead"}, + {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:" + "AfterRead", + "DBImpl::IngestExternalFiles:InstallSVForFirstCF:1"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.env = fault_injection_env.get(); + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + const std::vector> data_before_ingestion = + {{{"foo1", "fv1_0"}, {"foo2", "fv2_0"}, {"foo3", "fv3_0"}}, + {{"bar1", "bv1_0"}, {"bar2", "bv2_0"}, {"bar3", "bv3_0"}}, + {{"bar4", "bv4_0"}, {"bar5", "bv5_0"}, {"bar6", "bv6_0"}}}; + for (size_t i = 0; i != handles_.size(); ++i) { + int cf = static_cast(i); + const auto& orig_data = data_before_ingestion[i]; + for (const auto& kv : orig_data) { + ASSERT_OK(Put(cf, kv.first, kv.second)); + } + ASSERT_OK(Flush(cf)); + } + + std::vector column_families; + column_families.push_back(handles_[0]); + column_families.push_back(handles_[1]); + column_families.push_back(handles_[2]); + std::vector ifos(column_families.size()); + for (auto& ifo : ifos) { + ifo.allow_global_seqno = true; // Always allow global_seqno + // May or may not write global_seqno + ifo.write_global_seqno = std::get<0>(GetParam()); + // Whether to verify checksums before ingestion + ifo.verify_checksums_before_ingest = std::get<1>(GetParam()); + } + std::vector>> data; + data.push_back( + {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")}); + data.push_back( + {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")}); + data.push_back( + {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")}); + // Resize the true_data vector upon construction to avoid re-alloc + std::vector> true_data( + column_families.size()); + // Take snapshot before ingestion starts + ReadOptions read_opts; + read_opts.total_order_seek = true; + read_opts.snapshot = dbfull()->GetSnapshot(); + std::vector iters(handles_.size()); + + // Range scan checks first kv of each CF before ingestion starts. + for (size_t i = 0; i != handles_.size(); ++i) { + iters[i] = dbfull()->NewIterator(read_opts, handles_[i]); + iters[i]->SeekToFirst(); + ASSERT_TRUE(iters[i]->Valid()); + const std::string& key = iters[i]->key().ToString(); + const std::string& value = iters[i]->value().ToString(); + const std::map& orig_data = + data_before_ingestion[i]; + std::map::const_iterator it = orig_data.find(key); + ASSERT_NE(orig_data.end(), it); + ASSERT_EQ(it->second, value); + iters[i]->Next(); + } + port::Thread ingest_thread([&]() { + ASSERT_OK(GenerateAndAddExternalFiles(options, column_families, ifos, data, + -1, true, true_data)); + }); + TEST_SYNC_POINT( + "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:" + "BeforeRead"); + // Should see only data before ingestion + for (size_t i = 0; i != handles_.size(); ++i) { + const auto& orig_data = data_before_ingestion[i]; + for (; iters[i]->Valid(); iters[i]->Next()) { + const std::string& key = iters[i]->key().ToString(); + const std::string& value = iters[i]->value().ToString(); + std::map::const_iterator it = + orig_data.find(key); + ASSERT_NE(orig_data.end(), it); + ASSERT_EQ(it->second, value); + } + } + TEST_SYNC_POINT( + "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:" + "AfterRead"); + ingest_thread.join(); + for (auto* iter : iters) { + delete iter; + } + iters.clear(); + dbfull()->ReleaseSnapshot(read_opts.snapshot); + + Close(); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"}, + options); + // Should see consistent state after ingestion for all column families even + // without snapshot. + ASSERT_EQ(3, handles_.size()); + int cf = 0; + for (const auto& verify_map : true_data) { + for (const auto& elem : verify_map) { + const std::string& key = elem.first; + const std::string& value = elem.second; + ASSERT_EQ(value, Get(cf, key)); + } + ++cf; + } + Close(); + Destroy(options, true /* delete_cf_paths */); +} + +TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_PrepareFail) { + std::unique_ptr fault_injection_env( + new FaultInjectionTestEnv(env_)); + Options options = CurrentOptions(); + options.env = fault_injection_env.get(); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0", + "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_PrepareFail:" + "0"}, + {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies:PrepareFail:" + "1", + "DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + std::vector column_families; + column_families.push_back(handles_[0]); + column_families.push_back(handles_[1]); + column_families.push_back(handles_[2]); + std::vector ifos(column_families.size()); + for (auto& ifo : ifos) { + ifo.allow_global_seqno = true; // Always allow global_seqno + // May or may not write global_seqno + ifo.write_global_seqno = std::get<0>(GetParam()); + // Whether to verify block checksums before ingest + ifo.verify_checksums_before_ingest = std::get<1>(GetParam()); + } + std::vector>> data; + data.push_back( + {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")}); + data.push_back( + {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")}); + data.push_back( + {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")}); + + // Resize the true_data vector upon construction to avoid re-alloc + std::vector> true_data( + column_families.size()); + port::Thread ingest_thread([&]() { + ASSERT_NOK(GenerateAndAddExternalFiles(options, column_families, ifos, data, + -1, true, true_data)); + }); + TEST_SYNC_POINT( + "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_PrepareFail:" + "0"); + fault_injection_env->SetFilesystemActive(false); + TEST_SYNC_POINT( + "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies:PrepareFail:" + "1"); + ingest_thread.join(); + + fault_injection_env->SetFilesystemActive(true); + Close(); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"}, + options); + ASSERT_EQ(3, handles_.size()); + int cf = 0; + for (const auto& verify_map : true_data) { + for (const auto& elem : verify_map) { + const std::string& key = elem.first; + ASSERT_EQ("NOT_FOUND", Get(cf, key)); + } + ++cf; + } + Close(); + Destroy(options, true /* delete_cf_paths */); +} + +TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_CommitFail) { + std::unique_ptr fault_injection_env( + new FaultInjectionTestEnv(env_)); + Options options = CurrentOptions(); + options.env = fault_injection_env.get(); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::IngestExternalFiles:BeforeJobsRun:0", + "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:" + "0"}, + {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:" + "1", + "DBImpl::IngestExternalFiles:BeforeJobsRun:1"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + std::vector column_families; + column_families.push_back(handles_[0]); + column_families.push_back(handles_[1]); + column_families.push_back(handles_[2]); + std::vector ifos(column_families.size()); + for (auto& ifo : ifos) { + ifo.allow_global_seqno = true; // Always allow global_seqno + // May or may not write global_seqno + ifo.write_global_seqno = std::get<0>(GetParam()); + // Whether to verify block checksums before ingestion + ifo.verify_checksums_before_ingest = std::get<1>(GetParam()); + } + std::vector>> data; + data.push_back( + {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")}); + data.push_back( + {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")}); + data.push_back( + {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")}); + // Resize the true_data vector upon construction to avoid re-alloc + std::vector> true_data( + column_families.size()); + port::Thread ingest_thread([&]() { + ASSERT_NOK(GenerateAndAddExternalFiles(options, column_families, ifos, data, + -1, true, true_data)); + }); + TEST_SYNC_POINT( + "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:" + "0"); + fault_injection_env->SetFilesystemActive(false); + TEST_SYNC_POINT( + "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:" + "1"); + ingest_thread.join(); + + fault_injection_env->SetFilesystemActive(true); + Close(); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"}, + options); + ASSERT_EQ(3, handles_.size()); + int cf = 0; + for (const auto& verify_map : true_data) { + for (const auto& elem : verify_map) { + const std::string& key = elem.first; + ASSERT_EQ("NOT_FOUND", Get(cf, key)); + } + ++cf; + } + Close(); + Destroy(options, true /* delete_cf_paths */); +} + +TEST_P(ExternalSSTFileTest, + IngestFilesIntoMultipleColumnFamilies_PartialManifestWriteFail) { + std::unique_ptr fault_injection_env( + new FaultInjectionTestEnv(env_)); + Options options = CurrentOptions(); + options.env = fault_injection_env.get(); + + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + + SyncPoint::GetInstance()->ClearTrace(); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency({ + {"VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0", + "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_" + "PartialManifestWriteFail:0"}, + {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_" + "PartialManifestWriteFail:1", + "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::vector column_families; + column_families.push_back(handles_[0]); + column_families.push_back(handles_[1]); + column_families.push_back(handles_[2]); + std::vector ifos(column_families.size()); + for (auto& ifo : ifos) { + ifo.allow_global_seqno = true; // Always allow global_seqno + // May or may not write global_seqno + ifo.write_global_seqno = std::get<0>(GetParam()); + // Whether to verify block checksums before ingestion + ifo.verify_checksums_before_ingest = std::get<1>(GetParam()); + } + std::vector>> data; + data.push_back( + {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")}); + data.push_back( + {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")}); + data.push_back( + {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")}); + // Resize the true_data vector upon construction to avoid re-alloc + std::vector> true_data( + column_families.size()); + port::Thread ingest_thread([&]() { + ASSERT_NOK(GenerateAndAddExternalFiles(options, column_families, ifos, data, + -1, true, true_data)); + }); + TEST_SYNC_POINT( + "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_" + "PartialManifestWriteFail:0"); + fault_injection_env->SetFilesystemActive(false); + TEST_SYNC_POINT( + "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_" + "PartialManifestWriteFail:1"); + ingest_thread.join(); + + ASSERT_OK(fault_injection_env->DropUnsyncedFileData()); + fault_injection_env->SetFilesystemActive(true); + Close(); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"}, + options); + ASSERT_EQ(3, handles_.size()); + int cf = 0; + for (const auto& verify_map : true_data) { + for (const auto& elem : verify_map) { + const std::string& key = elem.first; + ASSERT_EQ("NOT_FOUND", Get(cf, key)); + } + ++cf; + } + Close(); + Destroy(options, true /* delete_cf_paths */); +} + +TEST_P(ExternalSSTFileTest, IngestFilesTriggerFlushingWithTwoWriteQueue) { + Options options = CurrentOptions(); + // Use large buffer to avoid memtable flush + options.write_buffer_size = 1024 * 1024; + options.two_write_queues = true; + DestroyAndReopen(options); + + ASSERT_OK(dbfull()->Put(WriteOptions(), "1000", "v1")); + ASSERT_OK(dbfull()->Put(WriteOptions(), "1001", "v1")); + ASSERT_OK(dbfull()->Put(WriteOptions(), "9999", "v1")); + + // Put one key which is overlap with keys in memtable. + // It will trigger flushing memtable and require this thread is + // currently at the front of the 2nd writer queue. We must make + // sure that it won't enter the 2nd writer queue for the second time. + std::vector> data; + data.push_back(std::make_pair("1001", "v2")); + ASSERT_OK(GenerateAndAddExternalFile(options, data, -1, true)); +} + +TEST_P(ExternalSSTFileTest, DeltaEncodingWhileGlobalSeqnoPresent) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + constexpr size_t kValueSize = 8; + Random rnd(301); + std::string value = rnd.RandomString(kValueSize); + + // Write some key to make global seqno larger than zero + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put("ab" + Key(i), value)); + } + // Get a Snapshot to make RocksDB assign global seqno to ingested sst files. + auto snap = dbfull()->GetSnapshot(); + + std::string fname = sst_files_dir_ + "test_file"; + ROCKSDB_NAMESPACE::SstFileWriter writer(EnvOptions(), options); + ASSERT_OK(writer.Open(fname)); + std::string key1 = "ab"; + std::string key2 = "ab"; + + // Make the prefix of key2 is same with key1 add zero seqno. The tail of every + // key is composed as (seqno << 8 | value_type), and here `1` represents + // ValueType::kTypeValue + + PutFixed64(&key2, PackSequenceAndType(0, kTypeValue)); + key2 += "cdefghijkl"; + + ASSERT_OK(writer.Put(key1, value)); + ASSERT_OK(writer.Put(key2, value)); + + ExternalSstFileInfo info; + ASSERT_OK(writer.Finish(&info)); + + ASSERT_OK(dbfull()->IngestExternalFile({info.file_path}, + IngestExternalFileOptions())); + dbfull()->ReleaseSnapshot(snap); + ASSERT_EQ(value, Get(key1)); + // You will get error here + ASSERT_EQ(value, Get(key2)); +} + +TEST_P(ExternalSSTFileTest, + DeltaEncodingWhileGlobalSeqnoPresentIteratorSwitch) { + // Regression test for bug where global seqno corrupted the shared bytes + // buffer when switching from reverse iteration to forward iteration. + constexpr size_t kValueSize = 8; + Options options = CurrentOptions(); + + Random rnd(301); + std::string value = rnd.RandomString(kValueSize); + + std::string key0 = "aa"; + std::string key1 = "ab"; + // Make the prefix of key2 is same with key1 add zero seqno. The tail of every + // key is composed as (seqno << 8 | value_type), and here `1` represents + // ValueType::kTypeValue + std::string key2 = "ab"; + PutFixed64(&key2, PackSequenceAndType(0, kTypeValue)); + key2 += "cdefghijkl"; + std::string key3 = key2 + "_"; + + // Write some key to make global seqno larger than zero + ASSERT_OK(Put(key0, value)); + + std::string fname = sst_files_dir_ + "test_file"; + ROCKSDB_NAMESPACE::SstFileWriter writer(EnvOptions(), options); + ASSERT_OK(writer.Open(fname)); + + // key0 is a dummy to ensure the turnaround point (key1) comes from Prev + // cache rather than block (restart keys are pinned in block). + ASSERT_OK(writer.Put(key0, value)); + ASSERT_OK(writer.Put(key1, value)); + ASSERT_OK(writer.Put(key2, value)); + ASSERT_OK(writer.Put(key3, value)); + + ExternalSstFileInfo info; + ASSERT_OK(writer.Finish(&info)); + + ASSERT_OK(dbfull()->IngestExternalFile({info.file_path}, + IngestExternalFileOptions())); + ReadOptions read_opts; + // Prevents Seek() when switching directions, which circumvents the bug. + read_opts.total_order_seek = true; + Iterator* iter = db_->NewIterator(read_opts); + // Scan backwards to key2. File iterator will then be positioned at key1. + iter->Seek(key3); + ASSERT_EQ(key3, iter->key()); + iter->Prev(); + ASSERT_EQ(key2, iter->key()); + // Scan forwards and make sure key3 is present. Previously key3 would be + // corrupted by the global seqno from key1. + iter->Next(); + ASSERT_EQ(key3, iter->key()); + delete iter; +} + +INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest, + testing::Values(std::make_tuple(false, false), + std::make_tuple(false, true), + std::make_tuple(true, false), + std::make_tuple(true, true))); + +INSTANTIATE_TEST_CASE_P(ExternSSTFileLinkFailFallbackTest, + ExternSSTFileLinkFailFallbackTest, + testing::Values(std::make_tuple(true, false), + std::make_tuple(true, true), + std::make_tuple(false, false))); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as External SST File Writer and Ingestion are not supported " + "in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/fault_injection_test.cc b/src/rocksdb/db/fault_injection_test.cc new file mode 100644 index 000000000..ddd4b47cc --- /dev/null +++ b/src/rocksdb/db/fault_injection_test.cc @@ -0,0 +1,637 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright 2014 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// This test uses a custom Env to keep track of the state of a filesystem as of +// the last "sync". It then checks for data loss errors by purposely dropping +// file data (or entire files) not protected by a "sync". + +#include "db/db_impl/db_impl.h" +#include "db/log_format.h" +#include "db/version_set.h" +#include "env/mock_env.h" +#include "file/filename.h" +#include "rocksdb/cache.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/table.h" +#include "rocksdb/write_batch.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "utilities/fault_injection_env.h" +#ifndef NDEBUG +#include "utilities/fault_injection_fs.h" +#endif + +namespace ROCKSDB_NAMESPACE { + +static const int kValueSize = 1000; +static const int kMaxNumValues = 2000; +static const size_t kNumIterations = 3; + +enum FaultInjectionOptionConfig { + kDefault, + kDifferentDataDir, + kWalDir, + kSyncWal, + kWalDirSyncWal, + kMultiLevels, + kEnd, +}; +class FaultInjectionTest + : public testing::Test, + public testing::WithParamInterface> { + protected: + int option_config_; + int non_inclusive_end_range_; // kEnd or equivalent to that + // When need to make sure data is persistent, sync WAL + bool sync_use_wal_; + // When need to make sure data is persistent, call DB::CompactRange() + bool sync_use_compact_; + + bool sequential_order_; + + public: + enum ExpectedVerifResult { kValExpectFound, kValExpectNoError }; + enum ResetMethod { + kResetDropUnsyncedData, + kResetDropRandomUnsyncedData, + kResetDeleteUnsyncedFiles, + kResetDropAndDeleteUnsynced + }; + + std::unique_ptr base_env_; + FaultInjectionTestEnv* env_; + std::string dbname_; + std::shared_ptr tiny_cache_; + Options options_; + DB* db_; + + FaultInjectionTest() + : option_config_(std::get<1>(GetParam())), + non_inclusive_end_range_(std::get<2>(GetParam())), + sync_use_wal_(false), + sync_use_compact_(true), + base_env_(nullptr), + env_(nullptr), + db_(nullptr) { + EXPECT_OK( + test::CreateEnvFromSystem(ConfigOptions(), &system_env_, &env_guard_)); + EXPECT_NE(system_env_, nullptr); + } + + ~FaultInjectionTest() override { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + } + + bool ChangeOptions() { + option_config_++; + if (option_config_ >= non_inclusive_end_range_) { + return false; + } else { + if (option_config_ == kMultiLevels) { + base_env_.reset(MockEnv::Create(system_env_)); + } + return true; + } + } + + // Return the current option configuration. + Options CurrentOptions() { + sync_use_wal_ = false; + sync_use_compact_ = true; + Options options; + switch (option_config_) { + case kWalDir: + options.wal_dir = test::PerThreadDBPath(env_, "fault_test_wal"); + break; + case kDifferentDataDir: + options.db_paths.emplace_back( + test::PerThreadDBPath(env_, "fault_test_data"), 1000000U); + break; + case kSyncWal: + sync_use_wal_ = true; + sync_use_compact_ = false; + break; + case kWalDirSyncWal: + options.wal_dir = test::PerThreadDBPath(env_, "/fault_test_wal"); + sync_use_wal_ = true; + sync_use_compact_ = false; + break; + case kMultiLevels: + options.write_buffer_size = 64 * 1024; + options.target_file_size_base = 64 * 1024; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 4; + options.max_bytes_for_level_base = 128 * 1024; + options.max_write_buffer_number = 2; + options.max_background_compactions = 8; + options.max_background_flushes = 8; + sync_use_wal_ = true; + sync_use_compact_ = false; + break; + default: + break; + } + return options; + } + + Status NewDB() { + assert(db_ == nullptr); + assert(tiny_cache_ == nullptr); + assert(env_ == nullptr); + + env_ = new FaultInjectionTestEnv(base_env_ ? base_env_.get() : system_env_); + + options_ = CurrentOptions(); + options_.env = env_; + options_.paranoid_checks = true; + + BlockBasedTableOptions table_options; + tiny_cache_ = NewLRUCache(100); + table_options.block_cache = tiny_cache_; + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + dbname_ = test::PerThreadDBPath("fault_test"); + + EXPECT_OK(DestroyDB(dbname_, options_)); + + options_.create_if_missing = true; + Status s = OpenDB(); + options_.create_if_missing = false; + return s; + } + + void SetUp() override { + sequential_order_ = std::get<0>(GetParam()); + ASSERT_OK(NewDB()); + } + + void TearDown() override { + CloseDB(); + + Status s = DestroyDB(dbname_, options_); + + delete env_; + env_ = nullptr; + + tiny_cache_.reset(); + + ASSERT_OK(s); + } + + void Build(const WriteOptions& write_options, int start_idx, int num_vals) { + std::string key_space, value_space; + WriteBatch batch; + for (int i = start_idx; i < start_idx + num_vals; i++) { + Slice key = Key(i, &key_space); + batch.Clear(); + ASSERT_OK(batch.Put(key, Value(i, &value_space))); + ASSERT_OK(db_->Write(write_options, &batch)); + } + } + + Status ReadValue(int i, std::string* val) const { + std::string key_space, value_space; + Slice key = Key(i, &key_space); + Value(i, &value_space); + ReadOptions options; + return db_->Get(options, key, val); + } + + Status Verify(int start_idx, int num_vals, + ExpectedVerifResult expected) const { + std::string val; + std::string value_space; + Status s; + for (int i = start_idx; i < start_idx + num_vals && s.ok(); i++) { + Value(i, &value_space); + s = ReadValue(i, &val); + if (s.ok()) { + EXPECT_EQ(value_space, val); + } + if (expected == kValExpectFound) { + if (!s.ok()) { + fprintf(stderr, "Error when read %dth record (expect found): %s\n", i, + s.ToString().c_str()); + return s; + } + } else if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "Error when read %dth record: %s\n", i, + s.ToString().c_str()); + return s; + } + } + return Status::OK(); + } + + // Return the ith key + Slice Key(int i, std::string* storage) const { + unsigned long long num = i; + if (!sequential_order_) { + // random transfer + const int m = 0x5bd1e995; + num *= m; + num ^= num << 24; + } + char buf[100]; + snprintf(buf, sizeof(buf), "%016d", static_cast(num)); + storage->assign(buf, strlen(buf)); + return Slice(*storage); + } + + // Return the value to associate with the specified key + Slice Value(int k, std::string* storage) const { + Random r(k); + *storage = r.RandomString(kValueSize); + return Slice(*storage); + } + + void CloseDB() { + delete db_; + db_ = nullptr; + } + + Status OpenDB() { + CloseDB(); + env_->ResetState(); + Status s = DB::Open(options_, dbname_, &db_); + assert(db_ != nullptr); + return s; + } + + void DeleteAllData() { + Iterator* iter = db_->NewIterator(ReadOptions()); + WriteOptions options; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(db_->Delete(WriteOptions(), iter->key())); + } + ASSERT_OK(iter->status()); + delete iter; + + FlushOptions flush_options; + flush_options.wait = true; + ASSERT_OK(db_->Flush(flush_options)); + } + + // rnd cannot be null for kResetDropRandomUnsyncedData + void ResetDBState(ResetMethod reset_method, Random* rnd = nullptr) { + env_->AssertNoOpenFile(); + switch (reset_method) { + case kResetDropUnsyncedData: + ASSERT_OK(env_->DropUnsyncedFileData()); + break; + case kResetDropRandomUnsyncedData: + ASSERT_OK(env_->DropRandomUnsyncedFileData(rnd)); + break; + case kResetDeleteUnsyncedFiles: + ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync()); + break; + case kResetDropAndDeleteUnsynced: + ASSERT_OK(env_->DropUnsyncedFileData()); + ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync()); + break; + default: + assert(false); + } + } + + void PartialCompactTestPreFault(int num_pre_sync, int num_post_sync) { + DeleteAllData(); + + WriteOptions write_options; + write_options.sync = sync_use_wal_; + + Build(write_options, 0, num_pre_sync); + if (sync_use_compact_) { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } + write_options.sync = false; + Build(write_options, num_pre_sync, num_post_sync); + } + + void PartialCompactTestReopenWithFault(ResetMethod reset_method, + int num_pre_sync, int num_post_sync, + Random* rnd = nullptr) { + env_->SetFilesystemActive(false); + CloseDB(); + ResetDBState(reset_method, rnd); + ASSERT_OK(OpenDB()); + ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::kValExpectFound)); + ASSERT_OK(Verify(num_pre_sync, num_post_sync, + FaultInjectionTest::kValExpectNoError)); + WaitCompactionFinish(); + ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::kValExpectFound)); + ASSERT_OK(Verify(num_pre_sync, num_post_sync, + FaultInjectionTest::kValExpectNoError)); + } + + void NoWriteTestPreFault() {} + + void NoWriteTestReopenWithFault(ResetMethod reset_method) { + CloseDB(); + ResetDBState(reset_method); + ASSERT_OK(OpenDB()); + } + + void WaitCompactionFinish() { + ASSERT_OK(static_cast(db_->GetRootDB())->TEST_WaitForCompact()); + ASSERT_OK(db_->Put(WriteOptions(), "", "")); + } + + private: + Env* system_env_; + std::shared_ptr env_guard_; +}; + +class FaultInjectionTestSplitted : public FaultInjectionTest {}; + +TEST_P(FaultInjectionTestSplitted, FaultTest) { + do { + Random rnd(301); + + for (size_t idx = 0; idx < kNumIterations; idx++) { + int num_pre_sync = rnd.Uniform(kMaxNumValues); + int num_post_sync = rnd.Uniform(kMaxNumValues); + + PartialCompactTestPreFault(num_pre_sync, num_post_sync); + PartialCompactTestReopenWithFault(kResetDropUnsyncedData, num_pre_sync, + num_post_sync); + NoWriteTestPreFault(); + NoWriteTestReopenWithFault(kResetDropUnsyncedData); + + PartialCompactTestPreFault(num_pre_sync, num_post_sync); + PartialCompactTestReopenWithFault(kResetDropRandomUnsyncedData, + num_pre_sync, num_post_sync, &rnd); + NoWriteTestPreFault(); + NoWriteTestReopenWithFault(kResetDropUnsyncedData); + + // Setting a separate data path won't pass the test as we don't sync + // it after creating new files, + PartialCompactTestPreFault(num_pre_sync, num_post_sync); + PartialCompactTestReopenWithFault(kResetDropAndDeleteUnsynced, + num_pre_sync, num_post_sync); + NoWriteTestPreFault(); + NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced); + + PartialCompactTestPreFault(num_pre_sync, num_post_sync); + // No new files created so we expect all values since no files will be + // dropped. + PartialCompactTestReopenWithFault(kResetDeleteUnsyncedFiles, num_pre_sync, + num_post_sync); + NoWriteTestPreFault(); + NoWriteTestReopenWithFault(kResetDeleteUnsyncedFiles); + } + } while (ChangeOptions()); +} + +// Previous log file is not fsynced if sync is forced after log rolling. +TEST_P(FaultInjectionTest, WriteOptionSyncTest) { + test::SleepingBackgroundTask sleeping_task_low; + env_->SetBackgroundThreads(1, Env::HIGH); + // Block the job queue to prevent flush job from running. + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::HIGH); + sleeping_task_low.WaitUntilSleeping(); + + WriteOptions write_options; + write_options.sync = false; + + std::string key_space, value_space; + ASSERT_OK( + db_->Put(write_options, Key(1, &key_space), Value(1, &value_space))); + FlushOptions flush_options; + flush_options.wait = false; + ASSERT_OK(db_->Flush(flush_options)); + write_options.sync = true; + ASSERT_OK( + db_->Put(write_options, Key(2, &key_space), Value(2, &value_space))); + ASSERT_OK(db_->FlushWAL(false)); + + env_->SetFilesystemActive(false); + NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + + ASSERT_OK(OpenDB()); + std::string val; + Value(2, &value_space); + ASSERT_OK(ReadValue(2, &val)); + ASSERT_EQ(value_space, val); + + Value(1, &value_space); + ASSERT_OK(ReadValue(1, &val)); + ASSERT_EQ(value_space, val); +} + +TEST_P(FaultInjectionTest, UninstalledCompaction) { + options_.target_file_size_base = 32 * 1024; + options_.write_buffer_size = 100 << 10; // 100KB + options_.level0_file_num_compaction_trigger = 6; + options_.level0_stop_writes_trigger = 1 << 10; + options_.level0_slowdown_writes_trigger = 1 << 10; + options_.max_background_compactions = 1; + OpenDB(); + + if (!sequential_order_) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"FaultInjectionTest::FaultTest:0", "DBImpl::BGWorkCompaction"}, + {"CompactionJob::Run():End", "FaultInjectionTest::FaultTest:1"}, + {"FaultInjectionTest::FaultTest:2", + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}, + }); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + int kNumKeys = 1000; + Build(WriteOptions(), 0, kNumKeys); + FlushOptions flush_options; + flush_options.wait = true; + ASSERT_OK(db_->Flush(flush_options)); + ASSERT_OK(db_->Put(WriteOptions(), "", "")); + TEST_SYNC_POINT("FaultInjectionTest::FaultTest:0"); + TEST_SYNC_POINT("FaultInjectionTest::FaultTest:1"); + env_->SetFilesystemActive(false); + TEST_SYNC_POINT("FaultInjectionTest::FaultTest:2"); + CloseDB(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ResetDBState(kResetDropUnsyncedData); + + std::atomic opened(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::Open:Opened", [&](void* /*arg*/) { opened.store(true); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BGWorkCompaction", + [&](void* /*arg*/) { ASSERT_TRUE(opened.load()); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(OpenDB()); + ASSERT_OK(Verify(0, kNumKeys, FaultInjectionTest::kValExpectFound)); + WaitCompactionFinish(); + ASSERT_OK(Verify(0, kNumKeys, FaultInjectionTest::kValExpectFound)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(FaultInjectionTest, ManualLogSyncTest) { + test::SleepingBackgroundTask sleeping_task_low; + env_->SetBackgroundThreads(1, Env::HIGH); + // Block the job queue to prevent flush job from running. + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::HIGH); + sleeping_task_low.WaitUntilSleeping(); + + WriteOptions write_options; + write_options.sync = false; + + std::string key_space, value_space; + ASSERT_OK( + db_->Put(write_options, Key(1, &key_space), Value(1, &value_space))); + FlushOptions flush_options; + flush_options.wait = false; + ASSERT_OK(db_->Flush(flush_options)); + ASSERT_OK( + db_->Put(write_options, Key(2, &key_space), Value(2, &value_space))); + ASSERT_OK(db_->FlushWAL(true)); + + env_->SetFilesystemActive(false); + NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + + ASSERT_OK(OpenDB()); + std::string val; + Value(2, &value_space); + ASSERT_OK(ReadValue(2, &val)); + ASSERT_EQ(value_space, val); + + Value(1, &value_space); + ASSERT_OK(ReadValue(1, &val)); + ASSERT_EQ(value_space, val); +} + +TEST_P(FaultInjectionTest, WriteBatchWalTerminationTest) { + ReadOptions ro; + Options options = CurrentOptions(); + options.env = env_; + + WriteOptions wo; + wo.sync = true; + wo.disableWAL = false; + WriteBatch batch; + ASSERT_OK(batch.Put("cats", "dogs")); + batch.MarkWalTerminationPoint(); + ASSERT_OK(batch.Put("boys", "girls")); + ASSERT_OK(db_->Write(wo, &batch)); + + env_->SetFilesystemActive(false); + NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced); + ASSERT_OK(OpenDB()); + + std::string val; + ASSERT_OK(db_->Get(ro, "cats", &val)); + ASSERT_EQ("dogs", val); + ASSERT_EQ(db_->Get(ro, "boys", &val), Status::NotFound()); +} + +TEST_P(FaultInjectionTest, NoDuplicateTrailingEntries) { + auto fault_fs = std::make_shared(FileSystem::Default()); + fault_fs->EnableWriteErrorInjection(); + fault_fs->SetFilesystemDirectWritable(false); + const std::string file_name = NormalizePath(dbname_ + "/test_file"); + std::unique_ptr log_writer = nullptr; + constexpr uint64_t log_number = 0; + { + std::unique_ptr file; + const Status s = + fault_fs->NewWritableFile(file_name, FileOptions(), &file, nullptr); + ASSERT_OK(s); + std::unique_ptr fwriter( + new WritableFileWriter(std::move(file), file_name, FileOptions())); + log_writer.reset(new log::Writer(std::move(fwriter), log_number, + /*recycle_log_files=*/false)); + } + + fault_fs->SetRandomWriteError( + 0xdeadbeef, /*one_in=*/1, IOStatus::IOError("Injected IOError"), + /*inject_for_all_file_types=*/true, /*types=*/{}); + + { + VersionEdit edit; + edit.SetColumnFamily(0); + std::string buf; + assert(edit.EncodeTo(&buf)); + const Status s = log_writer->AddRecord(buf); + ASSERT_NOK(s); + } + + fault_fs->DisableWriteErrorInjection(); + + // Closing the log writer will cause WritableFileWriter::Close() and flush + // remaining data from its buffer to underlying file. + log_writer.reset(); + + { + std::unique_ptr file; + Status s = + fault_fs->NewSequentialFile(file_name, FileOptions(), &file, nullptr); + ASSERT_OK(s); + std::unique_ptr freader( + new SequentialFileReader(std::move(file), file_name)); + Status log_read_s; + class LogReporter : public log::Reader::Reporter { + public: + Status* status_; + explicit LogReporter(Status* _s) : status_(_s) {} + void Corruption(size_t /*bytes*/, const Status& _s) override { + if (status_->ok()) { + *status_ = _s; + } + } + } reporter(&log_read_s); + std::unique_ptr log_reader(new log::Reader( + nullptr, std::move(freader), &reporter, /*checksum=*/true, log_number)); + Slice record; + std::string data; + size_t count = 0; + while (log_reader->ReadRecord(&record, &data) && log_read_s.ok()) { + VersionEdit edit; + ASSERT_OK(edit.DecodeFrom(data)); + ++count; + } + // Verify that only one version edit exists in the file. + ASSERT_EQ(1, count); + } +} + +INSTANTIATE_TEST_CASE_P( + FaultTest, FaultInjectionTest, + ::testing::Values(std::make_tuple(false, kDefault, kEnd), + std::make_tuple(true, kDefault, kEnd))); + +INSTANTIATE_TEST_CASE_P( + FaultTest, FaultInjectionTestSplitted, + ::testing::Values(std::make_tuple(false, kDefault, kSyncWal), + std::make_tuple(true, kDefault, kSyncWal), + std::make_tuple(false, kSyncWal, kEnd), + std::make_tuple(true, kSyncWal, kEnd))); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/file_indexer.cc b/src/rocksdb/db/file_indexer.cc new file mode 100644 index 000000000..608f1cb28 --- /dev/null +++ b/src/rocksdb/db/file_indexer.cc @@ -0,0 +1,218 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/file_indexer.h" + +#include +#include + +#include "db/version_edit.h" +#include "rocksdb/comparator.h" + +namespace ROCKSDB_NAMESPACE { + +FileIndexer::FileIndexer(const Comparator* ucmp) + : num_levels_(0), ucmp_(ucmp), level_rb_(nullptr) {} + +size_t FileIndexer::NumLevelIndex() const { return next_level_index_.size(); } + +size_t FileIndexer::LevelIndexSize(size_t level) const { + if (level >= next_level_index_.size()) { + return 0; + } + return next_level_index_[level].num_index; +} + +void FileIndexer::GetNextLevelIndex(const size_t level, const size_t file_index, + const int cmp_smallest, + const int cmp_largest, int32_t* left_bound, + int32_t* right_bound) const { + assert(level > 0); + + // Last level, no hint + if (level == num_levels_ - 1) { + *left_bound = 0; + *right_bound = -1; + return; + } + + assert(level < num_levels_ - 1); + assert(static_cast(file_index) <= level_rb_[level]); + + const IndexUnit* index_units = next_level_index_[level].index_units; + const auto& index = index_units[file_index]; + + if (cmp_smallest < 0) { + *left_bound = (level > 0 && file_index > 0) + ? index_units[file_index - 1].largest_lb + : 0; + *right_bound = index.smallest_rb; + } else if (cmp_smallest == 0) { + *left_bound = index.smallest_lb; + *right_bound = index.smallest_rb; + } else if (cmp_smallest > 0 && cmp_largest < 0) { + *left_bound = index.smallest_lb; + *right_bound = index.largest_rb; + } else if (cmp_largest == 0) { + *left_bound = index.largest_lb; + *right_bound = index.largest_rb; + } else if (cmp_largest > 0) { + *left_bound = index.largest_lb; + *right_bound = level_rb_[level + 1]; + } else { + assert(false); + } + + assert(*left_bound >= 0); + assert(*left_bound <= *right_bound + 1); + assert(*right_bound <= level_rb_[level + 1]); +} + +void FileIndexer::UpdateIndex(Arena* arena, const size_t num_levels, + std::vector* const files) { + if (files == nullptr) { + return; + } + if (num_levels == 0) { // uint_32 0-1 would cause bad behavior + num_levels_ = num_levels; + return; + } + assert(level_rb_ == nullptr); // level_rb_ should be init here + + num_levels_ = num_levels; + next_level_index_.resize(num_levels); + + char* mem = arena->AllocateAligned(num_levels_ * sizeof(int32_t)); + level_rb_ = new (mem) int32_t[num_levels_]; + for (size_t i = 0; i < num_levels_; i++) { + level_rb_[i] = -1; + } + + // L1 - Ln-1 + for (size_t level = 1; level < num_levels_ - 1; ++level) { + const auto& upper_files = files[level]; + const int32_t upper_size = static_cast(upper_files.size()); + const auto& lower_files = files[level + 1]; + level_rb_[level] = static_cast(upper_files.size()) - 1; + if (upper_size == 0) { + continue; + } + IndexLevel& index_level = next_level_index_[level]; + index_level.num_index = upper_size; + mem = arena->AllocateAligned(upper_size * sizeof(IndexUnit)); + index_level.index_units = new (mem) IndexUnit[upper_size]; + + CalculateLB( + upper_files, lower_files, &index_level, + [this](const FileMetaData* a, const FileMetaData* b) -> int { + return ucmp_->CompareWithoutTimestamp(a->smallest.user_key(), + b->largest.user_key()); + }, + [](IndexUnit* index, int32_t f_idx) { index->smallest_lb = f_idx; }); + CalculateLB( + upper_files, lower_files, &index_level, + [this](const FileMetaData* a, const FileMetaData* b) -> int { + return ucmp_->CompareWithoutTimestamp(a->largest.user_key(), + b->largest.user_key()); + }, + [](IndexUnit* index, int32_t f_idx) { index->largest_lb = f_idx; }); + CalculateRB( + upper_files, lower_files, &index_level, + [this](const FileMetaData* a, const FileMetaData* b) -> int { + return ucmp_->CompareWithoutTimestamp(a->smallest.user_key(), + b->smallest.user_key()); + }, + [](IndexUnit* index, int32_t f_idx) { index->smallest_rb = f_idx; }); + CalculateRB( + upper_files, lower_files, &index_level, + [this](const FileMetaData* a, const FileMetaData* b) -> int { + return ucmp_->CompareWithoutTimestamp(a->largest.user_key(), + b->smallest.user_key()); + }, + [](IndexUnit* index, int32_t f_idx) { index->largest_rb = f_idx; }); + } + + level_rb_[num_levels_ - 1] = + static_cast(files[num_levels_ - 1].size()) - 1; +} + +void FileIndexer::CalculateLB( + const std::vector& upper_files, + const std::vector& lower_files, IndexLevel* index_level, + std::function cmp_op, + std::function set_index) { + const int32_t upper_size = static_cast(upper_files.size()); + const int32_t lower_size = static_cast(lower_files.size()); + int32_t upper_idx = 0; + int32_t lower_idx = 0; + + IndexUnit* index = index_level->index_units; + while (upper_idx < upper_size && lower_idx < lower_size) { + int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]); + + if (cmp == 0) { + set_index(&index[upper_idx], lower_idx); + ++upper_idx; + } else if (cmp > 0) { + // Lower level's file (largest) is smaller, a key won't hit in that + // file. Move to next lower file + ++lower_idx; + } else { + // Lower level's file becomes larger, update the index, and + // move to the next upper file + set_index(&index[upper_idx], lower_idx); + ++upper_idx; + } + } + + while (upper_idx < upper_size) { + // Lower files are exhausted, that means the remaining upper files are + // greater than any lower files. Set the index to be the lower level size. + set_index(&index[upper_idx], lower_size); + ++upper_idx; + } +} + +void FileIndexer::CalculateRB( + const std::vector& upper_files, + const std::vector& lower_files, IndexLevel* index_level, + std::function cmp_op, + std::function set_index) { + const int32_t upper_size = static_cast(upper_files.size()); + const int32_t lower_size = static_cast(lower_files.size()); + int32_t upper_idx = upper_size - 1; + int32_t lower_idx = lower_size - 1; + + IndexUnit* index = index_level->index_units; + while (upper_idx >= 0 && lower_idx >= 0) { + int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]); + + if (cmp == 0) { + set_index(&index[upper_idx], lower_idx); + --upper_idx; + } else if (cmp < 0) { + // Lower level's file (smallest) is larger, a key won't hit in that + // file. Move to next lower file. + --lower_idx; + } else { + // Lower level's file becomes smaller, update the index, and move to + // the next the upper file + set_index(&index[upper_idx], lower_idx); + --upper_idx; + } + } + while (upper_idx >= 0) { + // Lower files are exhausted, that means the remaining upper files are + // smaller than any lower files. Set it to -1. + set_index(&index[upper_idx], -1); + --upper_idx; + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/file_indexer.h b/src/rocksdb/db/file_indexer.h new file mode 100644 index 000000000..45cb13615 --- /dev/null +++ b/src/rocksdb/db/file_indexer.h @@ -0,0 +1,140 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include + +#include "memory/arena.h" +#include "port/port.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class Comparator; +struct FileMetaData; +struct FdWithKeyRange; +struct FileLevel; + +// The file tree structure in Version is prebuilt and the range of each file +// is known. On Version::Get(), it uses binary search to find a potential file +// and then check if a target key can be found in the file by comparing the key +// to each file's smallest and largest key. The results of these comparisons +// can be reused beyond checking if a key falls into a file's range. +// With some pre-calculated knowledge, each key comparison that has been done +// can serve as a hint to narrow down further searches: if a key compared to +// be smaller than a file's smallest or largest, that comparison can be used +// to find out the right bound of next binary search. Similarly, if a key +// compared to be larger than a file's smallest or largest, it can be utilized +// to find out the left bound of next binary search. +// With these hints: it can greatly reduce the range of binary search, +// especially for bottom levels, given that one file most likely overlaps with +// only N files from level below (where N is max_bytes_for_level_multiplier). +// So on level L, we will only look at ~N files instead of N^L files on the +// naive approach. +class FileIndexer { + public: + explicit FileIndexer(const Comparator* ucmp); + + size_t NumLevelIndex() const; + + size_t LevelIndexSize(size_t level) const; + + // Return a file index range in the next level to search for a key based on + // smallest and largest key comparison for the current file specified by + // level and file_index. When *left_index < *right_index, both index should + // be valid and fit in the vector size. + void GetNextLevelIndex(const size_t level, const size_t file_index, + const int cmp_smallest, const int cmp_largest, + int32_t* left_bound, int32_t* right_bound) const; + + void UpdateIndex(Arena* arena, const size_t num_levels, + std::vector* const files); + + enum { kLevelMaxIndex = std::numeric_limits::max() }; + + private: + size_t num_levels_; + const Comparator* ucmp_; + + struct IndexUnit { + IndexUnit() + : smallest_lb(0), largest_lb(0), smallest_rb(-1), largest_rb(-1) {} + // During file search, a key is compared against smallest and largest + // from a FileMetaData. It can have 3 possible outcomes: + // (1) key is smaller than smallest, implying it is also smaller than + // larger. Precalculated index based on "smallest < smallest" can + // be used to provide right bound. + // (2) key is in between smallest and largest. + // Precalculated index based on "smallest > greatest" can be used to + // provide left bound. + // Precalculated index based on "largest < smallest" can be used to + // provide right bound. + // (3) key is larger than largest, implying it is also larger than smallest. + // Precalculated index based on "largest > largest" can be used to + // provide left bound. + // + // As a result, we will need to do: + // Compare smallest (<=) and largest keys from upper level file with + // smallest key from lower level to get a right bound. + // Compare smallest (>=) and largest keys from upper level file with + // largest key from lower level to get a left bound. + // + // Example: + // level 1: [50 - 60] + // level 2: [1 - 40], [45 - 55], [58 - 80] + // A key 35, compared to be less than 50, 3rd file on level 2 can be + // skipped according to rule (1). LB = 0, RB = 1. + // A key 53, sits in the middle 50 and 60. 1st file on level 2 can be + // skipped according to rule (2)-a, but the 3rd file cannot be skipped + // because 60 is greater than 58. LB = 1, RB = 2. + // A key 70, compared to be larger than 60. 1st and 2nd file can be skipped + // according to rule (3). LB = 2, RB = 2. + // + // Point to a left most file in a lower level that may contain a key, + // which compares greater than smallest of a FileMetaData (upper level) + int32_t smallest_lb; + // Point to a left most file in a lower level that may contain a key, + // which compares greater than largest of a FileMetaData (upper level) + int32_t largest_lb; + // Point to a right most file in a lower level that may contain a key, + // which compares smaller than smallest of a FileMetaData (upper level) + int32_t smallest_rb; + // Point to a right most file in a lower level that may contain a key, + // which compares smaller than largest of a FileMetaData (upper level) + int32_t largest_rb; + }; + + // Data structure to store IndexUnits in a whole level + struct IndexLevel { + size_t num_index; + IndexUnit* index_units; + + IndexLevel() : num_index(0), index_units(nullptr) {} + }; + + void CalculateLB( + const std::vector& upper_files, + const std::vector& lower_files, IndexLevel* index_level, + std::function cmp_op, + std::function set_index); + + void CalculateRB( + const std::vector& upper_files, + const std::vector& lower_files, IndexLevel* index_level, + std::function cmp_op, + std::function set_index); + + autovector next_level_index_; + int32_t* level_rb_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/file_indexer_test.cc b/src/rocksdb/db/file_indexer_test.cc new file mode 100644 index 000000000..5c82189ef --- /dev/null +++ b/src/rocksdb/db/file_indexer_test.cc @@ -0,0 +1,352 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/file_indexer.h" + +#include + +#include "db/dbformat.h" +#include "db/version_edit.h" +#include "port/stack_trace.h" +#include "rocksdb/comparator.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" + +namespace ROCKSDB_NAMESPACE { + +class IntComparator : public Comparator { + public: + int Compare(const Slice& a, const Slice& b) const override { + assert(a.size() == 8); + assert(b.size() == 8); + int64_t diff = *reinterpret_cast(a.data()) - + *reinterpret_cast(b.data()); + if (diff < 0) { + return -1; + } else if (diff == 0) { + return 0; + } else { + return 1; + } + } + + const char* Name() const override { return "IntComparator"; } + + void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override {} + + void FindShortSuccessor(std::string* /*key*/) const override {} +}; + +class FileIndexerTest : public testing::Test { + public: + FileIndexerTest() + : kNumLevels(4), files(new std::vector[kNumLevels]) {} + + ~FileIndexerTest() override { + ClearFiles(); + delete[] files; + } + + void AddFile(int level, int64_t smallest, int64_t largest) { + auto* f = new FileMetaData(); + f->smallest = IntKey(smallest); + f->largest = IntKey(largest); + files[level].push_back(f); + } + + InternalKey IntKey(int64_t v) { + return InternalKey(Slice(reinterpret_cast(&v), 8), 0, kTypeValue); + } + + void ClearFiles() { + for (uint32_t i = 0; i < kNumLevels; ++i) { + for (auto* f : files[i]) { + delete f; + } + files[i].clear(); + } + } + + void GetNextLevelIndex(const uint32_t level, const uint32_t file_index, + const int cmp_smallest, const int cmp_largest, + int32_t* left_index, int32_t* right_index) { + *left_index = 100; + *right_index = 100; + indexer->GetNextLevelIndex(level, file_index, cmp_smallest, cmp_largest, + left_index, right_index); + } + + int32_t left = 100; + int32_t right = 100; + const uint32_t kNumLevels; + IntComparator ucmp; + FileIndexer* indexer; + + std::vector* files; +}; + +// Case 0: Empty +TEST_F(FileIndexerTest, Empty) { + Arena arena; + indexer = new FileIndexer(&ucmp); + indexer->UpdateIndex(&arena, 0, files); + delete indexer; +} + +// Case 1: no overlap, files are on the left of next level files +TEST_F(FileIndexerTest, no_overlap_left) { + Arena arena; + indexer = new FileIndexer(&ucmp); + // level 1 + AddFile(1, 100, 200); + AddFile(1, 300, 400); + AddFile(1, 500, 600); + // level 2 + AddFile(2, 1500, 1600); + AddFile(2, 1601, 1699); + AddFile(2, 1700, 1800); + // level 3 + AddFile(3, 2500, 2600); + AddFile(3, 2601, 2699); + AddFile(3, 2700, 2800); + indexer->UpdateIndex(&arena, kNumLevels, files); + for (uint32_t level = 1; level < 3; ++level) { + for (uint32_t f = 0; f < 3; ++f) { + GetNextLevelIndex(level, f, -1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(level, f, 0, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(level, f, 1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(level, f, 1, 0, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(level, f, 1, 1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(2, right); + } + } + delete indexer; + ClearFiles(); +} + +// Case 2: no overlap, files are on the right of next level files +TEST_F(FileIndexerTest, no_overlap_right) { + Arena arena; + indexer = new FileIndexer(&ucmp); + // level 1 + AddFile(1, 2100, 2200); + AddFile(1, 2300, 2400); + AddFile(1, 2500, 2600); + // level 2 + AddFile(2, 1500, 1600); + AddFile(2, 1501, 1699); + AddFile(2, 1700, 1800); + // level 3 + AddFile(3, 500, 600); + AddFile(3, 501, 699); + AddFile(3, 700, 800); + indexer->UpdateIndex(&arena, kNumLevels, files); + for (uint32_t level = 1; level < 3; ++level) { + for (uint32_t f = 0; f < 3; ++f) { + GetNextLevelIndex(level, f, -1, -1, &left, &right); + ASSERT_EQ(f == 0 ? 0 : 3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(level, f, 0, -1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(level, f, 1, -1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(level, f, 1, -1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(level, f, 1, 0, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(level, f, 1, 1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + } + } + delete indexer; +} + +// Case 3: empty L2 +TEST_F(FileIndexerTest, empty_L2) { + Arena arena; + indexer = new FileIndexer(&ucmp); + for (uint32_t i = 1; i < kNumLevels; ++i) { + ASSERT_EQ(0U, indexer->LevelIndexSize(i)); + } + // level 1 + AddFile(1, 2100, 2200); + AddFile(1, 2300, 2400); + AddFile(1, 2500, 2600); + // level 3 + AddFile(3, 500, 600); + AddFile(3, 501, 699); + AddFile(3, 700, 800); + indexer->UpdateIndex(&arena, kNumLevels, files); + for (uint32_t f = 0; f < 3; ++f) { + GetNextLevelIndex(1, f, -1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(1, f, 0, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(1, f, 1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(1, f, 1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(1, f, 1, 0, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(1, f, 1, 1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + } + delete indexer; + ClearFiles(); +} + +// Case 4: mixed +TEST_F(FileIndexerTest, mixed) { + Arena arena; + indexer = new FileIndexer(&ucmp); + // level 1 + AddFile(1, 100, 200); + AddFile(1, 250, 400); + AddFile(1, 450, 500); + // level 2 + AddFile(2, 100, 150); // 0 + AddFile(2, 200, 250); // 1 + AddFile(2, 251, 300); // 2 + AddFile(2, 301, 350); // 3 + AddFile(2, 500, 600); // 4 + // level 3 + AddFile(3, 0, 50); + AddFile(3, 100, 200); + AddFile(3, 201, 250); + indexer->UpdateIndex(&arena, kNumLevels, files); + // level 1, 0 + GetNextLevelIndex(1, 0, -1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(0, right); + GetNextLevelIndex(1, 0, 0, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(0, right); + GetNextLevelIndex(1, 0, 1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(1, 0, 1, 0, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(1, 0, 1, 1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(4, right); + // level 1, 1 + GetNextLevelIndex(1, 1, -1, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(1, 1, 0, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(1, 1, 1, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(3, right); + GetNextLevelIndex(1, 1, 1, 0, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(3, right); + GetNextLevelIndex(1, 1, 1, 1, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(4, right); + // level 1, 2 + GetNextLevelIndex(1, 2, -1, -1, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(3, right); + GetNextLevelIndex(1, 2, 0, -1, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(3, right); + GetNextLevelIndex(1, 2, 1, -1, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(4, right); + GetNextLevelIndex(1, 2, 1, 0, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(4, right); + GetNextLevelIndex(1, 2, 1, 1, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(4, right); + // level 2, 0 + GetNextLevelIndex(2, 0, -1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(2, 0, 0, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(2, 0, 1, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(2, 0, 1, 0, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(2, 0, 1, 1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(2, right); + // level 2, 1 + GetNextLevelIndex(2, 1, -1, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(2, 1, 0, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(2, 1, 1, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(2, 1, 1, 0, &left, &right); + ASSERT_EQ(2, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(2, 1, 1, 1, &left, &right); + ASSERT_EQ(2, left); + ASSERT_EQ(2, right); + // level 2, [2 - 4], no overlap + for (uint32_t f = 2; f <= 4; ++f) { + GetNextLevelIndex(2, f, -1, -1, &left, &right); + ASSERT_EQ(f == 2 ? 2 : 3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(2, f, 0, -1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(2, f, 1, -1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(2, f, 1, 0, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(2, f, 1, 1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + } + delete indexer; + ClearFiles(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/filename_test.cc b/src/rocksdb/db/filename_test.cc new file mode 100644 index 000000000..04c81b333 --- /dev/null +++ b/src/rocksdb/db/filename_test.cc @@ -0,0 +1,241 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "file/filename.h" + +#include "db/dbformat.h" +#include "port/port.h" +#include "test_util/testharness.h" + +namespace ROCKSDB_NAMESPACE { + +class FileNameTest : public testing::Test {}; + +TEST_F(FileNameTest, Parse) { + Slice db; + FileType type; + uint64_t number; + + char kDefautInfoLogDir = 1; + char kDifferentInfoLogDir = 2; + char kNoCheckLogDir = 4; + char kAllMode = kDefautInfoLogDir | kDifferentInfoLogDir | kNoCheckLogDir; + + // Successful parses + static struct { + const char* fname; + uint64_t number; + FileType type; + char mode; + } cases[] = { + {"100.log", 100, kWalFile, kAllMode}, + {"0.log", 0, kWalFile, kAllMode}, + {"0.sst", 0, kTableFile, kAllMode}, + {"CURRENT", 0, kCurrentFile, kAllMode}, + {"LOCK", 0, kDBLockFile, kAllMode}, + {"MANIFEST-2", 2, kDescriptorFile, kAllMode}, + {"MANIFEST-7", 7, kDescriptorFile, kAllMode}, + {"METADB-2", 2, kMetaDatabase, kAllMode}, + {"METADB-7", 7, kMetaDatabase, kAllMode}, + {"LOG", 0, kInfoLogFile, kDefautInfoLogDir}, + {"LOG.old", 0, kInfoLogFile, kDefautInfoLogDir}, + {"LOG.old.6688", 6688, kInfoLogFile, kDefautInfoLogDir}, + {"rocksdb_dir_LOG", 0, kInfoLogFile, kDifferentInfoLogDir}, + {"rocksdb_dir_LOG.old", 0, kInfoLogFile, kDifferentInfoLogDir}, + {"rocksdb_dir_LOG.old.6688", 6688, kInfoLogFile, kDifferentInfoLogDir}, + {"18446744073709551615.log", 18446744073709551615ull, kWalFile, kAllMode}, + }; + for (char mode : {kDifferentInfoLogDir, kDefautInfoLogDir, kNoCheckLogDir}) { + for (unsigned int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) { + InfoLogPrefix info_log_prefix(mode != kDefautInfoLogDir, "/rocksdb/dir"); + if (cases[i].mode & mode) { + std::string f = cases[i].fname; + if (mode == kNoCheckLogDir) { + ASSERT_TRUE(ParseFileName(f, &number, &type)) << f; + } else { + ASSERT_TRUE(ParseFileName(f, &number, info_log_prefix.prefix, &type)) + << f; + } + ASSERT_EQ(cases[i].type, type) << f; + ASSERT_EQ(cases[i].number, number) << f; + } + } + } + + // Errors + static const char* errors[] = {"", + "foo", + "foo-dx-100.log", + ".log", + "", + "manifest", + "CURREN", + "CURRENTX", + "MANIFES", + "MANIFEST", + "MANIFEST-", + "XMANIFEST-3", + "MANIFEST-3x", + "META", + "METADB", + "METADB-", + "XMETADB-3", + "METADB-3x", + "LOC", + "LOCKx", + "LO", + "LOGx", + "18446744073709551616.log", + "184467440737095516150.log", + "100", + "100.", + "100.lop"}; + for (unsigned int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) { + std::string f = errors[i]; + ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f; + }; +} + +TEST_F(FileNameTest, InfoLogFileName) { + std::string dbname = ("/data/rocksdb"); + std::string db_absolute_path; + ASSERT_OK(Env::Default()->GetAbsolutePath(dbname, &db_absolute_path)); + + ASSERT_EQ("/data/rocksdb/LOG", InfoLogFileName(dbname, db_absolute_path, "")); + ASSERT_EQ("/data/rocksdb/LOG.old.666", + OldInfoLogFileName(dbname, 666u, db_absolute_path, "")); + + ASSERT_EQ("/data/rocksdb_log/data_rocksdb_LOG", + InfoLogFileName(dbname, db_absolute_path, "/data/rocksdb_log")); + ASSERT_EQ( + "/data/rocksdb_log/data_rocksdb_LOG.old.666", + OldInfoLogFileName(dbname, 666u, db_absolute_path, "/data/rocksdb_log")); +} + +TEST_F(FileNameTest, Construction) { + uint64_t number; + FileType type; + std::string fname; + + fname = CurrentFileName("foo"); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(0U, number); + ASSERT_EQ(kCurrentFile, type); + + fname = LockFileName("foo"); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(0U, number); + ASSERT_EQ(kDBLockFile, type); + + fname = LogFileName("foo", 192); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(192U, number); + ASSERT_EQ(kWalFile, type); + + fname = TableFileName({DbPath("bar", 0)}, 200, 0); + std::string fname1 = + TableFileName({DbPath("foo", 0), DbPath("bar", 0)}, 200, 1); + ASSERT_EQ(fname, fname1); + ASSERT_EQ("bar/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(200U, number); + ASSERT_EQ(kTableFile, type); + + fname = DescriptorFileName("bar", 100); + ASSERT_EQ("bar/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(100U, number); + ASSERT_EQ(kDescriptorFile, type); + + fname = TempFileName("tmp", 999); + ASSERT_EQ("tmp/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(999U, number); + ASSERT_EQ(kTempFile, type); + + fname = MetaDatabaseName("met", 100); + ASSERT_EQ("met/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(100U, number); + ASSERT_EQ(kMetaDatabase, type); +} + +TEST_F(FileNameTest, NormalizePath) { + // No leading slash + const std::string sep = std::string(1, kFilePathSeparator); + + std::string expected = "FOLDER" + sep + "filename.ext"; + std::string given = "FOLDER" + sep + "filename.ext"; + + ASSERT_EQ(expected, NormalizePath(given)); + + // Two chars /a + + expected = sep + "a"; + given = expected; + ASSERT_EQ(expected, NormalizePath(given)); + + // Two chars a/ + expected = "a" + sep; + given = expected; + ASSERT_EQ(expected, NormalizePath(given)); + + // Server only + expected = sep + sep + "a"; + given = expected; + ASSERT_EQ(expected, NormalizePath(given)); + + // Two slashes after character + expected = "a" + sep; + given = "a" + sep + sep; + + ASSERT_EQ(expected, NormalizePath(given)); + + // slash only / + expected = sep; + given = expected; + ASSERT_EQ(expected, NormalizePath(given)); + + // UNC only // + expected = sep; + given = sep + sep; + + ASSERT_EQ(expected, NormalizePath(given)); + + // 3 slashesy // + expected = sep + sep; + given = sep + sep + sep; + ASSERT_EQ(expected, NormalizePath(given)); + + // 3 slashes // + expected = sep + sep + "a" + sep; + given = sep + sep + sep + "a" + sep; + ASSERT_EQ(expected, NormalizePath(given)); + + // 2 separators in the middle + expected = "a" + sep + "b"; + given = "a" + sep + sep + "b"; + ASSERT_EQ(expected, NormalizePath(given)); + + // UNC with duplicate slashes + expected = sep + sep + "SERVER" + sep + "a" + sep + "b" + sep + "c"; + given = sep + sep + "SERVER" + sep + "a" + sep + sep + "b" + sep + "c"; + ASSERT_EQ(expected, NormalizePath(given)); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/flush_job.cc b/src/rocksdb/db/flush_job.cc new file mode 100644 index 000000000..645e42f44 --- /dev/null +++ b/src/rocksdb/db/flush_job.cc @@ -0,0 +1,1094 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/flush_job.h" + +#include +#include +#include + +#include "db/builder.h" +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/event_helpers.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/memtable_list.h" +#include "db/merge_context.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/version_set.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "logging/event_logger.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/thread_status_util.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "table/merging_iterator.h" +#include "table/table_builder.h" +#include "table/two_level_iterator.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/mutexlock.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +const char* GetFlushReasonString(FlushReason flush_reason) { + switch (flush_reason) { + case FlushReason::kOthers: + return "Other Reasons"; + case FlushReason::kGetLiveFiles: + return "Get Live Files"; + case FlushReason::kShutDown: + return "Shut down"; + case FlushReason::kExternalFileIngestion: + return "External File Ingestion"; + case FlushReason::kManualCompaction: + return "Manual Compaction"; + case FlushReason::kWriteBufferManager: + return "Write Buffer Manager"; + case FlushReason::kWriteBufferFull: + return "Write Buffer Full"; + case FlushReason::kTest: + return "Test"; + case FlushReason::kDeleteFiles: + return "Delete Files"; + case FlushReason::kAutoCompaction: + return "Auto Compaction"; + case FlushReason::kManualFlush: + return "Manual Flush"; + case FlushReason::kErrorRecovery: + return "Error Recovery"; + case FlushReason::kWalFull: + return "WAL Full"; + default: + return "Invalid"; + } +} + +FlushJob::FlushJob( + const std::string& dbname, ColumnFamilyData* cfd, + const ImmutableDBOptions& db_options, + const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id, + const FileOptions& file_options, VersionSet* versions, + InstrumentedMutex* db_mutex, std::atomic* shutting_down, + std::vector existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SnapshotChecker* snapshot_checker, JobContext* job_context, + LogBuffer* log_buffer, FSDirectory* db_directory, + FSDirectory* output_file_directory, CompressionType output_compression, + Statistics* stats, EventLogger* event_logger, bool measure_io_stats, + const bool sync_output_directory, const bool write_manifest, + Env::Priority thread_pri, const std::shared_ptr& io_tracer, + const SeqnoToTimeMapping& seqno_time_mapping, const std::string& db_id, + const std::string& db_session_id, std::string full_history_ts_low, + BlobFileCompletionCallback* blob_callback) + : dbname_(dbname), + db_id_(db_id), + db_session_id_(db_session_id), + cfd_(cfd), + db_options_(db_options), + mutable_cf_options_(mutable_cf_options), + max_memtable_id_(max_memtable_id), + file_options_(file_options), + versions_(versions), + db_mutex_(db_mutex), + shutting_down_(shutting_down), + existing_snapshots_(std::move(existing_snapshots)), + earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), + snapshot_checker_(snapshot_checker), + job_context_(job_context), + log_buffer_(log_buffer), + db_directory_(db_directory), + output_file_directory_(output_file_directory), + output_compression_(output_compression), + stats_(stats), + event_logger_(event_logger), + measure_io_stats_(measure_io_stats), + sync_output_directory_(sync_output_directory), + write_manifest_(write_manifest), + edit_(nullptr), + base_(nullptr), + pick_memtable_called(false), + thread_pri_(thread_pri), + io_tracer_(io_tracer), + clock_(db_options_.clock), + full_history_ts_low_(std::move(full_history_ts_low)), + blob_callback_(blob_callback), + db_impl_seqno_time_mapping_(seqno_time_mapping) { + // Update the thread status to indicate flush. + ReportStartedFlush(); + TEST_SYNC_POINT("FlushJob::FlushJob()"); +} + +FlushJob::~FlushJob() { ThreadStatusUtil::ResetThreadStatus(); } + +void FlushJob::ReportStartedFlush() { + ThreadStatusUtil::SetColumnFamily(cfd_, cfd_->ioptions()->env, + db_options_.enable_thread_tracking); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_FLUSH); + ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID, + job_context_->job_id); + IOSTATS_RESET(bytes_written); +} + +void FlushJob::ReportFlushInputSize(const autovector& mems) { + uint64_t input_size = 0; + for (auto* mem : mems) { + input_size += mem->ApproximateMemoryUsage(); + } + ThreadStatusUtil::IncreaseThreadOperationProperty( + ThreadStatus::FLUSH_BYTES_MEMTABLES, input_size); +} + +void FlushJob::RecordFlushIOStats() { + RecordTick(stats_, FLUSH_WRITE_BYTES, IOSTATS(bytes_written)); + ThreadStatusUtil::IncreaseThreadOperationProperty( + ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written)); + IOSTATS_RESET(bytes_written); +} +void FlushJob::PickMemTable() { + db_mutex_->AssertHeld(); + assert(!pick_memtable_called); + pick_memtable_called = true; + + // Maximum "NextLogNumber" of the memtables to flush. + // When mempurge feature is turned off, this variable is useless + // because the memtables are implicitly sorted by increasing order of creation + // time. Therefore mems_->back()->GetNextLogNumber() is already equal to + // max_next_log_number. However when Mempurge is on, the memtables are no + // longer sorted by increasing order of creation time. Therefore this variable + // becomes necessary because mems_->back()->GetNextLogNumber() is no longer + // necessarily equal to max_next_log_number. + uint64_t max_next_log_number = 0; + + // Save the contents of the earliest memtable as a new Table + cfd_->imm()->PickMemtablesToFlush(max_memtable_id_, &mems_, + &max_next_log_number); + if (mems_.empty()) { + return; + } + + ReportFlushInputSize(mems_); + + // entries mems are (implicitly) sorted in ascending order by their created + // time. We will use the first memtable's `edit` to keep the meta info for + // this flush. + MemTable* m = mems_[0]; + edit_ = m->GetEdits(); + edit_->SetPrevLogNumber(0); + // SetLogNumber(log_num) indicates logs with number smaller than log_num + // will no longer be picked up for recovery. + edit_->SetLogNumber(max_next_log_number); + edit_->SetColumnFamily(cfd_->GetID()); + + // path 0 for level 0 file. + meta_.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); + + base_ = cfd_->current(); + base_->Ref(); // it is likely that we do not need this reference +} + +Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta, + bool* switched_to_mempurge) { + TEST_SYNC_POINT("FlushJob::Start"); + db_mutex_->AssertHeld(); + assert(pick_memtable_called); + // Mempurge threshold can be dynamically changed. + // For sake of consistency, mempurge_threshold is + // saved locally to maintain consistency in each + // FlushJob::Run call. + double mempurge_threshold = + mutable_cf_options_.experimental_mempurge_threshold; + + AutoThreadOperationStageUpdater stage_run(ThreadStatus::STAGE_FLUSH_RUN); + if (mems_.empty()) { + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Nothing in memtable to flush", + cfd_->GetName().c_str()); + return Status::OK(); + } + + // I/O measurement variables + PerfLevel prev_perf_level = PerfLevel::kEnableTime; + uint64_t prev_write_nanos = 0; + uint64_t prev_fsync_nanos = 0; + uint64_t prev_range_sync_nanos = 0; + uint64_t prev_prepare_write_nanos = 0; + uint64_t prev_cpu_write_nanos = 0; + uint64_t prev_cpu_read_nanos = 0; + if (measure_io_stats_) { + prev_perf_level = GetPerfLevel(); + SetPerfLevel(PerfLevel::kEnableTime); + prev_write_nanos = IOSTATS(write_nanos); + prev_fsync_nanos = IOSTATS(fsync_nanos); + prev_range_sync_nanos = IOSTATS(range_sync_nanos); + prev_prepare_write_nanos = IOSTATS(prepare_write_nanos); + prev_cpu_write_nanos = IOSTATS(cpu_write_nanos); + prev_cpu_read_nanos = IOSTATS(cpu_read_nanos); + } + Status mempurge_s = Status::NotFound("No MemPurge."); + if ((mempurge_threshold > 0.0) && + (cfd_->GetFlushReason() == FlushReason::kWriteBufferFull) && + (!mems_.empty()) && MemPurgeDecider(mempurge_threshold) && + !(db_options_.atomic_flush)) { + cfd_->SetMempurgeUsed(); + mempurge_s = MemPurge(); + if (!mempurge_s.ok()) { + // Mempurge is typically aborted when the output + // bytes cannot be contained onto a single output memtable. + if (mempurge_s.IsAborted()) { + ROCKS_LOG_INFO(db_options_.info_log, "Mempurge process aborted: %s\n", + mempurge_s.ToString().c_str()); + } else { + // However the mempurge process can also fail for + // other reasons (eg: new_mem->Add() fails). + ROCKS_LOG_WARN(db_options_.info_log, "Mempurge process failed: %s\n", + mempurge_s.ToString().c_str()); + } + } else { + if (switched_to_mempurge) { + *switched_to_mempurge = true; + } else { + // The mempurge process was successful, but no switch_to_mempurge + // pointer provided so no way to propagate the state of flush job. + ROCKS_LOG_WARN(db_options_.info_log, + "Mempurge process succeeded" + "but no 'switched_to_mempurge' ptr provided.\n"); + } + } + } + Status s; + if (mempurge_s.ok()) { + base_->Unref(); + s = Status::OK(); + } else { + // This will release and re-acquire the mutex. + s = WriteLevel0Table(); + } + + if (s.ok() && cfd_->IsDropped()) { + s = Status::ColumnFamilyDropped("Column family dropped during compaction"); + } + if ((s.ok() || s.IsColumnFamilyDropped()) && + shutting_down_->load(std::memory_order_acquire)) { + s = Status::ShutdownInProgress("Database shutdown"); + } + + if (!s.ok()) { + cfd_->imm()->RollbackMemtableFlush(mems_, meta_.fd.GetNumber()); + } else if (write_manifest_) { + TEST_SYNC_POINT("FlushJob::InstallResults"); + // Replace immutable memtable with the generated Table + s = cfd_->imm()->TryInstallMemtableFlushResults( + cfd_, mutable_cf_options_, mems_, prep_tracker, versions_, db_mutex_, + meta_.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_, + log_buffer_, &committed_flush_jobs_info_, + !(mempurge_s.ok()) /* write_edit : true if no mempurge happened (or if aborted), + but 'false' if mempurge successful: no new min log number + or new level 0 file path to write to manifest. */); + } + + if (s.ok() && file_meta != nullptr) { + *file_meta = meta_; + } + RecordFlushIOStats(); + + // When measure_io_stats_ is true, the default 512 bytes is not enough. + auto stream = event_logger_->LogToBuffer(log_buffer_, 1024); + stream << "job" << job_context_->job_id << "event" + << "flush_finished"; + stream << "output_compression" + << CompressionTypeToString(output_compression_); + stream << "lsm_state"; + stream.StartArray(); + auto vstorage = cfd_->current()->storage_info(); + for (int level = 0; level < vstorage->num_levels(); ++level) { + stream << vstorage->NumLevelFiles(level); + } + stream.EndArray(); + + const auto& blob_files = vstorage->GetBlobFiles(); + if (!blob_files.empty()) { + assert(blob_files.front()); + stream << "blob_file_head" << blob_files.front()->GetBlobFileNumber(); + + assert(blob_files.back()); + stream << "blob_file_tail" << blob_files.back()->GetBlobFileNumber(); + } + + stream << "immutable_memtables" << cfd_->imm()->NumNotFlushed(); + + if (measure_io_stats_) { + if (prev_perf_level != PerfLevel::kEnableTime) { + SetPerfLevel(prev_perf_level); + } + stream << "file_write_nanos" << (IOSTATS(write_nanos) - prev_write_nanos); + stream << "file_range_sync_nanos" + << (IOSTATS(range_sync_nanos) - prev_range_sync_nanos); + stream << "file_fsync_nanos" << (IOSTATS(fsync_nanos) - prev_fsync_nanos); + stream << "file_prepare_write_nanos" + << (IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos); + stream << "file_cpu_write_nanos" + << (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos); + stream << "file_cpu_read_nanos" + << (IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos); + } + + return s; +} + +void FlushJob::Cancel() { + db_mutex_->AssertHeld(); + assert(base_ != nullptr); + base_->Unref(); +} + +Status FlushJob::MemPurge() { + Status s; + db_mutex_->AssertHeld(); + db_mutex_->Unlock(); + assert(!mems_.empty()); + + // Measure purging time. + const uint64_t start_micros = clock_->NowMicros(); + const uint64_t start_cpu_micros = clock_->CPUMicros(); + + MemTable* new_mem = nullptr; + // For performance/log investigation purposes: + // look at how much useful payload we harvest in the new_mem. + // This value is then printed to the DB log. + double new_mem_capacity = 0.0; + + // Create two iterators, one for the memtable data (contains + // info from puts + deletes), and one for the memtable + // Range Tombstones (from DeleteRanges). + ReadOptions ro; + ro.total_order_seek = true; + Arena arena; + std::vector memtables; + std::vector> + range_del_iters; + for (MemTable* m : mems_) { + memtables.push_back(m->NewIterator(ro, &arena)); + auto* range_del_iter = m->NewRangeTombstoneIterator( + ro, kMaxSequenceNumber, true /* immutable_memtable */); + if (range_del_iter != nullptr) { + range_del_iters.emplace_back(range_del_iter); + } + } + + assert(!memtables.empty()); + SequenceNumber first_seqno = kMaxSequenceNumber; + SequenceNumber earliest_seqno = kMaxSequenceNumber; + // Pick first and earliest seqno as min of all first_seqno + // and earliest_seqno of the mempurged memtables. + for (const auto& mem : mems_) { + first_seqno = mem->GetFirstSequenceNumber() < first_seqno + ? mem->GetFirstSequenceNumber() + : first_seqno; + earliest_seqno = mem->GetEarliestSequenceNumber() < earliest_seqno + ? mem->GetEarliestSequenceNumber() + : earliest_seqno; + } + + ScopedArenaIterator iter( + NewMergingIterator(&(cfd_->internal_comparator()), memtables.data(), + static_cast(memtables.size()), &arena)); + + auto* ioptions = cfd_->ioptions(); + + // Place iterator at the First (meaning most recent) key node. + iter->SeekToFirst(); + + const std::string* const full_history_ts_low = &(cfd_->GetFullHistoryTsLow()); + std::unique_ptr range_del_agg( + new CompactionRangeDelAggregator(&(cfd_->internal_comparator()), + existing_snapshots_, + full_history_ts_low)); + for (auto& rd_iter : range_del_iters) { + range_del_agg->AddTombstones(std::move(rd_iter)); + } + + // If there is valid data in the memtable, + // or at least range tombstones, copy over the info + // to the new memtable. + if (iter->Valid() || !range_del_agg->IsEmpty()) { + // MaxSize is the size of a memtable. + size_t maxSize = mutable_cf_options_.write_buffer_size; + std::unique_ptr compaction_filter; + if (ioptions->compaction_filter_factory != nullptr && + ioptions->compaction_filter_factory->ShouldFilterTableFileCreation( + TableFileCreationReason::kFlush)) { + CompactionFilter::Context ctx; + ctx.is_full_compaction = false; + ctx.is_manual_compaction = false; + ctx.column_family_id = cfd_->GetID(); + ctx.reason = TableFileCreationReason::kFlush; + compaction_filter = + ioptions->compaction_filter_factory->CreateCompactionFilter(ctx); + if (compaction_filter != nullptr && + !compaction_filter->IgnoreSnapshots()) { + s = Status::NotSupported( + "CompactionFilter::IgnoreSnapshots() = false is not supported " + "anymore."); + return s; + } + } + + new_mem = new MemTable((cfd_->internal_comparator()), *(cfd_->ioptions()), + mutable_cf_options_, cfd_->write_buffer_mgr(), + earliest_seqno, cfd_->GetID()); + assert(new_mem != nullptr); + + Env* env = db_options_.env; + assert(env); + MergeHelper merge( + env, (cfd_->internal_comparator()).user_comparator(), + (ioptions->merge_operator).get(), compaction_filter.get(), + ioptions->logger, true /* internal key corruption is not ok */, + existing_snapshots_.empty() ? 0 : existing_snapshots_.back(), + snapshot_checker_); + assert(job_context_); + SequenceNumber job_snapshot_seq = job_context_->GetJobSnapshotSequence(); + const std::atomic kManualCompactionCanceledFalse{false}; + CompactionIterator c_iter( + iter.get(), (cfd_->internal_comparator()).user_comparator(), &merge, + kMaxSequenceNumber, &existing_snapshots_, + earliest_write_conflict_snapshot_, job_snapshot_seq, snapshot_checker_, + env, ShouldReportDetailedTime(env, ioptions->stats), + true /* internal key corruption is not ok */, range_del_agg.get(), + nullptr, ioptions->allow_data_in_errors, + ioptions->enforce_single_del_contracts, + /*manual_compaction_canceled=*/kManualCompactionCanceledFalse, + /*compaction=*/nullptr, compaction_filter.get(), + /*shutting_down=*/nullptr, ioptions->info_log, full_history_ts_low); + + // Set earliest sequence number in the new memtable + // to be equal to the earliest sequence number of the + // memtable being flushed (See later if there is a need + // to update this number!). + new_mem->SetEarliestSequenceNumber(earliest_seqno); + // Likewise for first seq number. + new_mem->SetFirstSequenceNumber(first_seqno); + SequenceNumber new_first_seqno = kMaxSequenceNumber; + + c_iter.SeekToFirst(); + + // Key transfer + for (; c_iter.Valid(); c_iter.Next()) { + const ParsedInternalKey ikey = c_iter.ikey(); + const Slice value = c_iter.value(); + new_first_seqno = + ikey.sequence < new_first_seqno ? ikey.sequence : new_first_seqno; + + // Should we update "OldestKeyTime" ???? -> timestamp appear + // to still be an "experimental" feature. + s = new_mem->Add( + ikey.sequence, ikey.type, ikey.user_key, value, + nullptr, // KV protection info set as nullptr since it + // should only be useful for the first add to + // the original memtable. + false, // : allow concurrent_memtable_writes_ + // Not seen as necessary for now. + nullptr, // get_post_process_info(m) must be nullptr + // when concurrent_memtable_writes is switched off. + nullptr); // hint, only used when concurrent_memtable_writes_ + // is switched on. + if (!s.ok()) { + break; + } + + // If new_mem has size greater than maxSize, + // then rollback to regular flush operation, + // and destroy new_mem. + if (new_mem->ApproximateMemoryUsage() > maxSize) { + s = Status::Aborted("Mempurge filled more than one memtable."); + new_mem_capacity = 1.0; + break; + } + } + + // Check status and propagate + // potential error status from c_iter + if (!s.ok()) { + c_iter.status().PermitUncheckedError(); + } else if (!c_iter.status().ok()) { + s = c_iter.status(); + } + + // Range tombstone transfer. + if (s.ok()) { + auto range_del_it = range_del_agg->NewIterator(); + for (range_del_it->SeekToFirst(); range_del_it->Valid(); + range_del_it->Next()) { + auto tombstone = range_del_it->Tombstone(); + new_first_seqno = + tombstone.seq_ < new_first_seqno ? tombstone.seq_ : new_first_seqno; + s = new_mem->Add( + tombstone.seq_, // Sequence number + kTypeRangeDeletion, // KV type + tombstone.start_key_, // Key is start key. + tombstone.end_key_, // Value is end key. + nullptr, // KV protection info set as nullptr since it + // should only be useful for the first add to + // the original memtable. + false, // : allow concurrent_memtable_writes_ + // Not seen as necessary for now. + nullptr, // get_post_process_info(m) must be nullptr + // when concurrent_memtable_writes is switched off. + nullptr); // hint, only used when concurrent_memtable_writes_ + // is switched on. + + if (!s.ok()) { + break; + } + + // If new_mem has size greater than maxSize, + // then rollback to regular flush operation, + // and destroy new_mem. + if (new_mem->ApproximateMemoryUsage() > maxSize) { + s = Status::Aborted(Slice("Mempurge filled more than one memtable.")); + new_mem_capacity = 1.0; + break; + } + } + } + + // If everything happened smoothly and new_mem contains valid data, + // decide if it is flushed to storage or kept in the imm() + // memtable list (memory). + if (s.ok() && (new_first_seqno != kMaxSequenceNumber)) { + // Rectify the first sequence number, which (unlike the earliest seq + // number) needs to be present in the new memtable. + new_mem->SetFirstSequenceNumber(new_first_seqno); + + // The new_mem is added to the list of immutable memtables + // only if it filled at less than 100% capacity and isn't flagged + // as in need of being flushed. + if (new_mem->ApproximateMemoryUsage() < maxSize && + !(new_mem->ShouldFlushNow())) { + // Construct fragmented memtable range tombstones without mutex + new_mem->ConstructFragmentedRangeTombstones(); + db_mutex_->Lock(); + uint64_t new_mem_id = mems_[0]->GetID(); + + new_mem->SetID(new_mem_id); + new_mem->SetNextLogNumber(mems_[0]->GetNextLogNumber()); + + // This addition will not trigger another flush, because + // we do not call SchedulePendingFlush(). + cfd_->imm()->Add(new_mem, &job_context_->memtables_to_free); + new_mem->Ref(); +#ifndef ROCKSDB_LITE + // Piggyback FlushJobInfo on the first flushed memtable. + db_mutex_->AssertHeld(); + meta_.fd.file_size = 0; + mems_[0]->SetFlushJobInfo(GetFlushJobInfo()); +#endif // !ROCKSDB_LITE + db_mutex_->Unlock(); + } else { + s = Status::Aborted(Slice("Mempurge filled more than one memtable.")); + new_mem_capacity = 1.0; + if (new_mem) { + job_context_->memtables_to_free.push_back(new_mem); + } + } + } else { + // In this case, the newly allocated new_mem is empty. + assert(new_mem != nullptr); + job_context_->memtables_to_free.push_back(new_mem); + } + } + + // Reacquire the mutex for WriteLevel0 function. + db_mutex_->Lock(); + + // If mempurge successful, don't write input tables to level0, + // but write any full output table to level0. + if (s.ok()) { + TEST_SYNC_POINT("DBImpl::FlushJob:MemPurgeSuccessful"); + } else { + TEST_SYNC_POINT("DBImpl::FlushJob:MemPurgeUnsuccessful"); + } + const uint64_t micros = clock_->NowMicros() - start_micros; + const uint64_t cpu_micros = clock_->CPUMicros() - start_cpu_micros; + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Mempurge lasted %" PRIu64 + " microseconds, and %" PRIu64 + " cpu " + "microseconds. Status is %s ok. Perc capacity: %f\n", + cfd_->GetName().c_str(), job_context_->job_id, micros, + cpu_micros, s.ok() ? "" : "not", new_mem_capacity); + + return s; +} + +bool FlushJob::MemPurgeDecider(double threshold) { + // Never trigger mempurge if threshold is not a strictly positive value. + if (!(threshold > 0.0)) { + return false; + } + if (threshold > (1.0 * mems_.size())) { + return true; + } + // Payload and useful_payload (in bytes). + // The useful payload ratio of a given MemTable + // is estimated to be useful_payload/payload. + uint64_t payload = 0, useful_payload = 0, entry_size = 0; + + // Local variables used repetitively inside the for-loop + // when iterating over the sampled entries. + Slice key_slice, value_slice; + ParsedInternalKey res; + SnapshotImpl min_snapshot; + std::string vget; + Status mget_s, parse_s; + MergeContext merge_context; + SequenceNumber max_covering_tombstone_seq = 0, sqno = 0, + min_seqno_snapshot = 0; + bool get_res, can_be_useful_payload, not_in_next_mems; + + // If estimated_useful_payload is > threshold, + // then flush to storage, else MemPurge. + double estimated_useful_payload = 0.0; + // Cochran formula for determining sample size. + // 95% confidence interval, 7% precision. + // n0 = (1.96*1.96)*0.25/(0.07*0.07) = 196.0 + double n0 = 196.0; + ReadOptions ro; + ro.total_order_seek = true; + + // Iterate over each memtable of the set. + for (auto mem_iter = std::begin(mems_); mem_iter != std::end(mems_); + mem_iter++) { + MemTable* mt = *mem_iter; + + // Else sample from the table. + uint64_t nentries = mt->num_entries(); + // Corrected Cochran formula for small populations + // (converges to n0 for large populations). + uint64_t target_sample_size = + static_cast(ceil(n0 / (1.0 + (n0 / nentries)))); + std::unordered_set sentries = {}; + // Populate sample entries set. + mt->UniqueRandomSample(target_sample_size, &sentries); + + // Estimate the garbage ratio by comparing if + // each sample corresponds to a valid entry. + for (const char* ss : sentries) { + key_slice = GetLengthPrefixedSlice(ss); + parse_s = ParseInternalKey(key_slice, &res, true /*log_err_key*/); + if (!parse_s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Memtable Decider: ParseInternalKey did not parse " + "key_slice %s successfully.", + key_slice.data()); + } + + // Size of the entry is "key size (+ value size if KV entry)" + entry_size = key_slice.size(); + if (res.type == kTypeValue) { + value_slice = + GetLengthPrefixedSlice(key_slice.data() + key_slice.size()); + entry_size += value_slice.size(); + } + + // Count entry bytes as payload. + payload += entry_size; + + LookupKey lkey(res.user_key, kMaxSequenceNumber); + + // Paranoia: zero out these values just in case. + max_covering_tombstone_seq = 0; + sqno = 0; + + // Pick the oldest existing snapshot that is more recent + // than the sequence number of the sampled entry. + min_seqno_snapshot = kMaxSequenceNumber; + for (SequenceNumber seq_num : existing_snapshots_) { + if (seq_num > res.sequence && seq_num < min_seqno_snapshot) { + min_seqno_snapshot = seq_num; + } + } + min_snapshot.number_ = min_seqno_snapshot; + ro.snapshot = + min_seqno_snapshot < kMaxSequenceNumber ? &min_snapshot : nullptr; + + // Estimate if the sample entry is valid or not. + get_res = mt->Get(lkey, &vget, /*columns=*/nullptr, /*timestamp=*/nullptr, + &mget_s, &merge_context, &max_covering_tombstone_seq, + &sqno, ro, true /* immutable_memtable */); + if (!get_res) { + ROCKS_LOG_WARN( + db_options_.info_log, + "Memtable Get returned false when Get(sampled entry). " + "Yet each sample entry should exist somewhere in the memtable, " + "unrelated to whether it has been deleted or not."); + } + + // TODO(bjlemaire): evaluate typeMerge. + // This is where the sampled entry is estimated to be + // garbage or not. Note that this is a garbage *estimation* + // because we do not include certain items such as + // CompactionFitlers triggered at flush, or if the same delete + // has been inserted twice or more in the memtable. + + // Evaluate if the entry can be useful payload + // Situation #1: entry is a KV entry, was found in the memtable mt + // and the sequence numbers match. + can_be_useful_payload = (res.type == kTypeValue) && get_res && + mget_s.ok() && (sqno == res.sequence); + + // Situation #2: entry is a delete entry, was found in the memtable mt + // (because gres==true) and no valid KV entry is found. + // (note: duplicate delete entries are also taken into + // account here, because the sequence number 'sqno' + // in memtable->Get(&sqno) operation is set to be equal + // to the most recent delete entry as well). + can_be_useful_payload |= + ((res.type == kTypeDeletion) || (res.type == kTypeSingleDeletion)) && + mget_s.IsNotFound() && get_res && (sqno == res.sequence); + + // If there is a chance that the entry is useful payload + // Verify that the entry does not appear in the following memtables + // (memtables with greater memtable ID/larger sequence numbers). + if (can_be_useful_payload) { + not_in_next_mems = true; + for (auto next_mem_iter = mem_iter + 1; + next_mem_iter != std::end(mems_); next_mem_iter++) { + if ((*next_mem_iter) + ->Get(lkey, &vget, /*columns=*/nullptr, /*timestamp=*/nullptr, + &mget_s, &merge_context, &max_covering_tombstone_seq, + &sqno, ro, true /* immutable_memtable */)) { + not_in_next_mems = false; + break; + } + } + if (not_in_next_mems) { + useful_payload += entry_size; + } + } + } + if (payload > 0) { + // We use the estimated useful payload ratio to + // evaluate how many of the memtable bytes are useful bytes. + estimated_useful_payload += + (mt->ApproximateMemoryUsage()) * (useful_payload * 1.0 / payload); + + ROCKS_LOG_INFO(db_options_.info_log, + "Mempurge sampling [CF %s] - found garbage ratio from " + "sampling: %f. Threshold is %f\n", + cfd_->GetName().c_str(), + (payload - useful_payload) * 1.0 / payload, threshold); + } else { + ROCKS_LOG_WARN(db_options_.info_log, + "Mempurge sampling: null payload measured, and collected " + "sample size is %zu\n.", + sentries.size()); + } + } + // We convert the total number of useful payload bytes + // into the proportion of memtable necessary to store all these bytes. + // We compare this proportion with the threshold value. + return ((estimated_useful_payload / mutable_cf_options_.write_buffer_size) < + threshold); +} + +Status FlushJob::WriteLevel0Table() { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_FLUSH_WRITE_L0); + db_mutex_->AssertHeld(); + const uint64_t start_micros = clock_->NowMicros(); + const uint64_t start_cpu_micros = clock_->CPUMicros(); + Status s; + + SequenceNumber smallest_seqno = mems_.front()->GetEarliestSequenceNumber(); + if (!db_impl_seqno_time_mapping_.Empty()) { + // make a local copy, as the seqno_time_mapping from db_impl is not thread + // safe, which will be used while not holding the db_mutex. + seqno_to_time_mapping_ = db_impl_seqno_time_mapping_.Copy(smallest_seqno); + } + + std::vector blob_file_additions; + + { + auto write_hint = cfd_->CalculateSSTWriteHint(0); + Env::IOPriority io_priority = GetRateLimiterPriorityForWrite(); + db_mutex_->Unlock(); + if (log_buffer_) { + log_buffer_->FlushBufferToLog(); + } + // memtables and range_del_iters store internal iterators over each data + // memtable and its associated range deletion memtable, respectively, at + // corresponding indexes. + std::vector memtables; + std::vector> + range_del_iters; + ReadOptions ro; + ro.total_order_seek = true; + Arena arena; + uint64_t total_num_entries = 0, total_num_deletes = 0; + uint64_t total_data_size = 0; + size_t total_memory_usage = 0; + // Used for testing: + uint64_t mems_size = mems_.size(); + (void)mems_size; // avoids unused variable error when + // TEST_SYNC_POINT_CALLBACK not used. + TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:num_memtables", + &mems_size); + assert(job_context_); + for (MemTable* m : mems_) { + ROCKS_LOG_INFO( + db_options_.info_log, + "[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n", + cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber()); + memtables.push_back(m->NewIterator(ro, &arena)); + auto* range_del_iter = m->NewRangeTombstoneIterator( + ro, kMaxSequenceNumber, true /* immutable_memtable */); + if (range_del_iter != nullptr) { + range_del_iters.emplace_back(range_del_iter); + } + total_num_entries += m->num_entries(); + total_num_deletes += m->num_deletes(); + total_data_size += m->get_data_size(); + total_memory_usage += m->ApproximateMemoryUsage(); + } + + event_logger_->Log() << "job" << job_context_->job_id << "event" + << "flush_started" + << "num_memtables" << mems_.size() << "num_entries" + << total_num_entries << "num_deletes" + << total_num_deletes << "total_data_size" + << total_data_size << "memory_usage" + << total_memory_usage << "flush_reason" + << GetFlushReasonString(cfd_->GetFlushReason()); + + { + ScopedArenaIterator iter( + NewMergingIterator(&cfd_->internal_comparator(), memtables.data(), + static_cast(memtables.size()), &arena)); + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started", + cfd_->GetName().c_str(), job_context_->job_id, + meta_.fd.GetNumber()); + + TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression", + &output_compression_); + int64_t _current_time = 0; + auto status = clock_->GetCurrentTime(&_current_time); + // Safe to proceed even if GetCurrentTime fails. So, log and proceed. + if (!status.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "Failed to get current time to populate creation_time property. " + "Status: %s", + status.ToString().c_str()); + } + const uint64_t current_time = static_cast(_current_time); + + uint64_t oldest_key_time = mems_.front()->ApproximateOldestKeyTime(); + + // It's not clear whether oldest_key_time is always available. In case + // it is not available, use current_time. + uint64_t oldest_ancester_time = std::min(current_time, oldest_key_time); + + TEST_SYNC_POINT_CALLBACK( + "FlushJob::WriteLevel0Table:oldest_ancester_time", + &oldest_ancester_time); + meta_.oldest_ancester_time = oldest_ancester_time; + meta_.file_creation_time = current_time; + + uint64_t num_input_entries = 0; + uint64_t memtable_payload_bytes = 0; + uint64_t memtable_garbage_bytes = 0; + IOStatus io_s; + + const std::string* const full_history_ts_low = + (full_history_ts_low_.empty()) ? nullptr : &full_history_ts_low_; + TableBuilderOptions tboptions( + *cfd_->ioptions(), mutable_cf_options_, cfd_->internal_comparator(), + cfd_->int_tbl_prop_collector_factories(), output_compression_, + mutable_cf_options_.compression_opts, cfd_->GetID(), cfd_->GetName(), + 0 /* level */, false /* is_bottommost */, + TableFileCreationReason::kFlush, oldest_key_time, current_time, + db_id_, db_session_id_, 0 /* target_file_size */, + meta_.fd.GetNumber()); + const SequenceNumber job_snapshot_seq = + job_context_->GetJobSnapshotSequence(); + s = BuildTable( + dbname_, versions_, db_options_, tboptions, file_options_, + cfd_->table_cache(), iter.get(), std::move(range_del_iters), &meta_, + &blob_file_additions, existing_snapshots_, + earliest_write_conflict_snapshot_, job_snapshot_seq, + snapshot_checker_, mutable_cf_options_.paranoid_file_checks, + cfd_->internal_stats(), &io_s, io_tracer_, + BlobFileCreationReason::kFlush, seqno_to_time_mapping_, event_logger_, + job_context_->job_id, io_priority, &table_properties_, write_hint, + full_history_ts_low, blob_callback_, &num_input_entries, + &memtable_payload_bytes, &memtable_garbage_bytes); + // TODO: Cleanup io_status in BuildTable and table builders + assert(!s.ok() || io_s.ok()); + io_s.PermitUncheckedError(); + if (num_input_entries != total_num_entries && s.ok()) { + std::string msg = "Expected " + std::to_string(total_num_entries) + + " entries in memtables, but read " + + std::to_string(num_input_entries); + ROCKS_LOG_WARN(db_options_.info_log, "[%s] [JOB %d] Level-0 flush %s", + cfd_->GetName().c_str(), job_context_->job_id, + msg.c_str()); + if (db_options_.flush_verify_memtable_count) { + s = Status::Corruption(msg); + } + } + if (tboptions.reason == TableFileCreationReason::kFlush) { + TEST_SYNC_POINT("DBImpl::FlushJob:Flush"); + RecordTick(stats_, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH, + memtable_payload_bytes); + RecordTick(stats_, MEMTABLE_GARBAGE_BYTES_AT_FLUSH, + memtable_garbage_bytes); + } + LogFlush(db_options_.info_log); + } + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64 + " bytes %s" + "%s", + cfd_->GetName().c_str(), job_context_->job_id, + meta_.fd.GetNumber(), meta_.fd.GetFileSize(), + s.ToString().c_str(), + meta_.marked_for_compaction ? " (needs compaction)" : ""); + + if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) { + s = output_file_directory_->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + } + TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table", &mems_); + db_mutex_->Lock(); + } + base_->Unref(); + + // Note that if file_size is zero, the file has been deleted and + // should not be added to the manifest. + const bool has_output = meta_.fd.GetFileSize() > 0; + + if (s.ok() && has_output) { + TEST_SYNC_POINT("DBImpl::FlushJob:SSTFileCreated"); + // if we have more than 1 background thread, then we cannot + // insert files directly into higher levels because some other + // threads could be concurrently producing compacted files for + // that key range. + // Add file to L0 + edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(), + meta_.fd.GetFileSize(), meta_.smallest, meta_.largest, + meta_.fd.smallest_seqno, meta_.fd.largest_seqno, + meta_.marked_for_compaction, meta_.temperature, + meta_.oldest_blob_file_number, meta_.oldest_ancester_time, + meta_.file_creation_time, meta_.file_checksum, + meta_.file_checksum_func_name, meta_.unique_id); + + edit_->SetBlobFileAdditions(std::move(blob_file_additions)); + } +#ifndef ROCKSDB_LITE + // Piggyback FlushJobInfo on the first first flushed memtable. + mems_[0]->SetFlushJobInfo(GetFlushJobInfo()); +#endif // !ROCKSDB_LITE + + // Note that here we treat flush as level 0 compaction in internal stats + InternalStats::CompactionStats stats(CompactionReason::kFlush, 1); + const uint64_t micros = clock_->NowMicros() - start_micros; + const uint64_t cpu_micros = clock_->CPUMicros() - start_cpu_micros; + stats.micros = micros; + stats.cpu_micros = cpu_micros; + + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Flush lasted %" PRIu64 + " microseconds, and %" PRIu64 " cpu microseconds.\n", + cfd_->GetName().c_str(), job_context_->job_id, micros, + cpu_micros); + + if (has_output) { + stats.bytes_written = meta_.fd.GetFileSize(); + stats.num_output_files = 1; + } + + const auto& blobs = edit_->GetBlobFileAdditions(); + for (const auto& blob : blobs) { + stats.bytes_written_blob += blob.GetTotalBlobBytes(); + } + + stats.num_output_files_blob = static_cast(blobs.size()); + + RecordTimeToHistogram(stats_, FLUSH_TIME, stats.micros); + cfd_->internal_stats()->AddCompactionStats(0 /* level */, thread_pri_, stats); + cfd_->internal_stats()->AddCFStats( + InternalStats::BYTES_FLUSHED, + stats.bytes_written + stats.bytes_written_blob); + RecordFlushIOStats(); + + return s; +} + +Env::IOPriority FlushJob::GetRateLimiterPriorityForWrite() { + if (versions_ && versions_->GetColumnFamilySet() && + versions_->GetColumnFamilySet()->write_controller()) { + WriteController* write_controller = + versions_->GetColumnFamilySet()->write_controller(); + if (write_controller->IsStopped() || write_controller->NeedsDelay()) { + return Env::IO_USER; + } + } + + return Env::IO_HIGH; +} + +#ifndef ROCKSDB_LITE +std::unique_ptr FlushJob::GetFlushJobInfo() const { + db_mutex_->AssertHeld(); + std::unique_ptr info(new FlushJobInfo{}); + info->cf_id = cfd_->GetID(); + info->cf_name = cfd_->GetName(); + + const uint64_t file_number = meta_.fd.GetNumber(); + info->file_path = + MakeTableFileName(cfd_->ioptions()->cf_paths[0].path, file_number); + info->file_number = file_number; + info->oldest_blob_file_number = meta_.oldest_blob_file_number; + info->thread_id = db_options_.env->GetThreadID(); + info->job_id = job_context_->job_id; + info->smallest_seqno = meta_.fd.smallest_seqno; + info->largest_seqno = meta_.fd.largest_seqno; + info->table_properties = table_properties_; + info->flush_reason = cfd_->GetFlushReason(); + info->blob_compression_type = mutable_cf_options_.blob_compression_type; + + // Update BlobFilesInfo. + for (const auto& blob_file : edit_->GetBlobFileAdditions()) { + BlobFileAdditionInfo blob_file_addition_info( + BlobFileName(cfd_->ioptions()->cf_paths.front().path, + blob_file.GetBlobFileNumber()) /*blob_file_path*/, + blob_file.GetBlobFileNumber(), blob_file.GetTotalBlobCount(), + blob_file.GetTotalBlobBytes()); + info->blob_file_addition_infos.emplace_back( + std::move(blob_file_addition_info)); + } + return info; +} +#endif // !ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/flush_job.h b/src/rocksdb/db/flush_job.h new file mode 100644 index 000000000..60c272aec --- /dev/null +++ b/src/rocksdb/db/flush_job.h @@ -0,0 +1,203 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/blob/blob_file_completion_callback.h" +#include "db/column_family.h" +#include "db/flush_scheduler.h" +#include "db/internal_stats.h" +#include "db/job_context.h" +#include "db/log_writer.h" +#include "db/logs_with_prep_tracker.h" +#include "db/memtable_list.h" +#include "db/seqno_to_time_mapping.h" +#include "db/snapshot_impl.h" +#include "db/version_edit.h" +#include "db/write_controller.h" +#include "db/write_thread.h" +#include "logging/event_logger.h" +#include "monitoring/instrumented_mutex.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/listener.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/transaction_log.h" +#include "table/scoped_arena_iterator.h" +#include "util/autovector.h" +#include "util/stop_watch.h" +#include "util/thread_local.h" + +namespace ROCKSDB_NAMESPACE { + +class DBImpl; +class MemTable; +class SnapshotChecker; +class TableCache; +class Version; +class VersionEdit; +class VersionSet; +class Arena; + +class FlushJob { + public: + // TODO(icanadi) make effort to reduce number of parameters here + // IMPORTANT: mutable_cf_options needs to be alive while FlushJob is alive + FlushJob(const std::string& dbname, ColumnFamilyData* cfd, + const ImmutableDBOptions& db_options, + const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id, + const FileOptions& file_options, VersionSet* versions, + InstrumentedMutex* db_mutex, std::atomic* shutting_down, + std::vector existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SnapshotChecker* snapshot_checker, JobContext* job_context, + LogBuffer* log_buffer, FSDirectory* db_directory, + FSDirectory* output_file_directory, + CompressionType output_compression, Statistics* stats, + EventLogger* event_logger, bool measure_io_stats, + const bool sync_output_directory, const bool write_manifest, + Env::Priority thread_pri, const std::shared_ptr& io_tracer, + const SeqnoToTimeMapping& seq_time_mapping, + const std::string& db_id = "", const std::string& db_session_id = "", + std::string full_history_ts_low = "", + BlobFileCompletionCallback* blob_callback = nullptr); + + ~FlushJob(); + + // Require db_mutex held. + // Once PickMemTable() is called, either Run() or Cancel() has to be called. + void PickMemTable(); + Status Run(LogsWithPrepTracker* prep_tracker = nullptr, + FileMetaData* file_meta = nullptr, + bool* switched_to_mempurge = nullptr); + void Cancel(); + const autovector& GetMemTables() const { return mems_; } + +#ifndef ROCKSDB_LITE + std::list>* GetCommittedFlushJobsInfo() { + return &committed_flush_jobs_info_; + } +#endif // !ROCKSDB_LITE + + private: + friend class FlushJobTest_GetRateLimiterPriorityForWrite_Test; + + void ReportStartedFlush(); + void ReportFlushInputSize(const autovector& mems); + void RecordFlushIOStats(); + Status WriteLevel0Table(); + + // Memtable Garbage Collection algorithm: a MemPurge takes the list + // of immutable memtables and filters out (or "purge") the outdated bytes + // out of it. The output (the filtered bytes, or "useful payload") is + // then transfered into a new memtable. If this memtable is filled, then + // the mempurge is aborted and rerouted to a regular flush process. Else, + // depending on the heuristics, placed onto the immutable memtable list. + // The addition to the imm list will not trigger a flush operation. The + // flush of the imm list will instead be triggered once the mutable memtable + // is added to the imm list. + // This process is typically intended for workloads with heavy overwrites + // when we want to avoid SSD writes (and reads) as much as possible. + // "MemPurge" is an experimental feature still at a very early stage + // of development. At the moment it is only compatible with the Get, Put, + // Delete operations as well as Iterators and CompactionFilters. + // For this early version, "MemPurge" is called by setting the + // options.experimental_mempurge_threshold value as >0.0. When this is + // the case, ALL automatic flush operations (kWRiteBufferManagerFull) will + // first go through the MemPurge process. Therefore, we strongly + // recommend all users not to set this flag as true given that the MemPurge + // process has not matured yet. + Status MemPurge(); + bool MemPurgeDecider(double threshold); + // The rate limiter priority (io_priority) is determined dynamically here. + Env::IOPriority GetRateLimiterPriorityForWrite(); +#ifndef ROCKSDB_LITE + std::unique_ptr GetFlushJobInfo() const; +#endif // !ROCKSDB_LITE + + const std::string& dbname_; + const std::string db_id_; + const std::string db_session_id_; + ColumnFamilyData* cfd_; + const ImmutableDBOptions& db_options_; + const MutableCFOptions& mutable_cf_options_; + // A variable storing the largest memtable id to flush in this + // flush job. RocksDB uses this variable to select the memtables to flush in + // this job. All memtables in this column family with an ID smaller than or + // equal to max_memtable_id_ will be selected for flush. + uint64_t max_memtable_id_; + const FileOptions file_options_; + VersionSet* versions_; + InstrumentedMutex* db_mutex_; + std::atomic* shutting_down_; + std::vector existing_snapshots_; + SequenceNumber earliest_write_conflict_snapshot_; + SnapshotChecker* snapshot_checker_; + JobContext* job_context_; + LogBuffer* log_buffer_; + FSDirectory* db_directory_; + FSDirectory* output_file_directory_; + CompressionType output_compression_; + Statistics* stats_; + EventLogger* event_logger_; + TableProperties table_properties_; + bool measure_io_stats_; + // True if this flush job should call fsync on the output directory. False + // otherwise. + // Usually sync_output_directory_ is true. A flush job needs to call sync on + // the output directory before committing to the MANIFEST. + // However, an individual flush job does not have to call sync on the output + // directory if it is part of an atomic flush. After all flush jobs in the + // atomic flush succeed, call sync once on each distinct output directory. + const bool sync_output_directory_; + // True if this flush job should write to MANIFEST after successfully + // flushing memtables. False otherwise. + // Usually write_manifest_ is true. A flush job commits to the MANIFEST after + // flushing the memtables. + // However, an individual flush job cannot rashly write to the MANIFEST + // immediately after it finishes the flush if it is part of an atomic flush. + // In this case, only after all flush jobs succeed in flush can RocksDB + // commit to the MANIFEST. + const bool write_manifest_; + // The current flush job can commit flush result of a concurrent flush job. + // We collect FlushJobInfo of all jobs committed by current job and fire + // OnFlushCompleted for them. + std::list> committed_flush_jobs_info_; + + // Variables below are set by PickMemTable(): + FileMetaData meta_; + autovector mems_; + VersionEdit* edit_; + Version* base_; + bool pick_memtable_called; + Env::Priority thread_pri_; + + const std::shared_ptr io_tracer_; + SystemClock* clock_; + + const std::string full_history_ts_low_; + BlobFileCompletionCallback* blob_callback_; + + // reference to the seqno_time_mapping_ in db_impl.h, not safe to read without + // db mutex + const SeqnoToTimeMapping& db_impl_seqno_time_mapping_; + SeqnoToTimeMapping seqno_to_time_mapping_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/flush_job_test.cc b/src/rocksdb/db/flush_job_test.cc new file mode 100644 index 000000000..f994b4e9b --- /dev/null +++ b/src/rocksdb/db/flush_job_test.cc @@ -0,0 +1,745 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/flush_job.h" + +#include +#include +#include +#include + +#include "db/blob/blob_index.h" +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/version_set.h" +#include "file/writable_file_writer.h" +#include "rocksdb/cache.h" +#include "rocksdb/file_system.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/mock_table.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/random.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +// TODO(icanadi) Mock out everything else: +// 1. VersionSet +// 2. Memtable +class FlushJobTestBase : public testing::Test { + protected: + FlushJobTestBase(std::string dbname, const Comparator* ucmp) + : env_(Env::Default()), + fs_(env_->GetFileSystem()), + dbname_(std::move(dbname)), + ucmp_(ucmp), + options_(), + db_options_(options_), + column_family_names_({kDefaultColumnFamilyName, "foo", "bar"}), + table_cache_(NewLRUCache(50000, 16)), + write_buffer_manager_(db_options_.db_write_buffer_size), + shutting_down_(false), + mock_table_factory_(new mock::MockTableFactory()) {} + + virtual ~FlushJobTestBase() { + if (getenv("KEEP_DB")) { + fprintf(stdout, "db is still in %s\n", dbname_.c_str()); + } else { + // destroy versions_ to release all file handles + versions_.reset(); + EXPECT_OK(DestroyDir(env_, dbname_)); + } + } + + void NewDB() { + ASSERT_OK(SetIdentityFile(env_, dbname_)); + VersionEdit new_db; + + new_db.SetLogNumber(0); + new_db.SetNextFile(2); + new_db.SetLastSequence(0); + + autovector new_cfs; + SequenceNumber last_seq = 1; + uint32_t cf_id = 1; + for (size_t i = 1; i != column_family_names_.size(); ++i) { + VersionEdit new_cf; + new_cf.AddColumnFamily(column_family_names_[i]); + new_cf.SetColumnFamily(cf_id++); + new_cf.SetComparatorName(ucmp_->Name()); + new_cf.SetLogNumber(0); + new_cf.SetNextFile(2); + new_cf.SetLastSequence(last_seq++); + new_cfs.emplace_back(new_cf); + } + + const std::string manifest = DescriptorFileName(dbname_, 1); + const auto& fs = env_->GetFileSystem(); + std::unique_ptr file_writer; + Status s = WritableFileWriter::Create( + fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer, + nullptr); + ASSERT_OK(s); + + { + log::Writer log(std::move(file_writer), 0, false); + std::string record; + new_db.EncodeTo(&record); + s = log.AddRecord(record); + ASSERT_OK(s); + + for (const auto& e : new_cfs) { + record.clear(); + e.EncodeTo(&record); + s = log.AddRecord(record); + ASSERT_OK(s); + } + } + ASSERT_OK(s); + // Make "CURRENT" file that points to the new manifest file. + s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + ASSERT_OK(s); + } + + void SetUp() override { + EXPECT_OK(env_->CreateDirIfMissing(dbname_)); + + // TODO(icanadi) Remove this once we mock out VersionSet + NewDB(); + + db_options_.env = env_; + db_options_.fs = fs_; + db_options_.db_paths.emplace_back(dbname_, + std::numeric_limits::max()); + db_options_.statistics = CreateDBStatistics(); + + cf_options_.comparator = ucmp_; + + std::vector column_families; + cf_options_.table_factory = mock_table_factory_; + for (const auto& cf_name : column_family_names_) { + column_families.emplace_back(cf_name, cf_options_); + } + + versions_.reset( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id*/ "", /*db_session_id*/ "")); + EXPECT_OK(versions_->Recover(column_families, false)); + } + + Env* env_; + std::shared_ptr fs_; + std::string dbname_; + const Comparator* const ucmp_; + EnvOptions env_options_; + Options options_; + ImmutableDBOptions db_options_; + const std::vector column_family_names_; + std::shared_ptr table_cache_; + WriteController write_controller_; + WriteBufferManager write_buffer_manager_; + ColumnFamilyOptions cf_options_; + std::unique_ptr versions_; + InstrumentedMutex mutex_; + std::atomic shutting_down_; + std::shared_ptr mock_table_factory_; + + SeqnoToTimeMapping empty_seqno_to_time_mapping_; +}; + +class FlushJobTest : public FlushJobTestBase { + public: + FlushJobTest() + : FlushJobTestBase(test::PerThreadDBPath("flush_job_test"), + BytewiseComparator()) {} +}; + +TEST_F(FlushJobTest, Empty) { + JobContext job_context(0); + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + EventLogger event_logger(db_options_.info_log.get()); + SnapshotChecker* snapshot_checker = nullptr; // not relavant + FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(), + db_options_, *cfd->GetLatestMutableCFOptions(), + std::numeric_limits::max() /* memtable_id */, + env_options_, versions_.get(), &mutex_, &shutting_down_, + {}, kMaxSequenceNumber, snapshot_checker, &job_context, + nullptr, nullptr, nullptr, kNoCompression, nullptr, + &event_logger, false, true /* sync_output_directory */, + true /* write_manifest */, Env::Priority::USER, + nullptr /*IOTracer*/, empty_seqno_to_time_mapping_); + { + InstrumentedMutexLock l(&mutex_); + flush_job.PickMemTable(); + ASSERT_OK(flush_job.Run()); + } + job_context.Clean(); +} + +TEST_F(FlushJobTest, NonEmpty) { + JobContext job_context(0); + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + auto new_mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(), + kMaxSequenceNumber); + new_mem->Ref(); + auto inserted_keys = mock::MakeMockFile(); + // Test data: + // seqno [ 1, 2 ... 8998, 8999, 9000, 9001, 9002 ... 9999 ] + // key [ 1001, 1002 ... 9998, 9999, 0, 1, 2 ... 999 ] + // range-delete "9995" -> "9999" at seqno 10000 + // blob references with seqnos 10001..10006 + for (int i = 1; i < 10000; ++i) { + std::string key(std::to_string((i + 1000) % 10000)); + std::string value("value" + key); + ASSERT_OK(new_mem->Add(SequenceNumber(i), kTypeValue, key, value, + nullptr /* kv_prot_info */)); + if ((i + 1000) % 10000 < 9995) { + InternalKey internal_key(key, SequenceNumber(i), kTypeValue); + inserted_keys.push_back({internal_key.Encode().ToString(), value}); + } + } + + { + ASSERT_OK(new_mem->Add(SequenceNumber(10000), kTypeRangeDeletion, "9995", + "9999a", nullptr /* kv_prot_info */)); + InternalKey internal_key("9995", SequenceNumber(10000), kTypeRangeDeletion); + inserted_keys.push_back({internal_key.Encode().ToString(), "9999a"}); + } + + // Note: the first two blob references will not be considered when resolving + // the oldest blob file referenced (the first one is inlined TTL, while the + // second one is TTL and thus points to a TTL blob file). + constexpr std::array blob_file_numbers{ + {kInvalidBlobFileNumber, 5, 103, 17, 102, 101}}; + for (size_t i = 0; i < blob_file_numbers.size(); ++i) { + std::string key(std::to_string(i + 10001)); + std::string blob_index; + if (i == 0) { + BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 1234567890ULL, + "foo"); + } else if (i == 1) { + BlobIndex::EncodeBlobTTL(&blob_index, /* expiration */ 1234567890ULL, + blob_file_numbers[i], /* offset */ i << 10, + /* size */ i << 20, kNoCompression); + } else { + BlobIndex::EncodeBlob(&blob_index, blob_file_numbers[i], + /* offset */ i << 10, /* size */ i << 20, + kNoCompression); + } + + const SequenceNumber seq(i + 10001); + ASSERT_OK(new_mem->Add(seq, kTypeBlobIndex, key, blob_index, + nullptr /* kv_prot_info */)); + + InternalKey internal_key(key, seq, kTypeBlobIndex); + inserted_keys.push_back({internal_key.Encode().ToString(), blob_index}); + } + mock::SortKVVector(&inserted_keys); + + autovector to_delete; + new_mem->ConstructFragmentedRangeTombstones(); + cfd->imm()->Add(new_mem, &to_delete); + for (auto& m : to_delete) { + delete m; + } + + EventLogger event_logger(db_options_.info_log.get()); + SnapshotChecker* snapshot_checker = nullptr; // not relavant + FlushJob flush_job( + dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, + *cfd->GetLatestMutableCFOptions(), + std::numeric_limits::max() /* memtable_id */, env_options_, + versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber, + snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression, + db_options_.statistics.get(), &event_logger, true, + true /* sync_output_directory */, true /* write_manifest */, + Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_); + + HistogramData hist; + FileMetaData file_meta; + mutex_.Lock(); + flush_job.PickMemTable(); + ASSERT_OK(flush_job.Run(nullptr, &file_meta)); + mutex_.Unlock(); + db_options_.statistics->histogramData(FLUSH_TIME, &hist); + ASSERT_GT(hist.average, 0.0); + + ASSERT_EQ(std::to_string(0), file_meta.smallest.user_key().ToString()); + ASSERT_EQ("9999a", file_meta.largest.user_key().ToString()); + ASSERT_EQ(1, file_meta.fd.smallest_seqno); + ASSERT_EQ(10006, file_meta.fd.largest_seqno); + ASSERT_EQ(17, file_meta.oldest_blob_file_number); + mock_table_factory_->AssertSingleFile(inserted_keys); + job_context.Clean(); +} + +TEST_F(FlushJobTest, FlushMemTablesSingleColumnFamily) { + const size_t num_mems = 2; + const size_t num_mems_to_flush = 1; + const size_t num_keys_per_table = 100; + JobContext job_context(0); + ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault(); + std::vector memtable_ids; + std::vector new_mems; + for (size_t i = 0; i != num_mems; ++i) { + MemTable* mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(), + kMaxSequenceNumber); + mem->SetID(i); + mem->Ref(); + new_mems.emplace_back(mem); + memtable_ids.push_back(mem->GetID()); + + for (size_t j = 0; j < num_keys_per_table; ++j) { + std::string key(std::to_string(j + i * num_keys_per_table)); + std::string value("value" + key); + ASSERT_OK(mem->Add(SequenceNumber(j + i * num_keys_per_table), kTypeValue, + key, value, nullptr /* kv_prot_info */)); + } + } + + autovector to_delete; + for (auto mem : new_mems) { + mem->ConstructFragmentedRangeTombstones(); + cfd->imm()->Add(mem, &to_delete); + } + + EventLogger event_logger(db_options_.info_log.get()); + SnapshotChecker* snapshot_checker = nullptr; // not relavant + + assert(memtable_ids.size() == num_mems); + uint64_t smallest_memtable_id = memtable_ids.front(); + uint64_t flush_memtable_id = smallest_memtable_id + num_mems_to_flush - 1; + FlushJob flush_job( + dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, + *cfd->GetLatestMutableCFOptions(), flush_memtable_id, env_options_, + versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber, + snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression, + db_options_.statistics.get(), &event_logger, true, + true /* sync_output_directory */, true /* write_manifest */, + Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_); + HistogramData hist; + FileMetaData file_meta; + mutex_.Lock(); + flush_job.PickMemTable(); + ASSERT_OK(flush_job.Run(nullptr /* prep_tracker */, &file_meta)); + mutex_.Unlock(); + db_options_.statistics->histogramData(FLUSH_TIME, &hist); + ASSERT_GT(hist.average, 0.0); + + ASSERT_EQ(std::to_string(0), file_meta.smallest.user_key().ToString()); + ASSERT_EQ("99", file_meta.largest.user_key().ToString()); + ASSERT_EQ(0, file_meta.fd.smallest_seqno); + ASSERT_EQ(SequenceNumber(num_mems_to_flush * num_keys_per_table - 1), + file_meta.fd.largest_seqno); + ASSERT_EQ(kInvalidBlobFileNumber, file_meta.oldest_blob_file_number); + + for (auto m : to_delete) { + delete m; + } + to_delete.clear(); + job_context.Clean(); +} + +TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) { + autovector all_cfds; + for (auto cfd : *versions_->GetColumnFamilySet()) { + all_cfds.push_back(cfd); + } + const std::vector num_memtables = {2, 1, 3}; + assert(num_memtables.size() == column_family_names_.size()); + const size_t num_keys_per_memtable = 1000; + JobContext job_context(0); + std::vector memtable_ids; + std::vector smallest_seqs; + std::vector largest_seqs; + autovector to_delete; + SequenceNumber curr_seqno = 0; + size_t k = 0; + for (auto cfd : all_cfds) { + smallest_seqs.push_back(curr_seqno); + for (size_t i = 0; i != num_memtables[k]; ++i) { + MemTable* mem = cfd->ConstructNewMemtable( + *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber); + mem->SetID(i); + mem->Ref(); + + for (size_t j = 0; j != num_keys_per_memtable; ++j) { + std::string key(std::to_string(j + i * num_keys_per_memtable)); + std::string value("value" + key); + ASSERT_OK(mem->Add(curr_seqno++, kTypeValue, key, value, + nullptr /* kv_prot_info */)); + } + mem->ConstructFragmentedRangeTombstones(); + cfd->imm()->Add(mem, &to_delete); + } + largest_seqs.push_back(curr_seqno - 1); + memtable_ids.push_back(num_memtables[k++] - 1); + } + + EventLogger event_logger(db_options_.info_log.get()); + SnapshotChecker* snapshot_checker = nullptr; // not relevant + std::vector> flush_jobs; + k = 0; + for (auto cfd : all_cfds) { + std::vector snapshot_seqs; + flush_jobs.emplace_back(new FlushJob( + dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(), + memtable_ids[k], env_options_, versions_.get(), &mutex_, + &shutting_down_, snapshot_seqs, kMaxSequenceNumber, snapshot_checker, + &job_context, nullptr, nullptr, nullptr, kNoCompression, + db_options_.statistics.get(), &event_logger, true, + false /* sync_output_directory */, false /* write_manifest */, + Env::Priority::USER, nullptr /*IOTracer*/, + empty_seqno_to_time_mapping_)); + k++; + } + HistogramData hist; + std::vector file_metas; + // Call reserve to avoid auto-resizing + file_metas.reserve(flush_jobs.size()); + mutex_.Lock(); + for (auto& job : flush_jobs) { + job->PickMemTable(); + } + for (auto& job : flush_jobs) { + FileMetaData meta; + // Run will release and re-acquire mutex + ASSERT_OK(job->Run(nullptr /**/, &meta)); + file_metas.emplace_back(meta); + } + autovector file_meta_ptrs; + for (auto& meta : file_metas) { + file_meta_ptrs.push_back(&meta); + } + autovector*> mems_list; + for (size_t i = 0; i != all_cfds.size(); ++i) { + const auto& mems = flush_jobs[i]->GetMemTables(); + mems_list.push_back(&mems); + } + autovector mutable_cf_options_list; + for (auto cfd : all_cfds) { + mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions()); + } + autovector>*> + committed_flush_jobs_info; +#ifndef ROCKSDB_LITE + for (auto& job : flush_jobs) { + committed_flush_jobs_info.push_back(job->GetCommittedFlushJobsInfo()); + } +#endif //! ROCKSDB_LITE + + Status s = InstallMemtableAtomicFlushResults( + nullptr /* imm_lists */, all_cfds, mutable_cf_options_list, mems_list, + versions_.get(), nullptr /* prep_tracker */, &mutex_, file_meta_ptrs, + committed_flush_jobs_info, &job_context.memtables_to_free, + nullptr /* db_directory */, nullptr /* log_buffer */); + ASSERT_OK(s); + + mutex_.Unlock(); + db_options_.statistics->histogramData(FLUSH_TIME, &hist); + ASSERT_GT(hist.average, 0.0); + k = 0; + for (const auto& file_meta : file_metas) { + ASSERT_EQ(std::to_string(0), file_meta.smallest.user_key().ToString()); + ASSERT_EQ("999", file_meta.largest.user_key() + .ToString()); // max key by bytewise comparator + ASSERT_EQ(smallest_seqs[k], file_meta.fd.smallest_seqno); + ASSERT_EQ(largest_seqs[k], file_meta.fd.largest_seqno); + // Verify that imm is empty + ASSERT_EQ(std::numeric_limits::max(), + all_cfds[k]->imm()->GetEarliestMemTableID()); + ASSERT_EQ(0, all_cfds[k]->imm()->GetLatestMemTableID()); + ++k; + } + + for (auto m : to_delete) { + delete m; + } + to_delete.clear(); + job_context.Clean(); +} + +TEST_F(FlushJobTest, Snapshots) { + JobContext job_context(0); + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + auto new_mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(), + kMaxSequenceNumber); + + std::set snapshots_set; + int keys = 10000; + int max_inserts_per_keys = 8; + + Random rnd(301); + for (int i = 0; i < keys / 2; ++i) { + snapshots_set.insert(rnd.Uniform(keys * (max_inserts_per_keys / 2)) + 1); + } + // set has already removed the duplicate snapshots + std::vector snapshots(snapshots_set.begin(), + snapshots_set.end()); + + new_mem->Ref(); + SequenceNumber current_seqno = 0; + auto inserted_keys = mock::MakeMockFile(); + for (int i = 1; i < keys; ++i) { + std::string key(std::to_string(i)); + int insertions = rnd.Uniform(max_inserts_per_keys); + for (int j = 0; j < insertions; ++j) { + std::string value(rnd.HumanReadableString(10)); + auto seqno = ++current_seqno; + ASSERT_OK(new_mem->Add(SequenceNumber(seqno), kTypeValue, key, value, + nullptr /* kv_prot_info */)); + // a key is visible only if: + // 1. it's the last one written (j == insertions - 1) + // 2. there's a snapshot pointing at it + bool visible = (j == insertions - 1) || + (snapshots_set.find(seqno) != snapshots_set.end()); + if (visible) { + InternalKey internal_key(key, seqno, kTypeValue); + inserted_keys.push_back({internal_key.Encode().ToString(), value}); + } + } + } + mock::SortKVVector(&inserted_keys); + + autovector to_delete; + new_mem->ConstructFragmentedRangeTombstones(); + cfd->imm()->Add(new_mem, &to_delete); + for (auto& m : to_delete) { + delete m; + } + + EventLogger event_logger(db_options_.info_log.get()); + SnapshotChecker* snapshot_checker = nullptr; // not relavant + FlushJob flush_job( + dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, + *cfd->GetLatestMutableCFOptions(), + std::numeric_limits::max() /* memtable_id */, env_options_, + versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber, + snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression, + db_options_.statistics.get(), &event_logger, true, + true /* sync_output_directory */, true /* write_manifest */, + Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_); + mutex_.Lock(); + flush_job.PickMemTable(); + ASSERT_OK(flush_job.Run()); + mutex_.Unlock(); + mock_table_factory_->AssertSingleFile(inserted_keys); + HistogramData hist; + db_options_.statistics->histogramData(FLUSH_TIME, &hist); + ASSERT_GT(hist.average, 0.0); + job_context.Clean(); +} + +TEST_F(FlushJobTest, GetRateLimiterPriorityForWrite) { + // Prepare a FlushJob that flush MemTables of Single Column Family. + const size_t num_mems = 2; + const size_t num_mems_to_flush = 1; + const size_t num_keys_per_table = 100; + JobContext job_context(0); + ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault(); + std::vector memtable_ids; + std::vector new_mems; + for (size_t i = 0; i != num_mems; ++i) { + MemTable* mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(), + kMaxSequenceNumber); + mem->SetID(i); + mem->Ref(); + new_mems.emplace_back(mem); + memtable_ids.push_back(mem->GetID()); + + for (size_t j = 0; j < num_keys_per_table; ++j) { + std::string key(std::to_string(j + i * num_keys_per_table)); + std::string value("value" + key); + ASSERT_OK(mem->Add(SequenceNumber(j + i * num_keys_per_table), kTypeValue, + key, value, nullptr /* kv_prot_info */)); + } + } + + autovector to_delete; + for (auto mem : new_mems) { + mem->ConstructFragmentedRangeTombstones(); + cfd->imm()->Add(mem, &to_delete); + } + + EventLogger event_logger(db_options_.info_log.get()); + SnapshotChecker* snapshot_checker = nullptr; // not relavant + + assert(memtable_ids.size() == num_mems); + uint64_t smallest_memtable_id = memtable_ids.front(); + uint64_t flush_memtable_id = smallest_memtable_id + num_mems_to_flush - 1; + FlushJob flush_job( + dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, + *cfd->GetLatestMutableCFOptions(), flush_memtable_id, env_options_, + versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber, + snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression, + db_options_.statistics.get(), &event_logger, true, + true /* sync_output_directory */, true /* write_manifest */, + Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_); + + // When the state from WriteController is normal. + ASSERT_EQ(flush_job.GetRateLimiterPriorityForWrite(), Env::IO_HIGH); + + WriteController* write_controller = + flush_job.versions_->GetColumnFamilySet()->write_controller(); + + { + // When the state from WriteController is Delayed. + std::unique_ptr delay_token = + write_controller->GetDelayToken(1000000); + ASSERT_EQ(flush_job.GetRateLimiterPriorityForWrite(), Env::IO_USER); + } + + { + // When the state from WriteController is Stopped. + std::unique_ptr stop_token = + write_controller->GetStopToken(); + ASSERT_EQ(flush_job.GetRateLimiterPriorityForWrite(), Env::IO_USER); + } +} + +class FlushJobTimestampTest : public FlushJobTestBase { + public: + FlushJobTimestampTest() + : FlushJobTestBase(test::PerThreadDBPath("flush_job_ts_gc_test"), + test::BytewiseComparatorWithU64TsWrapper()) {} + + void AddKeyValueToMemtable(MemTable* memtable, std::string key, uint64_t ts, + SequenceNumber seq, ValueType value_type, + Slice value) { + std::string key_str(std::move(key)); + PutFixed64(&key_str, ts); + ASSERT_OK(memtable->Add(seq, value_type, key_str, value, + nullptr /* kv_prot_info */)); + } + + protected: + static constexpr uint64_t kStartTs = 10; + static constexpr SequenceNumber kStartSeq = 0; + SequenceNumber curr_seq_{kStartSeq}; + std::atomic curr_ts_{kStartTs}; +}; + +TEST_F(FlushJobTimestampTest, AllKeysExpired) { + ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault(); + autovector to_delete; + + { + MemTable* new_mem = cfd->ConstructNewMemtable( + *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber); + new_mem->Ref(); + for (int i = 0; i < 100; ++i) { + uint64_t ts = curr_ts_.fetch_add(1); + SequenceNumber seq = (curr_seq_++); + AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq, + ValueType::kTypeValue, "0_value"); + } + uint64_t ts = curr_ts_.fetch_add(1); + SequenceNumber seq = (curr_seq_++); + AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq, + ValueType::kTypeDeletionWithTimestamp, ""); + new_mem->ConstructFragmentedRangeTombstones(); + cfd->imm()->Add(new_mem, &to_delete); + } + + std::vector snapshots; + constexpr SnapshotChecker* const snapshot_checker = nullptr; + JobContext job_context(0); + EventLogger event_logger(db_options_.info_log.get()); + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, std::numeric_limits::max()); + FlushJob flush_job( + dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(), + std::numeric_limits::max() /* memtable_id */, env_options_, + versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber, + snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression, + db_options_.statistics.get(), &event_logger, true, + true /* sync_output_directory */, true /* write_manifest */, + Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_, + /*db_id=*/"", + /*db_session_id=*/"", full_history_ts_low); + + FileMetaData fmeta; + mutex_.Lock(); + flush_job.PickMemTable(); + ASSERT_OK(flush_job.Run(/*prep_tracker=*/nullptr, &fmeta)); + mutex_.Unlock(); + + { + std::string key = test::EncodeInt(0); + key.append(test::EncodeInt(curr_ts_.load(std::memory_order_relaxed) - 1)); + InternalKey ikey(key, curr_seq_ - 1, ValueType::kTypeDeletionWithTimestamp); + ASSERT_EQ(ikey.Encode(), fmeta.smallest.Encode()); + ASSERT_EQ(ikey.Encode(), fmeta.largest.Encode()); + } + + job_context.Clean(); + ASSERT_TRUE(to_delete.empty()); +} + +TEST_F(FlushJobTimestampTest, NoKeyExpired) { + ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault(); + autovector to_delete; + + { + MemTable* new_mem = cfd->ConstructNewMemtable( + *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber); + new_mem->Ref(); + for (int i = 0; i < 100; ++i) { + uint64_t ts = curr_ts_.fetch_add(1); + SequenceNumber seq = (curr_seq_++); + AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq, + ValueType::kTypeValue, "0_value"); + } + new_mem->ConstructFragmentedRangeTombstones(); + cfd->imm()->Add(new_mem, &to_delete); + } + + std::vector snapshots; + SnapshotChecker* const snapshot_checker = nullptr; + JobContext job_context(0); + EventLogger event_logger(db_options_.info_log.get()); + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, 0); + FlushJob flush_job( + dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(), + std::numeric_limits::max() /* memtable_id */, env_options_, + versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber, + snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression, + db_options_.statistics.get(), &event_logger, true, + true /* sync_output_directory */, true /* write_manifest */, + Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_, + /*db_id=*/"", + /*db_session_id=*/"", full_history_ts_low); + + FileMetaData fmeta; + mutex_.Lock(); + flush_job.PickMemTable(); + ASSERT_OK(flush_job.Run(/*prep_tracker=*/nullptr, &fmeta)); + mutex_.Unlock(); + + { + std::string ukey = test::EncodeInt(0); + std::string smallest_key = + ukey + test::EncodeInt(curr_ts_.load(std::memory_order_relaxed) - 1); + std::string largest_key = ukey + test::EncodeInt(kStartTs); + InternalKey smallest(smallest_key, curr_seq_ - 1, ValueType::kTypeValue); + InternalKey largest(largest_key, kStartSeq, ValueType::kTypeValue); + ASSERT_EQ(smallest.Encode(), fmeta.smallest.Encode()); + ASSERT_EQ(largest.Encode(), fmeta.largest.Encode()); + } + job_context.Clean(); + ASSERT_TRUE(to_delete.empty()); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/flush_scheduler.cc b/src/rocksdb/db/flush_scheduler.cc new file mode 100644 index 000000000..6f4d3e1a5 --- /dev/null +++ b/src/rocksdb/db/flush_scheduler.cc @@ -0,0 +1,86 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/flush_scheduler.h" + +#include + +#include "db/column_family.h" + +namespace ROCKSDB_NAMESPACE { + +void FlushScheduler::ScheduleWork(ColumnFamilyData* cfd) { +#ifndef NDEBUG + { + std::lock_guard lock(checking_mutex_); + assert(checking_set_.count(cfd) == 0); + checking_set_.insert(cfd); + } +#endif // NDEBUG + cfd->Ref(); +// Suppress false positive clang analyzer warnings. +#ifndef __clang_analyzer__ + Node* node = new Node{cfd, head_.load(std::memory_order_relaxed)}; + while (!head_.compare_exchange_strong( + node->next, node, std::memory_order_relaxed, std::memory_order_relaxed)) { + // failing CAS updates the first param, so we are already set for + // retry. TakeNextColumnFamily won't happen until after another + // inter-thread synchronization, so we don't even need release + // semantics for this CAS + } +#endif // __clang_analyzer__ +} + +ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() { + while (true) { + if (head_.load(std::memory_order_relaxed) == nullptr) { + return nullptr; + } + + // dequeue the head + Node* node = head_.load(std::memory_order_relaxed); + head_.store(node->next, std::memory_order_relaxed); + ColumnFamilyData* cfd = node->column_family; + delete node; + +#ifndef NDEBUG + { + std::lock_guard lock(checking_mutex_); + auto iter = checking_set_.find(cfd); + assert(iter != checking_set_.end()); + checking_set_.erase(iter); + } +#endif // NDEBUG + + if (!cfd->IsDropped()) { + // success + return cfd; + } + + // no longer relevant, retry + cfd->UnrefAndTryDelete(); + } +} + +bool FlushScheduler::Empty() { + auto rv = head_.load(std::memory_order_relaxed) == nullptr; +#ifndef NDEBUG + std::lock_guard lock(checking_mutex_); + // Empty is allowed to be called concurrnetly with ScheduleFlush. It would + // only miss the recent schedules. + assert((rv == checking_set_.empty()) || rv); +#endif // NDEBUG + return rv; +} + +void FlushScheduler::Clear() { + ColumnFamilyData* cfd; + while ((cfd = TakeNextColumnFamily()) != nullptr) { + cfd->UnrefAndTryDelete(); + } + assert(head_.load(std::memory_order_relaxed) == nullptr); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/flush_scheduler.h b/src/rocksdb/db/flush_scheduler.h new file mode 100644 index 000000000..eb03f3e11 --- /dev/null +++ b/src/rocksdb/db/flush_scheduler.h @@ -0,0 +1,55 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class ColumnFamilyData; + +// FlushScheduler keeps track of all column families whose memtable may +// be full and require flushing. Unless otherwise noted, all methods on +// FlushScheduler should be called only with the DB mutex held or from +// a single-threaded recovery context. +class FlushScheduler { + public: + FlushScheduler() : head_(nullptr) {} + + // May be called from multiple threads at once, but not concurrent with + // any other method calls on this instance + void ScheduleWork(ColumnFamilyData* cfd); + + // Removes and returns Ref()-ed column family. Client needs to Unref(). + // Filters column families that have been dropped. + ColumnFamilyData* TakeNextColumnFamily(); + + // This can be called concurrently with ScheduleWork but it would miss all + // the scheduled flushes after the last synchronization. This would result + // into less precise enforcement of memtable sizes but should not matter much. + bool Empty(); + + void Clear(); + + private: + struct Node { + ColumnFamilyData* column_family; + Node* next; + }; + + std::atomic head_; +#ifndef NDEBUG + std::mutex checking_mutex_; + std::set checking_set_; +#endif // NDEBUG +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/forward_iterator.cc b/src/rocksdb/db/forward_iterator.cc new file mode 100644 index 000000000..3fbc2cf47 --- /dev/null +++ b/src/rocksdb/db/forward_iterator.cc @@ -0,0 +1,1062 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#include "db/forward_iterator.h" + +#include +#include +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/job_context.h" +#include "db/range_del_aggregator.h" +#include "db/range_tombstone_fragmenter.h" +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "table/merging_iterator.h" +#include "test_util/sync_point.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +// Usage: +// ForwardLevelIterator iter; +// iter.SetFileIndex(file_index); +// iter.Seek(target); // or iter.SeekToFirst(); +// iter.Next() +class ForwardLevelIterator : public InternalIterator { + public: + ForwardLevelIterator( + const ColumnFamilyData* const cfd, const ReadOptions& read_options, + const std::vector& files, + const std::shared_ptr& prefix_extractor, + bool allow_unprepared_value) + : cfd_(cfd), + read_options_(read_options), + files_(files), + valid_(false), + file_index_(std::numeric_limits::max()), + file_iter_(nullptr), + pinned_iters_mgr_(nullptr), + prefix_extractor_(prefix_extractor), + allow_unprepared_value_(allow_unprepared_value) { + status_.PermitUncheckedError(); // Allow uninitialized status through + } + + ~ForwardLevelIterator() override { + // Reset current pointer + if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) { + pinned_iters_mgr_->PinIterator(file_iter_); + } else { + delete file_iter_; + } + } + + void SetFileIndex(uint32_t file_index) { + assert(file_index < files_.size()); + status_ = Status::OK(); + if (file_index != file_index_) { + file_index_ = file_index; + Reset(); + } + } + void Reset() { + assert(file_index_ < files_.size()); + + // Reset current pointer + if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) { + pinned_iters_mgr_->PinIterator(file_iter_); + } else { + delete file_iter_; + } + + ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(), + kMaxSequenceNumber /* upper_bound */); + file_iter_ = cfd_->table_cache()->NewIterator( + read_options_, *(cfd_->soptions()), cfd_->internal_comparator(), + *files_[file_index_], + read_options_.ignore_range_deletions ? nullptr : &range_del_agg, + prefix_extractor_, /*table_reader_ptr=*/nullptr, + /*file_read_hist=*/nullptr, TableReaderCaller::kUserIterator, + /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1, + /*max_file_size_for_l0_meta_pin=*/0, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr, allow_unprepared_value_); + file_iter_->SetPinnedItersMgr(pinned_iters_mgr_); + valid_ = false; + if (!range_del_agg.IsEmpty()) { + status_ = Status::NotSupported( + "Range tombstones unsupported with ForwardIterator"); + } + } + void SeekToLast() override { + status_ = Status::NotSupported("ForwardLevelIterator::SeekToLast()"); + valid_ = false; + } + void Prev() override { + status_ = Status::NotSupported("ForwardLevelIterator::Prev()"); + valid_ = false; + } + bool Valid() const override { return valid_; } + void SeekToFirst() override { + assert(file_iter_ != nullptr); + if (!status_.ok()) { + assert(!valid_); + return; + } + file_iter_->SeekToFirst(); + valid_ = file_iter_->Valid(); + } + void Seek(const Slice& internal_key) override { + assert(file_iter_ != nullptr); + + // This deviates from the usual convention for InternalIterator::Seek() in + // that it doesn't discard pre-existing error status. That's because this + // Seek() is only supposed to be called immediately after SetFileIndex() + // (which discards pre-existing error status), and SetFileIndex() may set + // an error status, which we shouldn't discard. + if (!status_.ok()) { + assert(!valid_); + return; + } + + file_iter_->Seek(internal_key); + valid_ = file_iter_->Valid(); + } + void SeekForPrev(const Slice& /*internal_key*/) override { + status_ = Status::NotSupported("ForwardLevelIterator::SeekForPrev()"); + valid_ = false; + } + void Next() override { + assert(valid_); + file_iter_->Next(); + for (;;) { + valid_ = file_iter_->Valid(); + if (!file_iter_->status().ok()) { + assert(!valid_); + return; + } + if (valid_) { + return; + } + if (file_index_ + 1 >= files_.size()) { + valid_ = false; + return; + } + SetFileIndex(file_index_ + 1); + if (!status_.ok()) { + assert(!valid_); + return; + } + file_iter_->SeekToFirst(); + } + } + Slice key() const override { + assert(valid_); + return file_iter_->key(); + } + Slice value() const override { + assert(valid_); + return file_iter_->value(); + } + Status status() const override { + if (!status_.ok()) { + return status_; + } else if (file_iter_) { + return file_iter_->status(); + } + return Status::OK(); + } + bool PrepareValue() override { + assert(valid_); + if (file_iter_->PrepareValue()) { + return true; + } + + assert(!file_iter_->Valid()); + valid_ = false; + return false; + } + bool IsKeyPinned() const override { + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + file_iter_->IsKeyPinned(); + } + bool IsValuePinned() const override { + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + file_iter_->IsValuePinned(); + } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + if (file_iter_) { + file_iter_->SetPinnedItersMgr(pinned_iters_mgr_); + } + } + + private: + const ColumnFamilyData* const cfd_; + const ReadOptions& read_options_; + const std::vector& files_; + + bool valid_; + uint32_t file_index_; + Status status_; + InternalIterator* file_iter_; + PinnedIteratorsManager* pinned_iters_mgr_; + // Kept alive by ForwardIterator::sv_->mutable_cf_options + const std::shared_ptr& prefix_extractor_; + const bool allow_unprepared_value_; +}; + +ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options, + ColumnFamilyData* cfd, + SuperVersion* current_sv, + bool allow_unprepared_value) + : db_(db), + read_options_(read_options), + cfd_(cfd), + prefix_extractor_(current_sv->mutable_cf_options.prefix_extractor.get()), + user_comparator_(cfd->user_comparator()), + allow_unprepared_value_(allow_unprepared_value), + immutable_min_heap_(MinIterComparator(&cfd_->internal_comparator())), + sv_(current_sv), + mutable_iter_(nullptr), + current_(nullptr), + valid_(false), + status_(Status::OK()), + immutable_status_(Status::OK()), + has_iter_trimmed_for_upper_bound_(false), + current_over_upper_bound_(false), + is_prev_set_(false), + is_prev_inclusive_(false), + pinned_iters_mgr_(nullptr) { + if (sv_) { + RebuildIterators(false); + } + + // immutable_status_ is a local aggregation of the + // status of the immutable Iterators. + // We have to PermitUncheckedError in case it is never + // used, otherwise it will fail ASSERT_STATUS_CHECKED. + immutable_status_.PermitUncheckedError(); +} + +ForwardIterator::~ForwardIterator() { Cleanup(true); } + +void ForwardIterator::SVCleanup(DBImpl* db, SuperVersion* sv, + bool background_purge_on_iterator_cleanup) { + if (sv->Unref()) { + // Job id == 0 means that this is not our background process, but rather + // user thread + JobContext job_context(0); + db->mutex_.Lock(); + sv->Cleanup(); + db->FindObsoleteFiles(&job_context, false, true); + if (background_purge_on_iterator_cleanup) { + db->ScheduleBgLogWriterClose(&job_context); + db->AddSuperVersionsToFreeQueue(sv); + db->SchedulePurge(); + } + db->mutex_.Unlock(); + if (!background_purge_on_iterator_cleanup) { + delete sv; + } + if (job_context.HaveSomethingToDelete()) { + db->PurgeObsoleteFiles(job_context, background_purge_on_iterator_cleanup); + } + job_context.Clean(); + } +} + +namespace { +struct SVCleanupParams { + DBImpl* db; + SuperVersion* sv; + bool background_purge_on_iterator_cleanup; +}; +} // anonymous namespace + +// Used in PinnedIteratorsManager to release pinned SuperVersion +void ForwardIterator::DeferredSVCleanup(void* arg) { + auto d = reinterpret_cast(arg); + ForwardIterator::SVCleanup(d->db, d->sv, + d->background_purge_on_iterator_cleanup); + delete d; +} + +void ForwardIterator::SVCleanup() { + if (sv_ == nullptr) { + return; + } + bool background_purge = + read_options_.background_purge_on_iterator_cleanup || + db_->immutable_db_options().avoid_unnecessary_blocking_io; + if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) { + // pinned_iters_mgr_ tells us to make sure that all visited key-value slices + // are alive until pinned_iters_mgr_->ReleasePinnedData() is called. + // The slices may point into some memtables owned by sv_, so we need to keep + // sv_ referenced until pinned_iters_mgr_ unpins everything. + auto p = new SVCleanupParams{db_, sv_, background_purge}; + pinned_iters_mgr_->PinPtr(p, &ForwardIterator::DeferredSVCleanup); + } else { + SVCleanup(db_, sv_, background_purge); + } +} + +void ForwardIterator::Cleanup(bool release_sv) { + if (mutable_iter_ != nullptr) { + DeleteIterator(mutable_iter_, true /* is_arena */); + } + + for (auto* m : imm_iters_) { + DeleteIterator(m, true /* is_arena */); + } + imm_iters_.clear(); + + for (auto* f : l0_iters_) { + DeleteIterator(f); + } + l0_iters_.clear(); + + for (auto* l : level_iters_) { + DeleteIterator(l); + } + level_iters_.clear(); + + if (release_sv) { + SVCleanup(); + } +} + +bool ForwardIterator::Valid() const { + // See UpdateCurrent(). + return valid_ ? !current_over_upper_bound_ : false; +} + +void ForwardIterator::SeekToFirst() { + if (sv_ == nullptr) { + RebuildIterators(true); + } else if (sv_->version_number != cfd_->GetSuperVersionNumber()) { + RenewIterators(); + } else if (immutable_status_.IsIncomplete()) { + ResetIncompleteIterators(); + } + SeekInternal(Slice(), true, false); +} + +bool ForwardIterator::IsOverUpperBound(const Slice& internal_key) const { + return !(read_options_.iterate_upper_bound == nullptr || + cfd_->internal_comparator().user_comparator()->Compare( + ExtractUserKey(internal_key), + *read_options_.iterate_upper_bound) < 0); +} + +void ForwardIterator::Seek(const Slice& internal_key) { + if (sv_ == nullptr) { + RebuildIterators(true); + } else if (sv_->version_number != cfd_->GetSuperVersionNumber()) { + RenewIterators(); + } else if (immutable_status_.IsIncomplete()) { + ResetIncompleteIterators(); + } + + SeekInternal(internal_key, false, false); + if (read_options_.async_io) { + SeekInternal(internal_key, false, true); + } +} + +// In case of async_io, SeekInternal is called twice with seek_after_async_io +// enabled in second call which only does seeking part to retrieve the blocks. +void ForwardIterator::SeekInternal(const Slice& internal_key, + bool seek_to_first, + bool seek_after_async_io) { + assert(mutable_iter_); + // mutable + if (!seek_after_async_io) { + seek_to_first ? mutable_iter_->SeekToFirst() + : mutable_iter_->Seek(internal_key); + } + + // immutable + // TODO(ljin): NeedToSeekImmutable has negative impact on performance + // if it turns to need to seek immutable often. We probably want to have + // an option to turn it off. + if (seek_to_first || seek_after_async_io || + NeedToSeekImmutable(internal_key)) { + if (!seek_after_async_io) { + immutable_status_ = Status::OK(); + if (has_iter_trimmed_for_upper_bound_ && + ( + // prev_ is not set yet + is_prev_set_ == false || + // We are doing SeekToFirst() and internal_key.size() = 0 + seek_to_first || + // prev_key_ > internal_key + cfd_->internal_comparator().InternalKeyComparator::Compare( + prev_key_.GetInternalKey(), internal_key) > 0)) { + // Some iterators are trimmed. Need to rebuild. + RebuildIterators(true); + // Already seeked mutable iter, so seek again + seek_to_first ? mutable_iter_->SeekToFirst() + : mutable_iter_->Seek(internal_key); + } + { + auto tmp = MinIterHeap(MinIterComparator(&cfd_->internal_comparator())); + immutable_min_heap_.swap(tmp); + } + for (size_t i = 0; i < imm_iters_.size(); i++) { + auto* m = imm_iters_[i]; + seek_to_first ? m->SeekToFirst() : m->Seek(internal_key); + if (!m->status().ok()) { + immutable_status_ = m->status(); + } else if (m->Valid()) { + immutable_min_heap_.push(m); + } + } + } + + Slice target_user_key; + if (!seek_to_first) { + target_user_key = ExtractUserKey(internal_key); + } + const VersionStorageInfo* vstorage = sv_->current->storage_info(); + const std::vector& l0 = vstorage->LevelFiles(0); + for (size_t i = 0; i < l0.size(); ++i) { + if (!l0_iters_[i]) { + continue; + } + if (seek_after_async_io) { + if (!l0_iters_[i]->status().IsTryAgain()) { + continue; + } + } + + if (seek_to_first) { + l0_iters_[i]->SeekToFirst(); + } else { + // If the target key passes over the largest key, we are sure Next() + // won't go over this file. + if (seek_after_async_io == false && + user_comparator_->Compare(target_user_key, + l0[i]->largest.user_key()) > 0) { + if (read_options_.iterate_upper_bound != nullptr) { + has_iter_trimmed_for_upper_bound_ = true; + DeleteIterator(l0_iters_[i]); + l0_iters_[i] = nullptr; + } + continue; + } + l0_iters_[i]->Seek(internal_key); + } + + if (l0_iters_[i]->status().IsTryAgain()) { + assert(!seek_after_async_io); + continue; + } else if (!l0_iters_[i]->status().ok()) { + immutable_status_ = l0_iters_[i]->status(); + } else if (l0_iters_[i]->Valid() && + !IsOverUpperBound(l0_iters_[i]->key())) { + immutable_min_heap_.push(l0_iters_[i]); + } else { + has_iter_trimmed_for_upper_bound_ = true; + DeleteIterator(l0_iters_[i]); + l0_iters_[i] = nullptr; + } + } + + for (int32_t level = 1; level < vstorage->num_levels(); ++level) { + const std::vector& level_files = + vstorage->LevelFiles(level); + if (level_files.empty()) { + continue; + } + if (level_iters_[level - 1] == nullptr) { + continue; + } + + if (seek_after_async_io) { + if (!level_iters_[level - 1]->status().IsTryAgain()) { + continue; + } + } + uint32_t f_idx = 0; + if (!seek_to_first && !seek_after_async_io) { + f_idx = FindFileInRange(level_files, internal_key, 0, + static_cast(level_files.size())); + } + + // Seek + if (seek_after_async_io || f_idx < level_files.size()) { + if (!seek_after_async_io) { + level_iters_[level - 1]->SetFileIndex(f_idx); + } + seek_to_first ? level_iters_[level - 1]->SeekToFirst() + : level_iters_[level - 1]->Seek(internal_key); + + if (level_iters_[level - 1]->status().IsTryAgain()) { + assert(!seek_after_async_io); + continue; + } else if (!level_iters_[level - 1]->status().ok()) { + immutable_status_ = level_iters_[level - 1]->status(); + } else if (level_iters_[level - 1]->Valid() && + !IsOverUpperBound(level_iters_[level - 1]->key())) { + immutable_min_heap_.push(level_iters_[level - 1]); + } else { + // Nothing in this level is interesting. Remove. + has_iter_trimmed_for_upper_bound_ = true; + DeleteIterator(level_iters_[level - 1]); + level_iters_[level - 1] = nullptr; + } + } + } + + if (seek_to_first) { + is_prev_set_ = false; + } else { + prev_key_.SetInternalKey(internal_key); + is_prev_set_ = true; + is_prev_inclusive_ = true; + } + + TEST_SYNC_POINT_CALLBACK("ForwardIterator::SeekInternal:Immutable", this); + } else if (current_ && current_ != mutable_iter_) { + // current_ is one of immutable iterators, push it back to the heap + immutable_min_heap_.push(current_); + } + + // For async_io, it should be updated when seek_after_async_io is true (in + // second call). + if (seek_to_first || !read_options_.async_io || seek_after_async_io) { + UpdateCurrent(); + } + TEST_SYNC_POINT_CALLBACK("ForwardIterator::SeekInternal:Return", this); +} + +void ForwardIterator::Next() { + assert(valid_); + bool update_prev_key = false; + + if (sv_ == nullptr || sv_->version_number != cfd_->GetSuperVersionNumber()) { + std::string current_key = key().ToString(); + Slice old_key(current_key.data(), current_key.size()); + + if (sv_ == nullptr) { + RebuildIterators(true); + } else { + RenewIterators(); + } + + SeekInternal(old_key, false, false); + if (read_options_.async_io) { + SeekInternal(old_key, false, true); + } + + if (!valid_ || key().compare(old_key) != 0) { + return; + } + } else if (current_ != mutable_iter_) { + // It is going to advance immutable iterator + + if (is_prev_set_ && prefix_extractor_) { + // advance prev_key_ to current_ only if they share the same prefix + update_prev_key = + prefix_extractor_->Transform(prev_key_.GetUserKey()) + .compare(prefix_extractor_->Transform(current_->key())) == 0; + } else { + update_prev_key = true; + } + + if (update_prev_key) { + prev_key_.SetInternalKey(current_->key()); + is_prev_set_ = true; + is_prev_inclusive_ = false; + } + } + + current_->Next(); + if (current_ != mutable_iter_) { + if (!current_->status().ok()) { + immutable_status_ = current_->status(); + } else if ((current_->Valid()) && (!IsOverUpperBound(current_->key()))) { + immutable_min_heap_.push(current_); + } else { + if ((current_->Valid()) && (IsOverUpperBound(current_->key()))) { + // remove the current iterator + DeleteCurrentIter(); + current_ = nullptr; + } + if (update_prev_key) { + mutable_iter_->Seek(prev_key_.GetInternalKey()); + } + } + } + UpdateCurrent(); + TEST_SYNC_POINT_CALLBACK("ForwardIterator::Next:Return", this); +} + +Slice ForwardIterator::key() const { + assert(valid_); + return current_->key(); +} + +Slice ForwardIterator::value() const { + assert(valid_); + return current_->value(); +} + +Status ForwardIterator::status() const { + if (!status_.ok()) { + return status_; + } else if (!mutable_iter_->status().ok()) { + return mutable_iter_->status(); + } + + return immutable_status_; +} + +bool ForwardIterator::PrepareValue() { + assert(valid_); + if (current_->PrepareValue()) { + return true; + } + + assert(!current_->Valid()); + assert(!current_->status().ok()); + assert(current_ != mutable_iter_); // memtable iterator can't fail + assert(immutable_status_.ok()); + + valid_ = false; + immutable_status_ = current_->status(); + return false; +} + +Status ForwardIterator::GetProperty(std::string prop_name, std::string* prop) { + assert(prop != nullptr); + if (prop_name == "rocksdb.iterator.super-version-number") { + *prop = std::to_string(sv_->version_number); + return Status::OK(); + } + return Status::InvalidArgument(); +} + +void ForwardIterator::SetPinnedItersMgr( + PinnedIteratorsManager* pinned_iters_mgr) { + pinned_iters_mgr_ = pinned_iters_mgr; + UpdateChildrenPinnedItersMgr(); +} + +void ForwardIterator::UpdateChildrenPinnedItersMgr() { + // Set PinnedIteratorsManager for mutable memtable iterator. + if (mutable_iter_) { + mutable_iter_->SetPinnedItersMgr(pinned_iters_mgr_); + } + + // Set PinnedIteratorsManager for immutable memtable iterators. + for (InternalIterator* child_iter : imm_iters_) { + if (child_iter) { + child_iter->SetPinnedItersMgr(pinned_iters_mgr_); + } + } + + // Set PinnedIteratorsManager for L0 files iterators. + for (InternalIterator* child_iter : l0_iters_) { + if (child_iter) { + child_iter->SetPinnedItersMgr(pinned_iters_mgr_); + } + } + + // Set PinnedIteratorsManager for L1+ levels iterators. + for (ForwardLevelIterator* child_iter : level_iters_) { + if (child_iter) { + child_iter->SetPinnedItersMgr(pinned_iters_mgr_); + } + } +} + +bool ForwardIterator::IsKeyPinned() const { + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + current_->IsKeyPinned(); +} + +bool ForwardIterator::IsValuePinned() const { + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + current_->IsValuePinned(); +} + +void ForwardIterator::RebuildIterators(bool refresh_sv) { + // Clean up + Cleanup(refresh_sv); + if (refresh_sv) { + // New + sv_ = cfd_->GetReferencedSuperVersion(db_); + } + ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(), + kMaxSequenceNumber /* upper_bound */); + mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_); + sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_); + if (!read_options_.ignore_range_deletions) { + std::unique_ptr range_del_iter( + sv_->mem->NewRangeTombstoneIterator( + read_options_, sv_->current->version_set()->LastSequence(), + false /* immutable_memtable */)); + range_del_agg.AddTombstones(std::move(range_del_iter)); + // Always return Status::OK(). + Status temp_s = sv_->imm->AddRangeTombstoneIterators(read_options_, &arena_, + &range_del_agg); + assert(temp_s.ok()); + } + has_iter_trimmed_for_upper_bound_ = false; + + const auto* vstorage = sv_->current->storage_info(); + const auto& l0_files = vstorage->LevelFiles(0); + l0_iters_.reserve(l0_files.size()); + for (const auto* l0 : l0_files) { + if ((read_options_.iterate_upper_bound != nullptr) && + cfd_->internal_comparator().user_comparator()->Compare( + l0->smallest.user_key(), *read_options_.iterate_upper_bound) > 0) { + // No need to set has_iter_trimmed_for_upper_bound_: this ForwardIterator + // will never be interested in files with smallest key above + // iterate_upper_bound, since iterate_upper_bound can't be changed. + l0_iters_.push_back(nullptr); + continue; + } + l0_iters_.push_back(cfd_->table_cache()->NewIterator( + read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0, + read_options_.ignore_range_deletions ? nullptr : &range_del_agg, + sv_->mutable_cf_options.prefix_extractor, + /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, + TableReaderCaller::kUserIterator, /*arena=*/nullptr, + /*skip_filters=*/false, /*level=*/-1, + MaxFileSizeForL0MetaPin(sv_->mutable_cf_options), + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr, allow_unprepared_value_)); + } + BuildLevelIterators(vstorage, sv_); + current_ = nullptr; + is_prev_set_ = false; + + UpdateChildrenPinnedItersMgr(); + if (!range_del_agg.IsEmpty()) { + status_ = Status::NotSupported( + "Range tombstones unsupported with ForwardIterator"); + valid_ = false; + } +} + +void ForwardIterator::RenewIterators() { + SuperVersion* svnew; + assert(sv_); + svnew = cfd_->GetReferencedSuperVersion(db_); + + if (mutable_iter_ != nullptr) { + DeleteIterator(mutable_iter_, true /* is_arena */); + } + for (auto* m : imm_iters_) { + DeleteIterator(m, true /* is_arena */); + } + imm_iters_.clear(); + + mutable_iter_ = svnew->mem->NewIterator(read_options_, &arena_); + svnew->imm->AddIterators(read_options_, &imm_iters_, &arena_); + ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(), + kMaxSequenceNumber /* upper_bound */); + if (!read_options_.ignore_range_deletions) { + std::unique_ptr range_del_iter( + svnew->mem->NewRangeTombstoneIterator( + read_options_, sv_->current->version_set()->LastSequence(), + false /* immutable_memtable */)); + range_del_agg.AddTombstones(std::move(range_del_iter)); + // Always return Status::OK(). + Status temp_s = svnew->imm->AddRangeTombstoneIterators( + read_options_, &arena_, &range_del_agg); + assert(temp_s.ok()); + } + + const auto* vstorage = sv_->current->storage_info(); + const auto& l0_files = vstorage->LevelFiles(0); + const auto* vstorage_new = svnew->current->storage_info(); + const auto& l0_files_new = vstorage_new->LevelFiles(0); + size_t iold, inew; + bool found; + std::vector l0_iters_new; + l0_iters_new.reserve(l0_files_new.size()); + + for (inew = 0; inew < l0_files_new.size(); inew++) { + found = false; + for (iold = 0; iold < l0_files.size(); iold++) { + if (l0_files[iold] == l0_files_new[inew]) { + found = true; + break; + } + } + if (found) { + if (l0_iters_[iold] == nullptr) { + l0_iters_new.push_back(nullptr); + TEST_SYNC_POINT_CALLBACK("ForwardIterator::RenewIterators:Null", this); + } else { + l0_iters_new.push_back(l0_iters_[iold]); + l0_iters_[iold] = nullptr; + TEST_SYNC_POINT_CALLBACK("ForwardIterator::RenewIterators:Copy", this); + } + continue; + } + l0_iters_new.push_back(cfd_->table_cache()->NewIterator( + read_options_, *cfd_->soptions(), cfd_->internal_comparator(), + *l0_files_new[inew], + read_options_.ignore_range_deletions ? nullptr : &range_del_agg, + svnew->mutable_cf_options.prefix_extractor, + /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, + TableReaderCaller::kUserIterator, /*arena=*/nullptr, + /*skip_filters=*/false, /*level=*/-1, + MaxFileSizeForL0MetaPin(svnew->mutable_cf_options), + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr, allow_unprepared_value_)); + } + + for (auto* f : l0_iters_) { + DeleteIterator(f); + } + l0_iters_.clear(); + l0_iters_ = l0_iters_new; + + for (auto* l : level_iters_) { + DeleteIterator(l); + } + level_iters_.clear(); + BuildLevelIterators(vstorage_new, svnew); + current_ = nullptr; + is_prev_set_ = false; + SVCleanup(); + sv_ = svnew; + + UpdateChildrenPinnedItersMgr(); + if (!range_del_agg.IsEmpty()) { + status_ = Status::NotSupported( + "Range tombstones unsupported with ForwardIterator"); + valid_ = false; + } +} + +void ForwardIterator::BuildLevelIterators(const VersionStorageInfo* vstorage, + SuperVersion* sv) { + level_iters_.reserve(vstorage->num_levels() - 1); + for (int32_t level = 1; level < vstorage->num_levels(); ++level) { + const auto& level_files = vstorage->LevelFiles(level); + if ((level_files.empty()) || + ((read_options_.iterate_upper_bound != nullptr) && + (user_comparator_->Compare(*read_options_.iterate_upper_bound, + level_files[0]->smallest.user_key()) < + 0))) { + level_iters_.push_back(nullptr); + if (!level_files.empty()) { + has_iter_trimmed_for_upper_bound_ = true; + } + } else { + level_iters_.push_back(new ForwardLevelIterator( + cfd_, read_options_, level_files, + sv->mutable_cf_options.prefix_extractor, allow_unprepared_value_)); + } + } +} + +void ForwardIterator::ResetIncompleteIterators() { + const auto& l0_files = sv_->current->storage_info()->LevelFiles(0); + for (size_t i = 0; i < l0_iters_.size(); ++i) { + assert(i < l0_files.size()); + if (!l0_iters_[i] || !l0_iters_[i]->status().IsIncomplete()) { + continue; + } + DeleteIterator(l0_iters_[i]); + l0_iters_[i] = cfd_->table_cache()->NewIterator( + read_options_, *cfd_->soptions(), cfd_->internal_comparator(), + *l0_files[i], /*range_del_agg=*/nullptr, + sv_->mutable_cf_options.prefix_extractor, + /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, + TableReaderCaller::kUserIterator, /*arena=*/nullptr, + /*skip_filters=*/false, /*level=*/-1, + MaxFileSizeForL0MetaPin(sv_->mutable_cf_options), + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr, allow_unprepared_value_); + l0_iters_[i]->SetPinnedItersMgr(pinned_iters_mgr_); + } + + for (auto* level_iter : level_iters_) { + if (level_iter && level_iter->status().IsIncomplete()) { + level_iter->Reset(); + } + } + + current_ = nullptr; + is_prev_set_ = false; +} + +void ForwardIterator::UpdateCurrent() { + if (immutable_min_heap_.empty() && !mutable_iter_->Valid()) { + current_ = nullptr; + } else if (immutable_min_heap_.empty()) { + current_ = mutable_iter_; + } else if (!mutable_iter_->Valid()) { + current_ = immutable_min_heap_.top(); + immutable_min_heap_.pop(); + } else { + current_ = immutable_min_heap_.top(); + assert(current_ != nullptr); + assert(current_->Valid()); + int cmp = cfd_->internal_comparator().InternalKeyComparator::Compare( + mutable_iter_->key(), current_->key()); + assert(cmp != 0); + if (cmp > 0) { + immutable_min_heap_.pop(); + } else { + current_ = mutable_iter_; + } + } + valid_ = current_ != nullptr && immutable_status_.ok(); + if (!status_.ok()) { + status_ = Status::OK(); + } + + // Upper bound doesn't apply to the memtable iterator. We want Valid() to + // return false when all iterators are over iterate_upper_bound, but can't + // just set valid_ to false, as that would effectively disable the tailing + // optimization (Seek() would be called on all immutable iterators regardless + // of whether the target key is greater than prev_key_). + current_over_upper_bound_ = valid_ && IsOverUpperBound(current_->key()); +} + +bool ForwardIterator::NeedToSeekImmutable(const Slice& target) { + // We maintain the interval (prev_key_, immutable_min_heap_.top()->key()) + // such that there are no records with keys within that range in + // immutable_min_heap_. Since immutable structures (SST files and immutable + // memtables) can't change in this version, we don't need to do a seek if + // 'target' belongs to that interval (immutable_min_heap_.top() is already + // at the correct position). + + if (!valid_ || !current_ || !is_prev_set_ || !immutable_status_.ok()) { + return true; + } + Slice prev_key = prev_key_.GetInternalKey(); + if (prefix_extractor_ && prefix_extractor_->Transform(target).compare( + prefix_extractor_->Transform(prev_key)) != 0) { + return true; + } + if (cfd_->internal_comparator().InternalKeyComparator::Compare( + prev_key, target) >= (is_prev_inclusive_ ? 1 : 0)) { + return true; + } + + if (immutable_min_heap_.empty() && current_ == mutable_iter_) { + // Nothing to seek on. + return false; + } + if (cfd_->internal_comparator().InternalKeyComparator::Compare( + target, current_ == mutable_iter_ ? immutable_min_heap_.top()->key() + : current_->key()) > 0) { + return true; + } + return false; +} + +void ForwardIterator::DeleteCurrentIter() { + const VersionStorageInfo* vstorage = sv_->current->storage_info(); + const std::vector& l0 = vstorage->LevelFiles(0); + for (size_t i = 0; i < l0.size(); ++i) { + if (!l0_iters_[i]) { + continue; + } + if (l0_iters_[i] == current_) { + has_iter_trimmed_for_upper_bound_ = true; + DeleteIterator(l0_iters_[i]); + l0_iters_[i] = nullptr; + return; + } + } + + for (int32_t level = 1; level < vstorage->num_levels(); ++level) { + if (level_iters_[level - 1] == nullptr) { + continue; + } + if (level_iters_[level - 1] == current_) { + has_iter_trimmed_for_upper_bound_ = true; + DeleteIterator(level_iters_[level - 1]); + level_iters_[level - 1] = nullptr; + } + } +} + +bool ForwardIterator::TEST_CheckDeletedIters(int* pdeleted_iters, + int* pnum_iters) { + bool retval = false; + int deleted_iters = 0; + int num_iters = 0; + + const VersionStorageInfo* vstorage = sv_->current->storage_info(); + const std::vector& l0 = vstorage->LevelFiles(0); + for (size_t i = 0; i < l0.size(); ++i) { + if (!l0_iters_[i]) { + retval = true; + deleted_iters++; + } else { + num_iters++; + } + } + + for (int32_t level = 1; level < vstorage->num_levels(); ++level) { + if ((level_iters_[level - 1] == nullptr) && + (!vstorage->LevelFiles(level).empty())) { + retval = true; + deleted_iters++; + } else if (!vstorage->LevelFiles(level).empty()) { + num_iters++; + } + } + if ((!retval) && num_iters <= 1) { + retval = true; + } + if (pdeleted_iters) { + *pdeleted_iters = deleted_iters; + } + if (pnum_iters) { + *pnum_iters = num_iters; + } + return retval; +} + +uint32_t ForwardIterator::FindFileInRange( + const std::vector& files, const Slice& internal_key, + uint32_t left, uint32_t right) { + auto cmp = [&](const FileMetaData* f, const Slice& k) -> bool { + return cfd_->internal_comparator().InternalKeyComparator::Compare( + f->largest.Encode(), k) < 0; + }; + const auto& b = files.begin(); + return static_cast( + std::lower_bound(b + left, b + right, internal_key, cmp) - b); +} + +void ForwardIterator::DeleteIterator(InternalIterator* iter, bool is_arena) { + if (iter == nullptr) { + return; + } + + if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) { + pinned_iters_mgr_->PinIterator(iter, is_arena); + } else { + if (is_arena) { + iter->~InternalIterator(); + } else { + delete iter; + } + } +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/forward_iterator.h b/src/rocksdb/db/forward_iterator.h new file mode 100644 index 000000000..5a5c6f0f3 --- /dev/null +++ b/src/rocksdb/db/forward_iterator.h @@ -0,0 +1,168 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include "rocksdb/comparator.h" +#ifndef ROCKSDB_LITE + +#include +#include +#include + +#include "memory/arena.h" +#include "rocksdb/db.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "table/internal_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +class DBImpl; +class Env; +struct SuperVersion; +class ColumnFamilyData; +class ForwardLevelIterator; +class VersionStorageInfo; +struct FileMetaData; + +class MinIterComparator { + public: + explicit MinIterComparator(const CompareInterface* comparator) + : comparator_(comparator) {} + + bool operator()(InternalIterator* a, InternalIterator* b) { + return comparator_->Compare(a->key(), b->key()) > 0; + } + + private: + const CompareInterface* comparator_; +}; + +using MinIterHeap = + std::priority_queue, + MinIterComparator>; + +/** + * ForwardIterator is a special type of iterator that only supports Seek() + * and Next(). It is expected to perform better than TailingIterator by + * removing the encapsulation and making all information accessible within + * the iterator. At the current implementation, snapshot is taken at the + * time Seek() is called. The Next() followed do not see new values after. + */ +class ForwardIterator : public InternalIterator { + public: + ForwardIterator(DBImpl* db, const ReadOptions& read_options, + ColumnFamilyData* cfd, SuperVersion* current_sv = nullptr, + bool allow_unprepared_value = false); + virtual ~ForwardIterator(); + + void SeekForPrev(const Slice& /*target*/) override { + status_ = Status::NotSupported("ForwardIterator::SeekForPrev()"); + valid_ = false; + } + void SeekToLast() override { + status_ = Status::NotSupported("ForwardIterator::SeekToLast()"); + valid_ = false; + } + void Prev() override { + status_ = Status::NotSupported("ForwardIterator::Prev"); + valid_ = false; + } + + virtual bool Valid() const override; + void SeekToFirst() override; + virtual void Seek(const Slice& target) override; + virtual void Next() override; + virtual Slice key() const override; + virtual Slice value() const override; + virtual Status status() const override; + virtual bool PrepareValue() override; + virtual Status GetProperty(std::string prop_name, std::string* prop) override; + virtual void SetPinnedItersMgr( + PinnedIteratorsManager* pinned_iters_mgr) override; + virtual bool IsKeyPinned() const override; + virtual bool IsValuePinned() const override; + + bool TEST_CheckDeletedIters(int* deleted_iters, int* num_iters); + + private: + void Cleanup(bool release_sv); + // Unreference and, if needed, clean up the current SuperVersion. This is + // either done immediately or deferred until this iterator is unpinned by + // PinnedIteratorsManager. + void SVCleanup(); + static void SVCleanup(DBImpl* db, SuperVersion* sv, + bool background_purge_on_iterator_cleanup); + static void DeferredSVCleanup(void* arg); + + void RebuildIterators(bool refresh_sv); + void RenewIterators(); + void BuildLevelIterators(const VersionStorageInfo* vstorage, + SuperVersion* sv); + void ResetIncompleteIterators(); + void SeekInternal(const Slice& internal_key, bool seek_to_first, + bool seek_after_async_io); + + void UpdateCurrent(); + bool NeedToSeekImmutable(const Slice& internal_key); + void DeleteCurrentIter(); + uint32_t FindFileInRange(const std::vector& files, + const Slice& internal_key, uint32_t left, + uint32_t right); + + bool IsOverUpperBound(const Slice& internal_key) const; + + // Set PinnedIteratorsManager for all children Iterators, this function should + // be called whenever we update children Iterators or pinned_iters_mgr_. + void UpdateChildrenPinnedItersMgr(); + + // A helper function that will release iter in the proper manner, or pass it + // to pinned_iters_mgr_ to release it later if pinning is enabled. + void DeleteIterator(InternalIterator* iter, bool is_arena = false); + + DBImpl* const db_; + const ReadOptions read_options_; + ColumnFamilyData* const cfd_; + const SliceTransform* const prefix_extractor_; + const Comparator* user_comparator_; + const bool allow_unprepared_value_; + MinIterHeap immutable_min_heap_; + + SuperVersion* sv_; + InternalIterator* mutable_iter_; + std::vector imm_iters_; + std::vector l0_iters_; + std::vector level_iters_; + InternalIterator* current_; + bool valid_; + + // Internal iterator status; set only by one of the unsupported methods. + Status status_; + // Status of immutable iterators, maintained here to avoid iterating over + // all of them in status(). + Status immutable_status_; + // Indicates that at least one of the immutable iterators pointed to a key + // larger than iterate_upper_bound and was therefore destroyed. Seek() may + // need to rebuild such iterators. + bool has_iter_trimmed_for_upper_bound_; + // Is current key larger than iterate_upper_bound? If so, makes Valid() + // return false. + bool current_over_upper_bound_; + + // Left endpoint of the range of keys that immutable iterators currently + // cover. When Seek() is called with a key that's within that range, immutable + // iterators don't need to be moved; see NeedToSeekImmutable(). This key is + // included in the range after a Seek(), but excluded when advancing the + // iterator using Next(). + IterKey prev_key_; + bool is_prev_set_; + bool is_prev_inclusive_; + + PinnedIteratorsManager* pinned_iters_mgr_; + Arena arena_; +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/forward_iterator_bench.cc b/src/rocksdb/db/forward_iterator_bench.cc new file mode 100644 index 000000000..325661cef --- /dev/null +++ b/src/rocksdb/db/forward_iterator_bench.cc @@ -0,0 +1,378 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#if !defined(GFLAGS) || defined(ROCKSDB_LITE) +#include +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#elif defined(OS_MACOSX) || defined(OS_WIN) +// Block forward_iterator_bench under MAC and Windows +int main() { return 0; } +#else +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/db.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "test_util/testharness.h" +#include "util/gflags_compat.h" + +const int MAX_SHARDS = 100000; + +DEFINE_int32(writers, 8, ""); +DEFINE_int32(readers, 8, ""); +DEFINE_int64(rate, 100000, ""); +DEFINE_int64(value_size, 300, ""); +DEFINE_int64(shards, 1000, ""); +DEFINE_int64(memtable_size, 500000000, ""); +DEFINE_int64(block_cache_size, 300000000, ""); +DEFINE_int64(block_size, 65536, ""); +DEFINE_double(runtime, 300.0, ""); +DEFINE_bool(cache_only_first, true, ""); +DEFINE_bool(iterate_upper_bound, true, ""); + +struct Stats { + char pad1[128] __attribute__((__unused__)); + std::atomic written{0}; + char pad2[128] __attribute__((__unused__)); + std::atomic read{0}; + std::atomic cache_misses{0}; + char pad3[128] __attribute__((__unused__)); +} stats; + +struct Key { + Key() {} + Key(uint64_t shard_in, uint64_t seqno_in) + : shard_be(htobe64(shard_in)), seqno_be(htobe64(seqno_in)) {} + + uint64_t shard() const { return be64toh(shard_be); } + uint64_t seqno() const { return be64toh(seqno_be); } + + private: + uint64_t shard_be; + uint64_t seqno_be; +} __attribute__((__packed__)); + +struct Reader; +struct Writer; + +struct ShardState { + char pad1[128] __attribute__((__unused__)); + std::atomic last_written{0}; + Writer* writer; + Reader* reader; + char pad2[128] __attribute__((__unused__)); + std::atomic last_read{0}; + std::unique_ptr it; + std::unique_ptr it_cacheonly; + Key upper_bound; + ROCKSDB_NAMESPACE::Slice upper_bound_slice; + char pad3[128] __attribute__((__unused__)); +}; + +struct Reader { + public: + explicit Reader(std::vector* shard_states, + ROCKSDB_NAMESPACE::DB* db) + : shard_states_(shard_states), db_(db) { + sem_init(&sem_, 0, 0); + thread_ = port::Thread(&Reader::run, this); + } + + void run() { + while (1) { + sem_wait(&sem_); + if (done_.load()) { + break; + } + + uint64_t shard; + { + std::lock_guard guard(queue_mutex_); + assert(!shards_pending_queue_.empty()); + shard = shards_pending_queue_.front(); + shards_pending_queue_.pop(); + shards_pending_set_.reset(shard); + } + readOnceFromShard(shard); + } + } + + void readOnceFromShard(uint64_t shard) { + ShardState& state = (*shard_states_)[shard]; + if (!state.it) { + // Initialize iterators + ROCKSDB_NAMESPACE::ReadOptions options; + options.tailing = true; + if (FLAGS_iterate_upper_bound) { + state.upper_bound = Key(shard, std::numeric_limits::max()); + state.upper_bound_slice = ROCKSDB_NAMESPACE::Slice( + (const char*)&state.upper_bound, sizeof(state.upper_bound)); + options.iterate_upper_bound = &state.upper_bound_slice; + } + + state.it.reset(db_->NewIterator(options)); + + if (FLAGS_cache_only_first) { + options.read_tier = ROCKSDB_NAMESPACE::ReadTier::kBlockCacheTier; + state.it_cacheonly.reset(db_->NewIterator(options)); + } + } + + const uint64_t upto = state.last_written.load(); + for (ROCKSDB_NAMESPACE::Iterator* it : + {state.it_cacheonly.get(), state.it.get()}) { + if (it == nullptr) { + continue; + } + if (state.last_read.load() >= upto) { + break; + } + bool need_seek = true; + for (uint64_t seq = state.last_read.load() + 1; seq <= upto; ++seq) { + if (need_seek) { + Key from(shard, state.last_read.load() + 1); + it->Seek(ROCKSDB_NAMESPACE::Slice((const char*)&from, sizeof(from))); + need_seek = false; + } else { + it->Next(); + } + if (it->status().IsIncomplete()) { + ++::stats.cache_misses; + break; + } + assert(it->Valid()); + assert(it->key().size() == sizeof(Key)); + Key key; + memcpy(&key, it->key().data(), it->key().size()); + // fprintf(stderr, "Expecting (%ld, %ld) read (%ld, %ld)\n", + // shard, seq, key.shard(), key.seqno()); + assert(key.shard() == shard); + assert(key.seqno() == seq); + state.last_read.store(seq); + ++::stats.read; + } + } + } + + void onWrite(uint64_t shard) { + { + std::lock_guard guard(queue_mutex_); + if (!shards_pending_set_.test(shard)) { + shards_pending_queue_.push(shard); + shards_pending_set_.set(shard); + sem_post(&sem_); + } + } + } + + ~Reader() { + done_.store(true); + sem_post(&sem_); + thread_.join(); + } + + private: + char pad1[128] __attribute__((__unused__)); + std::vector* shard_states_; + ROCKSDB_NAMESPACE::DB* db_; + ROCKSDB_NAMESPACE::port::Thread thread_; + sem_t sem_; + std::mutex queue_mutex_; + std::bitset shards_pending_set_; + std::queue shards_pending_queue_; + std::atomic done_{false}; + char pad2[128] __attribute__((__unused__)); +}; + +struct Writer { + explicit Writer(std::vector* shard_states, + ROCKSDB_NAMESPACE::DB* db) + : shard_states_(shard_states), db_(db) {} + + void start() { thread_ = port::Thread(&Writer::run, this); } + + void run() { + std::queue workq; + std::chrono::steady_clock::time_point deadline( + std::chrono::steady_clock::now() + + std::chrono::nanoseconds((uint64_t)(1000000000 * FLAGS_runtime))); + std::vector my_shards; + for (int i = 1; i <= FLAGS_shards; ++i) { + if ((*shard_states_)[i].writer == this) { + my_shards.push_back(i); + } + } + + std::mt19937 rng{std::random_device()()}; + std::uniform_int_distribution shard_dist( + 0, static_cast(my_shards.size()) - 1); + std::string value(FLAGS_value_size, '*'); + + while (1) { + auto now = std::chrono::steady_clock::now(); + if (FLAGS_runtime >= 0 && now >= deadline) { + break; + } + if (workq.empty()) { + for (int i = 0; i < FLAGS_rate; i += FLAGS_writers) { + std::chrono::nanoseconds offset(1000000000LL * i / FLAGS_rate); + workq.push(now + offset); + } + } + while (!workq.empty() && workq.front() < now) { + workq.pop(); + uint64_t shard = my_shards[shard_dist(rng)]; + ShardState& state = (*shard_states_)[shard]; + uint64_t seqno = state.last_written.load() + 1; + Key key(shard, seqno); + // fprintf(stderr, "Writing (%ld, %ld)\n", shard, seqno); + ROCKSDB_NAMESPACE::Status status = + db_->Put(ROCKSDB_NAMESPACE::WriteOptions(), + ROCKSDB_NAMESPACE::Slice((const char*)&key, sizeof(key)), + ROCKSDB_NAMESPACE::Slice(value)); + assert(status.ok()); + state.last_written.store(seqno); + state.reader->onWrite(shard); + ++::stats.written; + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + // fprintf(stderr, "Writer done\n"); + } + + ~Writer() { thread_.join(); } + + private: + char pad1[128] __attribute__((__unused__)); + std::vector* shard_states_; + ROCKSDB_NAMESPACE::DB* db_; + ROCKSDB_NAMESPACE::port::Thread thread_; + char pad2[128] __attribute__((__unused__)); +}; + +struct StatsThread { + explicit StatsThread(ROCKSDB_NAMESPACE::DB* db) + : db_(db), thread_(&StatsThread::run, this) {} + + void run() { + auto tstart = std::chrono::steady_clock::now(), tlast = tstart; + uint64_t wlast = 0, rlast = 0; + while (!done_.load()) { + { + std::unique_lock lock(cvm_); + cv_.wait_for(lock, std::chrono::seconds(1)); + } + auto now = std::chrono::steady_clock::now(); + double elapsed = + std::chrono::duration_cast >(now - + tlast) + .count(); + uint64_t w = ::stats.written.load(); + uint64_t r = ::stats.read.load(); + fprintf(stderr, + "%s elapsed %4lds | written %10ld | w/s %10.0f | read %10ld | " + "r/s %10.0f | cache misses %10ld\n", + db_->GetEnv()->TimeToString(time(nullptr)).c_str(), + std::chrono::duration_cast(now - tstart) + .count(), + w, (w - wlast) / elapsed, r, (r - rlast) / elapsed, + ::stats.cache_misses.load()); + wlast = w; + rlast = r; + tlast = now; + } + } + + ~StatsThread() { + { + std::lock_guard guard(cvm_); + done_.store(true); + } + cv_.notify_all(); + thread_.join(); + } + + private: + ROCKSDB_NAMESPACE::DB* db_; + std::mutex cvm_; + std::condition_variable cv_; + ROCKSDB_NAMESPACE::port::Thread thread_; + std::atomic done_{false}; +}; + +int main(int argc, char** argv) { + GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); + + std::mt19937 rng{std::random_device()()}; + ROCKSDB_NAMESPACE::Status status; + std::string path = + ROCKSDB_NAMESPACE::test::PerThreadDBPath("forward_iterator_test"); + fprintf(stderr, "db path is %s\n", path.c_str()); + ROCKSDB_NAMESPACE::Options options; + options.create_if_missing = true; + options.compression = ROCKSDB_NAMESPACE::CompressionType::kNoCompression; + options.compaction_style = + ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleNone; + options.level0_slowdown_writes_trigger = 99999; + options.level0_stop_writes_trigger = 99999; + options.use_direct_io_for_flush_and_compaction = true; + options.write_buffer_size = FLAGS_memtable_size; + ROCKSDB_NAMESPACE::BlockBasedTableOptions table_options; + table_options.block_cache = + ROCKSDB_NAMESPACE::NewLRUCache(FLAGS_block_cache_size); + table_options.block_size = FLAGS_block_size; + options.table_factory.reset( + ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(table_options)); + + status = ROCKSDB_NAMESPACE::DestroyDB(path, options); + assert(status.ok()); + ROCKSDB_NAMESPACE::DB* db_raw; + status = ROCKSDB_NAMESPACE::DB::Open(options, path, &db_raw); + assert(status.ok()); + std::unique_ptr db(db_raw); + + std::vector shard_states(FLAGS_shards + 1); + std::deque readers; + while (static_cast(readers.size()) < FLAGS_readers) { + readers.emplace_back(&shard_states, db_raw); + } + std::deque writers; + while (static_cast(writers.size()) < FLAGS_writers) { + writers.emplace_back(&shard_states, db_raw); + } + + // Each shard gets a random reader and random writer assigned to it + for (int i = 1; i <= FLAGS_shards; ++i) { + std::uniform_int_distribution reader_dist(0, FLAGS_readers - 1); + std::uniform_int_distribution writer_dist(0, FLAGS_writers - 1); + shard_states[i].reader = &readers[reader_dist(rng)]; + shard_states[i].writer = &writers[writer_dist(rng)]; + } + + StatsThread stats_thread(db_raw); + for (Writer& w : writers) { + w.start(); + } + + writers.clear(); + readers.clear(); +} +#endif // !defined(GFLAGS) || defined(ROCKSDB_LITE) diff --git a/src/rocksdb/db/history_trimming_iterator.h b/src/rocksdb/db/history_trimming_iterator.h new file mode 100644 index 000000000..b445ced33 --- /dev/null +++ b/src/rocksdb/db/history_trimming_iterator.h @@ -0,0 +1,91 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include + +#include "db/dbformat.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice.h" +#include "table/internal_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +class HistoryTrimmingIterator : public InternalIterator { + public: + explicit HistoryTrimmingIterator(InternalIterator* input, + const Comparator* cmp, const std::string& ts) + : input_(input), filter_ts_(ts), cmp_(cmp) { + assert(cmp_->timestamp_size() > 0 && !ts.empty()); + } + + bool filter() const { + if (!input_->Valid()) { + return true; + } + Slice current_ts = ExtractTimestampFromKey(key(), cmp_->timestamp_size()); + return cmp_->CompareTimestamp(current_ts, Slice(filter_ts_)) <= 0; + } + + bool Valid() const override { return input_->Valid(); } + + void SeekToFirst() override { + input_->SeekToFirst(); + while (!filter()) { + input_->Next(); + } + } + + void SeekToLast() override { + input_->SeekToLast(); + while (!filter()) { + input_->Prev(); + } + } + + void Seek(const Slice& target) override { + input_->Seek(target); + while (!filter()) { + input_->Next(); + } + } + + void SeekForPrev(const Slice& target) override { + input_->SeekForPrev(target); + while (!filter()) { + input_->Prev(); + } + } + + void Next() override { + do { + input_->Next(); + } while (!filter()); + } + + void Prev() override { + do { + input_->Prev(); + } while (!filter()); + } + + Slice key() const override { return input_->key(); } + + Slice value() const override { return input_->value(); } + + Status status() const override { return input_->status(); } + + bool IsKeyPinned() const override { return input_->IsKeyPinned(); } + + bool IsValuePinned() const override { return input_->IsValuePinned(); } + + private: + InternalIterator* input_; + const std::string filter_ts_; + const Comparator* const cmp_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/import_column_family_job.cc b/src/rocksdb/db/import_column_family_job.cc new file mode 100644 index 000000000..34985666a --- /dev/null +++ b/src/rocksdb/db/import_column_family_job.cc @@ -0,0 +1,312 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "db/import_column_family_job.h" + +#include +#include +#include +#include + +#include "db/version_edit.h" +#include "file/file_util.h" +#include "file/random_access_file_reader.h" +#include "logging/logging.h" +#include "table/merging_iterator.h" +#include "table/scoped_arena_iterator.h" +#include "table/sst_file_writer_collectors.h" +#include "table/table_builder.h" +#include "table/unique_id_impl.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +Status ImportColumnFamilyJob::Prepare(uint64_t next_file_number, + SuperVersion* sv) { + Status status; + + // Read the information of files we are importing + for (const auto& file_metadata : metadata_) { + const auto file_path = file_metadata.db_path + "/" + file_metadata.name; + IngestedFileInfo file_to_import; + status = + GetIngestedFileInfo(file_path, next_file_number++, &file_to_import, sv); + if (!status.ok()) { + return status; + } + files_to_import_.push_back(file_to_import); + } + + auto num_files = files_to_import_.size(); + if (num_files == 0) { + return Status::InvalidArgument("The list of files is empty"); + } else if (num_files > 1) { + // Verify that passed files don't have overlapping ranges in any particular + // level. + int min_level = 1; // Check for overlaps in Level 1 and above. + int max_level = -1; + for (const auto& file_metadata : metadata_) { + if (file_metadata.level > max_level) { + max_level = file_metadata.level; + } + } + for (int level = min_level; level <= max_level; ++level) { + autovector sorted_files; + for (size_t i = 0; i < num_files; i++) { + if (metadata_[i].level == level) { + sorted_files.push_back(&files_to_import_[i]); + } + } + + std::sort( + sorted_files.begin(), sorted_files.end(), + [this](const IngestedFileInfo* info1, const IngestedFileInfo* info2) { + return cfd_->internal_comparator().Compare( + info1->smallest_internal_key, + info2->smallest_internal_key) < 0; + }); + + for (size_t i = 0; i + 1 < sorted_files.size(); i++) { + if (cfd_->internal_comparator().Compare( + sorted_files[i]->largest_internal_key, + sorted_files[i + 1]->smallest_internal_key) >= 0) { + return Status::InvalidArgument("Files have overlapping ranges"); + } + } + } + } + + for (const auto& f : files_to_import_) { + if (f.num_entries == 0) { + return Status::InvalidArgument("File contain no entries"); + } + + if (!f.smallest_internal_key.Valid() || !f.largest_internal_key.Valid()) { + return Status::Corruption("File has corrupted keys"); + } + } + + // Copy/Move external files into DB + auto hardlink_files = import_options_.move_files; + for (auto& f : files_to_import_) { + const auto path_outside_db = f.external_file_path; + const auto path_inside_db = TableFileName( + cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId()); + + if (hardlink_files) { + status = + fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr); + if (status.IsNotSupported()) { + // Original file is on a different FS, use copy instead of hard linking + hardlink_files = false; + ROCKS_LOG_INFO(db_options_.info_log, + "Try to link file %s but it's not supported : %s", + f.internal_file_path.c_str(), status.ToString().c_str()); + } + } + if (!hardlink_files) { + status = + CopyFile(fs_.get(), path_outside_db, path_inside_db, 0, + db_options_.use_fsync, io_tracer_, Temperature::kUnknown); + } + if (!status.ok()) { + break; + } + f.copy_file = !hardlink_files; + f.internal_file_path = path_inside_db; + } + + if (!status.ok()) { + // We failed, remove all files that we copied into the db + for (const auto& f : files_to_import_) { + if (f.internal_file_path.empty()) { + break; + } + const auto s = + fs_->DeleteFile(f.internal_file_path, IOOptions(), nullptr); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "AddFile() clean up for file %s failed : %s", + f.internal_file_path.c_str(), s.ToString().c_str()); + } + } + } + + return status; +} + +// REQUIRES: we have become the only writer by entering both write_thread_ and +// nonmem_write_thread_ +Status ImportColumnFamilyJob::Run() { + Status status; + edit_.SetColumnFamily(cfd_->GetID()); + + // We use the import time as the ancester time. This is the time the data + // is written to the database. + int64_t temp_current_time = 0; + uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; + uint64_t current_time = kUnknownOldestAncesterTime; + if (clock_->GetCurrentTime(&temp_current_time).ok()) { + current_time = oldest_ancester_time = + static_cast(temp_current_time); + } + + for (size_t i = 0; i < files_to_import_.size(); ++i) { + const auto& f = files_to_import_[i]; + const auto& file_metadata = metadata_[i]; + + edit_.AddFile(file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(), + f.fd.GetFileSize(), f.smallest_internal_key, + f.largest_internal_key, file_metadata.smallest_seqno, + file_metadata.largest_seqno, false, file_metadata.temperature, + kInvalidBlobFileNumber, oldest_ancester_time, current_time, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + f.unique_id); + + // If incoming sequence number is higher, update local sequence number. + if (file_metadata.largest_seqno > versions_->LastSequence()) { + versions_->SetLastAllocatedSequence(file_metadata.largest_seqno); + versions_->SetLastPublishedSequence(file_metadata.largest_seqno); + versions_->SetLastSequence(file_metadata.largest_seqno); + } + } + + return status; +} + +void ImportColumnFamilyJob::Cleanup(const Status& status) { + if (!status.ok()) { + // We failed to add files to the database remove all the files we copied. + for (const auto& f : files_to_import_) { + const auto s = + fs_->DeleteFile(f.internal_file_path, IOOptions(), nullptr); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "AddFile() clean up for file %s failed : %s", + f.internal_file_path.c_str(), s.ToString().c_str()); + } + } + } else if (status.ok() && import_options_.move_files) { + // The files were moved and added successfully, remove original file links + for (IngestedFileInfo& f : files_to_import_) { + const auto s = + fs_->DeleteFile(f.external_file_path, IOOptions(), nullptr); + if (!s.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "%s was added to DB successfully but failed to remove original " + "file link : %s", + f.external_file_path.c_str(), s.ToString().c_str()); + } + } + } +} + +Status ImportColumnFamilyJob::GetIngestedFileInfo( + const std::string& external_file, uint64_t new_file_number, + IngestedFileInfo* file_to_import, SuperVersion* sv) { + file_to_import->external_file_path = external_file; + + // Get external file size + Status status = fs_->GetFileSize(external_file, IOOptions(), + &file_to_import->file_size, nullptr); + if (!status.ok()) { + return status; + } + + // Assign FD with number + file_to_import->fd = + FileDescriptor(new_file_number, 0, file_to_import->file_size); + + // Create TableReader for external file + std::unique_ptr table_reader; + std::unique_ptr sst_file; + std::unique_ptr sst_file_reader; + + status = + fs_->NewRandomAccessFile(external_file, env_options_, &sst_file, nullptr); + if (!status.ok()) { + return status; + } + sst_file_reader.reset(new RandomAccessFileReader( + std::move(sst_file), external_file, nullptr /*Env*/, io_tracer_)); + + status = cfd_->ioptions()->table_factory->NewTableReader( + TableReaderOptions( + *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor, + env_options_, cfd_->internal_comparator(), + /*skip_filters*/ false, /*immortal*/ false, + /*force_direct_prefetch*/ false, /*level*/ -1, + /*block_cache_tracer*/ nullptr, + /*max_file_size_for_l0_meta_pin*/ 0, versions_->DbSessionId(), + /*cur_file_num*/ new_file_number), + std::move(sst_file_reader), file_to_import->file_size, &table_reader); + if (!status.ok()) { + return status; + } + + // Get the external file properties + auto props = table_reader->GetTableProperties(); + + // Set original_seqno to 0. + file_to_import->original_seqno = 0; + + // Get number of entries in table + file_to_import->num_entries = props->num_entries; + + ParsedInternalKey key; + ReadOptions ro; + // During reading the external file we can cache blocks that we read into + // the block cache, if we later change the global seqno of this file, we will + // have block in cache that will include keys with wrong seqno. + // We need to disable fill_cache so that we read from the file without + // updating the block cache. + ro.fill_cache = false; + std::unique_ptr iter(table_reader->NewIterator( + ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion)); + + // Get first (smallest) key from file + iter->SeekToFirst(); + Status pik_status = + ParseInternalKey(iter->key(), &key, db_options_.allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted Key in external file. ", + pik_status.getState()); + } + file_to_import->smallest_internal_key.SetFrom(key); + + // Get last (largest) key from file + iter->SeekToLast(); + pik_status = + ParseInternalKey(iter->key(), &key, db_options_.allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted Key in external file. ", + pik_status.getState()); + } + file_to_import->largest_internal_key.SetFrom(key); + + file_to_import->cf_id = static_cast(props->column_family_id); + + file_to_import->table_properties = *props; + + auto s = GetSstInternalUniqueId(props->db_id, props->db_session_id, + props->orig_file_number, + &(file_to_import->unique_id)); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to get SST unique id for file %s", + file_to_import->internal_file_path.c_str()); + } + + return status; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/import_column_family_job.h b/src/rocksdb/db/import_column_family_job.h new file mode 100644 index 000000000..57c49c67f --- /dev/null +++ b/src/rocksdb/db/import_column_family_job.h @@ -0,0 +1,82 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include +#include +#include + +#include "db/column_family.h" +#include "db/external_sst_file_ingestion_job.h" +#include "db/snapshot_impl.h" +#include "options/db_options.h" +#include "rocksdb/db.h" +#include "rocksdb/metadata.h" +#include "rocksdb/sst_file_writer.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { +struct EnvOptions; +class SystemClock; + +// Imports a set of sst files as is into a new column family. Logic is similar +// to ExternalSstFileIngestionJob. +class ImportColumnFamilyJob { + public: + ImportColumnFamilyJob(VersionSet* versions, ColumnFamilyData* cfd, + const ImmutableDBOptions& db_options, + const EnvOptions& env_options, + const ImportColumnFamilyOptions& import_options, + const std::vector& metadata, + const std::shared_ptr& io_tracer) + : clock_(db_options.clock), + versions_(versions), + cfd_(cfd), + db_options_(db_options), + fs_(db_options_.fs, io_tracer), + env_options_(env_options), + import_options_(import_options), + metadata_(metadata), + io_tracer_(io_tracer) {} + + // Prepare the job by copying external files into the DB. + Status Prepare(uint64_t next_file_number, SuperVersion* sv); + + // Will execute the import job and prepare edit() to be applied. + // REQUIRES: Mutex held + Status Run(); + + // Cleanup after successful/failed job + void Cleanup(const Status& status); + + VersionEdit* edit() { return &edit_; } + + const autovector& files_to_import() const { + return files_to_import_; + } + + private: + // Open the external file and populate `file_to_import` with all the + // external information we need to import this file. + Status GetIngestedFileInfo(const std::string& external_file, + uint64_t new_file_number, + IngestedFileInfo* file_to_import, + SuperVersion* sv); + + SystemClock* clock_; + VersionSet* versions_; + ColumnFamilyData* cfd_; + const ImmutableDBOptions& db_options_; + const FileSystemPtr fs_; + const EnvOptions& env_options_; + autovector files_to_import_; + VersionEdit edit_; + const ImportColumnFamilyOptions& import_options_; + std::vector metadata_; + const std::shared_ptr io_tracer_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/import_column_family_test.cc b/src/rocksdb/db/import_column_family_test.cc new file mode 100644 index 000000000..2847ea8da --- /dev/null +++ b/src/rocksdb/db/import_column_family_test.cc @@ -0,0 +1,644 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include + +#include "db/db_test_util.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/sst_file_writer.h" +#include "test_util/testutil.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +class ImportColumnFamilyTest : public DBTestBase { + public: + ImportColumnFamilyTest() + : DBTestBase("import_column_family_test", /*env_do_fsync=*/true) { + sst_files_dir_ = dbname_ + "/sst_files/"; + export_files_dir_ = test::PerThreadDBPath(env_, "export"); + DestroyAndRecreateExternalSSTFilesDir(); + import_cfh_ = nullptr; + import_cfh2_ = nullptr; + metadata_ptr_ = nullptr; + } + + ~ImportColumnFamilyTest() { + if (import_cfh_) { + EXPECT_OK(db_->DropColumnFamily(import_cfh_)); + EXPECT_OK(db_->DestroyColumnFamilyHandle(import_cfh_)); + import_cfh_ = nullptr; + } + if (import_cfh2_) { + EXPECT_OK(db_->DropColumnFamily(import_cfh2_)); + EXPECT_OK(db_->DestroyColumnFamilyHandle(import_cfh2_)); + import_cfh2_ = nullptr; + } + if (metadata_ptr_) { + delete metadata_ptr_; + metadata_ptr_ = nullptr; + } + EXPECT_OK(DestroyDir(env_, sst_files_dir_)); + EXPECT_OK(DestroyDir(env_, export_files_dir_)); + } + + void DestroyAndRecreateExternalSSTFilesDir() { + EXPECT_OK(DestroyDir(env_, sst_files_dir_)); + EXPECT_OK(env_->CreateDir(sst_files_dir_)); + EXPECT_OK(DestroyDir(env_, export_files_dir_)); + } + + LiveFileMetaData LiveFileMetaDataInit(std::string name, std::string path, + int level, + SequenceNumber smallest_seqno, + SequenceNumber largest_seqno) { + LiveFileMetaData metadata; + metadata.name = name; + metadata.db_path = path; + metadata.smallest_seqno = smallest_seqno; + metadata.largest_seqno = largest_seqno; + metadata.level = level; + return metadata; + } + + protected: + std::string sst_files_dir_; + std::string export_files_dir_; + ColumnFamilyHandle* import_cfh_; + ColumnFamilyHandle* import_cfh2_; + ExportImportFilesMetaData* metadata_ptr_; +}; + +TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFiles) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko"}, options); + + SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]); + SstFileWriter sfw_unknown(EnvOptions(), options); + + // cf1.sst + const std::string cf1_sst_name = "cf1.sst"; + const std::string cf1_sst = sst_files_dir_ + cf1_sst_name; + ASSERT_OK(sfw_cf1.Open(cf1_sst)); + ASSERT_OK(sfw_cf1.Put("K1", "V1")); + ASSERT_OK(sfw_cf1.Put("K2", "V2")); + ASSERT_OK(sfw_cf1.Finish()); + + // cf_unknown.sst + const std::string unknown_sst_name = "cf_unknown.sst"; + const std::string unknown_sst = sst_files_dir_ + unknown_sst_name; + ASSERT_OK(sfw_unknown.Open(unknown_sst)); + ASSERT_OK(sfw_unknown.Put("K3", "V1")); + ASSERT_OK(sfw_unknown.Put("K4", "V2")); + ASSERT_OK(sfw_unknown.Finish()); + + { + // Import sst file corresponding to cf1 onto a new cf and verify + ExportImportFilesMetaData metadata; + metadata.files.push_back( + LiveFileMetaDataInit(cf1_sst_name, sst_files_dir_, 0, 10, 19)); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_OK(db_->CreateColumnFamilyWithImport( + options, "toto", ImportColumnFamilyOptions(), metadata, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + + std::string value; + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K1", &value)); + ASSERT_EQ(value, "V1"); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K2", &value)); + ASSERT_EQ(value, "V2"); + ASSERT_OK(db_->DropColumnFamily(import_cfh_)); + ASSERT_OK(db_->DestroyColumnFamilyHandle(import_cfh_)); + import_cfh_ = nullptr; + } + + { + // Import sst file corresponding to unknown cf onto a new cf and verify + ExportImportFilesMetaData metadata; + metadata.files.push_back( + LiveFileMetaDataInit(unknown_sst_name, sst_files_dir_, 0, 20, 29)); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_OK(db_->CreateColumnFamilyWithImport( + options, "yoyo", ImportColumnFamilyOptions(), metadata, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + + std::string value; + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K3", &value)); + ASSERT_EQ(value, "V1"); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K4", &value)); + ASSERT_EQ(value, "V2"); + } + EXPECT_OK(db_->DestroyColumnFamilyHandle(import_cfh_)); + import_cfh_ = nullptr; + + // verify sst unique id during reopen + options.verify_sst_unique_id_in_manifest = true; + ReopenWithColumnFamilies({"default", "koko", "yoyo"}, options); +} + +TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFilesWithOverlap) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko"}, options); + + SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]); + + // file3.sst + const std::string file3_sst_name = "file3.sst"; + const std::string file3_sst = sst_files_dir_ + file3_sst_name; + ASSERT_OK(sfw_cf1.Open(file3_sst)); + for (int i = 0; i < 100; ++i) { + ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_val")); + } + ASSERT_OK(sfw_cf1.Finish()); + + // file2.sst + const std::string file2_sst_name = "file2.sst"; + const std::string file2_sst = sst_files_dir_ + file2_sst_name; + ASSERT_OK(sfw_cf1.Open(file2_sst)); + for (int i = 0; i < 100; i += 2) { + ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite1")); + } + ASSERT_OK(sfw_cf1.Finish()); + + // file1a.sst + const std::string file1a_sst_name = "file1a.sst"; + const std::string file1a_sst = sst_files_dir_ + file1a_sst_name; + ASSERT_OK(sfw_cf1.Open(file1a_sst)); + for (int i = 0; i < 52; i += 4) { + ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite2")); + } + ASSERT_OK(sfw_cf1.Finish()); + + // file1b.sst + const std::string file1b_sst_name = "file1b.sst"; + const std::string file1b_sst = sst_files_dir_ + file1b_sst_name; + ASSERT_OK(sfw_cf1.Open(file1b_sst)); + for (int i = 52; i < 100; i += 4) { + ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite2")); + } + ASSERT_OK(sfw_cf1.Finish()); + + // file0a.sst + const std::string file0a_sst_name = "file0a.sst"; + const std::string file0a_sst = sst_files_dir_ + file0a_sst_name; + ASSERT_OK(sfw_cf1.Open(file0a_sst)); + for (int i = 0; i < 100; i += 16) { + ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite3")); + } + ASSERT_OK(sfw_cf1.Finish()); + + // file0b.sst + const std::string file0b_sst_name = "file0b.sst"; + const std::string file0b_sst = sst_files_dir_ + file0b_sst_name; + ASSERT_OK(sfw_cf1.Open(file0b_sst)); + for (int i = 0; i < 100; i += 16) { + ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite4")); + } + ASSERT_OK(sfw_cf1.Finish()); + + // Import sst files and verify + ExportImportFilesMetaData metadata; + metadata.files.push_back( + LiveFileMetaDataInit(file3_sst_name, sst_files_dir_, 3, 10, 19)); + metadata.files.push_back( + LiveFileMetaDataInit(file2_sst_name, sst_files_dir_, 2, 20, 29)); + metadata.files.push_back( + LiveFileMetaDataInit(file1a_sst_name, sst_files_dir_, 1, 30, 34)); + metadata.files.push_back( + LiveFileMetaDataInit(file1b_sst_name, sst_files_dir_, 1, 35, 39)); + metadata.files.push_back( + LiveFileMetaDataInit(file0a_sst_name, sst_files_dir_, 0, 40, 49)); + metadata.files.push_back( + LiveFileMetaDataInit(file0b_sst_name, sst_files_dir_, 0, 50, 59)); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_OK(db_->CreateColumnFamilyWithImport( + options, "toto", ImportColumnFamilyOptions(), metadata, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + + for (int i = 0; i < 100; i++) { + std::string value; + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value)); + if (i % 16 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite4"); + } else if (i % 4 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite2"); + } else if (i % 2 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite1"); + } else { + ASSERT_EQ(value, Key(i) + "_val"); + } + } + + for (int i = 0; i < 100; i += 5) { + ASSERT_OK( + db_->Put(WriteOptions(), import_cfh_, Key(i), Key(i) + "_overwrite5")); + } + + // Flush and check again + ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_)); + for (int i = 0; i < 100; i++) { + std::string value; + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value)); + if (i % 5 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite5"); + } else if (i % 16 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite4"); + } else if (i % 4 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite2"); + } else if (i % 2 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite1"); + } else { + ASSERT_EQ(value, Key(i) + "_val"); + } + } + + // Compact and check again. + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr)); + for (int i = 0; i < 100; i++) { + std::string value; + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value)); + if (i % 5 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite5"); + } else if (i % 16 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite4"); + } else if (i % 4 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite2"); + } else if (i % 2 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite1"); + } else { + ASSERT_EQ(value, Key(i) + "_val"); + } + } +} + +TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko"}, options); + + for (int i = 0; i < 100; ++i) { + ASSERT_OK(Put(1, Key(i), Key(i) + "_val")); + } + ASSERT_OK(Flush(1)); + + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)); + + // Overwrite the value in the same set of keys. + for (int i = 0; i < 100; ++i) { + ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite")); + } + + // Flush to create L0 file. + ASSERT_OK(Flush(1)); + for (int i = 0; i < 100; ++i) { + ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite2")); + } + + // Flush again to create another L0 file. It should have higher sequencer. + ASSERT_OK(Flush(1)); + + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_, + &metadata_ptr_)); + ASSERT_NE(metadata_ptr_, nullptr); + delete checkpoint; + + ImportColumnFamilyOptions import_options; + import_options.move_files = false; + ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "toto", import_options, + *metadata_ptr_, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + + import_options.move_files = true; + ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "yoyo", import_options, + *metadata_ptr_, &import_cfh2_)); + ASSERT_NE(import_cfh2_, nullptr); + delete metadata_ptr_; + metadata_ptr_ = NULL; + + std::string value1, value2; + + for (int i = 0; i < 100; ++i) { + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1)); + ASSERT_EQ(Get(1, Key(i)), value1); + } + + for (int i = 0; i < 100; ++i) { + ASSERT_OK(db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2)); + ASSERT_EQ(Get(1, Key(i)), value2); + } + + // Modify keys in cf1 and verify. + for (int i = 0; i < 25; i++) { + ASSERT_OK(db_->Delete(WriteOptions(), import_cfh_, Key(i))); + } + for (int i = 25; i < 50; i++) { + ASSERT_OK( + db_->Put(WriteOptions(), import_cfh_, Key(i), Key(i) + "_overwrite3")); + } + for (int i = 0; i < 25; ++i) { + ASSERT_TRUE( + db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound()); + } + for (int i = 25; i < 50; ++i) { + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1)); + ASSERT_EQ(Key(i) + "_overwrite3", value1); + } + for (int i = 50; i < 100; ++i) { + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1)); + ASSERT_EQ(Key(i) + "_overwrite2", value1); + } + + for (int i = 0; i < 100; ++i) { + ASSERT_OK(db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2)); + ASSERT_EQ(Get(1, Key(i)), value2); + } + + // Compact and check again. + ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_)); + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr)); + + for (int i = 0; i < 25; ++i) { + ASSERT_TRUE( + db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound()); + } + for (int i = 25; i < 50; ++i) { + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1)); + ASSERT_EQ(Key(i) + "_overwrite3", value1); + } + for (int i = 50; i < 100; ++i) { + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1)); + ASSERT_EQ(Key(i) + "_overwrite2", value1); + } + + for (int i = 0; i < 100; ++i) { + ASSERT_OK(db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2)); + ASSERT_EQ(Get(1, Key(i)), value2); + } +} + +TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko"}, options); + + for (int i = 0; i < 100; ++i) { + ASSERT_OK(Put(1, Key(i), Key(i) + "_val")); + } + ASSERT_OK(Flush(1)); + + // Compact to create a L1 file. + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)); + + // Overwrite the value in the same set of keys. + for (int i = 0; i < 50; ++i) { + ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite")); + } + + // Flush to create L0 file. + ASSERT_OK(Flush(1)); + + for (int i = 0; i < 25; ++i) { + ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite2")); + } + + // Flush again to create another L0 file. It should have higher sequencer. + ASSERT_OK(Flush(1)); + + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_, + &metadata_ptr_)); + ASSERT_NE(metadata_ptr_, nullptr); + delete checkpoint; + + // Create a new db and import the files. + DB* db_copy; + ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy")); + ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy)); + ColumnFamilyHandle* cfh = nullptr; + ASSERT_OK(db_copy->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + *metadata_ptr_, &cfh)); + ASSERT_NE(cfh, nullptr); + + for (int i = 0; i < 100; ++i) { + std::string value; + ASSERT_OK(db_copy->Get(ReadOptions(), cfh, Key(i), &value)); + ASSERT_EQ(Get(1, Key(i)), value); + } + ASSERT_OK(db_copy->DropColumnFamily(cfh)); + ASSERT_OK(db_copy->DestroyColumnFamilyHandle(cfh)); + delete db_copy; + ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy")); +} + +TEST_F(ImportColumnFamilyTest, LevelFilesOverlappingAtEndpoints) { + // Imports a column family containing a level where two files overlap at their + // endpoints. "Overlap" means the largest user key in one file is the same as + // the smallest user key in the second file. + const int kFileBytes = 128 << 10; // 128KB + const int kValueBytes = 1 << 10; // 1KB + const int kNumFiles = 4; + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = 2; + CreateAndReopenWithCF({"koko"}, options); + + Random rnd(301); + // Every key is snapshot protected to ensure older versions will not be + // dropped during compaction. + std::vector snapshots; + snapshots.reserve(kFileBytes / kValueBytes * kNumFiles); + for (int i = 0; i < kNumFiles; ++i) { + for (int j = 0; j < kFileBytes / kValueBytes; ++j) { + auto value = rnd.RandomString(kValueBytes); + ASSERT_OK(Put(1, "key", value)); + snapshots.push_back(db_->GetSnapshot()); + } + ASSERT_OK(Flush(1)); + } + + // Compact to create overlapping L1 files. + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)); + ASSERT_GT(NumTableFilesAtLevel(1, 1), 1); + + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_, + &metadata_ptr_)); + ASSERT_NE(metadata_ptr_, nullptr); + delete checkpoint; + + // Create a new db and import the files. + DB* db_copy; + ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy")); + ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy)); + ColumnFamilyHandle* cfh = nullptr; + ASSERT_OK(db_copy->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + *metadata_ptr_, &cfh)); + ASSERT_NE(cfh, nullptr); + + { + std::string value; + ASSERT_OK(db_copy->Get(ReadOptions(), cfh, "key", &value)); + } + ASSERT_OK(db_copy->DropColumnFamily(cfh)); + ASSERT_OK(db_copy->DestroyColumnFamilyHandle(cfh)); + delete db_copy; + ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy")); + for (const Snapshot* snapshot : snapshots) { + db_->ReleaseSnapshot(snapshot); + } +} + +TEST_F(ImportColumnFamilyTest, ImportColumnFamilyNegativeTest) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko"}, options); + + { + // Create column family with existing cf name. + ExportImportFilesMetaData metadata; + + ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "koko", + ImportColumnFamilyOptions(), + metadata, &import_cfh_), + Status::InvalidArgument("Column family already exists")); + ASSERT_EQ(import_cfh_, nullptr); + } + + { + // Import with no files specified. + ExportImportFilesMetaData metadata; + + ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + metadata, &import_cfh_), + Status::InvalidArgument("The list of files is empty")); + ASSERT_EQ(import_cfh_, nullptr); + } + + { + // Import with overlapping keys in sst files. + ExportImportFilesMetaData metadata; + SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]); + const std::string file1_sst_name = "file1.sst"; + const std::string file1_sst = sst_files_dir_ + file1_sst_name; + ASSERT_OK(sfw_cf1.Open(file1_sst)); + ASSERT_OK(sfw_cf1.Put("K1", "V1")); + ASSERT_OK(sfw_cf1.Put("K2", "V2")); + ASSERT_OK(sfw_cf1.Finish()); + const std::string file2_sst_name = "file2.sst"; + const std::string file2_sst = sst_files_dir_ + file2_sst_name; + ASSERT_OK(sfw_cf1.Open(file2_sst)); + ASSERT_OK(sfw_cf1.Put("K2", "V2")); + ASSERT_OK(sfw_cf1.Put("K3", "V3")); + ASSERT_OK(sfw_cf1.Finish()); + + metadata.files.push_back( + LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19)); + metadata.files.push_back( + LiveFileMetaDataInit(file2_sst_name, sst_files_dir_, 1, 10, 19)); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + metadata, &import_cfh_), + Status::InvalidArgument("Files have overlapping ranges")); + ASSERT_EQ(import_cfh_, nullptr); + } + + { + // Import with a mismatching comparator, should fail with appropriate error. + ExportImportFilesMetaData metadata; + Options mismatch_options = CurrentOptions(); + mismatch_options.comparator = ReverseBytewiseComparator(); + SstFileWriter sfw_cf1(EnvOptions(), mismatch_options, handles_[1]); + const std::string file1_sst_name = "file1.sst"; + const std::string file1_sst = sst_files_dir_ + file1_sst_name; + ASSERT_OK(sfw_cf1.Open(file1_sst)); + ASSERT_OK(sfw_cf1.Put("K2", "V2")); + ASSERT_OK(sfw_cf1.Put("K1", "V1")); + ASSERT_OK(sfw_cf1.Finish()); + + metadata.files.push_back( + LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19)); + metadata.db_comparator_name = mismatch_options.comparator->Name(); + + ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "coco", + ImportColumnFamilyOptions(), + metadata, &import_cfh_), + Status::InvalidArgument("Comparator name mismatch")); + ASSERT_EQ(import_cfh_, nullptr); + } + + { + // Import with non existent sst file should fail with appropriate error + ExportImportFilesMetaData metadata; + SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]); + const std::string file1_sst_name = "file1.sst"; + const std::string file1_sst = sst_files_dir_ + file1_sst_name; + ASSERT_OK(sfw_cf1.Open(file1_sst)); + ASSERT_OK(sfw_cf1.Put("K1", "V1")); + ASSERT_OK(sfw_cf1.Put("K2", "V2")); + ASSERT_OK(sfw_cf1.Finish()); + const std::string file3_sst_name = "file3.sst"; + + metadata.files.push_back( + LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19)); + metadata.files.push_back( + LiveFileMetaDataInit(file3_sst_name, sst_files_dir_, 1, 10, 19)); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + metadata, &import_cfh_), + Status::IOError("No such file or directory")); + ASSERT_EQ(import_cfh_, nullptr); + + // Test successful import after a failure with the same CF name. Ensures + // there is no side effect with CF when there is a failed import + metadata.files.pop_back(); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_OK(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + metadata, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as External SST File Writer and Import are not supported " + "in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/internal_stats.cc b/src/rocksdb/db/internal_stats.cc new file mode 100644 index 000000000..ac5b81f3e --- /dev/null +++ b/src/rocksdb/db/internal_stats.cc @@ -0,0 +1,2002 @@ +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/internal_stats.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cache/cache_entry_roles.h" +#include "cache/cache_entry_stats.h" +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "port/port.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/table.h" +#include "table/block_based/cachable_entry.h" +#include "util/hash_containers.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE + +const std::map InternalStats::compaction_level_stats = + { + {LevelStatType::NUM_FILES, LevelStat{"NumFiles", "Files"}}, + {LevelStatType::COMPACTED_FILES, + LevelStat{"CompactedFiles", "CompactedFiles"}}, + {LevelStatType::SIZE_BYTES, LevelStat{"SizeBytes", "Size"}}, + {LevelStatType::SCORE, LevelStat{"Score", "Score"}}, + {LevelStatType::READ_GB, LevelStat{"ReadGB", "Read(GB)"}}, + {LevelStatType::RN_GB, LevelStat{"RnGB", "Rn(GB)"}}, + {LevelStatType::RNP1_GB, LevelStat{"Rnp1GB", "Rnp1(GB)"}}, + {LevelStatType::WRITE_GB, LevelStat{"WriteGB", "Write(GB)"}}, + {LevelStatType::W_NEW_GB, LevelStat{"WnewGB", "Wnew(GB)"}}, + {LevelStatType::MOVED_GB, LevelStat{"MovedGB", "Moved(GB)"}}, + {LevelStatType::WRITE_AMP, LevelStat{"WriteAmp", "W-Amp"}}, + {LevelStatType::READ_MBPS, LevelStat{"ReadMBps", "Rd(MB/s)"}}, + {LevelStatType::WRITE_MBPS, LevelStat{"WriteMBps", "Wr(MB/s)"}}, + {LevelStatType::COMP_SEC, LevelStat{"CompSec", "Comp(sec)"}}, + {LevelStatType::COMP_CPU_SEC, + LevelStat{"CompMergeCPU", "CompMergeCPU(sec)"}}, + {LevelStatType::COMP_COUNT, LevelStat{"CompCount", "Comp(cnt)"}}, + {LevelStatType::AVG_SEC, LevelStat{"AvgSec", "Avg(sec)"}}, + {LevelStatType::KEY_IN, LevelStat{"KeyIn", "KeyIn"}}, + {LevelStatType::KEY_DROP, LevelStat{"KeyDrop", "KeyDrop"}}, + {LevelStatType::R_BLOB_GB, LevelStat{"RblobGB", "Rblob(GB)"}}, + {LevelStatType::W_BLOB_GB, LevelStat{"WblobGB", "Wblob(GB)"}}, +}; + +const std::map + InternalStats::db_stats_type_to_info = { + {InternalStats::kIntStatsWalFileBytes, + DBStatInfo{"db.wal_bytes_written"}}, + {InternalStats::kIntStatsWalFileSynced, DBStatInfo{"db.wal_syncs"}}, + {InternalStats::kIntStatsBytesWritten, + DBStatInfo{"db.user_bytes_written"}}, + {InternalStats::kIntStatsNumKeysWritten, + DBStatInfo{"db.user_keys_written"}}, + {InternalStats::kIntStatsWriteDoneByOther, + DBStatInfo{"db.user_writes_by_other"}}, + {InternalStats::kIntStatsWriteDoneBySelf, + DBStatInfo{"db.user_writes_by_self"}}, + {InternalStats::kIntStatsWriteWithWal, + DBStatInfo{"db.user_writes_with_wal"}}, + {InternalStats::kIntStatsWriteStallMicros, + DBStatInfo{"db.user_write_stall_micros"}}, +}; + +namespace { +const double kMB = 1048576.0; +const double kGB = kMB * 1024; +const double kMicrosInSec = 1000000.0; + +void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name, + const std::string& group_by) { + int written_size = + snprintf(buf, len, "\n** Compaction Stats [%s] **\n", cf_name.c_str()); + written_size = std::min(written_size, static_cast(len)); + auto hdr = [](LevelStatType t) { + return InternalStats::compaction_level_stats.at(t).header_name.c_str(); + }; + int line_size = snprintf( + buf + written_size, len - written_size, + "%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s " + "%s\n", + // Note that we skip COMPACTED_FILES and merge it with Files column + group_by.c_str(), hdr(LevelStatType::NUM_FILES), + hdr(LevelStatType::SIZE_BYTES), hdr(LevelStatType::SCORE), + hdr(LevelStatType::READ_GB), hdr(LevelStatType::RN_GB), + hdr(LevelStatType::RNP1_GB), hdr(LevelStatType::WRITE_GB), + hdr(LevelStatType::W_NEW_GB), hdr(LevelStatType::MOVED_GB), + hdr(LevelStatType::WRITE_AMP), hdr(LevelStatType::READ_MBPS), + hdr(LevelStatType::WRITE_MBPS), hdr(LevelStatType::COMP_SEC), + hdr(LevelStatType::COMP_CPU_SEC), hdr(LevelStatType::COMP_COUNT), + hdr(LevelStatType::AVG_SEC), hdr(LevelStatType::KEY_IN), + hdr(LevelStatType::KEY_DROP), hdr(LevelStatType::R_BLOB_GB), + hdr(LevelStatType::W_BLOB_GB)); + + written_size += line_size; + written_size = std::min(written_size, static_cast(len)); + snprintf(buf + written_size, len - written_size, "%s\n", + std::string(line_size, '-').c_str()); +} + +void PrepareLevelStats(std::map* level_stats, + int num_files, int being_compacted, + double total_file_size, double score, double w_amp, + const InternalStats::CompactionStats& stats) { + const uint64_t bytes_read = stats.bytes_read_non_output_levels + + stats.bytes_read_output_level + + stats.bytes_read_blob; + const uint64_t bytes_written = stats.bytes_written + stats.bytes_written_blob; + const int64_t bytes_new = stats.bytes_written - stats.bytes_read_output_level; + const double elapsed = (stats.micros + 1) / kMicrosInSec; + + (*level_stats)[LevelStatType::NUM_FILES] = num_files; + (*level_stats)[LevelStatType::COMPACTED_FILES] = being_compacted; + (*level_stats)[LevelStatType::SIZE_BYTES] = total_file_size; + (*level_stats)[LevelStatType::SCORE] = score; + (*level_stats)[LevelStatType::READ_GB] = bytes_read / kGB; + (*level_stats)[LevelStatType::RN_GB] = + stats.bytes_read_non_output_levels / kGB; + (*level_stats)[LevelStatType::RNP1_GB] = stats.bytes_read_output_level / kGB; + (*level_stats)[LevelStatType::WRITE_GB] = stats.bytes_written / kGB; + (*level_stats)[LevelStatType::W_NEW_GB] = bytes_new / kGB; + (*level_stats)[LevelStatType::MOVED_GB] = stats.bytes_moved / kGB; + (*level_stats)[LevelStatType::WRITE_AMP] = w_amp; + (*level_stats)[LevelStatType::READ_MBPS] = bytes_read / kMB / elapsed; + (*level_stats)[LevelStatType::WRITE_MBPS] = bytes_written / kMB / elapsed; + (*level_stats)[LevelStatType::COMP_SEC] = stats.micros / kMicrosInSec; + (*level_stats)[LevelStatType::COMP_CPU_SEC] = stats.cpu_micros / kMicrosInSec; + (*level_stats)[LevelStatType::COMP_COUNT] = stats.count; + (*level_stats)[LevelStatType::AVG_SEC] = + stats.count == 0 ? 0 : stats.micros / kMicrosInSec / stats.count; + (*level_stats)[LevelStatType::KEY_IN] = + static_cast(stats.num_input_records); + (*level_stats)[LevelStatType::KEY_DROP] = + static_cast(stats.num_dropped_records); + (*level_stats)[LevelStatType::R_BLOB_GB] = stats.bytes_read_blob / kGB; + (*level_stats)[LevelStatType::W_BLOB_GB] = stats.bytes_written_blob / kGB; +} + +void PrintLevelStats(char* buf, size_t len, const std::string& name, + const std::map& stat_value) { + snprintf( + buf, len, + "%4s " /* Level */ + "%6d/%-3d " /* Files */ + "%8s " /* Size */ + "%5.1f " /* Score */ + "%8.1f " /* Read(GB) */ + "%7.1f " /* Rn(GB) */ + "%8.1f " /* Rnp1(GB) */ + "%9.1f " /* Write(GB) */ + "%8.1f " /* Wnew(GB) */ + "%9.1f " /* Moved(GB) */ + "%5.1f " /* W-Amp */ + "%8.1f " /* Rd(MB/s) */ + "%8.1f " /* Wr(MB/s) */ + "%9.2f " /* Comp(sec) */ + "%17.2f " /* CompMergeCPU(sec) */ + "%9d " /* Comp(cnt) */ + "%8.3f " /* Avg(sec) */ + "%7s " /* KeyIn */ + "%6s " /* KeyDrop */ + "%9.1f " /* Rblob(GB) */ + "%9.1f\n", /* Wblob(GB) */ + name.c_str(), static_cast(stat_value.at(LevelStatType::NUM_FILES)), + static_cast(stat_value.at(LevelStatType::COMPACTED_FILES)), + BytesToHumanString( + static_cast(stat_value.at(LevelStatType::SIZE_BYTES))) + .c_str(), + stat_value.at(LevelStatType::SCORE), + stat_value.at(LevelStatType::READ_GB), + stat_value.at(LevelStatType::RN_GB), + stat_value.at(LevelStatType::RNP1_GB), + stat_value.at(LevelStatType::WRITE_GB), + stat_value.at(LevelStatType::W_NEW_GB), + stat_value.at(LevelStatType::MOVED_GB), + stat_value.at(LevelStatType::WRITE_AMP), + stat_value.at(LevelStatType::READ_MBPS), + stat_value.at(LevelStatType::WRITE_MBPS), + stat_value.at(LevelStatType::COMP_SEC), + stat_value.at(LevelStatType::COMP_CPU_SEC), + static_cast(stat_value.at(LevelStatType::COMP_COUNT)), + stat_value.at(LevelStatType::AVG_SEC), + NumberToHumanString( + static_cast(stat_value.at(LevelStatType::KEY_IN))) + .c_str(), + NumberToHumanString( + static_cast(stat_value.at(LevelStatType::KEY_DROP))) + .c_str(), + stat_value.at(LevelStatType::R_BLOB_GB), + stat_value.at(LevelStatType::W_BLOB_GB)); +} + +void PrintLevelStats(char* buf, size_t len, const std::string& name, + int num_files, int being_compacted, double total_file_size, + double score, double w_amp, + const InternalStats::CompactionStats& stats) { + std::map level_stats; + PrepareLevelStats(&level_stats, num_files, being_compacted, total_file_size, + score, w_amp, stats); + PrintLevelStats(buf, len, name, level_stats); +} + +// Assumes that trailing numbers represent an optional argument. This requires +// property names to not end with numbers. +std::pair GetPropertyNameAndArg(const Slice& property) { + Slice name = property, arg = property; + size_t sfx_len = 0; + while (sfx_len < property.size() && + isdigit(property[property.size() - sfx_len - 1])) { + ++sfx_len; + } + name.remove_suffix(sfx_len); + arg.remove_prefix(property.size() - sfx_len); + return {name, arg}; +} +} // anonymous namespace + +static const std::string rocksdb_prefix = "rocksdb."; + +static const std::string num_files_at_level_prefix = "num-files-at-level"; +static const std::string compression_ratio_at_level_prefix = + "compression-ratio-at-level"; +static const std::string allstats = "stats"; +static const std::string sstables = "sstables"; +static const std::string cfstats = "cfstats"; +static const std::string cfstats_no_file_histogram = + "cfstats-no-file-histogram"; +static const std::string cf_file_histogram = "cf-file-histogram"; +static const std::string dbstats = "dbstats"; +static const std::string levelstats = "levelstats"; +static const std::string block_cache_entry_stats = "block-cache-entry-stats"; +static const std::string fast_block_cache_entry_stats = + "fast-block-cache-entry-stats"; +static const std::string num_immutable_mem_table = "num-immutable-mem-table"; +static const std::string num_immutable_mem_table_flushed = + "num-immutable-mem-table-flushed"; +static const std::string mem_table_flush_pending = "mem-table-flush-pending"; +static const std::string compaction_pending = "compaction-pending"; +static const std::string background_errors = "background-errors"; +static const std::string cur_size_active_mem_table = + "cur-size-active-mem-table"; +static const std::string cur_size_all_mem_tables = "cur-size-all-mem-tables"; +static const std::string size_all_mem_tables = "size-all-mem-tables"; +static const std::string num_entries_active_mem_table = + "num-entries-active-mem-table"; +static const std::string num_entries_imm_mem_tables = + "num-entries-imm-mem-tables"; +static const std::string num_deletes_active_mem_table = + "num-deletes-active-mem-table"; +static const std::string num_deletes_imm_mem_tables = + "num-deletes-imm-mem-tables"; +static const std::string estimate_num_keys = "estimate-num-keys"; +static const std::string estimate_table_readers_mem = + "estimate-table-readers-mem"; +static const std::string is_file_deletions_enabled = + "is-file-deletions-enabled"; +static const std::string num_snapshots = "num-snapshots"; +static const std::string oldest_snapshot_time = "oldest-snapshot-time"; +static const std::string oldest_snapshot_sequence = "oldest-snapshot-sequence"; +static const std::string num_live_versions = "num-live-versions"; +static const std::string current_version_number = + "current-super-version-number"; +static const std::string estimate_live_data_size = "estimate-live-data-size"; +static const std::string min_log_number_to_keep_str = "min-log-number-to-keep"; +static const std::string min_obsolete_sst_number_to_keep_str = + "min-obsolete-sst-number-to-keep"; +static const std::string base_level_str = "base-level"; +static const std::string total_sst_files_size = "total-sst-files-size"; +static const std::string live_sst_files_size = "live-sst-files-size"; +static const std::string live_sst_files_size_at_temperature = + "live-sst-files-size-at-temperature"; +static const std::string estimate_pending_comp_bytes = + "estimate-pending-compaction-bytes"; +static const std::string aggregated_table_properties = + "aggregated-table-properties"; +static const std::string aggregated_table_properties_at_level = + aggregated_table_properties + "-at-level"; +static const std::string num_running_compactions = "num-running-compactions"; +static const std::string num_running_flushes = "num-running-flushes"; +static const std::string actual_delayed_write_rate = + "actual-delayed-write-rate"; +static const std::string is_write_stopped = "is-write-stopped"; +static const std::string estimate_oldest_key_time = "estimate-oldest-key-time"; +static const std::string block_cache_capacity = "block-cache-capacity"; +static const std::string block_cache_usage = "block-cache-usage"; +static const std::string block_cache_pinned_usage = "block-cache-pinned-usage"; +static const std::string options_statistics = "options-statistics"; +static const std::string num_blob_files = "num-blob-files"; +static const std::string blob_stats = "blob-stats"; +static const std::string total_blob_file_size = "total-blob-file-size"; +static const std::string live_blob_file_size = "live-blob-file-size"; +static const std::string live_blob_file_garbage_size = + "live-blob-file-garbage-size"; +static const std::string blob_cache_capacity = "blob-cache-capacity"; +static const std::string blob_cache_usage = "blob-cache-usage"; +static const std::string blob_cache_pinned_usage = "blob-cache-pinned-usage"; + +const std::string DB::Properties::kNumFilesAtLevelPrefix = + rocksdb_prefix + num_files_at_level_prefix; +const std::string DB::Properties::kCompressionRatioAtLevelPrefix = + rocksdb_prefix + compression_ratio_at_level_prefix; +const std::string DB::Properties::kStats = rocksdb_prefix + allstats; +const std::string DB::Properties::kSSTables = rocksdb_prefix + sstables; +const std::string DB::Properties::kCFStats = rocksdb_prefix + cfstats; +const std::string DB::Properties::kCFStatsNoFileHistogram = + rocksdb_prefix + cfstats_no_file_histogram; +const std::string DB::Properties::kCFFileHistogram = + rocksdb_prefix + cf_file_histogram; +const std::string DB::Properties::kDBStats = rocksdb_prefix + dbstats; +const std::string DB::Properties::kLevelStats = rocksdb_prefix + levelstats; +const std::string DB::Properties::kBlockCacheEntryStats = + rocksdb_prefix + block_cache_entry_stats; +const std::string DB::Properties::kFastBlockCacheEntryStats = + rocksdb_prefix + fast_block_cache_entry_stats; +const std::string DB::Properties::kNumImmutableMemTable = + rocksdb_prefix + num_immutable_mem_table; +const std::string DB::Properties::kNumImmutableMemTableFlushed = + rocksdb_prefix + num_immutable_mem_table_flushed; +const std::string DB::Properties::kMemTableFlushPending = + rocksdb_prefix + mem_table_flush_pending; +const std::string DB::Properties::kCompactionPending = + rocksdb_prefix + compaction_pending; +const std::string DB::Properties::kNumRunningCompactions = + rocksdb_prefix + num_running_compactions; +const std::string DB::Properties::kNumRunningFlushes = + rocksdb_prefix + num_running_flushes; +const std::string DB::Properties::kBackgroundErrors = + rocksdb_prefix + background_errors; +const std::string DB::Properties::kCurSizeActiveMemTable = + rocksdb_prefix + cur_size_active_mem_table; +const std::string DB::Properties::kCurSizeAllMemTables = + rocksdb_prefix + cur_size_all_mem_tables; +const std::string DB::Properties::kSizeAllMemTables = + rocksdb_prefix + size_all_mem_tables; +const std::string DB::Properties::kNumEntriesActiveMemTable = + rocksdb_prefix + num_entries_active_mem_table; +const std::string DB::Properties::kNumEntriesImmMemTables = + rocksdb_prefix + num_entries_imm_mem_tables; +const std::string DB::Properties::kNumDeletesActiveMemTable = + rocksdb_prefix + num_deletes_active_mem_table; +const std::string DB::Properties::kNumDeletesImmMemTables = + rocksdb_prefix + num_deletes_imm_mem_tables; +const std::string DB::Properties::kEstimateNumKeys = + rocksdb_prefix + estimate_num_keys; +const std::string DB::Properties::kEstimateTableReadersMem = + rocksdb_prefix + estimate_table_readers_mem; +const std::string DB::Properties::kIsFileDeletionsEnabled = + rocksdb_prefix + is_file_deletions_enabled; +const std::string DB::Properties::kNumSnapshots = + rocksdb_prefix + num_snapshots; +const std::string DB::Properties::kOldestSnapshotTime = + rocksdb_prefix + oldest_snapshot_time; +const std::string DB::Properties::kOldestSnapshotSequence = + rocksdb_prefix + oldest_snapshot_sequence; +const std::string DB::Properties::kNumLiveVersions = + rocksdb_prefix + num_live_versions; +const std::string DB::Properties::kCurrentSuperVersionNumber = + rocksdb_prefix + current_version_number; +const std::string DB::Properties::kEstimateLiveDataSize = + rocksdb_prefix + estimate_live_data_size; +const std::string DB::Properties::kMinLogNumberToKeep = + rocksdb_prefix + min_log_number_to_keep_str; +const std::string DB::Properties::kMinObsoleteSstNumberToKeep = + rocksdb_prefix + min_obsolete_sst_number_to_keep_str; +const std::string DB::Properties::kTotalSstFilesSize = + rocksdb_prefix + total_sst_files_size; +const std::string DB::Properties::kLiveSstFilesSize = + rocksdb_prefix + live_sst_files_size; +const std::string DB::Properties::kBaseLevel = rocksdb_prefix + base_level_str; +const std::string DB::Properties::kEstimatePendingCompactionBytes = + rocksdb_prefix + estimate_pending_comp_bytes; +const std::string DB::Properties::kAggregatedTableProperties = + rocksdb_prefix + aggregated_table_properties; +const std::string DB::Properties::kAggregatedTablePropertiesAtLevel = + rocksdb_prefix + aggregated_table_properties_at_level; +const std::string DB::Properties::kActualDelayedWriteRate = + rocksdb_prefix + actual_delayed_write_rate; +const std::string DB::Properties::kIsWriteStopped = + rocksdb_prefix + is_write_stopped; +const std::string DB::Properties::kEstimateOldestKeyTime = + rocksdb_prefix + estimate_oldest_key_time; +const std::string DB::Properties::kBlockCacheCapacity = + rocksdb_prefix + block_cache_capacity; +const std::string DB::Properties::kBlockCacheUsage = + rocksdb_prefix + block_cache_usage; +const std::string DB::Properties::kBlockCachePinnedUsage = + rocksdb_prefix + block_cache_pinned_usage; +const std::string DB::Properties::kOptionsStatistics = + rocksdb_prefix + options_statistics; +const std::string DB::Properties::kLiveSstFilesSizeAtTemperature = + rocksdb_prefix + live_sst_files_size_at_temperature; +const std::string DB::Properties::kNumBlobFiles = + rocksdb_prefix + num_blob_files; +const std::string DB::Properties::kBlobStats = rocksdb_prefix + blob_stats; +const std::string DB::Properties::kTotalBlobFileSize = + rocksdb_prefix + total_blob_file_size; +const std::string DB::Properties::kLiveBlobFileSize = + rocksdb_prefix + live_blob_file_size; +const std::string DB::Properties::kLiveBlobFileGarbageSize = + rocksdb_prefix + live_blob_file_garbage_size; +const std::string DB::Properties::kBlobCacheCapacity = + rocksdb_prefix + blob_cache_capacity; +const std::string DB::Properties::kBlobCacheUsage = + rocksdb_prefix + blob_cache_usage; +const std::string DB::Properties::kBlobCachePinnedUsage = + rocksdb_prefix + blob_cache_pinned_usage; + +const std::string InternalStats::kPeriodicCFStats = + DB::Properties::kCFStats + ".periodic"; +const int InternalStats::kMaxNoChangePeriodSinceDump = 8; + +const UnorderedMap + InternalStats::ppt_name_to_info = { + {DB::Properties::kNumFilesAtLevelPrefix, + {false, &InternalStats::HandleNumFilesAtLevel, nullptr, nullptr, + nullptr}}, + {DB::Properties::kCompressionRatioAtLevelPrefix, + {false, &InternalStats::HandleCompressionRatioAtLevelPrefix, nullptr, + nullptr, nullptr}}, + {DB::Properties::kLevelStats, + {false, &InternalStats::HandleLevelStats, nullptr, nullptr, nullptr}}, + {DB::Properties::kStats, + {false, &InternalStats::HandleStats, nullptr, nullptr, nullptr}}, + {DB::Properties::kCFStats, + {false, &InternalStats::HandleCFStats, nullptr, + &InternalStats::HandleCFMapStats, nullptr}}, + {InternalStats::kPeriodicCFStats, + {false, &InternalStats::HandleCFStatsPeriodic, nullptr, nullptr, + nullptr}}, + {DB::Properties::kCFStatsNoFileHistogram, + {false, &InternalStats::HandleCFStatsNoFileHistogram, nullptr, nullptr, + nullptr}}, + {DB::Properties::kCFFileHistogram, + {false, &InternalStats::HandleCFFileHistogram, nullptr, nullptr, + nullptr}}, + {DB::Properties::kDBStats, + {false, &InternalStats::HandleDBStats, nullptr, + &InternalStats::HandleDBMapStats, nullptr}}, + {DB::Properties::kBlockCacheEntryStats, + {true, &InternalStats::HandleBlockCacheEntryStats, nullptr, + &InternalStats::HandleBlockCacheEntryStatsMap, nullptr}}, + {DB::Properties::kFastBlockCacheEntryStats, + {true, &InternalStats::HandleFastBlockCacheEntryStats, nullptr, + &InternalStats::HandleFastBlockCacheEntryStatsMap, nullptr}}, + {DB::Properties::kSSTables, + {false, &InternalStats::HandleSsTables, nullptr, nullptr, nullptr}}, + {DB::Properties::kAggregatedTableProperties, + {false, &InternalStats::HandleAggregatedTableProperties, nullptr, + &InternalStats::HandleAggregatedTablePropertiesMap, nullptr}}, + {DB::Properties::kAggregatedTablePropertiesAtLevel, + {false, &InternalStats::HandleAggregatedTablePropertiesAtLevel, + nullptr, &InternalStats::HandleAggregatedTablePropertiesAtLevelMap, + nullptr}}, + {DB::Properties::kNumImmutableMemTable, + {false, nullptr, &InternalStats::HandleNumImmutableMemTable, nullptr, + nullptr}}, + {DB::Properties::kNumImmutableMemTableFlushed, + {false, nullptr, &InternalStats::HandleNumImmutableMemTableFlushed, + nullptr, nullptr}}, + {DB::Properties::kMemTableFlushPending, + {false, nullptr, &InternalStats::HandleMemTableFlushPending, nullptr, + nullptr}}, + {DB::Properties::kCompactionPending, + {false, nullptr, &InternalStats::HandleCompactionPending, nullptr, + nullptr}}, + {DB::Properties::kBackgroundErrors, + {false, nullptr, &InternalStats::HandleBackgroundErrors, nullptr, + nullptr}}, + {DB::Properties::kCurSizeActiveMemTable, + {false, nullptr, &InternalStats::HandleCurSizeActiveMemTable, nullptr, + nullptr}}, + {DB::Properties::kCurSizeAllMemTables, + {false, nullptr, &InternalStats::HandleCurSizeAllMemTables, nullptr, + nullptr}}, + {DB::Properties::kSizeAllMemTables, + {false, nullptr, &InternalStats::HandleSizeAllMemTables, nullptr, + nullptr}}, + {DB::Properties::kNumEntriesActiveMemTable, + {false, nullptr, &InternalStats::HandleNumEntriesActiveMemTable, + nullptr, nullptr}}, + {DB::Properties::kNumEntriesImmMemTables, + {false, nullptr, &InternalStats::HandleNumEntriesImmMemTables, nullptr, + nullptr}}, + {DB::Properties::kNumDeletesActiveMemTable, + {false, nullptr, &InternalStats::HandleNumDeletesActiveMemTable, + nullptr, nullptr}}, + {DB::Properties::kNumDeletesImmMemTables, + {false, nullptr, &InternalStats::HandleNumDeletesImmMemTables, nullptr, + nullptr}}, + {DB::Properties::kEstimateNumKeys, + {false, nullptr, &InternalStats::HandleEstimateNumKeys, nullptr, + nullptr}}, + {DB::Properties::kEstimateTableReadersMem, + {true, nullptr, &InternalStats::HandleEstimateTableReadersMem, nullptr, + nullptr}}, + {DB::Properties::kIsFileDeletionsEnabled, + {false, nullptr, &InternalStats::HandleIsFileDeletionsEnabled, nullptr, + nullptr}}, + {DB::Properties::kNumSnapshots, + {false, nullptr, &InternalStats::HandleNumSnapshots, nullptr, + nullptr}}, + {DB::Properties::kOldestSnapshotTime, + {false, nullptr, &InternalStats::HandleOldestSnapshotTime, nullptr, + nullptr}}, + {DB::Properties::kOldestSnapshotSequence, + {false, nullptr, &InternalStats::HandleOldestSnapshotSequence, nullptr, + nullptr}}, + {DB::Properties::kNumLiveVersions, + {false, nullptr, &InternalStats::HandleNumLiveVersions, nullptr, + nullptr}}, + {DB::Properties::kCurrentSuperVersionNumber, + {false, nullptr, &InternalStats::HandleCurrentSuperVersionNumber, + nullptr, nullptr}}, + {DB::Properties::kEstimateLiveDataSize, + {true, nullptr, &InternalStats::HandleEstimateLiveDataSize, nullptr, + nullptr}}, + {DB::Properties::kMinLogNumberToKeep, + {false, nullptr, &InternalStats::HandleMinLogNumberToKeep, nullptr, + nullptr}}, + {DB::Properties::kMinObsoleteSstNumberToKeep, + {false, nullptr, &InternalStats::HandleMinObsoleteSstNumberToKeep, + nullptr, nullptr}}, + {DB::Properties::kBaseLevel, + {false, nullptr, &InternalStats::HandleBaseLevel, nullptr, nullptr}}, + {DB::Properties::kTotalSstFilesSize, + {false, nullptr, &InternalStats::HandleTotalSstFilesSize, nullptr, + nullptr}}, + {DB::Properties::kLiveSstFilesSize, + {false, nullptr, &InternalStats::HandleLiveSstFilesSize, nullptr, + nullptr}}, + {DB::Properties::kLiveSstFilesSizeAtTemperature, + {false, &InternalStats::HandleLiveSstFilesSizeAtTemperature, nullptr, + nullptr, nullptr}}, + {DB::Properties::kEstimatePendingCompactionBytes, + {false, nullptr, &InternalStats::HandleEstimatePendingCompactionBytes, + nullptr, nullptr}}, + {DB::Properties::kNumRunningFlushes, + {false, nullptr, &InternalStats::HandleNumRunningFlushes, nullptr, + nullptr}}, + {DB::Properties::kNumRunningCompactions, + {false, nullptr, &InternalStats::HandleNumRunningCompactions, nullptr, + nullptr}}, + {DB::Properties::kActualDelayedWriteRate, + {false, nullptr, &InternalStats::HandleActualDelayedWriteRate, nullptr, + nullptr}}, + {DB::Properties::kIsWriteStopped, + {false, nullptr, &InternalStats::HandleIsWriteStopped, nullptr, + nullptr}}, + {DB::Properties::kEstimateOldestKeyTime, + {false, nullptr, &InternalStats::HandleEstimateOldestKeyTime, nullptr, + nullptr}}, + {DB::Properties::kBlockCacheCapacity, + {false, nullptr, &InternalStats::HandleBlockCacheCapacity, nullptr, + nullptr}}, + {DB::Properties::kBlockCacheUsage, + {false, nullptr, &InternalStats::HandleBlockCacheUsage, nullptr, + nullptr}}, + {DB::Properties::kBlockCachePinnedUsage, + {false, nullptr, &InternalStats::HandleBlockCachePinnedUsage, nullptr, + nullptr}}, + {DB::Properties::kOptionsStatistics, + {true, nullptr, nullptr, nullptr, + &DBImpl::GetPropertyHandleOptionsStatistics}}, + {DB::Properties::kNumBlobFiles, + {false, nullptr, &InternalStats::HandleNumBlobFiles, nullptr, + nullptr}}, + {DB::Properties::kBlobStats, + {false, &InternalStats::HandleBlobStats, nullptr, nullptr, nullptr}}, + {DB::Properties::kTotalBlobFileSize, + {false, nullptr, &InternalStats::HandleTotalBlobFileSize, nullptr, + nullptr}}, + {DB::Properties::kLiveBlobFileSize, + {false, nullptr, &InternalStats::HandleLiveBlobFileSize, nullptr, + nullptr}}, + {DB::Properties::kLiveBlobFileGarbageSize, + {false, nullptr, &InternalStats::HandleLiveBlobFileGarbageSize, + nullptr, nullptr}}, + {DB::Properties::kBlobCacheCapacity, + {false, nullptr, &InternalStats::HandleBlobCacheCapacity, nullptr, + nullptr}}, + {DB::Properties::kBlobCacheUsage, + {false, nullptr, &InternalStats::HandleBlobCacheUsage, nullptr, + nullptr}}, + {DB::Properties::kBlobCachePinnedUsage, + {false, nullptr, &InternalStats::HandleBlobCachePinnedUsage, nullptr, + nullptr}}, +}; + +InternalStats::InternalStats(int num_levels, SystemClock* clock, + ColumnFamilyData* cfd) + : db_stats_{}, + cf_stats_value_{}, + cf_stats_count_{}, + comp_stats_(num_levels), + comp_stats_by_pri_(Env::Priority::TOTAL), + file_read_latency_(num_levels), + has_cf_change_since_dump_(true), + bg_error_count_(0), + number_levels_(num_levels), + clock_(clock), + cfd_(cfd), + started_at_(clock->NowMicros()) { + Cache* block_cache = GetBlockCacheForStats(); + if (block_cache) { + // Extract or create stats collector. Could fail in rare cases. + Status s = CacheEntryStatsCollector::GetShared( + block_cache, clock_, &cache_entry_stats_collector_); + if (s.ok()) { + assert(cache_entry_stats_collector_); + } else { + assert(!cache_entry_stats_collector_); + } + } +} + +void InternalStats::TEST_GetCacheEntryRoleStats(CacheEntryRoleStats* stats, + bool foreground) { + CollectCacheEntryStats(foreground); + if (cache_entry_stats_collector_) { + cache_entry_stats_collector_->GetStats(stats); + } +} + +void InternalStats::CollectCacheEntryStats(bool foreground) { + // This function is safe to call from any thread because + // cache_entry_stats_collector_ field is const after constructor + // and ->GetStats does its own synchronization, which also suffices for + // cache_entry_stats_. + + if (!cache_entry_stats_collector_) { + return; // nothing to do (e.g. no block cache) + } + + // For "background" collections, strictly cap the collection time by + // expanding effective cache TTL. For foreground, be more aggressive about + // getting latest data. + int min_interval_seconds = foreground ? 10 : 180; + // 1/500 = max of 0.2% of one CPU thread + int min_interval_factor = foreground ? 10 : 500; + cache_entry_stats_collector_->CollectStats(min_interval_seconds, + min_interval_factor); +} + +std::function +InternalStats::CacheEntryRoleStats::GetEntryCallback() { + return [&](const Slice& /*key*/, void* /*value*/, size_t charge, + Cache::DeleterFn deleter) { + auto e = role_map_.find(deleter); + size_t role_idx; + if (e == role_map_.end()) { + role_idx = static_cast(CacheEntryRole::kMisc); + } else { + role_idx = static_cast(e->second); + } + entry_counts[role_idx]++; + total_charges[role_idx] += charge; + }; +} + +void InternalStats::CacheEntryRoleStats::BeginCollection( + Cache* cache, SystemClock*, uint64_t start_time_micros) { + Clear(); + last_start_time_micros_ = start_time_micros; + ++collection_count; + role_map_ = CopyCacheDeleterRoleMap(); + std::ostringstream str; + str << cache->Name() << "@" << static_cast(cache) << "#" + << port::GetProcessID(); + cache_id = str.str(); + cache_capacity = cache->GetCapacity(); + cache_usage = cache->GetUsage(); + table_size = cache->GetTableAddressCount(); + occupancy = cache->GetOccupancyCount(); +} + +void InternalStats::CacheEntryRoleStats::EndCollection( + Cache*, SystemClock*, uint64_t end_time_micros) { + last_end_time_micros_ = end_time_micros; +} + +void InternalStats::CacheEntryRoleStats::SkippedCollection() { + ++copies_of_last_collection; +} + +uint64_t InternalStats::CacheEntryRoleStats::GetLastDurationMicros() const { + if (last_end_time_micros_ > last_start_time_micros_) { + return last_end_time_micros_ - last_start_time_micros_; + } else { + return 0U; + } +} + +std::string InternalStats::CacheEntryRoleStats::ToString( + SystemClock* clock) const { + std::ostringstream str; + str << "Block cache " << cache_id + << " capacity: " << BytesToHumanString(cache_capacity) + << " usage: " << BytesToHumanString(cache_usage) + << " table_size: " << table_size << " occupancy: " << occupancy + << " collections: " << collection_count + << " last_copies: " << copies_of_last_collection + << " last_secs: " << (GetLastDurationMicros() / 1000000.0) + << " secs_since: " + << ((clock->NowMicros() - last_end_time_micros_) / 1000000U) << "\n"; + str << "Block cache entry stats(count,size,portion):"; + for (size_t i = 0; i < kNumCacheEntryRoles; ++i) { + if (entry_counts[i] > 0) { + str << " " << kCacheEntryRoleToCamelString[i] << "(" << entry_counts[i] + << "," << BytesToHumanString(total_charges[i]) << "," + << (100.0 * total_charges[i] / cache_capacity) << "%)"; + } + } + str << "\n"; + return str.str(); +} + +void InternalStats::CacheEntryRoleStats::ToMap( + std::map* values, SystemClock* clock) const { + values->clear(); + auto& v = *values; + v[BlockCacheEntryStatsMapKeys::CacheId()] = cache_id; + v[BlockCacheEntryStatsMapKeys::CacheCapacityBytes()] = + std::to_string(cache_capacity); + v[BlockCacheEntryStatsMapKeys::LastCollectionDurationSeconds()] = + std::to_string(GetLastDurationMicros() / 1000000.0); + v[BlockCacheEntryStatsMapKeys::LastCollectionAgeSeconds()] = + std::to_string((clock->NowMicros() - last_end_time_micros_) / 1000000U); + for (size_t i = 0; i < kNumCacheEntryRoles; ++i) { + auto role = static_cast(i); + v[BlockCacheEntryStatsMapKeys::EntryCount(role)] = + std::to_string(entry_counts[i]); + v[BlockCacheEntryStatsMapKeys::UsedBytes(role)] = + std::to_string(total_charges[i]); + v[BlockCacheEntryStatsMapKeys::UsedPercent(role)] = + std::to_string(100.0 * total_charges[i] / cache_capacity); + } +} + +bool InternalStats::HandleBlockCacheEntryStatsInternal(std::string* value, + bool fast) { + if (!cache_entry_stats_collector_) { + return false; + } + CollectCacheEntryStats(!fast /* foreground */); + CacheEntryRoleStats stats; + cache_entry_stats_collector_->GetStats(&stats); + *value = stats.ToString(clock_); + return true; +} + +bool InternalStats::HandleBlockCacheEntryStatsMapInternal( + std::map* values, bool fast) { + if (!cache_entry_stats_collector_) { + return false; + } + CollectCacheEntryStats(!fast /* foreground */); + CacheEntryRoleStats stats; + cache_entry_stats_collector_->GetStats(&stats); + stats.ToMap(values, clock_); + return true; +} + +bool InternalStats::HandleBlockCacheEntryStats(std::string* value, + Slice /*suffix*/) { + return HandleBlockCacheEntryStatsInternal(value, false /* fast */); +} + +bool InternalStats::HandleBlockCacheEntryStatsMap( + std::map* values, Slice /*suffix*/) { + return HandleBlockCacheEntryStatsMapInternal(values, false /* fast */); +} + +bool InternalStats::HandleFastBlockCacheEntryStats(std::string* value, + Slice /*suffix*/) { + return HandleBlockCacheEntryStatsInternal(value, true /* fast */); +} + +bool InternalStats::HandleFastBlockCacheEntryStatsMap( + std::map* values, Slice /*suffix*/) { + return HandleBlockCacheEntryStatsMapInternal(values, true /* fast */); +} + +bool InternalStats::HandleLiveSstFilesSizeAtTemperature(std::string* value, + Slice suffix) { + uint64_t temperature; + bool ok = ConsumeDecimalNumber(&suffix, &temperature) && suffix.empty(); + if (!ok) { + return false; + } + + uint64_t size = 0; + const auto* vstorage = cfd_->current()->storage_info(); + for (int level = 0; level < vstorage->num_levels(); level++) { + for (const auto& file_meta : vstorage->LevelFiles(level)) { + if (static_cast(file_meta->temperature) == temperature) { + size += file_meta->fd.GetFileSize(); + } + } + } + + *value = std::to_string(size); + return true; +} + +bool InternalStats::HandleNumBlobFiles(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + assert(value); + assert(cfd_); + + const auto* current = cfd_->current(); + assert(current); + + const auto* vstorage = current->storage_info(); + assert(vstorage); + + const auto& blob_files = vstorage->GetBlobFiles(); + + *value = blob_files.size(); + + return true; +} + +bool InternalStats::HandleBlobStats(std::string* value, Slice /*suffix*/) { + assert(value); + assert(cfd_); + + const auto* current = cfd_->current(); + assert(current); + + const auto* vstorage = current->storage_info(); + assert(vstorage); + + const auto blob_st = vstorage->GetBlobStats(); + + std::ostringstream oss; + + oss << "Number of blob files: " << vstorage->GetBlobFiles().size() + << "\nTotal size of blob files: " << blob_st.total_file_size + << "\nTotal size of garbage in blob files: " << blob_st.total_garbage_size + << "\nBlob file space amplification: " << blob_st.space_amp << '\n'; + + value->append(oss.str()); + + return true; +} + +bool InternalStats::HandleTotalBlobFileSize(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + assert(value); + assert(cfd_); + + *value = cfd_->GetTotalBlobFileSize(); + + return true; +} + +bool InternalStats::HandleLiveBlobFileSize(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + assert(value); + assert(cfd_); + + const auto* current = cfd_->current(); + assert(current); + + const auto* vstorage = current->storage_info(); + assert(vstorage); + + *value = vstorage->GetBlobStats().total_file_size; + + return true; +} + +bool InternalStats::HandleLiveBlobFileGarbageSize(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { + assert(value); + assert(cfd_); + + const auto* current = cfd_->current(); + assert(current); + + const auto* vstorage = current->storage_info(); + assert(vstorage); + + *value = vstorage->GetBlobStats().total_garbage_size; + + return true; +} + +Cache* InternalStats::GetBlobCacheForStats() { + return cfd_->ioptions()->blob_cache.get(); +} + +bool InternalStats::HandleBlobCacheCapacity(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + Cache* blob_cache = GetBlobCacheForStats(); + if (blob_cache) { + *value = static_cast(blob_cache->GetCapacity()); + return true; + } + return false; +} + +bool InternalStats::HandleBlobCacheUsage(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + Cache* blob_cache = GetBlobCacheForStats(); + if (blob_cache) { + *value = static_cast(blob_cache->GetUsage()); + return true; + } + return false; +} + +bool InternalStats::HandleBlobCachePinnedUsage(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + Cache* blob_cache = GetBlobCacheForStats(); + if (blob_cache) { + *value = static_cast(blob_cache->GetPinnedUsage()); + return true; + } + return false; +} + +const DBPropertyInfo* GetPropertyInfo(const Slice& property) { + std::string ppt_name = GetPropertyNameAndArg(property).first.ToString(); + auto ppt_info_iter = InternalStats::ppt_name_to_info.find(ppt_name); + if (ppt_info_iter == InternalStats::ppt_name_to_info.end()) { + return nullptr; + } + return &ppt_info_iter->second; +} + +bool InternalStats::GetStringProperty(const DBPropertyInfo& property_info, + const Slice& property, + std::string* value) { + assert(value != nullptr); + assert(property_info.handle_string != nullptr); + Slice arg = GetPropertyNameAndArg(property).second; + return (this->*(property_info.handle_string))(value, arg); +} + +bool InternalStats::GetMapProperty(const DBPropertyInfo& property_info, + const Slice& property, + std::map* value) { + assert(value != nullptr); + assert(property_info.handle_map != nullptr); + Slice arg = GetPropertyNameAndArg(property).second; + return (this->*(property_info.handle_map))(value, arg); +} + +bool InternalStats::GetIntProperty(const DBPropertyInfo& property_info, + uint64_t* value, DBImpl* db) { + assert(value != nullptr); + assert(property_info.handle_int != nullptr && + !property_info.need_out_of_mutex); + db->mutex_.AssertHeld(); + return (this->*(property_info.handle_int))(value, db, nullptr /* version */); +} + +bool InternalStats::GetIntPropertyOutOfMutex( + const DBPropertyInfo& property_info, Version* version, uint64_t* value) { + assert(value != nullptr); + assert(property_info.handle_int != nullptr && + property_info.need_out_of_mutex); + return (this->*(property_info.handle_int))(value, nullptr /* db */, version); +} + +bool InternalStats::HandleNumFilesAtLevel(std::string* value, Slice suffix) { + uint64_t level; + const auto* vstorage = cfd_->current()->storage_info(); + bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty(); + if (!ok || static_cast(level) >= number_levels_) { + return false; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "%d", + vstorage->NumLevelFiles(static_cast(level))); + *value = buf; + return true; + } +} + +bool InternalStats::HandleCompressionRatioAtLevelPrefix(std::string* value, + Slice suffix) { + uint64_t level; + const auto* vstorage = cfd_->current()->storage_info(); + bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty(); + if (!ok || level >= static_cast(number_levels_)) { + return false; + } + *value = std::to_string( + vstorage->GetEstimatedCompressionRatioAtLevel(static_cast(level))); + return true; +} + +bool InternalStats::HandleLevelStats(std::string* value, Slice /*suffix*/) { + char buf[1000]; + const auto* vstorage = cfd_->current()->storage_info(); + snprintf(buf, sizeof(buf), + "Level Files Size(MB)\n" + "--------------------\n"); + value->append(buf); + + for (int level = 0; level < number_levels_; level++) { + snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level, + vstorage->NumLevelFiles(level), + vstorage->NumLevelBytes(level) / kMB); + value->append(buf); + } + return true; +} + +bool InternalStats::HandleStats(std::string* value, Slice suffix) { + if (!HandleCFStats(value, suffix)) { + return false; + } + if (!HandleDBStats(value, suffix)) { + return false; + } + return true; +} + +bool InternalStats::HandleCFMapStats( + std::map* cf_stats, Slice /*suffix*/) { + DumpCFMapStats(cf_stats); + return true; +} + +bool InternalStats::HandleCFStats(std::string* value, Slice /*suffix*/) { + DumpCFStats(value); + return true; +} + +bool InternalStats::HandleCFStatsPeriodic(std::string* value, + Slice /*suffix*/) { + bool has_change = has_cf_change_since_dump_; + if (!has_change) { + // If file histogram changes, there is activity in this period too. + uint64_t new_histogram_num = 0; + for (int level = 0; level < number_levels_; level++) { + new_histogram_num += file_read_latency_[level].num(); + } + new_histogram_num += blob_file_read_latency_.num(); + if (new_histogram_num != last_histogram_num) { + has_change = true; + last_histogram_num = new_histogram_num; + } + } + if (has_change) { + no_cf_change_period_since_dump_ = 0; + has_cf_change_since_dump_ = false; + } else if (no_cf_change_period_since_dump_++ > 0) { + // Not ready to sync + if (no_cf_change_period_since_dump_ == kMaxNoChangePeriodSinceDump) { + // Next periodic, we need to dump stats even if there is no change. + no_cf_change_period_since_dump_ = 0; + } + return true; + } + + DumpCFStatsNoFileHistogram(/*is_periodic=*/true, value); + DumpCFFileHistogram(value); + return true; +} + +bool InternalStats::HandleCFStatsNoFileHistogram(std::string* value, + Slice /*suffix*/) { + DumpCFStatsNoFileHistogram(/*is_periodic=*/false, value); + return true; +} + +bool InternalStats::HandleCFFileHistogram(std::string* value, + Slice /*suffix*/) { + DumpCFFileHistogram(value); + return true; +} + +bool InternalStats::HandleDBMapStats( + std::map* db_stats, Slice /*suffix*/) { + DumpDBMapStats(db_stats); + return true; +} + +bool InternalStats::HandleDBStats(std::string* value, Slice /*suffix*/) { + DumpDBStats(value); + return true; +} + +bool InternalStats::HandleSsTables(std::string* value, Slice /*suffix*/) { + auto* current = cfd_->current(); + *value = current->DebugString(true, true); + return true; +} + +bool InternalStats::HandleAggregatedTableProperties(std::string* value, + Slice /*suffix*/) { + std::shared_ptr tp; + auto s = cfd_->current()->GetAggregatedTableProperties(&tp); + if (!s.ok()) { + return false; + } + *value = tp->ToString(); + return true; +} + +static std::map MapUint64ValuesToString( + const std::map& from) { + std::map to; + for (const auto& e : from) { + to[e.first] = std::to_string(e.second); + } + return to; +} + +bool InternalStats::HandleAggregatedTablePropertiesMap( + std::map* values, Slice /*suffix*/) { + std::shared_ptr tp; + auto s = cfd_->current()->GetAggregatedTableProperties(&tp); + if (!s.ok()) { + return false; + } + *values = MapUint64ValuesToString(tp->GetAggregatablePropertiesAsMap()); + return true; +} + +bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* values, + Slice suffix) { + uint64_t level; + bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty(); + if (!ok || static_cast(level) >= number_levels_) { + return false; + } + std::shared_ptr tp; + auto s = cfd_->current()->GetAggregatedTableProperties( + &tp, static_cast(level)); + if (!s.ok()) { + return false; + } + *values = tp->ToString(); + return true; +} + +bool InternalStats::HandleAggregatedTablePropertiesAtLevelMap( + std::map* values, Slice suffix) { + uint64_t level; + bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty(); + if (!ok || static_cast(level) >= number_levels_) { + return false; + } + std::shared_ptr tp; + auto s = cfd_->current()->GetAggregatedTableProperties( + &tp, static_cast(level)); + if (!s.ok()) { + return false; + } + *values = MapUint64ValuesToString(tp->GetAggregatablePropertiesAsMap()); + return true; +} + +bool InternalStats::HandleNumImmutableMemTable(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + *value = cfd_->imm()->NumNotFlushed(); + return true; +} + +bool InternalStats::HandleNumImmutableMemTableFlushed(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { + *value = cfd_->imm()->NumFlushed(); + return true; +} + +bool InternalStats::HandleMemTableFlushPending(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + *value = (cfd_->imm()->IsFlushPending() ? 1 : 0); + return true; +} + +bool InternalStats::HandleNumRunningFlushes(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = db->num_running_flushes(); + return true; +} + +bool InternalStats::HandleCompactionPending(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + // 1 if the system already determines at least one compaction is needed. + // 0 otherwise, + const auto* vstorage = cfd_->current()->storage_info(); + *value = (cfd_->compaction_picker()->NeedsCompaction(vstorage) ? 1 : 0); + return true; +} + +bool InternalStats::HandleNumRunningCompactions(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = db->num_running_compactions_; + return true; +} + +bool InternalStats::HandleBackgroundErrors(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + // Accumulated number of errors in background flushes or compactions. + *value = GetBackgroundErrorCount(); + return true; +} + +bool InternalStats::HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + // Current size of the active memtable + // Using ApproximateMemoryUsageFast to avoid the need for synchronization + *value = cfd_->mem()->ApproximateMemoryUsageFast(); + return true; +} + +bool InternalStats::HandleCurSizeAllMemTables(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + // Current size of the active memtable + immutable memtables + // Using ApproximateMemoryUsageFast to avoid the need for synchronization + *value = cfd_->mem()->ApproximateMemoryUsageFast() + + cfd_->imm()->ApproximateUnflushedMemTablesMemoryUsage(); + return true; +} + +bool InternalStats::HandleSizeAllMemTables(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + // Using ApproximateMemoryUsageFast to avoid the need for synchronization + *value = cfd_->mem()->ApproximateMemoryUsageFast() + + cfd_->imm()->ApproximateMemoryUsage(); + return true; +} + +bool InternalStats::HandleNumEntriesActiveMemTable(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { + // Current number of entires in the active memtable + *value = cfd_->mem()->num_entries(); + return true; +} + +bool InternalStats::HandleNumEntriesImmMemTables(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { + // Current number of entries in the immutable memtables + *value = cfd_->imm()->current()->GetTotalNumEntries(); + return true; +} + +bool InternalStats::HandleNumDeletesActiveMemTable(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { + // Current number of entires in the active memtable + *value = cfd_->mem()->num_deletes(); + return true; +} + +bool InternalStats::HandleNumDeletesImmMemTables(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { + // Current number of entries in the immutable memtables + *value = cfd_->imm()->current()->GetTotalNumDeletes(); + return true; +} + +bool InternalStats::HandleEstimateNumKeys(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + // Estimate number of entries in the column family: + // Use estimated entries in tables + total entries in memtables. + const auto* vstorage = cfd_->current()->storage_info(); + uint64_t estimate_keys = cfd_->mem()->num_entries() + + cfd_->imm()->current()->GetTotalNumEntries() + + vstorage->GetEstimatedActiveKeys(); + uint64_t estimate_deletes = + cfd_->mem()->num_deletes() + cfd_->imm()->current()->GetTotalNumDeletes(); + *value = estimate_keys > estimate_deletes * 2 + ? estimate_keys - (estimate_deletes * 2) + : 0; + return true; +} + +bool InternalStats::HandleNumSnapshots(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = db->snapshots().count(); + return true; +} + +bool InternalStats::HandleOldestSnapshotTime(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = static_cast(db->snapshots().GetOldestSnapshotTime()); + return true; +} + +bool InternalStats::HandleOldestSnapshotSequence(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = static_cast(db->snapshots().GetOldestSnapshotSequence()); + return true; +} + +bool InternalStats::HandleNumLiveVersions(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + *value = cfd_->GetNumLiveVersions(); + return true; +} + +bool InternalStats::HandleCurrentSuperVersionNumber(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { + *value = cfd_->GetSuperVersionNumber(); + return true; +} + +bool InternalStats::HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = db->IsFileDeletionsEnabled() ? 1 : 0; + return true; +} + +bool InternalStats::HandleBaseLevel(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + const auto* vstorage = cfd_->current()->storage_info(); + *value = vstorage->base_level(); + return true; +} + +bool InternalStats::HandleTotalSstFilesSize(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + *value = cfd_->GetTotalSstFilesSize(); + return true; +} + +bool InternalStats::HandleLiveSstFilesSize(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + *value = cfd_->GetLiveSstFilesSize(); + return true; +} + +bool InternalStats::HandleEstimatePendingCompactionBytes(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { + const auto* vstorage = cfd_->current()->storage_info(); + *value = vstorage->estimated_compaction_needed_bytes(); + return true; +} + +bool InternalStats::HandleEstimateTableReadersMem(uint64_t* value, + DBImpl* /*db*/, + Version* version) { + *value = (version == nullptr) ? 0 : version->GetMemoryUsageByTableReaders(); + return true; +} + +bool InternalStats::HandleEstimateLiveDataSize(uint64_t* value, DBImpl* /*db*/, + Version* version) { + const auto* vstorage = version->storage_info(); + *value = vstorage->EstimateLiveDataSize(); + return true; +} + +bool InternalStats::HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = db->MinLogNumberToKeep(); + return true; +} + +bool InternalStats::HandleMinObsoleteSstNumberToKeep(uint64_t* value, + DBImpl* db, + Version* /*version*/) { + *value = db->MinObsoleteSstNumberToKeep(); + return true; +} + +bool InternalStats::HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db, + Version* /*version*/) { + const WriteController& wc = db->write_controller(); + if (!wc.NeedsDelay()) { + *value = 0; + } else { + *value = wc.delayed_write_rate(); + } + return true; +} + +bool InternalStats::HandleIsWriteStopped(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = db->write_controller().IsStopped() ? 1 : 0; + return true; +} + +bool InternalStats::HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + // TODO(yiwu): The property is currently available for fifo compaction + // with allow_compaction = false. This is because we don't propagate + // oldest_key_time on compaction. + if (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO || + cfd_->GetCurrentMutableCFOptions() + ->compaction_options_fifo.allow_compaction) { + return false; + } + + TablePropertiesCollection collection; + auto s = cfd_->current()->GetPropertiesOfAllTables(&collection); + if (!s.ok()) { + return false; + } + *value = std::numeric_limits::max(); + for (auto& p : collection) { + *value = std::min(*value, p.second->oldest_key_time); + if (*value == 0) { + break; + } + } + if (*value > 0) { + *value = std::min({cfd_->mem()->ApproximateOldestKeyTime(), + cfd_->imm()->ApproximateOldestKeyTime(), *value}); + } + return *value > 0 && *value < std::numeric_limits::max(); +} + +Cache* InternalStats::GetBlockCacheForStats() { + auto* table_factory = cfd_->ioptions()->table_factory.get(); + assert(table_factory != nullptr); + return table_factory->GetOptions(TableFactory::kBlockCacheOpts()); +} + +bool InternalStats::HandleBlockCacheCapacity(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + Cache* block_cache = GetBlockCacheForStats(); + if (block_cache) { + *value = static_cast(block_cache->GetCapacity()); + return true; + } + return false; +} + +bool InternalStats::HandleBlockCacheUsage(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + Cache* block_cache = GetBlockCacheForStats(); + if (block_cache) { + *value = static_cast(block_cache->GetUsage()); + return true; + } + return false; +} + +bool InternalStats::HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + Cache* block_cache = GetBlockCacheForStats(); + if (block_cache) { + *value = static_cast(block_cache->GetPinnedUsage()); + return true; + } + return false; +} + +void InternalStats::DumpDBMapStats( + std::map* db_stats) { + for (int i = 0; i < static_cast(kIntStatsNumMax); ++i) { + InternalDBStatsType type = static_cast(i); + (*db_stats)[db_stats_type_to_info.at(type).property_name] = + std::to_string(GetDBStats(type)); + } + double seconds_up = (clock_->NowMicros() - started_at_) / kMicrosInSec; + (*db_stats)["db.uptime"] = std::to_string(seconds_up); +} + +void InternalStats::DumpDBStats(std::string* value) { + char buf[1000]; + // DB-level stats, only available from default column family + double seconds_up = (clock_->NowMicros() - started_at_) / kMicrosInSec; + double interval_seconds_up = seconds_up - db_stats_snapshot_.seconds_up; + snprintf(buf, sizeof(buf), + "\n** DB Stats **\nUptime(secs): %.1f total, %.1f interval\n", + seconds_up, interval_seconds_up); + value->append(buf); + // Cumulative + uint64_t user_bytes_written = + GetDBStats(InternalStats::kIntStatsBytesWritten); + uint64_t num_keys_written = + GetDBStats(InternalStats::kIntStatsNumKeysWritten); + uint64_t write_other = GetDBStats(InternalStats::kIntStatsWriteDoneByOther); + uint64_t write_self = GetDBStats(InternalStats::kIntStatsWriteDoneBySelf); + uint64_t wal_bytes = GetDBStats(InternalStats::kIntStatsWalFileBytes); + uint64_t wal_synced = GetDBStats(InternalStats::kIntStatsWalFileSynced); + uint64_t write_with_wal = GetDBStats(InternalStats::kIntStatsWriteWithWal); + uint64_t write_stall_micros = + GetDBStats(InternalStats::kIntStatsWriteStallMicros); + + const int kHumanMicrosLen = 32; + char human_micros[kHumanMicrosLen]; + + // Data + // writes: total number of write requests. + // keys: total number of key updates issued by all the write requests + // commit groups: number of group commits issued to the DB. Each group can + // contain one or more writes. + // so writes/keys is the average number of put in multi-put or put + // writes/groups is the average group commit size. + // + // The format is the same for interval stats. + snprintf(buf, sizeof(buf), + "Cumulative writes: %s writes, %s keys, %s commit groups, " + "%.1f writes per commit group, ingest: %.2f GB, %.2f MB/s\n", + NumberToHumanString(write_other + write_self).c_str(), + NumberToHumanString(num_keys_written).c_str(), + NumberToHumanString(write_self).c_str(), + (write_other + write_self) / + std::max(1.0, static_cast(write_self)), + user_bytes_written / kGB, + user_bytes_written / kMB / std::max(seconds_up, 0.001)); + value->append(buf); + // WAL + snprintf(buf, sizeof(buf), + "Cumulative WAL: %s writes, %s syncs, " + "%.2f writes per sync, written: %.2f GB, %.2f MB/s\n", + NumberToHumanString(write_with_wal).c_str(), + NumberToHumanString(wal_synced).c_str(), + write_with_wal / std::max(1.0, static_cast(wal_synced)), + wal_bytes / kGB, wal_bytes / kMB / std::max(seconds_up, 0.001)); + value->append(buf); + // Stall + AppendHumanMicros(write_stall_micros, human_micros, kHumanMicrosLen, true); + snprintf(buf, sizeof(buf), "Cumulative stall: %s, %.1f percent\n", + human_micros, + // 10000 = divide by 1M to get secs, then multiply by 100 for pct + write_stall_micros / 10000.0 / std::max(seconds_up, 0.001)); + value->append(buf); + + // Interval + uint64_t interval_write_other = write_other - db_stats_snapshot_.write_other; + uint64_t interval_write_self = write_self - db_stats_snapshot_.write_self; + uint64_t interval_num_keys_written = + num_keys_written - db_stats_snapshot_.num_keys_written; + snprintf( + buf, sizeof(buf), + "Interval writes: %s writes, %s keys, %s commit groups, " + "%.1f writes per commit group, ingest: %.2f MB, %.2f MB/s\n", + NumberToHumanString(interval_write_other + interval_write_self).c_str(), + NumberToHumanString(interval_num_keys_written).c_str(), + NumberToHumanString(interval_write_self).c_str(), + static_cast(interval_write_other + interval_write_self) / + std::max(1.0, static_cast(interval_write_self)), + (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB, + (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB / + std::max(interval_seconds_up, 0.001)), + value->append(buf); + + uint64_t interval_write_with_wal = + write_with_wal - db_stats_snapshot_.write_with_wal; + uint64_t interval_wal_synced = wal_synced - db_stats_snapshot_.wal_synced; + uint64_t interval_wal_bytes = wal_bytes - db_stats_snapshot_.wal_bytes; + + snprintf(buf, sizeof(buf), + "Interval WAL: %s writes, %s syncs, " + "%.2f writes per sync, written: %.2f GB, %.2f MB/s\n", + NumberToHumanString(interval_write_with_wal).c_str(), + NumberToHumanString(interval_wal_synced).c_str(), + interval_write_with_wal / + std::max(1.0, static_cast(interval_wal_synced)), + interval_wal_bytes / kGB, + interval_wal_bytes / kMB / std::max(interval_seconds_up, 0.001)); + value->append(buf); + + // Stall + AppendHumanMicros(write_stall_micros - db_stats_snapshot_.write_stall_micros, + human_micros, kHumanMicrosLen, true); + snprintf(buf, sizeof(buf), "Interval stall: %s, %.1f percent\n", human_micros, + // 10000 = divide by 1M to get secs, then multiply by 100 for pct + (write_stall_micros - db_stats_snapshot_.write_stall_micros) / + 10000.0 / std::max(interval_seconds_up, 0.001)); + value->append(buf); + + db_stats_snapshot_.seconds_up = seconds_up; + db_stats_snapshot_.ingest_bytes = user_bytes_written; + db_stats_snapshot_.write_other = write_other; + db_stats_snapshot_.write_self = write_self; + db_stats_snapshot_.num_keys_written = num_keys_written; + db_stats_snapshot_.wal_bytes = wal_bytes; + db_stats_snapshot_.wal_synced = wal_synced; + db_stats_snapshot_.write_with_wal = write_with_wal; + db_stats_snapshot_.write_stall_micros = write_stall_micros; +} + +/** + * Dump Compaction Level stats to a map of stat name with "compaction." prefix + * to value in double as string. The level in stat name is represented with + * a prefix "Lx" where "x" is the level number. A special level "Sum" + * represents the sum of a stat for all levels. + * The result also contains IO stall counters which keys start with "io_stalls." + * and values represent uint64 encoded as strings. + */ +void InternalStats::DumpCFMapStats( + std::map* cf_stats) { + const VersionStorageInfo* vstorage = cfd_->current()->storage_info(); + CompactionStats compaction_stats_sum; + std::map> levels_stats; + DumpCFMapStats(vstorage, &levels_stats, &compaction_stats_sum); + for (auto const& level_ent : levels_stats) { + auto level_str = + level_ent.first == -1 ? "Sum" : "L" + std::to_string(level_ent.first); + for (auto const& stat_ent : level_ent.second) { + auto stat_type = stat_ent.first; + auto key_str = + "compaction." + level_str + "." + + InternalStats::compaction_level_stats.at(stat_type).property_name; + (*cf_stats)[key_str] = std::to_string(stat_ent.second); + } + } + + DumpCFMapStatsIOStalls(cf_stats); +} + +void InternalStats::DumpCFMapStats( + const VersionStorageInfo* vstorage, + std::map>* levels_stats, + CompactionStats* compaction_stats_sum) { + assert(vstorage); + + int num_levels_to_check = + (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO) + ? vstorage->num_levels() - 1 + : 1; + + // Compaction scores are sorted based on its value. Restore them to the + // level order + std::vector compaction_score(number_levels_, 0); + for (int i = 0; i < num_levels_to_check; ++i) { + compaction_score[vstorage->CompactionScoreLevel(i)] = + vstorage->CompactionScore(i); + } + // Count # of files being compacted for each level + std::vector files_being_compacted(number_levels_, 0); + for (int level = 0; level < number_levels_; ++level) { + for (auto* f : vstorage->LevelFiles(level)) { + if (f->being_compacted) { + ++files_being_compacted[level]; + } + } + } + + int total_files = 0; + int total_files_being_compacted = 0; + double total_file_size = 0; + uint64_t flush_ingest = cf_stats_value_[BYTES_FLUSHED]; + uint64_t add_file_ingest = cf_stats_value_[BYTES_INGESTED_ADD_FILE]; + uint64_t curr_ingest = flush_ingest + add_file_ingest; + for (int level = 0; level < number_levels_; level++) { + int files = vstorage->NumLevelFiles(level); + total_files += files; + total_files_being_compacted += files_being_compacted[level]; + if (comp_stats_[level].micros > 0 || comp_stats_[level].cpu_micros > 0 || + files > 0) { + compaction_stats_sum->Add(comp_stats_[level]); + total_file_size += vstorage->NumLevelBytes(level); + uint64_t input_bytes; + if (level == 0) { + input_bytes = curr_ingest; + } else { + input_bytes = comp_stats_[level].bytes_read_non_output_levels + + comp_stats_[level].bytes_read_blob; + } + double w_amp = + (input_bytes == 0) + ? 0.0 + : static_cast(comp_stats_[level].bytes_written + + comp_stats_[level].bytes_written_blob) / + input_bytes; + std::map level_stats; + PrepareLevelStats(&level_stats, files, files_being_compacted[level], + static_cast(vstorage->NumLevelBytes(level)), + compaction_score[level], w_amp, comp_stats_[level]); + (*levels_stats)[level] = level_stats; + } + } + // Cumulative summary + double w_amp = (0 == curr_ingest) + ? 0.0 + : (compaction_stats_sum->bytes_written + + compaction_stats_sum->bytes_written_blob) / + static_cast(curr_ingest); + // Stats summary across levels + std::map sum_stats; + PrepareLevelStats(&sum_stats, total_files, total_files_being_compacted, + total_file_size, 0, w_amp, *compaction_stats_sum); + (*levels_stats)[-1] = sum_stats; // -1 is for the Sum level +} + +void InternalStats::DumpCFMapStatsByPriority( + std::map>* priorities_stats) { + for (size_t priority = 0; priority < comp_stats_by_pri_.size(); priority++) { + if (comp_stats_by_pri_[priority].micros > 0) { + std::map priority_stats; + PrepareLevelStats(&priority_stats, 0 /* num_files */, + 0 /* being_compacted */, 0 /* total_file_size */, + 0 /* compaction_score */, 0 /* w_amp */, + comp_stats_by_pri_[priority]); + (*priorities_stats)[static_cast(priority)] = priority_stats; + } + } +} + +void InternalStats::DumpCFMapStatsIOStalls( + std::map* cf_stats) { + (*cf_stats)["io_stalls.level0_slowdown"] = + std::to_string(cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS]); + (*cf_stats)["io_stalls.level0_slowdown_with_compaction"] = + std::to_string(cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS]); + (*cf_stats)["io_stalls.level0_numfiles"] = + std::to_string(cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS]); + (*cf_stats)["io_stalls.level0_numfiles_with_compaction"] = + std::to_string(cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_STOPS]); + (*cf_stats)["io_stalls.stop_for_pending_compaction_bytes"] = + std::to_string(cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS]); + (*cf_stats)["io_stalls.slowdown_for_pending_compaction_bytes"] = + std::to_string(cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS]); + (*cf_stats)["io_stalls.memtable_compaction"] = + std::to_string(cf_stats_count_[MEMTABLE_LIMIT_STOPS]); + (*cf_stats)["io_stalls.memtable_slowdown"] = + std::to_string(cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS]); + + uint64_t total_stop = cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS] + + cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS] + + cf_stats_count_[MEMTABLE_LIMIT_STOPS]; + + uint64_t total_slowdown = + cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS] + + cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS] + + cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS]; + + (*cf_stats)["io_stalls.total_stop"] = std::to_string(total_stop); + (*cf_stats)["io_stalls.total_slowdown"] = std::to_string(total_slowdown); +} + +void InternalStats::DumpCFStats(std::string* value) { + DumpCFStatsNoFileHistogram(/*is_periodic=*/false, value); + DumpCFFileHistogram(value); +} + +void InternalStats::DumpCFStatsNoFileHistogram(bool is_periodic, + std::string* value) { + char buf[2000]; + // Per-ColumnFamily stats + PrintLevelStatsHeader(buf, sizeof(buf), cfd_->GetName(), "Level"); + value->append(buf); + + // Print stats for each level + const VersionStorageInfo* vstorage = cfd_->current()->storage_info(); + std::map> levels_stats; + CompactionStats compaction_stats_sum; + DumpCFMapStats(vstorage, &levels_stats, &compaction_stats_sum); + for (int l = 0; l < number_levels_; ++l) { + if (levels_stats.find(l) != levels_stats.end()) { + PrintLevelStats(buf, sizeof(buf), "L" + std::to_string(l), + levels_stats[l]); + value->append(buf); + } + } + + // Print sum of level stats + PrintLevelStats(buf, sizeof(buf), "Sum", levels_stats[-1]); + value->append(buf); + + uint64_t flush_ingest = cf_stats_value_[BYTES_FLUSHED]; + uint64_t add_file_ingest = cf_stats_value_[BYTES_INGESTED_ADD_FILE]; + uint64_t ingest_files_addfile = cf_stats_value_[INGESTED_NUM_FILES_TOTAL]; + uint64_t ingest_l0_files_addfile = + cf_stats_value_[INGESTED_LEVEL0_NUM_FILES_TOTAL]; + uint64_t ingest_keys_addfile = cf_stats_value_[INGESTED_NUM_KEYS_TOTAL]; + // Cumulative summary + uint64_t total_stall_count = + cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS] + + cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS] + + cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS] + + cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS] + + cf_stats_count_[MEMTABLE_LIMIT_STOPS] + + cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS]; + // Interval summary + uint64_t interval_flush_ingest = + flush_ingest - cf_stats_snapshot_.ingest_bytes_flush; + uint64_t interval_add_file_inget = + add_file_ingest - cf_stats_snapshot_.ingest_bytes_addfile; + uint64_t interval_ingest = + interval_flush_ingest + interval_add_file_inget + 1; + CompactionStats interval_stats(compaction_stats_sum); + interval_stats.Subtract(cf_stats_snapshot_.comp_stats); + double w_amp = + (interval_stats.bytes_written + interval_stats.bytes_written_blob) / + static_cast(interval_ingest); + PrintLevelStats(buf, sizeof(buf), "Int", 0, 0, 0, 0, w_amp, interval_stats); + value->append(buf); + + PrintLevelStatsHeader(buf, sizeof(buf), cfd_->GetName(), "Priority"); + value->append(buf); + std::map> priorities_stats; + DumpCFMapStatsByPriority(&priorities_stats); + for (size_t priority = 0; priority < comp_stats_by_pri_.size(); ++priority) { + if (priorities_stats.find(static_cast(priority)) != + priorities_stats.end()) { + PrintLevelStats( + buf, sizeof(buf), + Env::PriorityToString(static_cast(priority)), + priorities_stats[static_cast(priority)]); + value->append(buf); + } + } + + const auto blob_st = vstorage->GetBlobStats(); + + snprintf(buf, sizeof(buf), + "\nBlob file count: %" ROCKSDB_PRIszt + ", total size: %.1f GB, garbage size: %.1f GB, space amp: %.1f\n\n", + vstorage->GetBlobFiles().size(), blob_st.total_file_size / kGB, + blob_st.total_garbage_size / kGB, blob_st.space_amp); + value->append(buf); + + uint64_t now_micros = clock_->NowMicros(); + double seconds_up = (now_micros - started_at_) / kMicrosInSec; + double interval_seconds_up = seconds_up - cf_stats_snapshot_.seconds_up; + snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n", + seconds_up, interval_seconds_up); + value->append(buf); + snprintf(buf, sizeof(buf), "Flush(GB): cumulative %.3f, interval %.3f\n", + flush_ingest / kGB, interval_flush_ingest / kGB); + value->append(buf); + snprintf(buf, sizeof(buf), "AddFile(GB): cumulative %.3f, interval %.3f\n", + add_file_ingest / kGB, interval_add_file_inget / kGB); + value->append(buf); + + uint64_t interval_ingest_files_addfile = + ingest_files_addfile - cf_stats_snapshot_.ingest_files_addfile; + snprintf(buf, sizeof(buf), + "AddFile(Total Files): cumulative %" PRIu64 ", interval %" PRIu64 + "\n", + ingest_files_addfile, interval_ingest_files_addfile); + value->append(buf); + + uint64_t interval_ingest_l0_files_addfile = + ingest_l0_files_addfile - cf_stats_snapshot_.ingest_l0_files_addfile; + snprintf(buf, sizeof(buf), + "AddFile(L0 Files): cumulative %" PRIu64 ", interval %" PRIu64 "\n", + ingest_l0_files_addfile, interval_ingest_l0_files_addfile); + value->append(buf); + + uint64_t interval_ingest_keys_addfile = + ingest_keys_addfile - cf_stats_snapshot_.ingest_keys_addfile; + snprintf(buf, sizeof(buf), + "AddFile(Keys): cumulative %" PRIu64 ", interval %" PRIu64 "\n", + ingest_keys_addfile, interval_ingest_keys_addfile); + value->append(buf); + + // Compact + uint64_t compact_bytes_read = 0; + uint64_t compact_bytes_write = 0; + uint64_t compact_micros = 0; + for (int level = 0; level < number_levels_; level++) { + compact_bytes_read += comp_stats_[level].bytes_read_output_level + + comp_stats_[level].bytes_read_non_output_levels + + comp_stats_[level].bytes_read_blob; + compact_bytes_write += comp_stats_[level].bytes_written + + comp_stats_[level].bytes_written_blob; + compact_micros += comp_stats_[level].micros; + } + + snprintf(buf, sizeof(buf), + "Cumulative compaction: %.2f GB write, %.2f MB/s write, " + "%.2f GB read, %.2f MB/s read, %.1f seconds\n", + compact_bytes_write / kGB, + compact_bytes_write / kMB / std::max(seconds_up, 0.001), + compact_bytes_read / kGB, + compact_bytes_read / kMB / std::max(seconds_up, 0.001), + compact_micros / kMicrosInSec); + value->append(buf); + + // Compaction interval + uint64_t interval_compact_bytes_write = + compact_bytes_write - cf_stats_snapshot_.compact_bytes_write; + uint64_t interval_compact_bytes_read = + compact_bytes_read - cf_stats_snapshot_.compact_bytes_read; + uint64_t interval_compact_micros = + compact_micros - cf_stats_snapshot_.compact_micros; + + snprintf( + buf, sizeof(buf), + "Interval compaction: %.2f GB write, %.2f MB/s write, " + "%.2f GB read, %.2f MB/s read, %.1f seconds\n", + interval_compact_bytes_write / kGB, + interval_compact_bytes_write / kMB / std::max(interval_seconds_up, 0.001), + interval_compact_bytes_read / kGB, + interval_compact_bytes_read / kMB / std::max(interval_seconds_up, 0.001), + interval_compact_micros / kMicrosInSec); + value->append(buf); + if (is_periodic) { + cf_stats_snapshot_.compact_bytes_write = compact_bytes_write; + cf_stats_snapshot_.compact_bytes_read = compact_bytes_read; + cf_stats_snapshot_.compact_micros = compact_micros; + } + + snprintf(buf, sizeof(buf), + "Stalls(count): %" PRIu64 + " level0_slowdown, " + "%" PRIu64 + " level0_slowdown_with_compaction, " + "%" PRIu64 + " level0_numfiles, " + "%" PRIu64 + " level0_numfiles_with_compaction, " + "%" PRIu64 + " stop for pending_compaction_bytes, " + "%" PRIu64 + " slowdown for pending_compaction_bytes, " + "%" PRIu64 + " memtable_compaction, " + "%" PRIu64 + " memtable_slowdown, " + "interval %" PRIu64 " total count\n", + cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS], + cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS], + cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS], + cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_STOPS], + cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS], + cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS], + cf_stats_count_[MEMTABLE_LIMIT_STOPS], + cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS], + total_stall_count - cf_stats_snapshot_.stall_count); + value->append(buf); + + if (is_periodic) { + cf_stats_snapshot_.seconds_up = seconds_up; + cf_stats_snapshot_.ingest_bytes_flush = flush_ingest; + cf_stats_snapshot_.ingest_bytes_addfile = add_file_ingest; + cf_stats_snapshot_.ingest_files_addfile = ingest_files_addfile; + cf_stats_snapshot_.ingest_l0_files_addfile = ingest_l0_files_addfile; + cf_stats_snapshot_.ingest_keys_addfile = ingest_keys_addfile; + cf_stats_snapshot_.comp_stats = compaction_stats_sum; + cf_stats_snapshot_.stall_count = total_stall_count; + } + + // Do not gather cache entry stats during CFStats because DB + // mutex is held. Only dump last cached collection (rely on DB + // periodic stats dump to update) + if (cache_entry_stats_collector_) { + CacheEntryRoleStats stats; + // thread safe + cache_entry_stats_collector_->GetStats(&stats); + + constexpr uint64_t kDayInMicros = uint64_t{86400} * 1000000U; + + // Skip if stats are extremely old (> 1 day, incl not yet populated) + if (now_micros - stats.last_end_time_micros_ < kDayInMicros) { + value->append(stats.ToString(clock_)); + } + } +} + +void InternalStats::DumpCFFileHistogram(std::string* value) { + assert(value); + assert(cfd_); + + std::ostringstream oss; + oss << "\n** File Read Latency Histogram By Level [" << cfd_->GetName() + << "] **\n"; + + for (int level = 0; level < number_levels_; level++) { + if (!file_read_latency_[level].Empty()) { + oss << "** Level " << level << " read latency histogram (micros):\n" + << file_read_latency_[level].ToString() << '\n'; + } + } + + if (!blob_file_read_latency_.Empty()) { + oss << "** Blob file read latency histogram (micros):\n" + << blob_file_read_latency_.ToString() << '\n'; + } + + value->append(oss.str()); +} + +#else + +const DBPropertyInfo* GetPropertyInfo(const Slice& /*property*/) { + return nullptr; +} + +#endif // !ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/internal_stats.h b/src/rocksdb/db/internal_stats.h new file mode 100644 index 000000000..b0cd5899b --- /dev/null +++ b/src/rocksdb/db/internal_stats.h @@ -0,0 +1,996 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// + +#pragma once + +#include +#include +#include +#include + +#include "cache/cache_entry_roles.h" +#include "db/version_set.h" +#include "rocksdb/system_clock.h" +#include "util/hash_containers.h" + +namespace ROCKSDB_NAMESPACE { + +template +class CacheEntryStatsCollector; +class DBImpl; +class MemTableList; + +// Config for retrieving a property's value. +struct DBPropertyInfo { + bool need_out_of_mutex; + + // gcc had an internal error for initializing union of pointer-to-member- + // functions. Workaround is to populate exactly one of the following function + // pointers with a non-nullptr value. + + // @param value Value-result argument for storing the property's string value + // @param suffix Argument portion of the property. For example, suffix would + // be "5" for the property "rocksdb.num-files-at-level5". So far, only + // certain string properties take an argument. + bool (InternalStats::*handle_string)(std::string* value, Slice suffix); + + // @param value Value-result argument for storing the property's uint64 value + // @param db Many of the int properties rely on DBImpl methods. + // @param version Version is needed in case the property is retrieved without + // holding db mutex, which is only supported for int properties. + bool (InternalStats::*handle_int)(uint64_t* value, DBImpl* db, + Version* version); + + // @param props Map of general properties to populate + // @param suffix Argument portion of the property. (see handle_string) + bool (InternalStats::*handle_map)(std::map* props, + Slice suffix); + + // handle the string type properties rely on DBImpl methods + // @param value Value-result argument for storing the property's string value + bool (DBImpl::*handle_string_dbimpl)(std::string* value); +}; + +extern const DBPropertyInfo* GetPropertyInfo(const Slice& property); + +#ifndef ROCKSDB_LITE +#undef SCORE +enum class LevelStatType { + INVALID = 0, + NUM_FILES, + COMPACTED_FILES, + SIZE_BYTES, + SCORE, + READ_GB, + RN_GB, + RNP1_GB, + WRITE_GB, + W_NEW_GB, + MOVED_GB, + WRITE_AMP, + READ_MBPS, + WRITE_MBPS, + COMP_SEC, + COMP_CPU_SEC, + COMP_COUNT, + AVG_SEC, + KEY_IN, + KEY_DROP, + R_BLOB_GB, + W_BLOB_GB, + TOTAL // total number of types +}; + +struct LevelStat { + // This what will be L?.property_name in the flat map returned to the user + std::string property_name; + // This will be what we will print in the header in the cli + std::string header_name; +}; + +struct DBStatInfo { + // This what will be property_name in the flat map returned to the user + std::string property_name; +}; + +class InternalStats { + public: + static const std::map compaction_level_stats; + + enum InternalCFStatsType { + L0_FILE_COUNT_LIMIT_SLOWDOWNS, + LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS, + MEMTABLE_LIMIT_STOPS, + MEMTABLE_LIMIT_SLOWDOWNS, + L0_FILE_COUNT_LIMIT_STOPS, + LOCKED_L0_FILE_COUNT_LIMIT_STOPS, + PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, + PENDING_COMPACTION_BYTES_LIMIT_STOPS, + WRITE_STALLS_ENUM_MAX, + BYTES_FLUSHED, + BYTES_INGESTED_ADD_FILE, + INGESTED_NUM_FILES_TOTAL, + INGESTED_LEVEL0_NUM_FILES_TOTAL, + INGESTED_NUM_KEYS_TOTAL, + INTERNAL_CF_STATS_ENUM_MAX, + }; + + enum InternalDBStatsType { + kIntStatsWalFileBytes, + kIntStatsWalFileSynced, + kIntStatsBytesWritten, + kIntStatsNumKeysWritten, + kIntStatsWriteDoneByOther, + kIntStatsWriteDoneBySelf, + kIntStatsWriteWithWal, + kIntStatsWriteStallMicros, + kIntStatsNumMax, + }; + + static const std::map db_stats_type_to_info; + + InternalStats(int num_levels, SystemClock* clock, ColumnFamilyData* cfd); + + // Per level compaction stats + struct CompactionOutputsStats { + uint64_t num_output_records = 0; + uint64_t bytes_written = 0; + uint64_t bytes_written_blob = 0; + uint64_t num_output_files = 0; + uint64_t num_output_files_blob = 0; + + void Add(const CompactionOutputsStats& stats) { + this->num_output_records += stats.num_output_records; + this->bytes_written += stats.bytes_written; + this->bytes_written_blob += stats.bytes_written_blob; + this->num_output_files += stats.num_output_files; + this->num_output_files_blob += stats.num_output_files_blob; + } + }; + + // Per level compaction stats. comp_stats_[level] stores the stats for + // compactions that produced data for the specified "level". + struct CompactionStats { + uint64_t micros; + uint64_t cpu_micros; + + // The number of bytes read from all non-output levels (table files) + uint64_t bytes_read_non_output_levels; + + // The number of bytes read from the compaction output level (table files) + uint64_t bytes_read_output_level; + + // The number of bytes read from blob files + uint64_t bytes_read_blob; + + // Total number of bytes written to table files during compaction + uint64_t bytes_written; + + // Total number of bytes written to blob files during compaction + uint64_t bytes_written_blob; + + // Total number of bytes moved to the output level (table files) + uint64_t bytes_moved; + + // The number of compaction input files in all non-output levels (table + // files) + int num_input_files_in_non_output_levels; + + // The number of compaction input files in the output level (table files) + int num_input_files_in_output_level; + + // The number of compaction output files (table files) + int num_output_files; + + // The number of compaction output files (blob files) + int num_output_files_blob; + + // Total incoming entries during compaction between levels N and N+1 + uint64_t num_input_records; + + // Accumulated diff number of entries + // (num input entries - num output entries) for compaction levels N and N+1 + uint64_t num_dropped_records; + + // Total output entries from compaction + uint64_t num_output_records; + + // Number of compactions done + int count; + + // Number of compactions done per CompactionReason + int counts[static_cast(CompactionReason::kNumOfReasons)]{}; + + explicit CompactionStats() + : micros(0), + cpu_micros(0), + bytes_read_non_output_levels(0), + bytes_read_output_level(0), + bytes_read_blob(0), + bytes_written(0), + bytes_written_blob(0), + bytes_moved(0), + num_input_files_in_non_output_levels(0), + num_input_files_in_output_level(0), + num_output_files(0), + num_output_files_blob(0), + num_input_records(0), + num_dropped_records(0), + num_output_records(0), + count(0) { + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + for (int i = 0; i < num_of_reasons; i++) { + counts[i] = 0; + } + } + + explicit CompactionStats(CompactionReason reason, int c) + : micros(0), + cpu_micros(0), + bytes_read_non_output_levels(0), + bytes_read_output_level(0), + bytes_read_blob(0), + bytes_written(0), + bytes_written_blob(0), + bytes_moved(0), + num_input_files_in_non_output_levels(0), + num_input_files_in_output_level(0), + num_output_files(0), + num_output_files_blob(0), + num_input_records(0), + num_dropped_records(0), + num_output_records(0), + count(c) { + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + for (int i = 0; i < num_of_reasons; i++) { + counts[i] = 0; + } + int r = static_cast(reason); + if (r >= 0 && r < num_of_reasons) { + counts[r] = c; + } else { + count = 0; + } + } + + CompactionStats(const CompactionStats& c) + : micros(c.micros), + cpu_micros(c.cpu_micros), + bytes_read_non_output_levels(c.bytes_read_non_output_levels), + bytes_read_output_level(c.bytes_read_output_level), + bytes_read_blob(c.bytes_read_blob), + bytes_written(c.bytes_written), + bytes_written_blob(c.bytes_written_blob), + bytes_moved(c.bytes_moved), + num_input_files_in_non_output_levels( + c.num_input_files_in_non_output_levels), + num_input_files_in_output_level(c.num_input_files_in_output_level), + num_output_files(c.num_output_files), + num_output_files_blob(c.num_output_files_blob), + num_input_records(c.num_input_records), + num_dropped_records(c.num_dropped_records), + num_output_records(c.num_output_records), + count(c.count) { + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + for (int i = 0; i < num_of_reasons; i++) { + counts[i] = c.counts[i]; + } + } + + CompactionStats& operator=(const CompactionStats& c) { + micros = c.micros; + cpu_micros = c.cpu_micros; + bytes_read_non_output_levels = c.bytes_read_non_output_levels; + bytes_read_output_level = c.bytes_read_output_level; + bytes_read_blob = c.bytes_read_blob; + bytes_written = c.bytes_written; + bytes_written_blob = c.bytes_written_blob; + bytes_moved = c.bytes_moved; + num_input_files_in_non_output_levels = + c.num_input_files_in_non_output_levels; + num_input_files_in_output_level = c.num_input_files_in_output_level; + num_output_files = c.num_output_files; + num_output_files_blob = c.num_output_files_blob; + num_input_records = c.num_input_records; + num_dropped_records = c.num_dropped_records; + num_output_records = c.num_output_records; + count = c.count; + + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + for (int i = 0; i < num_of_reasons; i++) { + counts[i] = c.counts[i]; + } + return *this; + } + + void Clear() { + this->micros = 0; + this->cpu_micros = 0; + this->bytes_read_non_output_levels = 0; + this->bytes_read_output_level = 0; + this->bytes_read_blob = 0; + this->bytes_written = 0; + this->bytes_written_blob = 0; + this->bytes_moved = 0; + this->num_input_files_in_non_output_levels = 0; + this->num_input_files_in_output_level = 0; + this->num_output_files = 0; + this->num_output_files_blob = 0; + this->num_input_records = 0; + this->num_dropped_records = 0; + this->num_output_records = 0; + this->count = 0; + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + for (int i = 0; i < num_of_reasons; i++) { + counts[i] = 0; + } + } + + void Add(const CompactionStats& c) { + this->micros += c.micros; + this->cpu_micros += c.cpu_micros; + this->bytes_read_non_output_levels += c.bytes_read_non_output_levels; + this->bytes_read_output_level += c.bytes_read_output_level; + this->bytes_read_blob += c.bytes_read_blob; + this->bytes_written += c.bytes_written; + this->bytes_written_blob += c.bytes_written_blob; + this->bytes_moved += c.bytes_moved; + this->num_input_files_in_non_output_levels += + c.num_input_files_in_non_output_levels; + this->num_input_files_in_output_level += + c.num_input_files_in_output_level; + this->num_output_files += c.num_output_files; + this->num_output_files_blob += c.num_output_files_blob; + this->num_input_records += c.num_input_records; + this->num_dropped_records += c.num_dropped_records; + this->num_output_records += c.num_output_records; + this->count += c.count; + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + for (int i = 0; i < num_of_reasons; i++) { + counts[i] += c.counts[i]; + } + } + + void Add(const CompactionOutputsStats& stats) { + this->num_output_files += static_cast(stats.num_output_files); + this->num_output_records += stats.num_output_records; + this->bytes_written += stats.bytes_written; + this->bytes_written_blob += stats.bytes_written_blob; + this->num_output_files_blob += + static_cast(stats.num_output_files_blob); + } + + void Subtract(const CompactionStats& c) { + this->micros -= c.micros; + this->cpu_micros -= c.cpu_micros; + this->bytes_read_non_output_levels -= c.bytes_read_non_output_levels; + this->bytes_read_output_level -= c.bytes_read_output_level; + this->bytes_read_blob -= c.bytes_read_blob; + this->bytes_written -= c.bytes_written; + this->bytes_written_blob -= c.bytes_written_blob; + this->bytes_moved -= c.bytes_moved; + this->num_input_files_in_non_output_levels -= + c.num_input_files_in_non_output_levels; + this->num_input_files_in_output_level -= + c.num_input_files_in_output_level; + this->num_output_files -= c.num_output_files; + this->num_output_files_blob -= c.num_output_files_blob; + this->num_input_records -= c.num_input_records; + this->num_dropped_records -= c.num_dropped_records; + this->num_output_records -= c.num_output_records; + this->count -= c.count; + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + for (int i = 0; i < num_of_reasons; i++) { + counts[i] -= c.counts[i]; + } + } + + void ResetCompactionReason(CompactionReason reason) { + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + assert(count == 1); // only support update one compaction reason + for (int i = 0; i < num_of_reasons; i++) { + counts[i] = 0; + } + int r = static_cast(reason); + assert(r >= 0 && r < num_of_reasons); + counts[r] = 1; + } + }; + + // Compaction stats, for per_key_placement compaction, it includes 2 levels + // stats: the last level and the penultimate level. + struct CompactionStatsFull { + // the stats for the target primary output level + CompactionStats stats; + + // stats for penultimate level output if exist + bool has_penultimate_level_output = false; + CompactionStats penultimate_level_stats; + + explicit CompactionStatsFull() : stats(), penultimate_level_stats() {} + + explicit CompactionStatsFull(CompactionReason reason, int c) + : stats(reason, c), penultimate_level_stats(reason, c){}; + + uint64_t TotalBytesWritten() const { + uint64_t bytes_written = stats.bytes_written + stats.bytes_written_blob; + if (has_penultimate_level_output) { + bytes_written += penultimate_level_stats.bytes_written + + penultimate_level_stats.bytes_written_blob; + } + return bytes_written; + } + + uint64_t DroppedRecords() { + uint64_t output_records = stats.num_output_records; + if (has_penultimate_level_output) { + output_records += penultimate_level_stats.num_output_records; + } + if (stats.num_input_records > output_records) { + return stats.num_input_records - output_records; + } + return 0; + } + + void SetMicros(uint64_t val) { + stats.micros = val; + penultimate_level_stats.micros = val; + } + + void AddCpuMicros(uint64_t val) { + stats.cpu_micros += val; + penultimate_level_stats.cpu_micros += val; + } + }; + + // For use with CacheEntryStatsCollector + struct CacheEntryRoleStats { + uint64_t cache_capacity = 0; + uint64_t cache_usage = 0; + size_t table_size = 0; + size_t occupancy = 0; + std::string cache_id; + std::array total_charges; + std::array entry_counts; + uint32_t collection_count = 0; + uint32_t copies_of_last_collection = 0; + uint64_t last_start_time_micros_ = 0; + uint64_t last_end_time_micros_ = 0; + + void Clear() { + // Wipe everything except collection_count + uint32_t saved_collection_count = collection_count; + *this = CacheEntryRoleStats(); + collection_count = saved_collection_count; + } + + void BeginCollection(Cache*, SystemClock*, uint64_t start_time_micros); + std::function + GetEntryCallback(); + void EndCollection(Cache*, SystemClock*, uint64_t end_time_micros); + void SkippedCollection(); + + std::string ToString(SystemClock* clock) const; + void ToMap(std::map* values, + SystemClock* clock) const; + + private: + UnorderedMap role_map_; + uint64_t GetLastDurationMicros() const; + }; + + void Clear() { + for (int i = 0; i < kIntStatsNumMax; i++) { + db_stats_[i].store(0); + } + for (int i = 0; i < INTERNAL_CF_STATS_ENUM_MAX; i++) { + cf_stats_count_[i] = 0; + cf_stats_value_[i] = 0; + } + for (auto& comp_stat : comp_stats_) { + comp_stat.Clear(); + } + per_key_placement_comp_stats_.Clear(); + for (auto& h : file_read_latency_) { + h.Clear(); + } + blob_file_read_latency_.Clear(); + cf_stats_snapshot_.Clear(); + db_stats_snapshot_.Clear(); + bg_error_count_ = 0; + started_at_ = clock_->NowMicros(); + has_cf_change_since_dump_ = true; + } + + void AddCompactionStats(int level, Env::Priority thread_pri, + const CompactionStats& stats) { + comp_stats_[level].Add(stats); + comp_stats_by_pri_[thread_pri].Add(stats); + } + + void AddCompactionStats(int level, Env::Priority thread_pri, + const CompactionStatsFull& comp_stats_full) { + AddCompactionStats(level, thread_pri, comp_stats_full.stats); + if (comp_stats_full.has_penultimate_level_output) { + per_key_placement_comp_stats_.Add( + comp_stats_full.penultimate_level_stats); + } + } + + void IncBytesMoved(int level, uint64_t amount) { + comp_stats_[level].bytes_moved += amount; + } + + void AddCFStats(InternalCFStatsType type, uint64_t value) { + has_cf_change_since_dump_ = true; + cf_stats_value_[type] += value; + ++cf_stats_count_[type]; + } + + void AddDBStats(InternalDBStatsType type, uint64_t value, + bool concurrent = false) { + auto& v = db_stats_[type]; + if (concurrent) { + v.fetch_add(value, std::memory_order_relaxed); + } else { + v.store(v.load(std::memory_order_relaxed) + value, + std::memory_order_relaxed); + } + } + + uint64_t GetDBStats(InternalDBStatsType type) { + return db_stats_[type].load(std::memory_order_relaxed); + } + + HistogramImpl* GetFileReadHist(int level) { + return &file_read_latency_[level]; + } + + HistogramImpl* GetBlobFileReadHist() { return &blob_file_read_latency_; } + + uint64_t GetBackgroundErrorCount() const { return bg_error_count_; } + + uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; } + + bool GetStringProperty(const DBPropertyInfo& property_info, + const Slice& property, std::string* value); + + bool GetMapProperty(const DBPropertyInfo& property_info, + const Slice& property, + std::map* value); + + bool GetIntProperty(const DBPropertyInfo& property_info, uint64_t* value, + DBImpl* db); + + bool GetIntPropertyOutOfMutex(const DBPropertyInfo& property_info, + Version* version, uint64_t* value); + + // Unless there is a recent enough collection of the stats, collect and + // saved new cache entry stats. If `foreground`, require data to be more + // recent to skip re-collection. + // + // This should only be called while NOT holding the DB mutex. + void CollectCacheEntryStats(bool foreground); + + const uint64_t* TEST_GetCFStatsValue() const { return cf_stats_value_; } + + const std::vector& TEST_GetCompactionStats() const { + return comp_stats_; + } + + const CompactionStats& TEST_GetPerKeyPlacementCompactionStats() const { + return per_key_placement_comp_stats_; + } + + void TEST_GetCacheEntryRoleStats(CacheEntryRoleStats* stats, bool foreground); + + // Store a mapping from the user-facing DB::Properties string to our + // DBPropertyInfo struct used internally for retrieving properties. + static const UnorderedMap ppt_name_to_info; + + static const std::string kPeriodicCFStats; + + private: + void DumpDBMapStats(std::map* db_stats); + void DumpDBStats(std::string* value); + void DumpCFMapStats(std::map* cf_stats); + void DumpCFMapStats( + const VersionStorageInfo* vstorage, + std::map>* level_stats, + CompactionStats* compaction_stats_sum); + void DumpCFMapStatsByPriority( + std::map>* priorities_stats); + void DumpCFMapStatsIOStalls(std::map* cf_stats); + void DumpCFStats(std::string* value); + // if is_periodic = true, it is an internal call by RocksDB periodically to + // dump the status. + void DumpCFStatsNoFileHistogram(bool is_periodic, std::string* value); + // if is_periodic = true, it is an internal call by RocksDB periodically to + // dump the status. + void DumpCFFileHistogram(std::string* value); + + Cache* GetBlockCacheForStats(); + Cache* GetBlobCacheForStats(); + + // Per-DB stats + std::atomic db_stats_[kIntStatsNumMax]; + // Per-ColumnFamily stats + uint64_t cf_stats_value_[INTERNAL_CF_STATS_ENUM_MAX]; + uint64_t cf_stats_count_[INTERNAL_CF_STATS_ENUM_MAX]; + // Initialize/reference the collector in constructor so that we don't need + // additional synchronization in InternalStats, relying on synchronization + // in CacheEntryStatsCollector::GetStats. This collector is pinned in cache + // (through a shared_ptr) so that it does not get immediately ejected from + // a full cache, which would force a re-scan on the next GetStats. + std::shared_ptr> + cache_entry_stats_collector_; + // Per-ColumnFamily/level compaction stats + std::vector comp_stats_; + std::vector comp_stats_by_pri_; + CompactionStats per_key_placement_comp_stats_; + std::vector file_read_latency_; + HistogramImpl blob_file_read_latency_; + bool has_cf_change_since_dump_; + // How many periods of no change since the last time stats are dumped for + // a periodic dump. + int no_cf_change_period_since_dump_ = 0; + uint64_t last_histogram_num = std::numeric_limits::max(); + static const int kMaxNoChangePeriodSinceDump; + + // Used to compute per-interval statistics + struct CFStatsSnapshot { + // ColumnFamily-level stats + CompactionStats comp_stats; + uint64_t ingest_bytes_flush; // Bytes written to L0 (Flush) + uint64_t stall_count; // Stall count + // Stats from compaction jobs - bytes written, bytes read, duration. + uint64_t compact_bytes_write; + uint64_t compact_bytes_read; + uint64_t compact_micros; + double seconds_up; + + // AddFile specific stats + uint64_t ingest_bytes_addfile; // Total Bytes ingested + uint64_t ingest_files_addfile; // Total number of files ingested + uint64_t ingest_l0_files_addfile; // Total number of files ingested to L0 + uint64_t ingest_keys_addfile; // Total number of keys ingested + + CFStatsSnapshot() + : ingest_bytes_flush(0), + stall_count(0), + compact_bytes_write(0), + compact_bytes_read(0), + compact_micros(0), + seconds_up(0), + ingest_bytes_addfile(0), + ingest_files_addfile(0), + ingest_l0_files_addfile(0), + ingest_keys_addfile(0) {} + + void Clear() { + comp_stats.Clear(); + ingest_bytes_flush = 0; + stall_count = 0; + compact_bytes_write = 0; + compact_bytes_read = 0; + compact_micros = 0; + seconds_up = 0; + ingest_bytes_addfile = 0; + ingest_files_addfile = 0; + ingest_l0_files_addfile = 0; + ingest_keys_addfile = 0; + } + } cf_stats_snapshot_; + + struct DBStatsSnapshot { + // DB-level stats + uint64_t ingest_bytes; // Bytes written by user + uint64_t wal_bytes; // Bytes written to WAL + uint64_t wal_synced; // Number of times WAL is synced + uint64_t write_with_wal; // Number of writes that request WAL + // These count the number of writes processed by the calling thread or + // another thread. + uint64_t write_other; + uint64_t write_self; + // Total number of keys written. write_self and write_other measure number + // of write requests written, Each of the write request can contain updates + // to multiple keys. num_keys_written is total number of keys updated by all + // those writes. + uint64_t num_keys_written; + // Total time writes delayed by stalls. + uint64_t write_stall_micros; + double seconds_up; + + DBStatsSnapshot() + : ingest_bytes(0), + wal_bytes(0), + wal_synced(0), + write_with_wal(0), + write_other(0), + write_self(0), + num_keys_written(0), + write_stall_micros(0), + seconds_up(0) {} + + void Clear() { + ingest_bytes = 0; + wal_bytes = 0; + wal_synced = 0; + write_with_wal = 0; + write_other = 0; + write_self = 0; + num_keys_written = 0; + write_stall_micros = 0; + seconds_up = 0; + } + } db_stats_snapshot_; + + // Handler functions for getting property values. They use "value" as a value- + // result argument, and return true upon successfully setting "value". + bool HandleNumFilesAtLevel(std::string* value, Slice suffix); + bool HandleCompressionRatioAtLevelPrefix(std::string* value, Slice suffix); + bool HandleLevelStats(std::string* value, Slice suffix); + bool HandleStats(std::string* value, Slice suffix); + bool HandleCFMapStats(std::map* compaction_stats, + Slice suffix); + bool HandleCFStats(std::string* value, Slice suffix); + bool HandleCFStatsNoFileHistogram(std::string* value, Slice suffix); + bool HandleCFFileHistogram(std::string* value, Slice suffix); + bool HandleCFStatsPeriodic(std::string* value, Slice suffix); + bool HandleDBMapStats(std::map* compaction_stats, + Slice suffix); + bool HandleDBStats(std::string* value, Slice suffix); + bool HandleSsTables(std::string* value, Slice suffix); + bool HandleAggregatedTableProperties(std::string* value, Slice suffix); + bool HandleAggregatedTablePropertiesAtLevel(std::string* value, Slice suffix); + bool HandleAggregatedTablePropertiesMap( + std::map* values, Slice suffix); + bool HandleAggregatedTablePropertiesAtLevelMap( + std::map* values, Slice suffix); + bool HandleNumImmutableMemTable(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumImmutableMemTableFlushed(uint64_t* value, DBImpl* db, + Version* version); + bool HandleMemTableFlushPending(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumRunningFlushes(uint64_t* value, DBImpl* db, Version* version); + bool HandleCompactionPending(uint64_t* value, DBImpl* db, Version* version); + bool HandleNumRunningCompactions(uint64_t* value, DBImpl* db, + Version* version); + bool HandleBackgroundErrors(uint64_t* value, DBImpl* db, Version* version); + bool HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* db, + Version* version); + bool HandleCurSizeAllMemTables(uint64_t* value, DBImpl* db, Version* version); + bool HandleSizeAllMemTables(uint64_t* value, DBImpl* db, Version* version); + bool HandleNumEntriesActiveMemTable(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumEntriesImmMemTables(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumDeletesActiveMemTable(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumDeletesImmMemTables(uint64_t* value, DBImpl* db, + Version* version); + bool HandleEstimateNumKeys(uint64_t* value, DBImpl* db, Version* version); + bool HandleNumSnapshots(uint64_t* value, DBImpl* db, Version* version); + bool HandleOldestSnapshotTime(uint64_t* value, DBImpl* db, Version* version); + bool HandleOldestSnapshotSequence(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumLiveVersions(uint64_t* value, DBImpl* db, Version* version); + bool HandleCurrentSuperVersionNumber(uint64_t* value, DBImpl* db, + Version* version); + bool HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db, + Version* version); + bool HandleBaseLevel(uint64_t* value, DBImpl* db, Version* version); + bool HandleTotalSstFilesSize(uint64_t* value, DBImpl* db, Version* version); + bool HandleLiveSstFilesSize(uint64_t* value, DBImpl* db, Version* version); + bool HandleEstimatePendingCompactionBytes(uint64_t* value, DBImpl* db, + Version* version); + bool HandleEstimateTableReadersMem(uint64_t* value, DBImpl* db, + Version* version); + bool HandleEstimateLiveDataSize(uint64_t* value, DBImpl* db, + Version* version); + bool HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db, Version* version); + bool HandleMinObsoleteSstNumberToKeep(uint64_t* value, DBImpl* db, + Version* version); + bool HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db, + Version* version); + bool HandleIsWriteStopped(uint64_t* value, DBImpl* db, Version* version); + bool HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* db, + Version* version); + bool HandleBlockCacheCapacity(uint64_t* value, DBImpl* db, Version* version); + bool HandleBlockCacheUsage(uint64_t* value, DBImpl* db, Version* version); + bool HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* db, + Version* version); + bool HandleBlockCacheEntryStatsInternal(std::string* value, bool fast); + bool HandleBlockCacheEntryStatsMapInternal( + std::map* values, bool fast); + bool HandleBlockCacheEntryStats(std::string* value, Slice suffix); + bool HandleBlockCacheEntryStatsMap(std::map* values, + Slice suffix); + bool HandleFastBlockCacheEntryStats(std::string* value, Slice suffix); + bool HandleFastBlockCacheEntryStatsMap( + std::map* values, Slice suffix); + bool HandleLiveSstFilesSizeAtTemperature(std::string* value, Slice suffix); + bool HandleNumBlobFiles(uint64_t* value, DBImpl* db, Version* version); + bool HandleBlobStats(std::string* value, Slice suffix); + bool HandleTotalBlobFileSize(uint64_t* value, DBImpl* db, Version* version); + bool HandleLiveBlobFileSize(uint64_t* value, DBImpl* db, Version* version); + bool HandleLiveBlobFileGarbageSize(uint64_t* value, DBImpl* db, + Version* version); + bool HandleBlobCacheCapacity(uint64_t* value, DBImpl* db, Version* version); + bool HandleBlobCacheUsage(uint64_t* value, DBImpl* db, Version* version); + bool HandleBlobCachePinnedUsage(uint64_t* value, DBImpl* db, + Version* version); + + // Total number of background errors encountered. Every time a flush task + // or compaction task fails, this counter is incremented. The failure can + // be caused by any possible reason, including file system errors, out of + // resources, or input file corruption. Failing when retrying the same flush + // or compaction will cause the counter to increase too. + uint64_t bg_error_count_; + + const int number_levels_; + SystemClock* clock_; + ColumnFamilyData* cfd_; + uint64_t started_at_; +}; + +#else + +class InternalStats { + public: + enum InternalCFStatsType { + L0_FILE_COUNT_LIMIT_SLOWDOWNS, + LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS, + MEMTABLE_LIMIT_STOPS, + MEMTABLE_LIMIT_SLOWDOWNS, + L0_FILE_COUNT_LIMIT_STOPS, + LOCKED_L0_FILE_COUNT_LIMIT_STOPS, + PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, + PENDING_COMPACTION_BYTES_LIMIT_STOPS, + WRITE_STALLS_ENUM_MAX, + BYTES_FLUSHED, + BYTES_INGESTED_ADD_FILE, + INGESTED_NUM_FILES_TOTAL, + INGESTED_LEVEL0_NUM_FILES_TOTAL, + INGESTED_NUM_KEYS_TOTAL, + INTERNAL_CF_STATS_ENUM_MAX, + }; + + enum InternalDBStatsType { + kIntStatsWalFileBytes, + kIntStatsWalFileSynced, + kIntStatsBytesWritten, + kIntStatsNumKeysWritten, + kIntStatsWriteDoneByOther, + kIntStatsWriteDoneBySelf, + kIntStatsWriteWithWal, + kIntStatsWriteStallMicros, + kIntStatsNumMax, + }; + + InternalStats(int /*num_levels*/, SystemClock* /*clock*/, + ColumnFamilyData* /*cfd*/) {} + + // Per level compaction stats + struct CompactionOutputsStats { + uint64_t num_output_records = 0; + uint64_t bytes_written = 0; + uint64_t bytes_written_blob = 0; + uint64_t num_output_files = 0; + uint64_t num_output_files_blob = 0; + + void Add(const CompactionOutputsStats& stats) { + this->num_output_records += stats.num_output_records; + this->bytes_written += stats.bytes_written; + this->bytes_written_blob += stats.bytes_written_blob; + this->num_output_files += stats.num_output_files; + this->num_output_files_blob += stats.num_output_files_blob; + } + }; + + struct CompactionStats { + uint64_t micros; + uint64_t cpu_micros; + uint64_t bytes_read_non_output_levels; + uint64_t bytes_read_output_level; + uint64_t bytes_read_blob; + uint64_t bytes_written; + uint64_t bytes_written_blob; + uint64_t bytes_moved; + int num_input_files_in_non_output_levels; + int num_input_files_in_output_level; + int num_output_files; + int num_output_files_blob; + uint64_t num_input_records; + uint64_t num_dropped_records; + uint64_t num_output_records; + int count; + + explicit CompactionStats() {} + + explicit CompactionStats(CompactionReason /*reason*/, int /*c*/) {} + + explicit CompactionStats(const CompactionStats& /*c*/) {} + + void Add(const CompactionStats& /*c*/) {} + + void Add(const CompactionOutputsStats& /*c*/) {} + + void Subtract(const CompactionStats& /*c*/) {} + }; + + struct CompactionStatsFull { + // the stats for the target primary output level (per level stats) + CompactionStats stats; + + // stats for output_to_penultimate_level level (per level stats) + bool has_penultimate_level_output = false; + CompactionStats penultimate_level_stats; + + explicit CompactionStatsFull(){}; + + explicit CompactionStatsFull(CompactionReason /*reason*/, int /*c*/){}; + + uint64_t TotalBytesWritten() const { return 0; } + + uint64_t DroppedRecords() { return 0; } + + void SetMicros(uint64_t /*val*/){}; + + void AddCpuMicros(uint64_t /*val*/){}; + }; + + void AddCompactionStats(int /*level*/, Env::Priority /*thread_pri*/, + const CompactionStats& /*stats*/) {} + + void AddCompactionStats(int /*level*/, Env::Priority /*thread_pri*/, + const CompactionStatsFull& /*unmerged_stats*/) {} + + void IncBytesMoved(int /*level*/, uint64_t /*amount*/) {} + + void AddCFStats(InternalCFStatsType /*type*/, uint64_t /*value*/) {} + + void AddDBStats(InternalDBStatsType /*type*/, uint64_t /*value*/, + bool /*concurrent */ = false) {} + + HistogramImpl* GetFileReadHist(int /*level*/) { return nullptr; } + + HistogramImpl* GetBlobFileReadHist() { return nullptr; } + + uint64_t GetBackgroundErrorCount() const { return 0; } + + uint64_t BumpAndGetBackgroundErrorCount() { return 0; } + + bool GetStringProperty(const DBPropertyInfo& /*property_info*/, + const Slice& /*property*/, std::string* /*value*/) { + return false; + } + + bool GetMapProperty(const DBPropertyInfo& /*property_info*/, + const Slice& /*property*/, + std::map* /*value*/) { + return false; + } + + bool GetIntProperty(const DBPropertyInfo& /*property_info*/, + uint64_t* /*value*/, DBImpl* /*db*/) const { + return false; + } + + bool GetIntPropertyOutOfMutex(const DBPropertyInfo& /*property_info*/, + Version* /*version*/, + uint64_t* /*value*/) const { + return false; + } +}; +#endif // !ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/job_context.h b/src/rocksdb/db/job_context.h new file mode 100644 index 000000000..352c58e82 --- /dev/null +++ b/src/rocksdb/db/job_context.h @@ -0,0 +1,238 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include + +#include "db/column_family.h" +#include "db/log_writer.h" +#include "db/version_set.h" + +namespace ROCKSDB_NAMESPACE { + +class MemTable; +struct SuperVersion; + +struct SuperVersionContext { + struct WriteStallNotification { + WriteStallInfo write_stall_info; + const ImmutableOptions* immutable_options; + }; + + autovector superversions_to_free; +#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION + autovector write_stall_notifications; +#endif + std::unique_ptr + new_superversion; // if nullptr no new superversion + + explicit SuperVersionContext(bool create_superversion = false) + : new_superversion(create_superversion ? new SuperVersion() : nullptr) {} + + explicit SuperVersionContext(SuperVersionContext&& other) noexcept + : superversions_to_free(std::move(other.superversions_to_free)), +#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION + write_stall_notifications(std::move(other.write_stall_notifications)), +#endif + new_superversion(std::move(other.new_superversion)) { + } + // No copies + SuperVersionContext(const SuperVersionContext& other) = delete; + void operator=(const SuperVersionContext& other) = delete; + + void NewSuperVersion() { + new_superversion = std::unique_ptr(new SuperVersion()); + } + + inline bool HaveSomethingToDelete() const { +#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION + return !superversions_to_free.empty() || !write_stall_notifications.empty(); +#else + return !superversions_to_free.empty(); +#endif + } + + void PushWriteStallNotification(WriteStallCondition old_cond, + WriteStallCondition new_cond, + const std::string& name, + const ImmutableOptions* ioptions) { +#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) + WriteStallNotification notif; + notif.write_stall_info.cf_name = name; + notif.write_stall_info.condition.prev = old_cond; + notif.write_stall_info.condition.cur = new_cond; + notif.immutable_options = ioptions; + write_stall_notifications.push_back(notif); +#else + (void)old_cond; + (void)new_cond; + (void)name; + (void)ioptions; +#endif // !defined(ROCKSDB_LITE) && + // !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) + } + + void Clean() { +#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) + // notify listeners on changed write stall conditions + for (auto& notif : write_stall_notifications) { + for (auto& listener : notif.immutable_options->listeners) { + listener->OnStallConditionsChanged(notif.write_stall_info); + } + } + write_stall_notifications.clear(); +#endif // !ROCKSDB_LITE + // free superversions + for (auto s : superversions_to_free) { + delete s; + } + superversions_to_free.clear(); + } + + ~SuperVersionContext() { +#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION + assert(write_stall_notifications.empty()); +#endif + assert(superversions_to_free.empty()); + } +}; + +struct JobContext { + inline bool HaveSomethingToDelete() const { + return !(full_scan_candidate_files.empty() && sst_delete_files.empty() && + blob_delete_files.empty() && log_delete_files.empty() && + manifest_delete_files.empty()); + } + + inline bool HaveSomethingToClean() const { + bool sv_have_sth = false; + for (const auto& sv_ctx : superversion_contexts) { + if (sv_ctx.HaveSomethingToDelete()) { + sv_have_sth = true; + break; + } + } + return memtables_to_free.size() > 0 || logs_to_free.size() > 0 || + job_snapshot != nullptr || sv_have_sth; + } + + SequenceNumber GetJobSnapshotSequence() const { + if (job_snapshot) { + assert(job_snapshot->snapshot()); + return job_snapshot->snapshot()->GetSequenceNumber(); + } + return kMaxSequenceNumber; + } + + // Structure to store information for candidate files to delete. + struct CandidateFileInfo { + std::string file_name; + std::string file_path; + CandidateFileInfo(std::string name, std::string path) + : file_name(std::move(name)), file_path(std::move(path)) {} + bool operator==(const CandidateFileInfo& other) const { + return file_name == other.file_name && file_path == other.file_path; + } + }; + + // Unique job id + int job_id; + + // a list of all files that we'll consider deleting + // (every once in a while this is filled up with all files + // in the DB directory) + // (filled only if we're doing full scan) + std::vector full_scan_candidate_files; + + // the list of all live sst files that cannot be deleted + std::vector sst_live; + + // the list of sst files that we need to delete + std::vector sst_delete_files; + + // the list of all live blob files that cannot be deleted + std::vector blob_live; + + // the list of blob files that we need to delete + std::vector blob_delete_files; + + // a list of log files that we need to delete + std::vector log_delete_files; + + // a list of log files that we need to preserve during full purge since they + // will be reused later + std::vector log_recycle_files; + + // a list of manifest files that we need to delete + std::vector manifest_delete_files; + + // a list of memtables to be free + autovector memtables_to_free; + + // contexts for installing superversions for multiple column families + std::vector superversion_contexts; + + autovector logs_to_free; + + // the current manifest_file_number, log_number and prev_log_number + // that corresponds to the set of files in 'live'. + uint64_t manifest_file_number; + uint64_t pending_manifest_file_number; + uint64_t log_number; + uint64_t prev_log_number; + + uint64_t min_pending_output = 0; + uint64_t prev_total_log_size = 0; + size_t num_alive_log_files = 0; + uint64_t size_log_to_delete = 0; + + // Snapshot taken before flush/compaction job. + std::unique_ptr job_snapshot; + + explicit JobContext(int _job_id, bool create_superversion = false) { + job_id = _job_id; + manifest_file_number = 0; + pending_manifest_file_number = 0; + log_number = 0; + prev_log_number = 0; + superversion_contexts.emplace_back( + SuperVersionContext(create_superversion)); + } + + // For non-empty JobContext Clean() has to be called at least once before + // before destruction (see asserts in ~JobContext()). Should be called with + // unlocked DB mutex. Destructor doesn't call Clean() to avoid accidentally + // doing potentially slow Clean() with locked DB mutex. + void Clean() { + // free superversions + for (auto& sv_context : superversion_contexts) { + sv_context.Clean(); + } + // free pending memtables + for (auto m : memtables_to_free) { + delete m; + } + for (auto l : logs_to_free) { + delete l; + } + + memtables_to_free.clear(); + logs_to_free.clear(); + job_snapshot.reset(); + } + + ~JobContext() { + assert(memtables_to_free.size() == 0); + assert(logs_to_free.size() == 0); + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/kv_checksum.h b/src/rocksdb/db/kv_checksum.h new file mode 100644 index 000000000..bce507fcf --- /dev/null +++ b/src/rocksdb/db/kv_checksum.h @@ -0,0 +1,398 @@ +// Copyright (c) 2020-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file contains classes containing fields to protect individual entries. +// The classes are named "ProtectionInfo", where indicates the +// combination of fields that are covered. Each field has a single letter +// abbreviation as follows. +// +// K = key +// V = value +// O = optype aka value type +// S = seqno +// C = CF ID +// +// Then, for example, a class that protects an entry consisting of key, value, +// optype, and CF ID (i.e., a `WriteBatch` entry) would be named +// `ProtectionInfoKVOC`. +// +// The `ProtectionInfo.*` classes are templated on the integer type used to hold +// the XOR of hashes for each field. Only unsigned integer types are supported, +// and the maximum supported integer width is 64 bits. When the integer type is +// narrower than the hash values, we lop off the most significant bits to make +// them fit. +// +// The `ProtectionInfo.*` classes are all intended to be non-persistent. We do +// not currently make the byte order consistent for integer fields before +// hashing them, so the resulting values are endianness-dependent. + +#pragma once + +#include + +#include "db/dbformat.h" +#include "rocksdb/types.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +template +class ProtectionInfo; +template +class ProtectionInfoKVO; +template +class ProtectionInfoKVOC; +template +class ProtectionInfoKVOS; + +// Aliases for 64-bit protection infos. +using ProtectionInfo64 = ProtectionInfo; +using ProtectionInfoKVO64 = ProtectionInfoKVO; +using ProtectionInfoKVOC64 = ProtectionInfoKVOC; +using ProtectionInfoKVOS64 = ProtectionInfoKVOS; + +template +class ProtectionInfo { + public: + ProtectionInfo() = default; + + Status GetStatus() const; + ProtectionInfoKVO ProtectKVO(const Slice& key, const Slice& value, + ValueType op_type) const; + ProtectionInfoKVO ProtectKVO(const SliceParts& key, + const SliceParts& value, + ValueType op_type) const; + + T GetVal() const { return val_; } + + private: + friend class ProtectionInfoKVO; + friend class ProtectionInfoKVOS; + friend class ProtectionInfoKVOC; + + // Each field is hashed with an independent value so we can catch fields being + // swapped. Per the `NPHash64()` docs, using consecutive seeds is a pitfall, + // and we should instead vary our seeds by a large odd number. This value by + // which we increment (0xD28AAD72F49BD50B) was taken from + // `head -c8 /dev/urandom | hexdump`, run repeatedly until it yielded an odd + // number. The values are computed manually since the Windows C++ compiler + // complains about the overflow when adding constants. + static const uint64_t kSeedK = 0; + static const uint64_t kSeedV = 0xD28AAD72F49BD50B; + static const uint64_t kSeedO = 0xA5155AE5E937AA16; + static const uint64_t kSeedS = 0x77A00858DDD37F21; + static const uint64_t kSeedC = 0x4A2AB5CBD26F542C; + + ProtectionInfo(T val) : val_(val) { + static_assert(sizeof(ProtectionInfo) == sizeof(T), ""); + } + + void SetVal(T val) { val_ = val; } + + T val_ = 0; +}; + +template +class ProtectionInfoKVO { + public: + ProtectionInfoKVO() = default; + + ProtectionInfo StripKVO(const Slice& key, const Slice& value, + ValueType op_type) const; + ProtectionInfo StripKVO(const SliceParts& key, const SliceParts& value, + ValueType op_type) const; + + ProtectionInfoKVOC ProtectC(ColumnFamilyId column_family_id) const; + ProtectionInfoKVOS ProtectS(SequenceNumber sequence_number) const; + + void UpdateK(const Slice& old_key, const Slice& new_key); + void UpdateK(const SliceParts& old_key, const SliceParts& new_key); + void UpdateV(const Slice& old_value, const Slice& new_value); + void UpdateV(const SliceParts& old_value, const SliceParts& new_value); + void UpdateO(ValueType old_op_type, ValueType new_op_type); + + T GetVal() const { return info_.GetVal(); } + + private: + friend class ProtectionInfo; + friend class ProtectionInfoKVOS; + friend class ProtectionInfoKVOC; + + explicit ProtectionInfoKVO(T val) : info_(val) { + static_assert(sizeof(ProtectionInfoKVO) == sizeof(T), ""); + } + + void SetVal(T val) { info_.SetVal(val); } + + ProtectionInfo info_; +}; + +template +class ProtectionInfoKVOC { + public: + ProtectionInfoKVOC() = default; + + ProtectionInfoKVO StripC(ColumnFamilyId column_family_id) const; + + void UpdateK(const Slice& old_key, const Slice& new_key) { + kvo_.UpdateK(old_key, new_key); + } + void UpdateK(const SliceParts& old_key, const SliceParts& new_key) { + kvo_.UpdateK(old_key, new_key); + } + void UpdateV(const Slice& old_value, const Slice& new_value) { + kvo_.UpdateV(old_value, new_value); + } + void UpdateV(const SliceParts& old_value, const SliceParts& new_value) { + kvo_.UpdateV(old_value, new_value); + } + void UpdateO(ValueType old_op_type, ValueType new_op_type) { + kvo_.UpdateO(old_op_type, new_op_type); + } + void UpdateC(ColumnFamilyId old_column_family_id, + ColumnFamilyId new_column_family_id); + + T GetVal() const { return kvo_.GetVal(); } + + private: + friend class ProtectionInfoKVO; + + explicit ProtectionInfoKVOC(T val) : kvo_(val) { + static_assert(sizeof(ProtectionInfoKVOC) == sizeof(T), ""); + } + + void SetVal(T val) { kvo_.SetVal(val); } + + ProtectionInfoKVO kvo_; +}; + +template +class ProtectionInfoKVOS { + public: + ProtectionInfoKVOS() = default; + + ProtectionInfoKVO StripS(SequenceNumber sequence_number) const; + + void UpdateK(const Slice& old_key, const Slice& new_key) { + kvo_.UpdateK(old_key, new_key); + } + void UpdateK(const SliceParts& old_key, const SliceParts& new_key) { + kvo_.UpdateK(old_key, new_key); + } + void UpdateV(const Slice& old_value, const Slice& new_value) { + kvo_.UpdateV(old_value, new_value); + } + void UpdateV(const SliceParts& old_value, const SliceParts& new_value) { + kvo_.UpdateV(old_value, new_value); + } + void UpdateO(ValueType old_op_type, ValueType new_op_type) { + kvo_.UpdateO(old_op_type, new_op_type); + } + void UpdateS(SequenceNumber old_sequence_number, + SequenceNumber new_sequence_number); + + T GetVal() const { return kvo_.GetVal(); } + + private: + friend class ProtectionInfoKVO; + + explicit ProtectionInfoKVOS(T val) : kvo_(val) { + static_assert(sizeof(ProtectionInfoKVOS) == sizeof(T), ""); + } + + void SetVal(T val) { kvo_.SetVal(val); } + + ProtectionInfoKVO kvo_; +}; + +template +Status ProtectionInfo::GetStatus() const { + if (val_ != 0) { + return Status::Corruption("ProtectionInfo mismatch"); + } + return Status::OK(); +} + +template +ProtectionInfoKVO ProtectionInfo::ProtectKVO(const Slice& key, + const Slice& value, + ValueType op_type) const { + T val = GetVal(); + val = val ^ static_cast(GetSliceNPHash64(key, ProtectionInfo::kSeedK)); + val = + val ^ static_cast(GetSliceNPHash64(value, ProtectionInfo::kSeedV)); + val = val ^ + static_cast(NPHash64(reinterpret_cast(&op_type), + sizeof(op_type), ProtectionInfo::kSeedO)); + return ProtectionInfoKVO(val); +} + +template +ProtectionInfoKVO ProtectionInfo::ProtectKVO(const SliceParts& key, + const SliceParts& value, + ValueType op_type) const { + T val = GetVal(); + val = val ^ + static_cast(GetSlicePartsNPHash64(key, ProtectionInfo::kSeedK)); + val = val ^ + static_cast(GetSlicePartsNPHash64(value, ProtectionInfo::kSeedV)); + val = val ^ + static_cast(NPHash64(reinterpret_cast(&op_type), + sizeof(op_type), ProtectionInfo::kSeedO)); + return ProtectionInfoKVO(val); +} + +template +void ProtectionInfoKVO::UpdateK(const Slice& old_key, const Slice& new_key) { + T val = GetVal(); + val = val ^ + static_cast(GetSliceNPHash64(old_key, ProtectionInfo::kSeedK)); + val = val ^ + static_cast(GetSliceNPHash64(new_key, ProtectionInfo::kSeedK)); + SetVal(val); +} + +template +void ProtectionInfoKVO::UpdateK(const SliceParts& old_key, + const SliceParts& new_key) { + T val = GetVal(); + val = val ^ static_cast( + GetSlicePartsNPHash64(old_key, ProtectionInfo::kSeedK)); + val = val ^ static_cast( + GetSlicePartsNPHash64(new_key, ProtectionInfo::kSeedK)); + SetVal(val); +} + +template +void ProtectionInfoKVO::UpdateV(const Slice& old_value, + const Slice& new_value) { + T val = GetVal(); + val = val ^ + static_cast(GetSliceNPHash64(old_value, ProtectionInfo::kSeedV)); + val = val ^ + static_cast(GetSliceNPHash64(new_value, ProtectionInfo::kSeedV)); + SetVal(val); +} + +template +void ProtectionInfoKVO::UpdateV(const SliceParts& old_value, + const SliceParts& new_value) { + T val = GetVal(); + val = val ^ static_cast( + GetSlicePartsNPHash64(old_value, ProtectionInfo::kSeedV)); + val = val ^ static_cast( + GetSlicePartsNPHash64(new_value, ProtectionInfo::kSeedV)); + SetVal(val); +} + +template +void ProtectionInfoKVO::UpdateO(ValueType old_op_type, + ValueType new_op_type) { + T val = GetVal(); + val = val ^ static_cast(NPHash64(reinterpret_cast(&old_op_type), + sizeof(old_op_type), + ProtectionInfo::kSeedO)); + val = val ^ static_cast(NPHash64(reinterpret_cast(&new_op_type), + sizeof(new_op_type), + ProtectionInfo::kSeedO)); + SetVal(val); +} + +template +ProtectionInfo ProtectionInfoKVO::StripKVO(const Slice& key, + const Slice& value, + ValueType op_type) const { + T val = GetVal(); + val = val ^ static_cast(GetSliceNPHash64(key, ProtectionInfo::kSeedK)); + val = + val ^ static_cast(GetSliceNPHash64(value, ProtectionInfo::kSeedV)); + val = val ^ + static_cast(NPHash64(reinterpret_cast(&op_type), + sizeof(op_type), ProtectionInfo::kSeedO)); + return ProtectionInfo(val); +} + +template +ProtectionInfo ProtectionInfoKVO::StripKVO(const SliceParts& key, + const SliceParts& value, + ValueType op_type) const { + T val = GetVal(); + val = val ^ + static_cast(GetSlicePartsNPHash64(key, ProtectionInfo::kSeedK)); + val = val ^ + static_cast(GetSlicePartsNPHash64(value, ProtectionInfo::kSeedV)); + val = val ^ + static_cast(NPHash64(reinterpret_cast(&op_type), + sizeof(op_type), ProtectionInfo::kSeedO)); + return ProtectionInfo(val); +} + +template +ProtectionInfoKVOC ProtectionInfoKVO::ProtectC( + ColumnFamilyId column_family_id) const { + T val = GetVal(); + val = val ^ static_cast(NPHash64( + reinterpret_cast(&column_family_id), + sizeof(column_family_id), ProtectionInfo::kSeedC)); + return ProtectionInfoKVOC(val); +} + +template +ProtectionInfoKVO ProtectionInfoKVOC::StripC( + ColumnFamilyId column_family_id) const { + T val = GetVal(); + val = val ^ static_cast(NPHash64( + reinterpret_cast(&column_family_id), + sizeof(column_family_id), ProtectionInfo::kSeedC)); + return ProtectionInfoKVO(val); +} + +template +void ProtectionInfoKVOC::UpdateC(ColumnFamilyId old_column_family_id, + ColumnFamilyId new_column_family_id) { + T val = GetVal(); + val = val ^ static_cast(NPHash64( + reinterpret_cast(&old_column_family_id), + sizeof(old_column_family_id), ProtectionInfo::kSeedC)); + val = val ^ static_cast(NPHash64( + reinterpret_cast(&new_column_family_id), + sizeof(new_column_family_id), ProtectionInfo::kSeedC)); + SetVal(val); +} + +template +ProtectionInfoKVOS ProtectionInfoKVO::ProtectS( + SequenceNumber sequence_number) const { + T val = GetVal(); + val = val ^ static_cast(NPHash64(reinterpret_cast(&sequence_number), + sizeof(sequence_number), + ProtectionInfo::kSeedS)); + return ProtectionInfoKVOS(val); +} + +template +ProtectionInfoKVO ProtectionInfoKVOS::StripS( + SequenceNumber sequence_number) const { + T val = GetVal(); + val = val ^ static_cast(NPHash64(reinterpret_cast(&sequence_number), + sizeof(sequence_number), + ProtectionInfo::kSeedS)); + return ProtectionInfoKVO(val); +} + +template +void ProtectionInfoKVOS::UpdateS(SequenceNumber old_sequence_number, + SequenceNumber new_sequence_number) { + T val = GetVal(); + val = val ^ static_cast(NPHash64( + reinterpret_cast(&old_sequence_number), + sizeof(old_sequence_number), ProtectionInfo::kSeedS)); + val = val ^ static_cast(NPHash64( + reinterpret_cast(&new_sequence_number), + sizeof(new_sequence_number), ProtectionInfo::kSeedS)); + SetVal(val); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/listener_test.cc b/src/rocksdb/db/listener_test.cc new file mode 100644 index 000000000..160866bb7 --- /dev/null +++ b/src/rocksdb/db/listener_test.cc @@ -0,0 +1,1595 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_index.h" +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "db/dbformat.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "file/filename.h" +#include "monitoring/statistics.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/hash.h" +#include "util/mutexlock.h" +#include "util/rate_limiter.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +#ifndef ROCKSDB_LITE + +namespace ROCKSDB_NAMESPACE { + +class EventListenerTest : public DBTestBase { + public: + EventListenerTest() : DBTestBase("listener_test", /*env_do_fsync=*/true) {} + + static std::string BlobStr(uint64_t blob_file_number, uint64_t offset, + uint64_t size) { + std::string blob_index; + BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size, + kNoCompression); + return blob_index; + } + + const size_t k110KB = 110 << 10; +}; + +struct TestPropertiesCollector + : public ROCKSDB_NAMESPACE::TablePropertiesCollector { + ROCKSDB_NAMESPACE::Status AddUserKey( + const ROCKSDB_NAMESPACE::Slice& /*key*/, + const ROCKSDB_NAMESPACE::Slice& /*value*/, + ROCKSDB_NAMESPACE::EntryType /*type*/, + ROCKSDB_NAMESPACE::SequenceNumber /*seq*/, + uint64_t /*file_size*/) override { + return Status::OK(); + } + ROCKSDB_NAMESPACE::Status Finish( + ROCKSDB_NAMESPACE::UserCollectedProperties* properties) override { + properties->insert({"0", "1"}); + return Status::OK(); + } + + const char* Name() const override { return "TestTablePropertiesCollector"; } + + ROCKSDB_NAMESPACE::UserCollectedProperties GetReadableProperties() + const override { + ROCKSDB_NAMESPACE::UserCollectedProperties ret; + ret["2"] = "3"; + return ret; + } +}; + +class TestPropertiesCollectorFactory : public TablePropertiesCollectorFactory { + public: + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context /*context*/) override { + return new TestPropertiesCollector; + } + const char* Name() const override { return "TestTablePropertiesCollector"; } +}; + +class TestCompactionListener : public EventListener { + public: + explicit TestCompactionListener(EventListenerTest* test) : test_(test) {} + + void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override { + std::lock_guard lock(mutex_); + compacted_dbs_.push_back(db); + ASSERT_GT(ci.input_files.size(), 0U); + ASSERT_EQ(ci.input_files.size(), ci.input_file_infos.size()); + + for (size_t i = 0; i < ci.input_file_infos.size(); ++i) { + ASSERT_EQ(ci.input_file_infos[i].level, ci.base_input_level); + ASSERT_EQ(ci.input_file_infos[i].file_number, + TableFileNameToNumber(ci.input_files[i])); + } + + ASSERT_GT(ci.output_files.size(), 0U); + ASSERT_EQ(ci.output_files.size(), ci.output_file_infos.size()); + + ASSERT_TRUE(test_); + ASSERT_EQ(test_->db_, db); + + std::vector> files_by_level; + test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[ci.cf_id], + &files_by_level); + ASSERT_GT(files_by_level.size(), ci.output_level); + + for (size_t i = 0; i < ci.output_file_infos.size(); ++i) { + ASSERT_EQ(ci.output_file_infos[i].level, ci.output_level); + ASSERT_EQ(ci.output_file_infos[i].file_number, + TableFileNameToNumber(ci.output_files[i])); + + auto it = std::find_if( + files_by_level[ci.output_level].begin(), + files_by_level[ci.output_level].end(), [&](const FileMetaData& meta) { + return meta.fd.GetNumber() == ci.output_file_infos[i].file_number; + }); + ASSERT_NE(it, files_by_level[ci.output_level].end()); + + ASSERT_EQ(ci.output_file_infos[i].oldest_blob_file_number, + it->oldest_blob_file_number); + } + + ASSERT_EQ(db->GetEnv()->GetThreadID(), ci.thread_id); + ASSERT_GT(ci.thread_id, 0U); + + for (auto fl : {ci.input_files, ci.output_files}) { + for (auto fn : fl) { + auto it = ci.table_properties.find(fn); + ASSERT_NE(it, ci.table_properties.end()); + auto tp = it->second; + ASSERT_TRUE(tp != nullptr); + ASSERT_EQ(tp->user_collected_properties.find("0")->second, "1"); + } + } + } + + EventListenerTest* test_; + std::vector compacted_dbs_; + std::mutex mutex_; +}; + +TEST_F(EventListenerTest, OnSingleDBCompactionTest) { + const int kTestKeySize = 16; + const int kTestValueSize = 984; + const int kEntrySize = kTestKeySize + kTestValueSize; + const int kEntriesPerBuffer = 100; + const int kNumL0Files = 4; + + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; + options.compaction_style = kCompactionStyleLevel; + options.target_file_size_base = options.write_buffer_size; + options.max_bytes_for_level_base = options.target_file_size_base * 2; + options.max_bytes_for_level_multiplier = 2; + options.compression = kNoCompression; +#ifdef ROCKSDB_USING_THREAD_STATUS + options.enable_thread_tracking = true; +#endif // ROCKSDB_USING_THREAD_STATUS + options.level0_file_num_compaction_trigger = kNumL0Files; + options.table_properties_collector_factories.push_back( + std::make_shared()); + + TestCompactionListener* listener = new TestCompactionListener(this); + options.listeners.emplace_back(listener); + std::vector cf_names = {"pikachu", "ilya", "muromec", + "dobrynia", "nikitich", "alyosha", + "popovich"}; + CreateAndReopenWithCF(cf_names, options); + ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p'))); + + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 1, "ditto", + BlobStr(123, 0, 1 << 10))); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); + + ASSERT_OK(Put(2, "ilya", std::string(90000, 'i'))); + ASSERT_OK(Put(3, "muromec", std::string(90000, 'm'))); + ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd'))); + ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n'))); + ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a'))); + ASSERT_OK(Put(7, "popovich", std::string(90000, 'p'))); + for (int i = 1; i < 8; ++i) { + ASSERT_OK(Flush(i)); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[i], + nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + + ASSERT_EQ(listener->compacted_dbs_.size(), cf_names.size()); + for (size_t i = 0; i < cf_names.size(); ++i) { + ASSERT_EQ(listener->compacted_dbs_[i], db_); + } +} + +// This simple Listener can only handle one flush at a time. +class TestFlushListener : public EventListener { + public: + TestFlushListener(Env* env, EventListenerTest* test) + : slowdown_count(0), stop_count(0), db_closed(), env_(env), test_(test) { + db_closed = false; + } + + virtual ~TestFlushListener() { + prev_fc_info_.status.PermitUncheckedError(); // Ignore the status + } + void OnTableFileCreated(const TableFileCreationInfo& info) override { + // remember the info for later checking the FlushJobInfo. + prev_fc_info_ = info; + ASSERT_GT(info.db_name.size(), 0U); + ASSERT_GT(info.cf_name.size(), 0U); + ASSERT_GT(info.file_path.size(), 0U); + ASSERT_GT(info.job_id, 0); + ASSERT_GT(info.table_properties.data_size, 0U); + ASSERT_GT(info.table_properties.raw_key_size, 0U); + ASSERT_GT(info.table_properties.raw_value_size, 0U); + ASSERT_GT(info.table_properties.num_data_blocks, 0U); + ASSERT_GT(info.table_properties.num_entries, 0U); + ASSERT_EQ(info.file_checksum, kUnknownFileChecksum); + ASSERT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName); + +#ifdef ROCKSDB_USING_THREAD_STATUS + // Verify the id of the current thread that created this table + // file matches the id of any active flush or compaction thread. + uint64_t thread_id = env_->GetThreadID(); + std::vector thread_list; + ASSERT_OK(env_->GetThreadList(&thread_list)); + bool found_match = false; + for (auto thread_status : thread_list) { + if (thread_status.operation_type == ThreadStatus::OP_FLUSH || + thread_status.operation_type == ThreadStatus::OP_COMPACTION) { + if (thread_id == thread_status.thread_id) { + found_match = true; + break; + } + } + } + ASSERT_TRUE(found_match); +#endif // ROCKSDB_USING_THREAD_STATUS + } + + void OnFlushCompleted(DB* db, const FlushJobInfo& info) override { + flushed_dbs_.push_back(db); + flushed_column_family_names_.push_back(info.cf_name); + if (info.triggered_writes_slowdown) { + slowdown_count++; + } + if (info.triggered_writes_stop) { + stop_count++; + } + // verify whether the previously created file matches the flushed file. + ASSERT_EQ(prev_fc_info_.db_name, db->GetName()); + ASSERT_EQ(prev_fc_info_.cf_name, info.cf_name); + ASSERT_EQ(prev_fc_info_.job_id, info.job_id); + ASSERT_EQ(prev_fc_info_.file_path, info.file_path); + ASSERT_EQ(TableFileNameToNumber(info.file_path), info.file_number); + + // Note: the following chunk relies on the notification pertaining to the + // database pointed to by DBTestBase::db_, and is thus bypassed when + // that assumption does not hold (see the test case MultiDBMultiListeners + // below). + ASSERT_TRUE(test_); + if (db == test_->db_) { + std::vector> files_by_level; + ASSERT_LT(info.cf_id, test_->handles_.size()); + ASSERT_GE(info.cf_id, 0u); + ASSERT_NE(test_->handles_[info.cf_id], nullptr); + test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[info.cf_id], + &files_by_level); + + ASSERT_FALSE(files_by_level.empty()); + auto it = std::find_if(files_by_level[0].begin(), files_by_level[0].end(), + [&](const FileMetaData& meta) { + return meta.fd.GetNumber() == info.file_number; + }); + ASSERT_NE(it, files_by_level[0].end()); + ASSERT_EQ(info.oldest_blob_file_number, it->oldest_blob_file_number); + } + + ASSERT_EQ(db->GetEnv()->GetThreadID(), info.thread_id); + ASSERT_GT(info.thread_id, 0U); + ASSERT_EQ(info.table_properties.user_collected_properties.find("0")->second, + "1"); + } + + std::vector flushed_column_family_names_; + std::vector flushed_dbs_; + int slowdown_count; + int stop_count; + bool db_closing; + std::atomic_bool db_closed; + TableFileCreationInfo prev_fc_info_; + + protected: + Env* env_; + EventListenerTest* test_; +}; + +TEST_F(EventListenerTest, OnSingleDBFlushTest) { + Options options; + options.env = CurrentOptions().env; + options.write_buffer_size = k110KB; +#ifdef ROCKSDB_USING_THREAD_STATUS + options.enable_thread_tracking = true; +#endif // ROCKSDB_USING_THREAD_STATUS + TestFlushListener* listener = new TestFlushListener(options.env, this); + options.listeners.emplace_back(listener); + std::vector cf_names = {"pikachu", "ilya", "muromec", + "dobrynia", "nikitich", "alyosha", + "popovich"}; + options.table_properties_collector_factories.push_back( + std::make_shared()); + CreateAndReopenWithCF(cf_names, options); + + ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p'))); + + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 1, "ditto", + BlobStr(456, 0, 1 << 10))); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); + + ASSERT_OK(Put(2, "ilya", std::string(90000, 'i'))); + ASSERT_OK(Put(3, "muromec", std::string(90000, 'm'))); + ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd'))); + ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n'))); + ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a'))); + ASSERT_OK(Put(7, "popovich", std::string(90000, 'p'))); + for (int i = 1; i < 8; ++i) { + ASSERT_OK(Flush(i)); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + // Ensure background work is fully finished including listener callbacks + // before accessing listener state. + ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork()); + ASSERT_EQ(listener->flushed_dbs_.size(), i); + ASSERT_EQ(listener->flushed_column_family_names_.size(), i); + } + + // make sure callback functions are called in the right order + for (size_t i = 0; i < cf_names.size(); ++i) { + ASSERT_EQ(listener->flushed_dbs_[i], db_); + ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]); + } +} + +TEST_F(EventListenerTest, MultiCF) { + Options options; + options.env = CurrentOptions().env; + options.write_buffer_size = k110KB; +#ifdef ROCKSDB_USING_THREAD_STATUS + options.enable_thread_tracking = true; +#endif // ROCKSDB_USING_THREAD_STATUS + for (auto atomic_flush : {false, true}) { + options.atomic_flush = atomic_flush; + options.create_if_missing = true; + DestroyAndReopen(options); + TestFlushListener* listener = new TestFlushListener(options.env, this); + options.listeners.emplace_back(listener); + options.table_properties_collector_factories.push_back( + std::make_shared()); + std::vector cf_names = {"pikachu", "ilya", "muromec", + "dobrynia", "nikitich", "alyosha", + "popovich"}; + CreateAndReopenWithCF(cf_names, options); + + ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p'))); + ASSERT_OK(Put(2, "ilya", std::string(90000, 'i'))); + ASSERT_OK(Put(3, "muromec", std::string(90000, 'm'))); + ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd'))); + ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n'))); + ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a'))); + ASSERT_OK(Put(7, "popovich", std::string(90000, 'p'))); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 1; i < 8; ++i) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::NotifyOnFlushCompleted::PostAllOnFlushCompleted", + "EventListenerTest.MultiCF:PreVerifyListener"}}); + ASSERT_OK(Flush(i)); + TEST_SYNC_POINT("EventListenerTest.MultiCF:PreVerifyListener"); + ASSERT_EQ(listener->flushed_dbs_.size(), i); + ASSERT_EQ(listener->flushed_column_family_names_.size(), i); + // make sure callback functions are called in the right order + if (i == 7) { + for (size_t j = 0; j < cf_names.size(); j++) { + ASSERT_EQ(listener->flushed_dbs_[j], db_); + ASSERT_EQ(listener->flushed_column_family_names_[j], cf_names[j]); + } + } + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + Close(); + } +} + +TEST_F(EventListenerTest, MultiDBMultiListeners) { + Options options; + options.env = CurrentOptions().env; +#ifdef ROCKSDB_USING_THREAD_STATUS + options.enable_thread_tracking = true; +#endif // ROCKSDB_USING_THREAD_STATUS + options.table_properties_collector_factories.push_back( + std::make_shared()); + std::vector listeners; + const int kNumDBs = 5; + const int kNumListeners = 10; + for (int i = 0; i < kNumListeners; ++i) { + listeners.emplace_back(new TestFlushListener(options.env, this)); + } + + std::vector cf_names = {"pikachu", "ilya", "muromec", + "dobrynia", "nikitich", "alyosha", + "popovich"}; + + options.create_if_missing = true; + for (int i = 0; i < kNumListeners; ++i) { + options.listeners.emplace_back(listeners[i]); + } + DBOptions db_opts(options); + ColumnFamilyOptions cf_opts(options); + + std::vector dbs; + std::vector> vec_handles; + + for (int d = 0; d < kNumDBs; ++d) { + ASSERT_OK(DestroyDB(dbname_ + std::to_string(d), options)); + DB* db; + std::vector handles; + ASSERT_OK(DB::Open(options, dbname_ + std::to_string(d), &db)); + for (size_t c = 0; c < cf_names.size(); ++c) { + ColumnFamilyHandle* handle; + ASSERT_OK(db->CreateColumnFamily(cf_opts, cf_names[c], &handle)); + handles.push_back(handle); + } + + vec_handles.push_back(std::move(handles)); + dbs.push_back(db); + } + + for (int d = 0; d < kNumDBs; ++d) { + for (size_t c = 0; c < cf_names.size(); ++c) { + ASSERT_OK(dbs[d]->Put(WriteOptions(), vec_handles[d][c], cf_names[c], + cf_names[c])); + } + } + + for (size_t c = 0; c < cf_names.size(); ++c) { + for (int d = 0; d < kNumDBs; ++d) { + ASSERT_OK(dbs[d]->Flush(FlushOptions(), vec_handles[d][c])); + ASSERT_OK( + static_cast_with_check(dbs[d])->TEST_WaitForFlushMemTable()); + } + } + + for (int d = 0; d < kNumDBs; ++d) { + // Ensure background work is fully finished including listener callbacks + // before accessing listener state. + ASSERT_OK( + static_cast_with_check(dbs[d])->TEST_WaitForBackgroundWork()); + } + + for (auto* listener : listeners) { + int pos = 0; + for (size_t c = 0; c < cf_names.size(); ++c) { + for (int d = 0; d < kNumDBs; ++d) { + ASSERT_EQ(listener->flushed_dbs_[pos], dbs[d]); + ASSERT_EQ(listener->flushed_column_family_names_[pos], cf_names[c]); + pos++; + } + } + } + + for (auto handles : vec_handles) { + for (auto h : handles) { + delete h; + } + handles.clear(); + } + vec_handles.clear(); + + for (auto db : dbs) { + delete db; + } +} + +TEST_F(EventListenerTest, DisableBGCompaction) { + Options options; + options.env = CurrentOptions().env; +#ifdef ROCKSDB_USING_THREAD_STATUS + options.enable_thread_tracking = true; +#endif // ROCKSDB_USING_THREAD_STATUS + TestFlushListener* listener = new TestFlushListener(options.env, this); + const int kCompactionTrigger = 1; + const int kSlowdownTrigger = 5; + const int kStopTrigger = 100; + options.level0_file_num_compaction_trigger = kCompactionTrigger; + options.level0_slowdown_writes_trigger = kSlowdownTrigger; + options.level0_stop_writes_trigger = kStopTrigger; + options.max_write_buffer_number = 10; + options.listeners.emplace_back(listener); + // BG compaction is disabled. Number of L0 files will simply keeps + // increasing in this test. + options.compaction_style = kCompactionStyleNone; + options.compression = kNoCompression; + options.write_buffer_size = 100000; // Small write buffer + options.table_properties_collector_factories.push_back( + std::make_shared()); + + CreateAndReopenWithCF({"pikachu"}, options); + ColumnFamilyMetaData cf_meta; + db_->GetColumnFamilyMetaData(handles_[1], &cf_meta); + + // keep writing until writes are forced to stop. + for (int i = 0; static_cast(cf_meta.file_count) < kSlowdownTrigger * 10; + ++i) { + ASSERT_OK( + Put(1, std::to_string(i), std::string(10000, 'x'), WriteOptions())); + FlushOptions fo; + fo.allow_write_stall = true; + ASSERT_OK(db_->Flush(fo, handles_[1])); + db_->GetColumnFamilyMetaData(handles_[1], &cf_meta); + } + // Ensure background work is fully finished including listener callbacks + // before accessing listener state. + ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork()); + ASSERT_GE(listener->slowdown_count, kSlowdownTrigger * 9); +} + +class TestCompactionReasonListener : public EventListener { + public: + void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override { + std::lock_guard lock(mutex_); + compaction_reasons_.push_back(ci.compaction_reason); + } + + std::vector compaction_reasons_; + std::mutex mutex_; +}; + +TEST_F(EventListenerTest, CompactionReasonLevel) { + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + options.memtable_factory.reset(test::NewSpecialSkipListFactory( + DBTestBase::kNumKeysByGenerateNewRandomFile)); + + TestCompactionReasonListener* listener = new TestCompactionReasonListener(); + options.listeners.emplace_back(listener); + + options.level0_file_num_compaction_trigger = 4; + options.compaction_style = kCompactionStyleLevel; + + DestroyAndReopen(options); + Random rnd(301); + + // Write 4 files in L0 + for (int i = 0; i < 4; i++) { + GenerateNewRandomFile(&rnd); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ(listener->compaction_reasons_.size(), 1); + ASSERT_EQ(listener->compaction_reasons_[0], + CompactionReason::kLevelL0FilesNum); + + DestroyAndReopen(options); + + // Write 3 non-overlapping files in L0 + for (int k = 1; k <= 30; k++) { + ASSERT_OK(Put(Key(k), Key(k))); + if (k % 10 == 0) { + Flush(); + } + } + + // Do a trivial move from L0 -> L1 + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + options.max_bytes_for_level_base = 1; + Close(); + listener->compaction_reasons_.clear(); + Reopen(options); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_GT(listener->compaction_reasons_.size(), 1); + + for (auto compaction_reason : listener->compaction_reasons_) { + ASSERT_EQ(compaction_reason, CompactionReason::kLevelMaxLevelSize); + } + + options.disable_auto_compactions = true; + Close(); + listener->compaction_reasons_.clear(); + Reopen(options); + + ASSERT_OK(Put("key", "value")); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_GT(listener->compaction_reasons_.size(), 0); + for (auto compaction_reason : listener->compaction_reasons_) { + ASSERT_EQ(compaction_reason, CompactionReason::kManualCompaction); + } +} + +TEST_F(EventListenerTest, CompactionReasonUniversal) { + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + options.memtable_factory.reset(test::NewSpecialSkipListFactory( + DBTestBase::kNumKeysByGenerateNewRandomFile)); + + TestCompactionReasonListener* listener = new TestCompactionReasonListener(); + options.listeners.emplace_back(listener); + + options.compaction_style = kCompactionStyleUniversal; + + Random rnd(301); + + options.level0_file_num_compaction_trigger = 8; + options.compaction_options_universal.max_size_amplification_percent = 100000; + options.compaction_options_universal.size_ratio = 100000; + DestroyAndReopen(options); + listener->compaction_reasons_.clear(); + + // Write 8 files in L0 + for (int i = 0; i < 8; i++) { + GenerateNewRandomFile(&rnd); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_GT(listener->compaction_reasons_.size(), 0); + for (auto compaction_reason : listener->compaction_reasons_) { + ASSERT_EQ(compaction_reason, CompactionReason::kUniversalSizeRatio); + } + + options.level0_file_num_compaction_trigger = 8; + options.compaction_options_universal.max_size_amplification_percent = 1; + options.compaction_options_universal.size_ratio = 100000; + + DestroyAndReopen(options); + listener->compaction_reasons_.clear(); + + // Write 8 files in L0 + for (int i = 0; i < 8; i++) { + GenerateNewRandomFile(&rnd); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_GT(listener->compaction_reasons_.size(), 0); + for (auto compaction_reason : listener->compaction_reasons_) { + ASSERT_EQ(compaction_reason, CompactionReason::kUniversalSizeAmplification); + } + + options.disable_auto_compactions = true; + Close(); + listener->compaction_reasons_.clear(); + Reopen(options); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + ASSERT_GT(listener->compaction_reasons_.size(), 0); + for (auto compaction_reason : listener->compaction_reasons_) { + ASSERT_EQ(compaction_reason, CompactionReason::kManualCompaction); + } +} + +TEST_F(EventListenerTest, CompactionReasonFIFO) { + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + options.memtable_factory.reset(test::NewSpecialSkipListFactory( + DBTestBase::kNumKeysByGenerateNewRandomFile)); + + TestCompactionReasonListener* listener = new TestCompactionReasonListener(); + options.listeners.emplace_back(listener); + + options.level0_file_num_compaction_trigger = 4; + options.compaction_style = kCompactionStyleFIFO; + options.compaction_options_fifo.max_table_files_size = 1; + + DestroyAndReopen(options); + Random rnd(301); + + // Write 4 files in L0 + for (int i = 0; i < 4; i++) { + GenerateNewRandomFile(&rnd); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_GT(listener->compaction_reasons_.size(), 0); + for (auto compaction_reason : listener->compaction_reasons_) { + ASSERT_EQ(compaction_reason, CompactionReason::kFIFOMaxSize); + } +} + +class TableFileCreationListener : public EventListener { + public: + class TestEnv : public EnvWrapper { + public: + explicit TestEnv(Env* t) : EnvWrapper(t) {} + static const char* kClassName() { return "TestEnv"; } + const char* Name() const override { return kClassName(); } + + void SetStatus(Status s) { status_ = s; } + + Status NewWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override { + if (fname.size() > 4 && fname.substr(fname.size() - 4) == ".sst") { + if (!status_.ok()) { + return status_; + } + } + return target()->NewWritableFile(fname, result, options); + } + + private: + Status status_; + }; + + TableFileCreationListener() { + for (int i = 0; i < 2; i++) { + started_[i] = finished_[i] = failure_[i] = 0; + } + } + + int Index(TableFileCreationReason reason) { + int idx; + switch (reason) { + case TableFileCreationReason::kFlush: + idx = 0; + break; + case TableFileCreationReason::kCompaction: + idx = 1; + break; + default: + idx = -1; + } + return idx; + } + + void CheckAndResetCounters(int flush_started, int flush_finished, + int flush_failure, int compaction_started, + int compaction_finished, int compaction_failure) { + ASSERT_EQ(started_[0], flush_started); + ASSERT_EQ(finished_[0], flush_finished); + ASSERT_EQ(failure_[0], flush_failure); + ASSERT_EQ(started_[1], compaction_started); + ASSERT_EQ(finished_[1], compaction_finished); + ASSERT_EQ(failure_[1], compaction_failure); + for (int i = 0; i < 2; i++) { + started_[i] = finished_[i] = failure_[i] = 0; + } + } + + void OnTableFileCreationStarted( + const TableFileCreationBriefInfo& info) override { + int idx = Index(info.reason); + if (idx >= 0) { + started_[idx]++; + } + ASSERT_GT(info.db_name.size(), 0U); + ASSERT_GT(info.cf_name.size(), 0U); + ASSERT_GT(info.file_path.size(), 0U); + ASSERT_GT(info.job_id, 0); + } + + void OnTableFileCreated(const TableFileCreationInfo& info) override { + int idx = Index(info.reason); + if (idx >= 0) { + finished_[idx]++; + } + ASSERT_GT(info.db_name.size(), 0U); + ASSERT_GT(info.cf_name.size(), 0U); + ASSERT_GT(info.file_path.size(), 0U); + ASSERT_GT(info.job_id, 0); + ASSERT_EQ(info.file_checksum, kUnknownFileChecksum); + ASSERT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName); + if (info.status.ok()) { + if (info.table_properties.num_range_deletions == 0U) { + ASSERT_GT(info.table_properties.data_size, 0U); + ASSERT_GT(info.table_properties.raw_key_size, 0U); + ASSERT_GT(info.table_properties.raw_value_size, 0U); + ASSERT_GT(info.table_properties.num_data_blocks, 0U); + ASSERT_GT(info.table_properties.num_entries, 0U); + } + } else { + if (idx >= 0) { + failure_[idx]++; + last_failure_ = info.status; + } + } + } + + int started_[2]; + int finished_[2]; + int failure_[2]; + Status last_failure_; +}; + +TEST_F(EventListenerTest, TableFileCreationListenersTest) { + auto listener = std::make_shared(); + Options options; + std::unique_ptr test_env( + new TableFileCreationListener::TestEnv(CurrentOptions().env)); + options.create_if_missing = true; + options.listeners.push_back(listener); + options.env = test_env.get(); + DestroyAndReopen(options); + + ASSERT_OK(Put("foo", "aaa")); + ASSERT_OK(Put("bar", "bbb")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0); + ASSERT_OK(Put("foo", "aaa1")); + ASSERT_OK(Put("bar", "bbb1")); + test_env->SetStatus(Status::NotSupported("not supported")); + ASSERT_NOK(Flush()); + listener->CheckAndResetCounters(1, 1, 1, 0, 0, 0); + ASSERT_TRUE(listener->last_failure_.IsNotSupported()); + test_env->SetStatus(Status::OK()); + + Reopen(options); + ASSERT_OK(Put("foo", "aaa2")); + ASSERT_OK(Put("bar", "bbb2")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0); + + const Slice kRangeStart = "a"; + const Slice kRangeEnd = "z"; + ASSERT_OK( + dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + listener->CheckAndResetCounters(0, 0, 0, 1, 1, 0); + + ASSERT_OK(Put("foo", "aaa3")); + ASSERT_OK(Put("bar", "bbb3")); + ASSERT_OK(Flush()); + test_env->SetStatus(Status::NotSupported("not supported")); + ASSERT_NOK( + dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd)); + ASSERT_NOK(dbfull()->TEST_WaitForCompact()); + listener->CheckAndResetCounters(1, 1, 0, 1, 1, 1); + ASSERT_TRUE(listener->last_failure_.IsNotSupported()); + + // Reset + test_env->SetStatus(Status::OK()); + DestroyAndReopen(options); + + // Verify that an empty table file that is immediately deleted gives Aborted + // status to listener. + ASSERT_OK(Put("baz", "z")); + ASSERT_OK(SingleDelete("baz")); + ASSERT_OK(Flush()); + listener->CheckAndResetCounters(1, 1, 1, 0, 0, 0); + ASSERT_TRUE(listener->last_failure_.IsAborted()); + + // Also in compaction + ASSERT_OK(Put("baz", "z")); + ASSERT_OK(Flush()); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + kRangeStart, kRangeEnd)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + listener->CheckAndResetCounters(2, 2, 0, 1, 1, 1); + ASSERT_TRUE(listener->last_failure_.IsAborted()); + + Close(); // Avoid UAF on listener +} + +class MemTableSealedListener : public EventListener { + private: + SequenceNumber latest_seq_number_; + + public: + MemTableSealedListener() {} + void OnMemTableSealed(const MemTableInfo& info) override { + latest_seq_number_ = info.first_seqno; + } + + void OnFlushCompleted(DB* /*db*/, + const FlushJobInfo& flush_job_info) override { + ASSERT_LE(flush_job_info.smallest_seqno, latest_seq_number_); + } +}; + +TEST_F(EventListenerTest, MemTableSealedListenerTest) { + auto listener = std::make_shared(); + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + options.listeners.push_back(listener); + DestroyAndReopen(options); + + for (unsigned int i = 0; i < 10; i++) { + std::string tag = std::to_string(i); + ASSERT_OK(Put("foo" + tag, "aaa")); + ASSERT_OK(Put("bar" + tag, "bbb")); + + ASSERT_OK(Flush()); + } +} + +class ColumnFamilyHandleDeletionStartedListener : public EventListener { + private: + std::vector cfs_; + int counter; + + public: + explicit ColumnFamilyHandleDeletionStartedListener( + const std::vector& cfs) + : cfs_(cfs), counter(0) { + cfs_.insert(cfs_.begin(), kDefaultColumnFamilyName); + } + void OnColumnFamilyHandleDeletionStarted( + ColumnFamilyHandle* handle) override { + ASSERT_EQ(cfs_[handle->GetID()], handle->GetName()); + counter++; + } + int getCounter() { return counter; } +}; + +TEST_F(EventListenerTest, ColumnFamilyHandleDeletionStartedListenerTest) { + std::vector cfs{"pikachu", "eevee", "Mewtwo"}; + auto listener = + std::make_shared(cfs); + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + options.listeners.push_back(listener); + CreateAndReopenWithCF(cfs, options); + ASSERT_EQ(handles_.size(), 4); + delete handles_[3]; + delete handles_[2]; + delete handles_[1]; + handles_.resize(1); + ASSERT_EQ(listener->getCounter(), 3); +} + +class BackgroundErrorListener : public EventListener { + private: + SpecialEnv* env_; + int counter_; + + public: + BackgroundErrorListener(SpecialEnv* env) : env_(env), counter_(0) {} + + void OnBackgroundError(BackgroundErrorReason /*reason*/, + Status* bg_error) override { + if (counter_ == 0) { + // suppress the first error and disable write-dropping such that a retry + // can succeed. + *bg_error = Status::OK(); + env_->drop_writes_.store(false, std::memory_order_release); + env_->SetMockSleep(false); + } + ++counter_; + } + + int counter() { return counter_; } +}; + +TEST_F(EventListenerTest, BackgroundErrorListenerFailedFlushTest) { + auto listener = std::make_shared(env_); + Options options; + options.create_if_missing = true; + options.env = env_; + options.listeners.push_back(listener); + options.memtable_factory.reset(test::NewSpecialSkipListFactory(1)); + options.paranoid_checks = true; + DestroyAndReopen(options); + + // the usual TEST_WaitForFlushMemTable() doesn't work for failed flushes, so + // forge a custom one for the failed flush case. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BGWorkFlush:done", + "EventListenerTest:BackgroundErrorListenerFailedFlushTest:1"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + env_->drop_writes_.store(true, std::memory_order_release); + env_->SetMockSleep(); + + ASSERT_OK(Put("key0", "val")); + ASSERT_OK(Put("key1", "val")); + TEST_SYNC_POINT("EventListenerTest:BackgroundErrorListenerFailedFlushTest:1"); + ASSERT_EQ(1, listener->counter()); + ASSERT_OK(Put("key2", "val")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); +} + +TEST_F(EventListenerTest, BackgroundErrorListenerFailedCompactionTest) { + auto listener = std::make_shared(env_); + Options options; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.env = env_; + options.level0_file_num_compaction_trigger = 2; + options.listeners.push_back(listener); + options.memtable_factory.reset(test::NewSpecialSkipListFactory(2)); + options.paranoid_checks = true; + DestroyAndReopen(options); + + // third iteration triggers the second memtable's flush + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("key0", "val")); + if (i > 0) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_OK(Put("key1", "val")); + } + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + + env_->drop_writes_.store(true, std::memory_order_release); + env_->SetMockSleep(); + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(1, listener->counter()); + + // trigger flush so compaction is triggered again; this time it succeeds + // The previous failed compaction may get retried automatically, so we may + // be left with 0 or 1 files in level 1, depending on when the retry gets + // scheduled + ASSERT_OK(Put("key0", "val")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_LE(1, NumTableFilesAtLevel(0)); +} + +class TestFileOperationListener : public EventListener { + public: + TestFileOperationListener() { + file_reads_.store(0); + file_reads_success_.store(0); + file_writes_.store(0); + file_writes_success_.store(0); + file_flushes_.store(0); + file_flushes_success_.store(0); + file_closes_.store(0); + file_closes_success_.store(0); + file_syncs_.store(0); + file_syncs_success_.store(0); + file_truncates_.store(0); + file_truncates_success_.store(0); + file_seq_reads_.store(0); + blob_file_reads_.store(0); + blob_file_writes_.store(0); + blob_file_flushes_.store(0); + blob_file_closes_.store(0); + blob_file_syncs_.store(0); + blob_file_truncates_.store(0); + } + + void OnFileReadFinish(const FileOperationInfo& info) override { + ++file_reads_; + if (info.status.ok()) { + ++file_reads_success_; + } + if (info.path.find("MANIFEST") != std::string::npos) { + ++file_seq_reads_; + } + if (EndsWith(info.path, ".blob")) { + ++blob_file_reads_; + } + ReportDuration(info); + } + + void OnFileWriteFinish(const FileOperationInfo& info) override { + ++file_writes_; + if (info.status.ok()) { + ++file_writes_success_; + } + if (EndsWith(info.path, ".blob")) { + ++blob_file_writes_; + } + ReportDuration(info); + } + + void OnFileFlushFinish(const FileOperationInfo& info) override { + ++file_flushes_; + if (info.status.ok()) { + ++file_flushes_success_; + } + if (EndsWith(info.path, ".blob")) { + ++blob_file_flushes_; + } + ReportDuration(info); + } + + void OnFileCloseFinish(const FileOperationInfo& info) override { + ++file_closes_; + if (info.status.ok()) { + ++file_closes_success_; + } + if (EndsWith(info.path, ".blob")) { + ++blob_file_closes_; + } + ReportDuration(info); + } + + void OnFileSyncFinish(const FileOperationInfo& info) override { + ++file_syncs_; + if (info.status.ok()) { + ++file_syncs_success_; + } + if (EndsWith(info.path, ".blob")) { + ++blob_file_syncs_; + } + ReportDuration(info); + } + + void OnFileTruncateFinish(const FileOperationInfo& info) override { + ++file_truncates_; + if (info.status.ok()) { + ++file_truncates_success_; + } + if (EndsWith(info.path, ".blob")) { + ++blob_file_truncates_; + } + ReportDuration(info); + } + + bool ShouldBeNotifiedOnFileIO() override { return true; } + + std::atomic file_reads_; + std::atomic file_reads_success_; + std::atomic file_writes_; + std::atomic file_writes_success_; + std::atomic file_flushes_; + std::atomic file_flushes_success_; + std::atomic file_closes_; + std::atomic file_closes_success_; + std::atomic file_syncs_; + std::atomic file_syncs_success_; + std::atomic file_truncates_; + std::atomic file_truncates_success_; + std::atomic file_seq_reads_; + std::atomic blob_file_reads_; + std::atomic blob_file_writes_; + std::atomic blob_file_flushes_; + std::atomic blob_file_closes_; + std::atomic blob_file_syncs_; + std::atomic blob_file_truncates_; + + private: + void ReportDuration(const FileOperationInfo& info) const { + ASSERT_GT(info.duration.count(), 0); + } +}; + +TEST_F(EventListenerTest, OnFileOperationTest) { + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + + TestFileOperationListener* listener = new TestFileOperationListener(); + options.listeners.emplace_back(listener); + + options.use_direct_io_for_flush_and_compaction = false; + Status s = TryReopen(options); + if (s.IsInvalidArgument()) { + options.use_direct_io_for_flush_and_compaction = false; + } else { + ASSERT_OK(s); + } + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "aaa")); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_GE(listener->file_writes_.load(), + listener->file_writes_success_.load()); + ASSERT_GT(listener->file_writes_.load(), 0); + ASSERT_GE(listener->file_flushes_.load(), + listener->file_flushes_success_.load()); + ASSERT_GT(listener->file_flushes_.load(), 0); + Close(); + + Reopen(options); + ASSERT_GE(listener->file_reads_.load(), listener->file_reads_success_.load()); + ASSERT_GT(listener->file_reads_.load(), 0); + ASSERT_GE(listener->file_closes_.load(), + listener->file_closes_success_.load()); + ASSERT_GT(listener->file_closes_.load(), 0); + ASSERT_GE(listener->file_syncs_.load(), listener->file_syncs_success_.load()); + ASSERT_GT(listener->file_syncs_.load(), 0); + if (true == options.use_direct_io_for_flush_and_compaction) { + ASSERT_GE(listener->file_truncates_.load(), + listener->file_truncates_success_.load()); + ASSERT_GT(listener->file_truncates_.load(), 0); + } +} + +TEST_F(EventListenerTest, OnBlobFileOperationTest) { + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + TestFileOperationListener* listener = new TestFileOperationListener(); + options.listeners.emplace_back(listener); + options.disable_auto_compactions = true; + options.enable_blob_files = true; + options.min_blob_size = 0; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 0.5; + + DestroyAndReopen(options); + + ASSERT_OK(Put("Key1", "blob_value1")); + ASSERT_OK(Put("Key2", "blob_value2")); + ASSERT_OK(Put("Key3", "blob_value3")); + ASSERT_OK(Put("Key4", "blob_value4")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("Key3", "new_blob_value3")); + ASSERT_OK(Put("Key4", "new_blob_value4")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("Key5", "blob_value5")); + ASSERT_OK(Put("Key6", "blob_value6")); + ASSERT_OK(Flush()); + + ASSERT_GT(listener->blob_file_writes_.load(), 0U); + ASSERT_GT(listener->blob_file_flushes_.load(), 0U); + Close(); + + Reopen(options); + ASSERT_GT(listener->blob_file_closes_.load(), 0U); + ASSERT_GT(listener->blob_file_syncs_.load(), 0U); + if (true == options.use_direct_io_for_flush_and_compaction) { + ASSERT_GT(listener->blob_file_truncates_.load(), 0U); + } +} + +TEST_F(EventListenerTest, ReadManifestAndWALOnRecovery) { + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + + TestFileOperationListener* listener = new TestFileOperationListener(); + options.listeners.emplace_back(listener); + + options.use_direct_io_for_flush_and_compaction = false; + Status s = TryReopen(options); + if (s.IsInvalidArgument()) { + options.use_direct_io_for_flush_and_compaction = false; + } else { + ASSERT_OK(s); + } + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "aaa")); + Close(); + + size_t seq_reads = listener->file_seq_reads_.load(); + Reopen(options); + ASSERT_GT(listener->file_seq_reads_.load(), seq_reads); +} + +class BlobDBJobLevelEventListenerTest : public EventListener { + public: + explicit BlobDBJobLevelEventListenerTest(EventListenerTest* test) + : test_(test), call_count_(0) {} + + const VersionStorageInfo* GetVersionStorageInfo() const { + VersionSet* const versions = test_->dbfull()->GetVersionSet(); + assert(versions); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + EXPECT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + EXPECT_NE(current, nullptr); + + const VersionStorageInfo* const storage_info = current->storage_info(); + EXPECT_NE(storage_info, nullptr); + + return storage_info; + } + + void CheckBlobFileAdditions( + const std::vector& blob_file_addition_infos) const { + const auto* vstorage = GetVersionStorageInfo(); + + EXPECT_FALSE(blob_file_addition_infos.empty()); + + for (const auto& blob_file_addition_info : blob_file_addition_infos) { + const auto meta = vstorage->GetBlobFileMetaData( + blob_file_addition_info.blob_file_number); + + EXPECT_NE(meta, nullptr); + EXPECT_EQ(meta->GetBlobFileNumber(), + blob_file_addition_info.blob_file_number); + EXPECT_EQ(meta->GetTotalBlobBytes(), + blob_file_addition_info.total_blob_bytes); + EXPECT_EQ(meta->GetTotalBlobCount(), + blob_file_addition_info.total_blob_count); + EXPECT_FALSE(blob_file_addition_info.blob_file_path.empty()); + } + } + + std::vector GetFlushedFiles() { + std::lock_guard lock(mutex_); + std::vector result; + for (const auto& fname : flushed_files_) { + result.push_back(fname); + } + return result; + } + + void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { + call_count_++; + + { + std::lock_guard lock(mutex_); + flushed_files_.push_back(info.file_path); + } + + EXPECT_EQ(info.blob_compression_type, kNoCompression); + + CheckBlobFileAdditions(info.blob_file_addition_infos); + } + + void OnCompactionCompleted(DB* /*db*/, + const CompactionJobInfo& info) override { + call_count_++; + + EXPECT_EQ(info.blob_compression_type, kNoCompression); + + CheckBlobFileAdditions(info.blob_file_addition_infos); + + EXPECT_FALSE(info.blob_file_garbage_infos.empty()); + + for (const auto& blob_file_garbage_info : info.blob_file_garbage_infos) { + EXPECT_GT(blob_file_garbage_info.blob_file_number, 0U); + EXPECT_GT(blob_file_garbage_info.garbage_blob_count, 0U); + EXPECT_GT(blob_file_garbage_info.garbage_blob_bytes, 0U); + EXPECT_FALSE(blob_file_garbage_info.blob_file_path.empty()); + } + } + + EventListenerTest* test_; + uint32_t call_count_; + + private: + std::vector flushed_files_; + std::mutex mutex_; +}; + +// Test OnFlushCompleted EventListener called for blob files +TEST_F(EventListenerTest, BlobDBOnFlushCompleted) { + Options options; + options.env = CurrentOptions().env; + options.enable_blob_files = true; + options.create_if_missing = true; + options.disable_auto_compactions = true; + + options.min_blob_size = 0; + BlobDBJobLevelEventListenerTest* blob_event_listener = + new BlobDBJobLevelEventListenerTest(this); + options.listeners.emplace_back(blob_event_listener); + + DestroyAndReopen(options); + + ASSERT_OK(Put("Key1", "blob_value1")); + ASSERT_OK(Put("Key2", "blob_value2")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("Key3", "blob_value3")); + ASSERT_OK(Flush()); + + ASSERT_EQ(Get("Key1"), "blob_value1"); + ASSERT_EQ(Get("Key2"), "blob_value2"); + ASSERT_EQ(Get("Key3"), "blob_value3"); + + ASSERT_GT(blob_event_listener->call_count_, 0U); +} + +// Test OnCompactionCompleted EventListener called for blob files +TEST_F(EventListenerTest, BlobDBOnCompactionCompleted) { + Options options; + options.env = CurrentOptions().env; + options.enable_blob_files = true; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.min_blob_size = 0; + BlobDBJobLevelEventListenerTest* blob_event_listener = + new BlobDBJobLevelEventListenerTest(this); + options.listeners.emplace_back(blob_event_listener); + + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 0.5; + + DestroyAndReopen(options); + + ASSERT_OK(Put("Key1", "blob_value1")); + ASSERT_OK(Put("Key2", "blob_value2")); + ASSERT_OK(Put("Key3", "blob_value3")); + ASSERT_OK(Put("Key4", "blob_value4")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("Key3", "new_blob_value3")); + ASSERT_OK(Put("Key4", "new_blob_value4")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("Key5", "blob_value5")); + ASSERT_OK(Put("Key6", "blob_value6")); + ASSERT_OK(Flush()); + + blob_event_listener->call_count_ = 0; + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + // On compaction, because of blob_garbage_collection_age_cutoff, it will + // delete the oldest blob file and create new blob file during compaction. + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + // Make sure, OnCompactionCompleted is called. + ASSERT_GT(blob_event_listener->call_count_, 0U); +} + +// Test CompactFiles calls OnCompactionCompleted EventListener for blob files +// and populate the blob files info. +TEST_F(EventListenerTest, BlobDBCompactFiles) { + Options options; + options.env = CurrentOptions().env; + options.enable_blob_files = true; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.min_blob_size = 0; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 0.5; + + BlobDBJobLevelEventListenerTest* blob_event_listener = + new BlobDBJobLevelEventListenerTest(this); + options.listeners.emplace_back(blob_event_listener); + + DestroyAndReopen(options); + + ASSERT_OK(Put("Key1", "blob_value1")); + ASSERT_OK(Put("Key2", "blob_value2")); + ASSERT_OK(Put("Key3", "blob_value3")); + ASSERT_OK(Put("Key4", "blob_value4")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("Key3", "new_blob_value3")); + ASSERT_OK(Put("Key4", "new_blob_value4")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("Key5", "blob_value5")); + ASSERT_OK(Put("Key6", "blob_value6")); + ASSERT_OK(Flush()); + + std::vector output_file_names; + CompactionJobInfo compaction_job_info; + + // On compaction, because of blob_garbage_collection_age_cutoff, it will + // delete the oldest blob file and create new blob file during compaction + // which will be populated in output_files_names. + ASSERT_OK(dbfull()->CompactFiles( + CompactionOptions(), blob_event_listener->GetFlushedFiles(), 1, -1, + &output_file_names, &compaction_job_info)); + + bool is_blob_in_output = false; + for (const auto& file : output_file_names) { + if (EndsWith(file, ".blob")) { + is_blob_in_output = true; + } + } + ASSERT_TRUE(is_blob_in_output); + + for (const auto& blob_file_addition_info : + compaction_job_info.blob_file_addition_infos) { + EXPECT_GT(blob_file_addition_info.blob_file_number, 0U); + EXPECT_GT(blob_file_addition_info.total_blob_bytes, 0U); + EXPECT_GT(blob_file_addition_info.total_blob_count, 0U); + EXPECT_FALSE(blob_file_addition_info.blob_file_path.empty()); + } + + for (const auto& blob_file_garbage_info : + compaction_job_info.blob_file_garbage_infos) { + EXPECT_GT(blob_file_garbage_info.blob_file_number, 0U); + EXPECT_GT(blob_file_garbage_info.garbage_blob_count, 0U); + EXPECT_GT(blob_file_garbage_info.garbage_blob_bytes, 0U); + EXPECT_FALSE(blob_file_garbage_info.blob_file_path.empty()); + } +} + +class BlobDBFileLevelEventListener : public EventListener { + public: + void OnBlobFileCreationStarted( + const BlobFileCreationBriefInfo& info) override { + files_started_++; + EXPECT_FALSE(info.db_name.empty()); + EXPECT_FALSE(info.cf_name.empty()); + EXPECT_FALSE(info.file_path.empty()); + EXPECT_GT(info.job_id, 0); + } + + void OnBlobFileCreated(const BlobFileCreationInfo& info) override { + files_created_++; + EXPECT_FALSE(info.db_name.empty()); + EXPECT_FALSE(info.cf_name.empty()); + EXPECT_FALSE(info.file_path.empty()); + EXPECT_GT(info.job_id, 0); + EXPECT_GT(info.total_blob_count, 0U); + EXPECT_GT(info.total_blob_bytes, 0U); + EXPECT_EQ(info.file_checksum, kUnknownFileChecksum); + EXPECT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName); + EXPECT_TRUE(info.status.ok()); + } + + void OnBlobFileDeleted(const BlobFileDeletionInfo& info) override { + files_deleted_++; + EXPECT_FALSE(info.db_name.empty()); + EXPECT_FALSE(info.file_path.empty()); + EXPECT_GT(info.job_id, 0); + EXPECT_TRUE(info.status.ok()); + } + + void CheckCounters() { + EXPECT_EQ(files_started_, files_created_); + EXPECT_GT(files_started_, 0U); + EXPECT_GT(files_deleted_, 0U); + EXPECT_LT(files_deleted_, files_created_); + } + + private: + std::atomic files_started_{}; + std::atomic files_created_{}; + std::atomic files_deleted_{}; +}; + +TEST_F(EventListenerTest, BlobDBFileTest) { + Options options; + options.env = CurrentOptions().env; + options.enable_blob_files = true; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.min_blob_size = 0; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 0.5; + + BlobDBFileLevelEventListener* blob_event_listener = + new BlobDBFileLevelEventListener(); + options.listeners.emplace_back(blob_event_listener); + + DestroyAndReopen(options); + + ASSERT_OK(Put("Key1", "blob_value1")); + ASSERT_OK(Put("Key2", "blob_value2")); + ASSERT_OK(Put("Key3", "blob_value3")); + ASSERT_OK(Put("Key4", "blob_value4")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("Key3", "new_blob_value3")); + ASSERT_OK(Put("Key4", "new_blob_value4")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("Key5", "blob_value5")); + ASSERT_OK(Put("Key6", "blob_value6")); + ASSERT_OK(Flush()); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + // On compaction, because of blob_garbage_collection_age_cutoff, it will + // delete the oldest blob file and create new blob file during compaction. + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + blob_event_listener->CheckCounters(); +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/log_format.h b/src/rocksdb/db/log_format.h new file mode 100644 index 000000000..d397372f4 --- /dev/null +++ b/src/rocksdb/db/log_format.h @@ -0,0 +1,51 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Log format information shared by reader and writer. +// See ../doc/log_format.txt for more detail. + +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +namespace log { + +enum RecordType { + // Zero is reserved for preallocated files + kZeroType = 0, + kFullType = 1, + + // For fragments + kFirstType = 2, + kMiddleType = 3, + kLastType = 4, + + // For recycled log files + kRecyclableFullType = 5, + kRecyclableFirstType = 6, + kRecyclableMiddleType = 7, + kRecyclableLastType = 8, + + // Compression Type + kSetCompressionType = 9, +}; +static const int kMaxRecordType = kSetCompressionType; + +static const unsigned int kBlockSize = 32768; + +// Header is checksum (4 bytes), length (2 bytes), type (1 byte) +static const int kHeaderSize = 4 + 2 + 1; + +// Recyclable header is checksum (4 bytes), length (2 bytes), type (1 byte), +// log number (4 bytes). +static const int kRecyclableHeaderSize = 4 + 2 + 1 + 4; + +} // namespace log +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/log_reader.cc b/src/rocksdb/db/log_reader.cc new file mode 100644 index 000000000..a21868776 --- /dev/null +++ b/src/rocksdb/db/log_reader.cc @@ -0,0 +1,854 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_reader.h" + +#include + +#include "file/sequence_file_reader.h" +#include "port/lang.h" +#include "rocksdb/env.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/crc32c.h" + +namespace ROCKSDB_NAMESPACE { +namespace log { + +Reader::Reporter::~Reporter() {} + +Reader::Reader(std::shared_ptr info_log, + std::unique_ptr&& _file, + Reporter* reporter, bool checksum, uint64_t log_num) + : info_log_(info_log), + file_(std::move(_file)), + reporter_(reporter), + checksum_(checksum), + backing_store_(new char[kBlockSize]), + buffer_(), + eof_(false), + read_error_(false), + eof_offset_(0), + last_record_offset_(0), + end_of_buffer_offset_(0), + log_number_(log_num), + recycled_(false), + first_record_read_(false), + compression_type_(kNoCompression), + compression_type_record_read_(false), + uncompress_(nullptr), + hash_state_(nullptr), + uncompress_hash_state_(nullptr){}; + +Reader::~Reader() { + delete[] backing_store_; + if (uncompress_) { + delete uncompress_; + } + if (hash_state_) { + XXH3_freeState(hash_state_); + } + if (uncompress_hash_state_) { + XXH3_freeState(uncompress_hash_state_); + } +} + +// For kAbsoluteConsistency, on clean shutdown we don't expect any error +// in the log files. For other modes, we can ignore only incomplete records +// in the last log file, which are presumably due to a write in progress +// during restart (or from log recycling). +// +// TODO krad: Evaluate if we need to move to a more strict mode where we +// restrict the inconsistency to only the last log +bool Reader::ReadRecord(Slice* record, std::string* scratch, + WALRecoveryMode wal_recovery_mode, + uint64_t* record_checksum) { + scratch->clear(); + record->clear(); + if (record_checksum != nullptr) { + if (hash_state_ == nullptr) { + hash_state_ = XXH3_createState(); + } + XXH3_64bits_reset(hash_state_); + } + if (uncompress_) { + uncompress_->Reset(); + } + bool in_fragmented_record = false; + // Record offset of the logical record that we're reading + // 0 is a dummy value to make compilers happy + uint64_t prospective_record_offset = 0; + + Slice fragment; + while (true) { + uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size(); + size_t drop_size = 0; + const unsigned int record_type = + ReadPhysicalRecord(&fragment, &drop_size, record_checksum); + switch (record_type) { + case kFullType: + case kRecyclableFullType: + if (in_fragmented_record && !scratch->empty()) { + // Handle bug in earlier versions of log::Writer where + // it could emit an empty kFirstType record at the tail end + // of a block followed by a kFullType or kFirstType record + // at the beginning of the next block. + ReportCorruption(scratch->size(), "partial record without end(1)"); + } + // No need to compute record_checksum since the record + // consists of a single fragment and the checksum is computed + // in ReadPhysicalRecord() if WAL compression is enabled + if (record_checksum != nullptr && uncompress_ == nullptr) { + // No need to stream since the record is a single fragment + *record_checksum = XXH3_64bits(fragment.data(), fragment.size()); + } + prospective_record_offset = physical_record_offset; + scratch->clear(); + *record = fragment; + last_record_offset_ = prospective_record_offset; + first_record_read_ = true; + return true; + + case kFirstType: + case kRecyclableFirstType: + if (in_fragmented_record && !scratch->empty()) { + // Handle bug in earlier versions of log::Writer where + // it could emit an empty kFirstType record at the tail end + // of a block followed by a kFullType or kFirstType record + // at the beginning of the next block. + ReportCorruption(scratch->size(), "partial record without end(2)"); + XXH3_64bits_reset(hash_state_); + } + if (record_checksum != nullptr) { + XXH3_64bits_update(hash_state_, fragment.data(), fragment.size()); + } + prospective_record_offset = physical_record_offset; + scratch->assign(fragment.data(), fragment.size()); + in_fragmented_record = true; + break; + + case kMiddleType: + case kRecyclableMiddleType: + if (!in_fragmented_record) { + ReportCorruption(fragment.size(), + "missing start of fragmented record(1)"); + } else { + if (record_checksum != nullptr) { + XXH3_64bits_update(hash_state_, fragment.data(), fragment.size()); + } + scratch->append(fragment.data(), fragment.size()); + } + break; + + case kLastType: + case kRecyclableLastType: + if (!in_fragmented_record) { + ReportCorruption(fragment.size(), + "missing start of fragmented record(2)"); + } else { + if (record_checksum != nullptr) { + XXH3_64bits_update(hash_state_, fragment.data(), fragment.size()); + *record_checksum = XXH3_64bits_digest(hash_state_); + } + scratch->append(fragment.data(), fragment.size()); + *record = Slice(*scratch); + last_record_offset_ = prospective_record_offset; + first_record_read_ = true; + return true; + } + break; + + case kBadHeader: + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency || + wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) { + // In clean shutdown we don't expect any error in the log files. + // In point-in-time recovery an incomplete record at the end could + // produce a hole in the recovered data. Report an error here, which + // higher layers can choose to ignore when it's provable there is no + // hole. + ReportCorruption(drop_size, "truncated header"); + } + FALLTHROUGH_INTENDED; + + case kEof: + if (in_fragmented_record) { + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency || + wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) { + // In clean shutdown we don't expect any error in the log files. + // In point-in-time recovery an incomplete record at the end could + // produce a hole in the recovered data. Report an error here, which + // higher layers can choose to ignore when it's provable there is no + // hole. + ReportCorruption(scratch->size(), "error reading trailing data"); + } + // This can be caused by the writer dying immediately after + // writing a physical record but before completing the next; don't + // treat it as a corruption, just ignore the entire logical record. + scratch->clear(); + } + return false; + + case kOldRecord: + if (wal_recovery_mode != WALRecoveryMode::kSkipAnyCorruptedRecords) { + // Treat a record from a previous instance of the log as EOF. + if (in_fragmented_record) { + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency || + wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) { + // In clean shutdown we don't expect any error in the log files. + // In point-in-time recovery an incomplete record at the end could + // produce a hole in the recovered data. Report an error here, + // which higher layers can choose to ignore when it's provable + // there is no hole. + ReportCorruption(scratch->size(), "error reading trailing data"); + } + // This can be caused by the writer dying immediately after + // writing a physical record but before completing the next; don't + // treat it as a corruption, just ignore the entire logical record. + scratch->clear(); + } + return false; + } + FALLTHROUGH_INTENDED; + + case kBadRecord: + if (in_fragmented_record) { + ReportCorruption(scratch->size(), "error in middle of record"); + in_fragmented_record = false; + scratch->clear(); + } + break; + + case kBadRecordLen: + if (eof_) { + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency || + wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) { + // In clean shutdown we don't expect any error in the log files. + // In point-in-time recovery an incomplete record at the end could + // produce a hole in the recovered data. Report an error here, which + // higher layers can choose to ignore when it's provable there is no + // hole. + ReportCorruption(drop_size, "truncated record body"); + } + return false; + } + FALLTHROUGH_INTENDED; + + case kBadRecordChecksum: + if (recycled_ && wal_recovery_mode == + WALRecoveryMode::kTolerateCorruptedTailRecords) { + scratch->clear(); + return false; + } + if (record_type == kBadRecordLen) { + ReportCorruption(drop_size, "bad record length"); + } else { + ReportCorruption(drop_size, "checksum mismatch"); + } + if (in_fragmented_record) { + ReportCorruption(scratch->size(), "error in middle of record"); + in_fragmented_record = false; + scratch->clear(); + } + break; + + case kSetCompressionType: { + if (compression_type_record_read_) { + ReportCorruption(fragment.size(), + "read multiple SetCompressionType records"); + } + if (first_record_read_) { + ReportCorruption(fragment.size(), + "SetCompressionType not the first record"); + } + prospective_record_offset = physical_record_offset; + scratch->clear(); + last_record_offset_ = prospective_record_offset; + CompressionTypeRecord compression_record(kNoCompression); + Status s = compression_record.DecodeFrom(&fragment); + if (!s.ok()) { + ReportCorruption(fragment.size(), + "could not decode SetCompressionType record"); + } else { + InitCompression(compression_record); + } + break; + } + + default: { + char buf[40]; + snprintf(buf, sizeof(buf), "unknown record type %u", record_type); + ReportCorruption( + (fragment.size() + (in_fragmented_record ? scratch->size() : 0)), + buf); + in_fragmented_record = false; + scratch->clear(); + break; + } + } + } + return false; +} + +uint64_t Reader::LastRecordOffset() { return last_record_offset_; } + +uint64_t Reader::LastRecordEnd() { + return end_of_buffer_offset_ - buffer_.size(); +} + +void Reader::UnmarkEOF() { + if (read_error_) { + return; + } + eof_ = false; + if (eof_offset_ == 0) { + return; + } + UnmarkEOFInternal(); +} + +void Reader::UnmarkEOFInternal() { + // If the EOF was in the middle of a block (a partial block was read) we have + // to read the rest of the block as ReadPhysicalRecord can only read full + // blocks and expects the file position indicator to be aligned to the start + // of a block. + // + // consumed_bytes + buffer_size() + remaining == kBlockSize + + size_t consumed_bytes = eof_offset_ - buffer_.size(); + size_t remaining = kBlockSize - eof_offset_; + + // backing_store_ is used to concatenate what is left in buffer_ and + // the remainder of the block. If buffer_ already uses backing_store_, + // we just append the new data. + if (buffer_.data() != backing_store_ + consumed_bytes) { + // Buffer_ does not use backing_store_ for storage. + // Copy what is left in buffer_ to backing_store. + memmove(backing_store_ + consumed_bytes, buffer_.data(), buffer_.size()); + } + + Slice read_buffer; + // TODO: rate limit log reader with approriate priority. + // TODO: avoid overcharging rate limiter: + // Note that the Read here might overcharge SequentialFileReader's internal + // rate limiter if priority is not IO_TOTAL, e.g., when there is not enough + // content left until EOF to read. + Status status = + file_->Read(remaining, &read_buffer, backing_store_ + eof_offset_, + Env::IO_TOTAL /* rate_limiter_priority */); + + size_t added = read_buffer.size(); + end_of_buffer_offset_ += added; + + if (!status.ok()) { + if (added > 0) { + ReportDrop(added, status); + } + + read_error_ = true; + return; + } + + if (read_buffer.data() != backing_store_ + eof_offset_) { + // Read did not write to backing_store_ + memmove(backing_store_ + eof_offset_, read_buffer.data(), + read_buffer.size()); + } + + buffer_ = Slice(backing_store_ + consumed_bytes, + eof_offset_ + added - consumed_bytes); + + if (added < remaining) { + eof_ = true; + eof_offset_ += added; + } else { + eof_offset_ = 0; + } +} + +void Reader::ReportCorruption(size_t bytes, const char* reason) { + ReportDrop(bytes, Status::Corruption(reason)); +} + +void Reader::ReportDrop(size_t bytes, const Status& reason) { + if (reporter_ != nullptr) { + reporter_->Corruption(bytes, reason); + } +} + +bool Reader::ReadMore(size_t* drop_size, int* error) { + if (!eof_ && !read_error_) { + // Last read was a full read, so this is a trailer to skip + buffer_.clear(); + // TODO: rate limit log reader with approriate priority. + // TODO: avoid overcharging rate limiter: + // Note that the Read here might overcharge SequentialFileReader's internal + // rate limiter if priority is not IO_TOTAL, e.g., when there is not enough + // content left until EOF to read. + Status status = file_->Read(kBlockSize, &buffer_, backing_store_, + Env::IO_TOTAL /* rate_limiter_priority */); + TEST_SYNC_POINT_CALLBACK("LogReader::ReadMore:AfterReadFile", &status); + end_of_buffer_offset_ += buffer_.size(); + if (!status.ok()) { + buffer_.clear(); + ReportDrop(kBlockSize, status); + read_error_ = true; + *error = kEof; + return false; + } else if (buffer_.size() < static_cast(kBlockSize)) { + eof_ = true; + eof_offset_ = buffer_.size(); + } + return true; + } else { + // Note that if buffer_ is non-empty, we have a truncated header at the + // end of the file, which can be caused by the writer crashing in the + // middle of writing the header. Unless explicitly requested we don't + // considering this an error, just report EOF. + if (buffer_.size()) { + *drop_size = buffer_.size(); + buffer_.clear(); + *error = kBadHeader; + return false; + } + buffer_.clear(); + *error = kEof; + return false; + } +} + +unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size, + uint64_t* fragment_checksum) { + while (true) { + // We need at least the minimum header size + if (buffer_.size() < static_cast(kHeaderSize)) { + // the default value of r is meaningless because ReadMore will overwrite + // it if it returns false; in case it returns true, the return value will + // not be used anyway + int r = kEof; + if (!ReadMore(drop_size, &r)) { + return r; + } + continue; + } + + // Parse the header + const char* header = buffer_.data(); + const uint32_t a = static_cast(header[4]) & 0xff; + const uint32_t b = static_cast(header[5]) & 0xff; + const unsigned int type = header[6]; + const uint32_t length = a | (b << 8); + int header_size = kHeaderSize; + if (type >= kRecyclableFullType && type <= kRecyclableLastType) { + if (end_of_buffer_offset_ - buffer_.size() == 0) { + recycled_ = true; + } + header_size = kRecyclableHeaderSize; + // We need enough for the larger header + if (buffer_.size() < static_cast(kRecyclableHeaderSize)) { + int r = kEof; + if (!ReadMore(drop_size, &r)) { + return r; + } + continue; + } + const uint32_t log_num = DecodeFixed32(header + 7); + if (log_num != log_number_) { + return kOldRecord; + } + } + if (header_size + length > buffer_.size()) { + assert(buffer_.size() >= static_cast(header_size)); + *drop_size = buffer_.size(); + buffer_.clear(); + // If the end of the read has been reached without seeing + // `header_size + length` bytes of payload, report a corruption. The + // higher layers can decide how to handle it based on the recovery mode, + // whether this occurred at EOF, whether this is the final WAL, etc. + return kBadRecordLen; + } + + if (type == kZeroType && length == 0) { + // Skip zero length record without reporting any drops since + // such records are produced by the mmap based writing code in + // env_posix.cc that preallocates file regions. + // NOTE: this should never happen in DB written by new RocksDB versions, + // since we turn off mmap writes to manifest and log files + buffer_.clear(); + return kBadRecord; + } + + // Check crc + if (checksum_) { + uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); + uint32_t actual_crc = crc32c::Value(header + 6, length + header_size - 6); + if (actual_crc != expected_crc) { + // Drop the rest of the buffer since "length" itself may have + // been corrupted and if we trust it, we could find some + // fragment of a real log record that just happens to look + // like a valid log record. + *drop_size = buffer_.size(); + buffer_.clear(); + return kBadRecordChecksum; + } + } + + buffer_.remove_prefix(header_size + length); + + if (!uncompress_ || type == kSetCompressionType) { + *result = Slice(header + header_size, length); + return type; + } else { + // Uncompress compressed records + uncompressed_record_.clear(); + if (fragment_checksum != nullptr) { + if (uncompress_hash_state_ == nullptr) { + uncompress_hash_state_ = XXH3_createState(); + } + XXH3_64bits_reset(uncompress_hash_state_); + } + + size_t uncompressed_size = 0; + int remaining = 0; + do { + remaining = uncompress_->Uncompress(header + header_size, length, + uncompressed_buffer_.get(), + &uncompressed_size); + if (remaining < 0) { + buffer_.clear(); + return kBadRecord; + } + if (uncompressed_size > 0) { + if (fragment_checksum != nullptr) { + XXH3_64bits_update(uncompress_hash_state_, + uncompressed_buffer_.get(), uncompressed_size); + } + uncompressed_record_.append(uncompressed_buffer_.get(), + uncompressed_size); + } + } while (remaining > 0 || uncompressed_size == kBlockSize); + + if (fragment_checksum != nullptr) { + // We can remove this check by updating hash_state_ directly, + // but that requires resetting hash_state_ for full and first types + // for edge cases like consecutive fist type records. + // Leaving the check as is since it is cleaner and can revert to the + // above approach if it causes performance impact. + *fragment_checksum = XXH3_64bits_digest(uncompress_hash_state_); + uint64_t actual_checksum = XXH3_64bits(uncompressed_record_.data(), + uncompressed_record_.size()); + if (*fragment_checksum != actual_checksum) { + // uncompressed_record_ contains bad content that does not match + // actual decompressed content + return kBadRecord; + } + } + *result = Slice(uncompressed_record_); + return type; + } + } +} + +// Initialize uncompress related fields +void Reader::InitCompression(const CompressionTypeRecord& compression_record) { + compression_type_ = compression_record.GetCompressionType(); + compression_type_record_read_ = true; + constexpr uint32_t compression_format_version = 2; + uncompress_ = StreamingUncompress::Create( + compression_type_, compression_format_version, kBlockSize); + assert(uncompress_ != nullptr); + uncompressed_buffer_ = std::unique_ptr(new char[kBlockSize]); + assert(uncompressed_buffer_); +} + +bool FragmentBufferedReader::ReadRecord(Slice* record, std::string* scratch, + WALRecoveryMode /*unused*/, + uint64_t* /* checksum */) { + assert(record != nullptr); + assert(scratch != nullptr); + record->clear(); + scratch->clear(); + if (uncompress_) { + uncompress_->Reset(); + } + + uint64_t prospective_record_offset = 0; + uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size(); + size_t drop_size = 0; + unsigned int fragment_type_or_err = 0; // Initialize to make compiler happy + Slice fragment; + while (TryReadFragment(&fragment, &drop_size, &fragment_type_or_err)) { + switch (fragment_type_or_err) { + case kFullType: + case kRecyclableFullType: + if (in_fragmented_record_ && !fragments_.empty()) { + ReportCorruption(fragments_.size(), "partial record without end(1)"); + } + fragments_.clear(); + *record = fragment; + prospective_record_offset = physical_record_offset; + last_record_offset_ = prospective_record_offset; + first_record_read_ = true; + in_fragmented_record_ = false; + return true; + + case kFirstType: + case kRecyclableFirstType: + if (in_fragmented_record_ || !fragments_.empty()) { + ReportCorruption(fragments_.size(), "partial record without end(2)"); + } + prospective_record_offset = physical_record_offset; + fragments_.assign(fragment.data(), fragment.size()); + in_fragmented_record_ = true; + break; + + case kMiddleType: + case kRecyclableMiddleType: + if (!in_fragmented_record_) { + ReportCorruption(fragment.size(), + "missing start of fragmented record(1)"); + } else { + fragments_.append(fragment.data(), fragment.size()); + } + break; + + case kLastType: + case kRecyclableLastType: + if (!in_fragmented_record_) { + ReportCorruption(fragment.size(), + "missing start of fragmented record(2)"); + } else { + fragments_.append(fragment.data(), fragment.size()); + scratch->assign(fragments_.data(), fragments_.size()); + fragments_.clear(); + *record = Slice(*scratch); + last_record_offset_ = prospective_record_offset; + first_record_read_ = true; + in_fragmented_record_ = false; + return true; + } + break; + + case kBadHeader: + case kBadRecord: + case kEof: + case kOldRecord: + if (in_fragmented_record_) { + ReportCorruption(fragments_.size(), "error in middle of record"); + in_fragmented_record_ = false; + fragments_.clear(); + } + break; + + case kBadRecordChecksum: + if (recycled_) { + fragments_.clear(); + return false; + } + ReportCorruption(drop_size, "checksum mismatch"); + if (in_fragmented_record_) { + ReportCorruption(fragments_.size(), "error in middle of record"); + in_fragmented_record_ = false; + fragments_.clear(); + } + break; + + case kSetCompressionType: { + if (compression_type_record_read_) { + ReportCorruption(fragment.size(), + "read multiple SetCompressionType records"); + } + if (first_record_read_) { + ReportCorruption(fragment.size(), + "SetCompressionType not the first record"); + } + fragments_.clear(); + prospective_record_offset = physical_record_offset; + last_record_offset_ = prospective_record_offset; + in_fragmented_record_ = false; + CompressionTypeRecord compression_record(kNoCompression); + Status s = compression_record.DecodeFrom(&fragment); + if (!s.ok()) { + ReportCorruption(fragment.size(), + "could not decode SetCompressionType record"); + } else { + InitCompression(compression_record); + } + break; + } + + default: { + char buf[40]; + snprintf(buf, sizeof(buf), "unknown record type %u", + fragment_type_or_err); + ReportCorruption( + fragment.size() + (in_fragmented_record_ ? fragments_.size() : 0), + buf); + in_fragmented_record_ = false; + fragments_.clear(); + break; + } + } + } + return false; +} + +void FragmentBufferedReader::UnmarkEOF() { + if (read_error_) { + return; + } + eof_ = false; + UnmarkEOFInternal(); +} + +bool FragmentBufferedReader::TryReadMore(size_t* drop_size, int* error) { + if (!eof_ && !read_error_) { + // Last read was a full read, so this is a trailer to skip + buffer_.clear(); + // TODO: rate limit log reader with approriate priority. + // TODO: avoid overcharging rate limiter: + // Note that the Read here might overcharge SequentialFileReader's internal + // rate limiter if priority is not IO_TOTAL, e.g., when there is not enough + // content left until EOF to read. + Status status = file_->Read(kBlockSize, &buffer_, backing_store_, + Env::IO_TOTAL /* rate_limiter_priority */); + end_of_buffer_offset_ += buffer_.size(); + if (!status.ok()) { + buffer_.clear(); + ReportDrop(kBlockSize, status); + read_error_ = true; + *error = kEof; + return false; + } else if (buffer_.size() < static_cast(kBlockSize)) { + eof_ = true; + eof_offset_ = buffer_.size(); + TEST_SYNC_POINT_CALLBACK( + "FragmentBufferedLogReader::TryReadMore:FirstEOF", nullptr); + } + return true; + } else if (!read_error_) { + UnmarkEOF(); + } + if (!read_error_) { + return true; + } + *error = kEof; + *drop_size = buffer_.size(); + if (buffer_.size() > 0) { + *error = kBadHeader; + } + buffer_.clear(); + return false; +} + +// return true if the caller should process the fragment_type_or_err. +bool FragmentBufferedReader::TryReadFragment( + Slice* fragment, size_t* drop_size, unsigned int* fragment_type_or_err) { + assert(fragment != nullptr); + assert(drop_size != nullptr); + assert(fragment_type_or_err != nullptr); + + while (buffer_.size() < static_cast(kHeaderSize)) { + size_t old_size = buffer_.size(); + int error = kEof; + if (!TryReadMore(drop_size, &error)) { + *fragment_type_or_err = error; + return false; + } else if (old_size == buffer_.size()) { + return false; + } + } + const char* header = buffer_.data(); + const uint32_t a = static_cast(header[4]) & 0xff; + const uint32_t b = static_cast(header[5]) & 0xff; + const unsigned int type = header[6]; + const uint32_t length = a | (b << 8); + int header_size = kHeaderSize; + if (type >= kRecyclableFullType && type <= kRecyclableLastType) { + if (end_of_buffer_offset_ - buffer_.size() == 0) { + recycled_ = true; + } + header_size = kRecyclableHeaderSize; + while (buffer_.size() < static_cast(kRecyclableHeaderSize)) { + size_t old_size = buffer_.size(); + int error = kEof; + if (!TryReadMore(drop_size, &error)) { + *fragment_type_or_err = error; + return false; + } else if (old_size == buffer_.size()) { + return false; + } + } + const uint32_t log_num = DecodeFixed32(header + 7); + if (log_num != log_number_) { + *fragment_type_or_err = kOldRecord; + return true; + } + } + + while (header_size + length > buffer_.size()) { + size_t old_size = buffer_.size(); + int error = kEof; + if (!TryReadMore(drop_size, &error)) { + *fragment_type_or_err = error; + return false; + } else if (old_size == buffer_.size()) { + return false; + } + } + + if (type == kZeroType && length == 0) { + buffer_.clear(); + *fragment_type_or_err = kBadRecord; + return true; + } + + if (checksum_) { + uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); + uint32_t actual_crc = crc32c::Value(header + 6, length + header_size - 6); + if (actual_crc != expected_crc) { + *drop_size = buffer_.size(); + buffer_.clear(); + *fragment_type_or_err = kBadRecordChecksum; + return true; + } + } + + buffer_.remove_prefix(header_size + length); + + if (!uncompress_ || type == kSetCompressionType) { + *fragment = Slice(header + header_size, length); + *fragment_type_or_err = type; + return true; + } else { + // Uncompress compressed records + uncompressed_record_.clear(); + size_t uncompressed_size = 0; + int remaining = 0; + do { + remaining = uncompress_->Uncompress(header + header_size, length, + uncompressed_buffer_.get(), + &uncompressed_size); + if (remaining < 0) { + buffer_.clear(); + *fragment_type_or_err = kBadRecord; + return true; + } + if (uncompressed_size > 0) { + uncompressed_record_.append(uncompressed_buffer_.get(), + uncompressed_size); + } + } while (remaining > 0 || uncompressed_size == kBlockSize); + *fragment = Slice(std::move(uncompressed_record_)); + *fragment_type_or_err = type; + return true; + } +} + +} // namespace log +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/log_reader.h b/src/rocksdb/db/log_reader.h new file mode 100644 index 000000000..e3be1570e --- /dev/null +++ b/src/rocksdb/db/log_reader.h @@ -0,0 +1,225 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include + +#include "db/log_format.h" +#include "file/sequence_file_reader.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "util/compression.h" +#include "util/xxhash.h" + +namespace ROCKSDB_NAMESPACE { +class Logger; + +namespace log { + +/** + * Reader is a general purpose log stream reader implementation. The actual job + * of reading from the device is implemented by the SequentialFile interface. + * + * Please see Writer for details on the file and record layout. + */ +class Reader { + public: + // Interface for reporting errors. + class Reporter { + public: + virtual ~Reporter(); + + // Some corruption was detected. "size" is the approximate number + // of bytes dropped due to the corruption. + virtual void Corruption(size_t bytes, const Status& status) = 0; + }; + + // Create a reader that will return log records from "*file". + // "*file" must remain live while this Reader is in use. + // + // If "reporter" is non-nullptr, it is notified whenever some data is + // dropped due to a detected corruption. "*reporter" must remain + // live while this Reader is in use. + // + // If "checksum" is true, verify checksums if available. + Reader(std::shared_ptr info_log, + std::unique_ptr&& file, Reporter* reporter, + bool checksum, uint64_t log_num); + // No copying allowed + Reader(const Reader&) = delete; + void operator=(const Reader&) = delete; + + virtual ~Reader(); + + // Read the next record into *record. Returns true if read + // successfully, false if we hit end of the input. May use + // "*scratch" as temporary storage. The contents filled in *record + // will only be valid until the next mutating operation on this + // reader or the next mutation to *scratch. + // If record_checksum is not nullptr, then this function will calculate the + // checksum of the record read and set record_checksum to it. The checksum is + // calculated from the original buffers that contain the contents of the + // record. + virtual bool ReadRecord(Slice* record, std::string* scratch, + WALRecoveryMode wal_recovery_mode = + WALRecoveryMode::kTolerateCorruptedTailRecords, + uint64_t* record_checksum = nullptr); + + // Returns the physical offset of the last record returned by ReadRecord. + // + // Undefined before the first call to ReadRecord. + uint64_t LastRecordOffset(); + + // Returns the first physical offset after the last record returned by + // ReadRecord, or zero before first call to ReadRecord. This can also be + // thought of as the "current" position in processing the file bytes. + uint64_t LastRecordEnd(); + + // returns true if the reader has encountered an eof condition. + bool IsEOF() { return eof_; } + + // returns true if the reader has encountered read error. + bool hasReadError() const { return read_error_; } + + // when we know more data has been written to the file. we can use this + // function to force the reader to look again in the file. + // Also aligns the file position indicator to the start of the next block + // by reading the rest of the data from the EOF position to the end of the + // block that was partially read. + virtual void UnmarkEOF(); + + SequentialFileReader* file() { return file_.get(); } + + Reporter* GetReporter() const { return reporter_; } + + uint64_t GetLogNumber() const { return log_number_; } + + size_t GetReadOffset() const { + return static_cast(end_of_buffer_offset_); + } + + bool IsCompressedAndEmptyFile() { + return !first_record_read_ && compression_type_record_read_; + } + + protected: + std::shared_ptr info_log_; + const std::unique_ptr file_; + Reporter* const reporter_; + bool const checksum_; + char* const backing_store_; + + // Internal state variables used for reading records + Slice buffer_; + bool eof_; // Last Read() indicated EOF by returning < kBlockSize + bool read_error_; // Error occurred while reading from file + + // Offset of the file position indicator within the last block when an + // EOF was detected. + size_t eof_offset_; + + // Offset of the last record returned by ReadRecord. + uint64_t last_record_offset_; + // Offset of the first location past the end of buffer_. + uint64_t end_of_buffer_offset_; + + // which log number this is + uint64_t const log_number_; + + // Whether this is a recycled log file + bool recycled_; + + // Whether the first record has been read or not. + bool first_record_read_; + // Type of compression used + CompressionType compression_type_; + // Track whether the compression type record has been read or not. + bool compression_type_record_read_; + StreamingUncompress* uncompress_; + // Reusable uncompressed output buffer + std::unique_ptr uncompressed_buffer_; + // Reusable uncompressed record + std::string uncompressed_record_; + // Used for stream hashing fragment content in ReadRecord() + XXH3_state_t* hash_state_; + // Used for stream hashing uncompressed buffer in ReadPhysicalRecord() + XXH3_state_t* uncompress_hash_state_; + + // Extend record types with the following special values + enum { + kEof = kMaxRecordType + 1, + // Returned whenever we find an invalid physical record. + // Currently there are three situations in which this happens: + // * The record has an invalid CRC (ReadPhysicalRecord reports a drop) + // * The record is a 0-length record (No drop is reported) + kBadRecord = kMaxRecordType + 2, + // Returned when we fail to read a valid header. + kBadHeader = kMaxRecordType + 3, + // Returned when we read an old record from a previous user of the log. + kOldRecord = kMaxRecordType + 4, + // Returned when we get a bad record length + kBadRecordLen = kMaxRecordType + 5, + // Returned when we get a bad record checksum + kBadRecordChecksum = kMaxRecordType + 6, + }; + + // Return type, or one of the preceding special values + // If WAL compressioned is enabled, fragment_checksum is the checksum of the + // fragment computed from the orginal buffer containinng uncompressed + // fragment. + unsigned int ReadPhysicalRecord(Slice* result, size_t* drop_size, + uint64_t* fragment_checksum = nullptr); + + // Read some more + bool ReadMore(size_t* drop_size, int* error); + + void UnmarkEOFInternal(); + + // Reports dropped bytes to the reporter. + // buffer_ must be updated to remove the dropped bytes prior to invocation. + void ReportCorruption(size_t bytes, const char* reason); + void ReportDrop(size_t bytes, const Status& reason); + + void InitCompression(const CompressionTypeRecord& compression_record); +}; + +class FragmentBufferedReader : public Reader { + public: + FragmentBufferedReader(std::shared_ptr info_log, + std::unique_ptr&& _file, + Reporter* reporter, bool checksum, uint64_t log_num) + : Reader(info_log, std::move(_file), reporter, checksum, log_num), + fragments_(), + in_fragmented_record_(false) {} + ~FragmentBufferedReader() override {} + bool ReadRecord(Slice* record, std::string* scratch, + WALRecoveryMode wal_recovery_mode = + WALRecoveryMode::kTolerateCorruptedTailRecords, + uint64_t* record_checksum = nullptr) override; + void UnmarkEOF() override; + + private: + std::string fragments_; + bool in_fragmented_record_; + + bool TryReadFragment(Slice* result, size_t* drop_size, + unsigned int* fragment_type_or_err); + + bool TryReadMore(size_t* drop_size, int* error); + + // No copy allowed + FragmentBufferedReader(const FragmentBufferedReader&); + void operator=(const FragmentBufferedReader&); +}; + +} // namespace log +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/log_test.cc b/src/rocksdb/db/log_test.cc new file mode 100644 index 000000000..2a43dc152 --- /dev/null +++ b/src/rocksdb/db/log_test.cc @@ -0,0 +1,1062 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "file/sequence_file_reader.h" +#include "file/writable_file_writer.h" +#include "rocksdb/env.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/random.h" +#include "utilities/memory_allocators.h" + +namespace ROCKSDB_NAMESPACE { +namespace log { + +// Construct a string of the specified length made out of the supplied +// partial string. +static std::string BigString(const std::string& partial_string, size_t n) { + std::string result; + while (result.size() < n) { + result.append(partial_string); + } + result.resize(n); + return result; +} + +// Construct a string from a number +static std::string NumberString(int n) { + char buf[50]; + snprintf(buf, sizeof(buf), "%d.", n); + return std::string(buf); +} + +// Return a skewed potentially long string +static std::string RandomSkewedString(int i, Random* rnd) { + return BigString(NumberString(i), rnd->Skewed(17)); +} + +// Param type is tuple +// get<0>(tuple): non-zero if recycling log, zero if regular log +// get<1>(tuple): true if allow retry after read EOF, false otherwise +class LogTest + : public ::testing::TestWithParam> { + private: + class StringSource : public FSSequentialFile { + public: + Slice& contents_; + bool force_error_; + size_t force_error_position_; + bool force_eof_; + size_t force_eof_position_; + bool returned_partial_; + bool fail_after_read_partial_; + explicit StringSource(Slice& contents, bool fail_after_read_partial) + : contents_(contents), + force_error_(false), + force_error_position_(0), + force_eof_(false), + force_eof_position_(0), + returned_partial_(false), + fail_after_read_partial_(fail_after_read_partial) {} + + IOStatus Read(size_t n, const IOOptions& /*opts*/, Slice* result, + char* scratch, IODebugContext* /*dbg*/) override { + if (fail_after_read_partial_) { + EXPECT_TRUE(!returned_partial_) << "must not Read() after eof/error"; + } + + if (force_error_) { + if (force_error_position_ >= n) { + force_error_position_ -= n; + } else { + *result = Slice(contents_.data(), force_error_position_); + contents_.remove_prefix(force_error_position_); + force_error_ = false; + returned_partial_ = true; + return IOStatus::Corruption("read error"); + } + } + + if (contents_.size() < n) { + n = contents_.size(); + returned_partial_ = true; + } + + if (force_eof_) { + if (force_eof_position_ >= n) { + force_eof_position_ -= n; + } else { + force_eof_ = false; + n = force_eof_position_; + returned_partial_ = true; + } + } + + // By using scratch we ensure that caller has control over the + // lifetime of result.data() + memcpy(scratch, contents_.data(), n); + *result = Slice(scratch, n); + + contents_.remove_prefix(n); + return IOStatus::OK(); + } + + IOStatus Skip(uint64_t n) override { + if (n > contents_.size()) { + contents_.clear(); + return IOStatus::NotFound("in-memory file skipepd past end"); + } + + contents_.remove_prefix(n); + + return IOStatus::OK(); + } + }; + + class ReportCollector : public Reader::Reporter { + public: + size_t dropped_bytes_; + std::string message_; + + ReportCollector() : dropped_bytes_(0) {} + void Corruption(size_t bytes, const Status& status) override { + dropped_bytes_ += bytes; + message_.append(status.ToString()); + } + }; + + std::string& dest_contents() { return sink_->contents_; } + + const std::string& dest_contents() const { return sink_->contents_; } + + void reset_source_contents() { source_->contents_ = dest_contents(); } + + Slice reader_contents_; + test::StringSink* sink_; + StringSource* source_; + ReportCollector report_; + + protected: + std::unique_ptr writer_; + std::unique_ptr reader_; + bool allow_retry_read_; + CompressionType compression_type_; + + public: + LogTest() + : reader_contents_(), + sink_(new test::StringSink(&reader_contents_)), + source_(new StringSource(reader_contents_, !std::get<1>(GetParam()))), + allow_retry_read_(std::get<1>(GetParam())), + compression_type_(std::get<2>(GetParam())) { + std::unique_ptr sink_holder(sink_); + std::unique_ptr file_writer(new WritableFileWriter( + std::move(sink_holder), "" /* don't care */, FileOptions())); + Writer* writer = + new Writer(std::move(file_writer), 123, std::get<0>(GetParam()), false, + compression_type_); + writer_.reset(writer); + std::unique_ptr source_holder(source_); + std::unique_ptr file_reader( + new SequentialFileReader(std::move(source_holder), "" /* file name */)); + if (allow_retry_read_) { + reader_.reset(new FragmentBufferedReader(nullptr, std::move(file_reader), + &report_, true /* checksum */, + 123 /* log_number */)); + } else { + reader_.reset(new Reader(nullptr, std::move(file_reader), &report_, + true /* checksum */, 123 /* log_number */)); + } + } + + Slice* get_reader_contents() { return &reader_contents_; } + + void Write(const std::string& msg) { + ASSERT_OK(writer_->AddRecord(Slice(msg))); + } + + size_t WrittenBytes() const { return dest_contents().size(); } + + std::string Read(const WALRecoveryMode wal_recovery_mode = + WALRecoveryMode::kTolerateCorruptedTailRecords) { + std::string scratch; + Slice record; + bool ret = false; + uint64_t record_checksum; + ret = reader_->ReadRecord(&record, &scratch, wal_recovery_mode, + &record_checksum); + if (ret) { + if (!allow_retry_read_) { + // allow_retry_read_ means using FragmentBufferedReader which does not + // support record checksum yet. + uint64_t actual_record_checksum = + XXH3_64bits(record.data(), record.size()); + assert(actual_record_checksum == record_checksum); + } + return record.ToString(); + } else { + return "EOF"; + } + } + + void IncrementByte(int offset, char delta) { + dest_contents()[offset] += delta; + } + + void SetByte(int offset, char new_byte) { + dest_contents()[offset] = new_byte; + } + + void ShrinkSize(int bytes) { sink_->Drop(bytes); } + + void FixChecksum(int header_offset, int len, bool recyclable) { + // Compute crc of type/len/data + int header_size = recyclable ? kRecyclableHeaderSize : kHeaderSize; + uint32_t crc = crc32c::Value(&dest_contents()[header_offset + 6], + header_size - 6 + len); + crc = crc32c::Mask(crc); + EncodeFixed32(&dest_contents()[header_offset], crc); + } + + void ForceError(size_t position = 0) { + source_->force_error_ = true; + source_->force_error_position_ = position; + } + + size_t DroppedBytes() const { return report_.dropped_bytes_; } + + std::string ReportMessage() const { return report_.message_; } + + void ForceEOF(size_t position = 0) { + source_->force_eof_ = true; + source_->force_eof_position_ = position; + } + + void UnmarkEOF() { + source_->returned_partial_ = false; + reader_->UnmarkEOF(); + } + + bool IsEOF() { return reader_->IsEOF(); } + + // Returns OK iff recorded error message contains "msg" + std::string MatchError(const std::string& msg) const { + if (report_.message_.find(msg) == std::string::npos) { + return report_.message_; + } else { + return "OK"; + } + } +}; + +TEST_P(LogTest, Empty) { ASSERT_EQ("EOF", Read()); } + +TEST_P(LogTest, ReadWrite) { + Write("foo"); + Write("bar"); + Write(""); + Write("xxxx"); + ASSERT_EQ("foo", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("xxxx", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ("EOF", Read()); // Make sure reads at eof work +} + +TEST_P(LogTest, ManyBlocks) { + for (int i = 0; i < 100000; i++) { + Write(NumberString(i)); + } + for (int i = 0; i < 100000; i++) { + ASSERT_EQ(NumberString(i), Read()); + } + ASSERT_EQ("EOF", Read()); +} + +TEST_P(LogTest, Fragmentation) { + Write("small"); + Write(BigString("medium", 50000)); + Write(BigString("large", 100000)); + ASSERT_EQ("small", Read()); + ASSERT_EQ(BigString("medium", 50000), Read()); + ASSERT_EQ(BigString("large", 100000), Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST_P(LogTest, MarginalTrailer) { + // Make a trailer that is exactly the same length as an empty record. + int header_size = + std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize; + const int n = kBlockSize - 2 * header_size; + Write(BigString("foo", n)); + ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes()); + Write(""); + Write("bar"); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST_P(LogTest, MarginalTrailer2) { + // Make a trailer that is exactly the same length as an empty record. + int header_size = + std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize; + const int n = kBlockSize - 2 * header_size; + Write(BigString("foo", n)); + ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes()); + Write("bar"); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(0U, DroppedBytes()); + ASSERT_EQ("", ReportMessage()); +} + +TEST_P(LogTest, ShortTrailer) { + int header_size = + std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize; + const int n = kBlockSize - 2 * header_size + 4; + Write(BigString("foo", n)); + ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes()); + Write(""); + Write("bar"); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST_P(LogTest, AlignedEof) { + int header_size = + std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize; + const int n = kBlockSize - 2 * header_size + 4; + Write(BigString("foo", n)); + ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes()); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST_P(LogTest, RandomRead) { + const int N = 500; + Random write_rnd(301); + for (int i = 0; i < N; i++) { + Write(RandomSkewedString(i, &write_rnd)); + } + Random read_rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read()); + } + ASSERT_EQ("EOF", Read()); +} + +// Tests of all the error paths in log_reader.cc follow: + +TEST_P(LogTest, ReadError) { + Write("foo"); + ForceError(); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ((unsigned int)kBlockSize, DroppedBytes()); + ASSERT_EQ("OK", MatchError("read error")); +} + +TEST_P(LogTest, BadRecordType) { + Write("foo"); + // Type is stored in header[6] + IncrementByte(6, 100); + FixChecksum(0, 3, false); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("unknown record type")); +} + +TEST_P(LogTest, TruncatedTrailingRecordIsIgnored) { + Write("foo"); + ShrinkSize(4); // Drop all payload as well as a header byte + ASSERT_EQ("EOF", Read()); + // Truncated last record is ignored, not treated as an error + ASSERT_EQ(0U, DroppedBytes()); + ASSERT_EQ("", ReportMessage()); +} + +TEST_P(LogTest, TruncatedTrailingRecordIsNotIgnored) { + if (allow_retry_read_) { + // If read retry is allowed, then truncated trailing record should not + // raise an error. + return; + } + Write("foo"); + ShrinkSize(4); // Drop all payload as well as a header byte + ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency)); + // Truncated last record is ignored, not treated as an error + ASSERT_GT(DroppedBytes(), 0U); + ASSERT_EQ("OK", MatchError("Corruption: truncated header")); +} + +TEST_P(LogTest, BadLength) { + if (allow_retry_read_) { + // If read retry is allowed, then we should not raise an error when the + // record length specified in header is longer than data currently + // available. It's possible that the body of the record is not written yet. + return; + } + bool recyclable_log = (std::get<0>(GetParam()) != 0); + int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize; + const int kPayloadSize = kBlockSize - header_size; + Write(BigString("bar", kPayloadSize)); + Write("foo"); + // Least significant size byte is stored in header[4]. + IncrementByte(4, 1); + if (!recyclable_log) { + ASSERT_EQ("foo", Read()); + ASSERT_EQ(kBlockSize, DroppedBytes()); + ASSERT_EQ("OK", MatchError("bad record length")); + } else { + ASSERT_EQ("EOF", Read()); + } +} + +TEST_P(LogTest, BadLengthAtEndIsIgnored) { + if (allow_retry_read_) { + // If read retry is allowed, then we should not raise an error when the + // record length specified in header is longer than data currently + // available. It's possible that the body of the record is not written yet. + return; + } + Write("foo"); + ShrinkSize(1); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(0U, DroppedBytes()); + ASSERT_EQ("", ReportMessage()); +} + +TEST_P(LogTest, BadLengthAtEndIsNotIgnored) { + if (allow_retry_read_) { + // If read retry is allowed, then we should not raise an error when the + // record length specified in header is longer than data currently + // available. It's possible that the body of the record is not written yet. + return; + } + Write("foo"); + ShrinkSize(1); + ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency)); + ASSERT_GT(DroppedBytes(), 0U); + ASSERT_EQ("OK", MatchError("Corruption: truncated record body")); +} + +TEST_P(LogTest, ChecksumMismatch) { + Write("foooooo"); + IncrementByte(0, 14); + ASSERT_EQ("EOF", Read()); + bool recyclable_log = (std::get<0>(GetParam()) != 0); + if (!recyclable_log) { + ASSERT_EQ(14U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("checksum mismatch")); + } else { + ASSERT_EQ(0U, DroppedBytes()); + ASSERT_EQ("", ReportMessage()); + } +} + +TEST_P(LogTest, UnexpectedMiddleType) { + Write("foo"); + bool recyclable_log = (std::get<0>(GetParam()) != 0); + SetByte(6, static_cast(recyclable_log ? kRecyclableMiddleType + : kMiddleType)); + FixChecksum(0, 3, !!recyclable_log); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("missing start")); +} + +TEST_P(LogTest, UnexpectedLastType) { + Write("foo"); + bool recyclable_log = (std::get<0>(GetParam()) != 0); + SetByte(6, + static_cast(recyclable_log ? kRecyclableLastType : kLastType)); + FixChecksum(0, 3, !!recyclable_log); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("missing start")); +} + +TEST_P(LogTest, UnexpectedFullType) { + Write("foo"); + Write("bar"); + bool recyclable_log = (std::get<0>(GetParam()) != 0); + SetByte( + 6, static_cast(recyclable_log ? kRecyclableFirstType : kFirstType)); + FixChecksum(0, 3, !!recyclable_log); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("partial record without end")); +} + +TEST_P(LogTest, UnexpectedFirstType) { + Write("foo"); + Write(BigString("bar", 100000)); + bool recyclable_log = (std::get<0>(GetParam()) != 0); + SetByte( + 6, static_cast(recyclable_log ? kRecyclableFirstType : kFirstType)); + FixChecksum(0, 3, !!recyclable_log); + ASSERT_EQ(BigString("bar", 100000), Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("partial record without end")); +} + +TEST_P(LogTest, MissingLastIsIgnored) { + Write(BigString("bar", kBlockSize)); + // Remove the LAST block, including header. + ShrinkSize(14); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ("", ReportMessage()); + ASSERT_EQ(0U, DroppedBytes()); +} + +TEST_P(LogTest, MissingLastIsNotIgnored) { + if (allow_retry_read_) { + // If read retry is allowed, then truncated trailing record should not + // raise an error. + return; + } + Write(BigString("bar", kBlockSize)); + // Remove the LAST block, including header. + ShrinkSize(14); + ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency)); + ASSERT_GT(DroppedBytes(), 0U); + ASSERT_EQ("OK", MatchError("Corruption: error reading trailing data")); +} + +TEST_P(LogTest, PartialLastIsIgnored) { + Write(BigString("bar", kBlockSize)); + // Cause a bad record length in the LAST block. + ShrinkSize(1); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ("", ReportMessage()); + ASSERT_EQ(0U, DroppedBytes()); +} + +TEST_P(LogTest, PartialLastIsNotIgnored) { + if (allow_retry_read_) { + // If read retry is allowed, then truncated trailing record should not + // raise an error. + return; + } + Write(BigString("bar", kBlockSize)); + // Cause a bad record length in the LAST block. + ShrinkSize(1); + ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency)); + ASSERT_GT(DroppedBytes(), 0U); + ASSERT_EQ("OK", MatchError("Corruption: truncated record body")); +} + +TEST_P(LogTest, ErrorJoinsRecords) { + // Consider two fragmented records: + // first(R1) last(R1) first(R2) last(R2) + // where the middle two fragments disappear. We do not want + // first(R1),last(R2) to get joined and returned as a valid record. + + // Write records that span two blocks + Write(BigString("foo", kBlockSize)); + Write(BigString("bar", kBlockSize)); + Write("correct"); + + // Wipe the middle block + for (unsigned int offset = kBlockSize; offset < 2 * kBlockSize; offset++) { + SetByte(offset, 'x'); + } + + bool recyclable_log = (std::get<0>(GetParam()) != 0); + if (!recyclable_log) { + ASSERT_EQ("correct", Read()); + ASSERT_EQ("EOF", Read()); + size_t dropped = DroppedBytes(); + ASSERT_LE(dropped, 2 * kBlockSize + 100); + ASSERT_GE(dropped, 2 * kBlockSize); + } else { + ASSERT_EQ("EOF", Read()); + } +} + +TEST_P(LogTest, ClearEofSingleBlock) { + Write("foo"); + Write("bar"); + bool recyclable_log = (std::get<0>(GetParam()) != 0); + int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize; + ForceEOF(3 + header_size + 2); + ASSERT_EQ("foo", Read()); + UnmarkEOF(); + ASSERT_EQ("bar", Read()); + ASSERT_TRUE(IsEOF()); + ASSERT_EQ("EOF", Read()); + Write("xxx"); + UnmarkEOF(); + ASSERT_EQ("xxx", Read()); + ASSERT_TRUE(IsEOF()); +} + +TEST_P(LogTest, ClearEofMultiBlock) { + size_t num_full_blocks = 5; + bool recyclable_log = (std::get<0>(GetParam()) != 0); + int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize; + size_t n = (kBlockSize - header_size) * num_full_blocks + 25; + Write(BigString("foo", n)); + Write(BigString("bar", n)); + ForceEOF(n + num_full_blocks * header_size + header_size + 3); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_TRUE(IsEOF()); + UnmarkEOF(); + ASSERT_EQ(BigString("bar", n), Read()); + ASSERT_TRUE(IsEOF()); + Write(BigString("xxx", n)); + UnmarkEOF(); + ASSERT_EQ(BigString("xxx", n), Read()); + ASSERT_TRUE(IsEOF()); +} + +TEST_P(LogTest, ClearEofError) { + // If an error occurs during Read() in UnmarkEOF(), the records contained + // in the buffer should be returned on subsequent calls of ReadRecord() + // until no more full records are left, whereafter ReadRecord() should return + // false to indicate that it cannot read any further. + + Write("foo"); + Write("bar"); + UnmarkEOF(); + ASSERT_EQ("foo", Read()); + ASSERT_TRUE(IsEOF()); + Write("xxx"); + ForceError(0); + UnmarkEOF(); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST_P(LogTest, ClearEofError2) { + Write("foo"); + Write("bar"); + UnmarkEOF(); + ASSERT_EQ("foo", Read()); + Write("xxx"); + ForceError(3); + UnmarkEOF(); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("read error")); +} + +TEST_P(LogTest, Recycle) { + bool recyclable_log = (std::get<0>(GetParam()) != 0); + if (!recyclable_log) { + return; // test is only valid for recycled logs + } + Write("foo"); + Write("bar"); + Write("baz"); + Write("bif"); + Write("blitz"); + while (get_reader_contents()->size() < log::kBlockSize * 2) { + Write("xxxxxxxxxxxxxxxx"); + } + std::unique_ptr sink( + new test::OverwritingStringSink(get_reader_contents())); + std::unique_ptr dest_holder(new WritableFileWriter( + std::move(sink), "" /* don't care */, FileOptions())); + Writer recycle_writer(std::move(dest_holder), 123, true); + ASSERT_OK(recycle_writer.AddRecord(Slice("foooo"))); + ASSERT_OK(recycle_writer.AddRecord(Slice("bar"))); + ASSERT_GE(get_reader_contents()->size(), log::kBlockSize * 2); + ASSERT_EQ("foooo", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +// Do NOT enable compression for this instantiation. +INSTANTIATE_TEST_CASE_P( + Log, LogTest, + ::testing::Combine(::testing::Values(0, 1), ::testing::Bool(), + ::testing::Values(CompressionType::kNoCompression))); + +class RetriableLogTest : public ::testing::TestWithParam { + private: + class ReportCollector : public Reader::Reporter { + public: + size_t dropped_bytes_; + std::string message_; + + ReportCollector() : dropped_bytes_(0) {} + void Corruption(size_t bytes, const Status& status) override { + dropped_bytes_ += bytes; + message_.append(status.ToString()); + } + }; + + Slice contents_; + test::StringSink* sink_; + std::unique_ptr log_writer_; + Env* env_; + const std::string test_dir_; + const std::string log_file_; + std::unique_ptr writer_; + std::unique_ptr reader_; + ReportCollector report_; + std::unique_ptr log_reader_; + + public: + RetriableLogTest() + : contents_(), + sink_(new test::StringSink(&contents_)), + log_writer_(nullptr), + env_(Env::Default()), + test_dir_(test::PerThreadDBPath("retriable_log_test")), + log_file_(test_dir_ + "/log"), + writer_(nullptr), + reader_(nullptr), + log_reader_(nullptr) { + std::unique_ptr sink_holder(sink_); + std::unique_ptr wfw(new WritableFileWriter( + std::move(sink_holder), "" /* file name */, FileOptions())); + log_writer_.reset(new Writer(std::move(wfw), 123, GetParam())); + } + + Status SetupTestEnv() { + Status s; + FileOptions fopts; + auto fs = env_->GetFileSystem(); + s = fs->CreateDirIfMissing(test_dir_, IOOptions(), nullptr); + std::unique_ptr writable_file; + if (s.ok()) { + s = fs->NewWritableFile(log_file_, fopts, &writable_file, nullptr); + } + if (s.ok()) { + writer_.reset( + new WritableFileWriter(std::move(writable_file), log_file_, fopts)); + EXPECT_NE(writer_, nullptr); + } + std::unique_ptr seq_file; + if (s.ok()) { + s = fs->NewSequentialFile(log_file_, fopts, &seq_file, nullptr); + } + if (s.ok()) { + reader_.reset(new SequentialFileReader(std::move(seq_file), log_file_)); + EXPECT_NE(reader_, nullptr); + log_reader_.reset(new FragmentBufferedReader( + nullptr, std::move(reader_), &report_, true /* checksum */, + 123 /* log_number */)); + EXPECT_NE(log_reader_, nullptr); + } + return s; + } + + std::string contents() { return sink_->contents_; } + + void Encode(const std::string& msg) { + ASSERT_OK(log_writer_->AddRecord(Slice(msg))); + } + + void Write(const Slice& data) { + ASSERT_OK(writer_->Append(data)); + ASSERT_OK(writer_->Sync(true)); + } + + bool TryRead(std::string* result) { + assert(result != nullptr); + result->clear(); + std::string scratch; + Slice record; + bool r = log_reader_->ReadRecord(&record, &scratch); + if (r) { + result->assign(record.data(), record.size()); + return true; + } else { + return false; + } + } +}; + +TEST_P(RetriableLogTest, TailLog_PartialHeader) { + ASSERT_OK(SetupTestEnv()); + std::vector remaining_bytes_in_last_record; + size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + bool eof = false; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"RetriableLogTest::TailLog:AfterPart1", + "RetriableLogTest::TailLog:BeforeReadRecord"}, + {"FragmentBufferedLogReader::TryReadMore:FirstEOF", + "RetriableLogTest::TailLog:BeforePart2"}}); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "FragmentBufferedLogReader::TryReadMore:FirstEOF", + [&](void* /*arg*/) { eof = true; }); + SyncPoint::GetInstance()->EnableProcessing(); + + size_t delta = header_size - 1; + port::Thread log_writer_thread([&]() { + size_t old_sz = contents().size(); + Encode("foo"); + size_t new_sz = contents().size(); + std::string part1 = contents().substr(old_sz, delta); + std::string part2 = + contents().substr(old_sz + delta, new_sz - old_sz - delta); + Write(Slice(part1)); + TEST_SYNC_POINT("RetriableLogTest::TailLog:AfterPart1"); + TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforePart2"); + Write(Slice(part2)); + }); + + std::string record; + port::Thread log_reader_thread([&]() { + TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforeReadRecord"); + while (!TryRead(&record)) { + } + }); + log_reader_thread.join(); + log_writer_thread.join(); + ASSERT_EQ("foo", record); + ASSERT_TRUE(eof); +} + +TEST_P(RetriableLogTest, TailLog_FullHeader) { + ASSERT_OK(SetupTestEnv()); + std::vector remaining_bytes_in_last_record; + size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + bool eof = false; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"RetriableLogTest::TailLog:AfterPart1", + "RetriableLogTest::TailLog:BeforeReadRecord"}, + {"FragmentBufferedLogReader::TryReadMore:FirstEOF", + "RetriableLogTest::TailLog:BeforePart2"}}); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "FragmentBufferedLogReader::TryReadMore:FirstEOF", + [&](void* /*arg*/) { eof = true; }); + SyncPoint::GetInstance()->EnableProcessing(); + + size_t delta = header_size + 1; + port::Thread log_writer_thread([&]() { + size_t old_sz = contents().size(); + Encode("foo"); + size_t new_sz = contents().size(); + std::string part1 = contents().substr(old_sz, delta); + std::string part2 = + contents().substr(old_sz + delta, new_sz - old_sz - delta); + Write(Slice(part1)); + TEST_SYNC_POINT("RetriableLogTest::TailLog:AfterPart1"); + TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforePart2"); + Write(Slice(part2)); + ASSERT_TRUE(eof); + }); + + std::string record; + port::Thread log_reader_thread([&]() { + TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforeReadRecord"); + while (!TryRead(&record)) { + } + }); + log_reader_thread.join(); + log_writer_thread.join(); + ASSERT_EQ("foo", record); +} + +TEST_P(RetriableLogTest, NonBlockingReadFullRecord) { + // Clear all sync point callbacks even if this test does not use sync point. + // It is necessary, otherwise the execute of this test may hit a sync point + // with which a callback is registered. The registered callback may access + // some dead variable, causing segfault. + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + ASSERT_OK(SetupTestEnv()); + size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + size_t delta = header_size - 1; + size_t old_sz = contents().size(); + Encode("foo-bar"); + size_t new_sz = contents().size(); + std::string part1 = contents().substr(old_sz, delta); + std::string part2 = + contents().substr(old_sz + delta, new_sz - old_sz - delta); + Write(Slice(part1)); + std::string record; + ASSERT_FALSE(TryRead(&record)); + ASSERT_TRUE(record.empty()); + Write(Slice(part2)); + ASSERT_TRUE(TryRead(&record)); + ASSERT_EQ("foo-bar", record); +} + +INSTANTIATE_TEST_CASE_P(bool, RetriableLogTest, ::testing::Values(0, 2)); + +class CompressionLogTest : public LogTest { + public: + Status SetupTestEnv() { return writer_->AddCompressionTypeRecord(); } +}; + +TEST_P(CompressionLogTest, Empty) { + CompressionType compression_type = std::get<2>(GetParam()); + if (!StreamingCompressionTypeSupported(compression_type)) { + ROCKSDB_GTEST_SKIP("Test requires support for compression type"); + return; + } + ASSERT_OK(SetupTestEnv()); + const bool compression_enabled = + std::get<2>(GetParam()) == kNoCompression ? false : true; + // If WAL compression is enabled, a record is added for the compression type + const int compression_record_size = compression_enabled ? kHeaderSize + 4 : 0; + ASSERT_EQ(compression_record_size, WrittenBytes()); + ASSERT_EQ("EOF", Read()); +} + +TEST_P(CompressionLogTest, ReadWrite) { + CompressionType compression_type = std::get<2>(GetParam()); + if (!StreamingCompressionTypeSupported(compression_type)) { + ROCKSDB_GTEST_SKIP("Test requires support for compression type"); + return; + } + ASSERT_OK(SetupTestEnv()); + Write("foo"); + Write("bar"); + Write(""); + Write("xxxx"); + ASSERT_EQ("foo", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("xxxx", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ("EOF", Read()); // Make sure reads at eof work +} + +TEST_P(CompressionLogTest, ManyBlocks) { + CompressionType compression_type = std::get<2>(GetParam()); + if (!StreamingCompressionTypeSupported(compression_type)) { + ROCKSDB_GTEST_SKIP("Test requires support for compression type"); + return; + } + ASSERT_OK(SetupTestEnv()); + for (int i = 0; i < 100000; i++) { + Write(NumberString(i)); + } + for (int i = 0; i < 100000; i++) { + ASSERT_EQ(NumberString(i), Read()); + } + ASSERT_EQ("EOF", Read()); +} + +TEST_P(CompressionLogTest, Fragmentation) { + CompressionType compression_type = std::get<2>(GetParam()); + if (!StreamingCompressionTypeSupported(compression_type)) { + ROCKSDB_GTEST_SKIP("Test requires support for compression type"); + return; + } + ASSERT_OK(SetupTestEnv()); + Random rnd(301); + const std::vector wal_entries = { + "small", + rnd.RandomBinaryString(3 * kBlockSize / 2), // Spans into block 2 + rnd.RandomBinaryString(3 * kBlockSize), // Spans into block 5 + }; + for (const std::string& wal_entry : wal_entries) { + Write(wal_entry); + } + + for (const std::string& wal_entry : wal_entries) { + ASSERT_EQ(wal_entry, Read()); + } + ASSERT_EQ("EOF", Read()); +} + +INSTANTIATE_TEST_CASE_P( + Compression, CompressionLogTest, + ::testing::Combine(::testing::Values(0, 1), ::testing::Bool(), + ::testing::Values(CompressionType::kNoCompression, + CompressionType::kZSTD))); + +class StreamingCompressionTest + : public ::testing::TestWithParam> {}; + +TEST_P(StreamingCompressionTest, Basic) { + size_t input_size = std::get<0>(GetParam()); + CompressionType compression_type = std::get<1>(GetParam()); + if (!StreamingCompressionTypeSupported(compression_type)) { + ROCKSDB_GTEST_SKIP("Test requires support for compression type"); + return; + } + CompressionOptions opts; + constexpr uint32_t compression_format_version = 2; + StreamingCompress* compress = StreamingCompress::Create( + compression_type, opts, compression_format_version, kBlockSize); + StreamingUncompress* uncompress = StreamingUncompress::Create( + compression_type, compression_format_version, kBlockSize); + MemoryAllocator* allocator = new DefaultMemoryAllocator(); + std::string input_buffer = BigString("abc", input_size); + std::vector compressed_buffers; + size_t remaining; + // Call compress till the entire input is consumed + do { + char* output_buffer = (char*)allocator->Allocate(kBlockSize); + size_t output_pos; + remaining = compress->Compress(input_buffer.c_str(), input_size, + output_buffer, &output_pos); + if (output_pos > 0) { + std::string compressed_buffer; + compressed_buffer.assign(output_buffer, output_pos); + compressed_buffers.emplace_back(std::move(compressed_buffer)); + } + allocator->Deallocate((void*)output_buffer); + } while (remaining > 0); + std::string uncompressed_buffer = ""; + int ret_val = 0; + size_t output_pos; + char* uncompressed_output_buffer = (char*)allocator->Allocate(kBlockSize); + // Uncompress the fragments and concatenate them. + for (int i = 0; i < (int)compressed_buffers.size(); i++) { + // Call uncompress till either the entire input is consumed or the output + // buffer size is equal to the allocated output buffer size. + do { + ret_val = uncompress->Uncompress(compressed_buffers[i].c_str(), + compressed_buffers[i].size(), + uncompressed_output_buffer, &output_pos); + if (output_pos > 0) { + std::string uncompressed_fragment; + uncompressed_fragment.assign(uncompressed_output_buffer, output_pos); + uncompressed_buffer += uncompressed_fragment; + } + } while (ret_val > 0 || output_pos == kBlockSize); + } + allocator->Deallocate((void*)uncompressed_output_buffer); + delete allocator; + delete compress; + delete uncompress; + // The final return value from uncompress() should be 0. + ASSERT_EQ(ret_val, 0); + ASSERT_EQ(input_buffer, uncompressed_buffer); +} + +INSTANTIATE_TEST_CASE_P( + StreamingCompression, StreamingCompressionTest, + ::testing::Combine(::testing::Values(10, 100, 1000, kBlockSize, + kBlockSize * 2), + ::testing::Values(CompressionType::kZSTD))); + +} // namespace log +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/log_writer.cc b/src/rocksdb/db/log_writer.cc new file mode 100644 index 000000000..56f58543e --- /dev/null +++ b/src/rocksdb/db/log_writer.cc @@ -0,0 +1,249 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_writer.h" + +#include + +#include "file/writable_file_writer.h" +#include "rocksdb/env.h" +#include "rocksdb/io_status.h" +#include "util/coding.h" +#include "util/crc32c.h" + +namespace ROCKSDB_NAMESPACE { +namespace log { + +Writer::Writer(std::unique_ptr&& dest, uint64_t log_number, + bool recycle_log_files, bool manual_flush, + CompressionType compression_type) + : dest_(std::move(dest)), + block_offset_(0), + log_number_(log_number), + recycle_log_files_(recycle_log_files), + manual_flush_(manual_flush), + compression_type_(compression_type), + compress_(nullptr) { + for (int i = 0; i <= kMaxRecordType; i++) { + char t = static_cast(i); + type_crc_[i] = crc32c::Value(&t, 1); + } +} + +Writer::~Writer() { + if (dest_) { + WriteBuffer().PermitUncheckedError(); + } + if (compress_) { + delete compress_; + } +} + +IOStatus Writer::WriteBuffer() { + if (dest_->seen_error()) { + return IOStatus::IOError("Seen error. Skip writing buffer."); + } + return dest_->Flush(); +} + +IOStatus Writer::Close() { + IOStatus s; + if (dest_) { + s = dest_->Close(); + dest_.reset(); + } + return s; +} + +IOStatus Writer::AddRecord(const Slice& slice, + Env::IOPriority rate_limiter_priority) { + const char* ptr = slice.data(); + size_t left = slice.size(); + + // Header size varies depending on whether we are recycling or not. + const int header_size = + recycle_log_files_ ? kRecyclableHeaderSize : kHeaderSize; + + // Fragment the record if necessary and emit it. Note that if slice + // is empty, we still want to iterate once to emit a single + // zero-length record + IOStatus s; + bool begin = true; + int compress_remaining = 0; + bool compress_start = false; + if (compress_) { + compress_->Reset(); + compress_start = true; + } + do { + const int64_t leftover = kBlockSize - block_offset_; + assert(leftover >= 0); + if (leftover < header_size) { + // Switch to a new block + if (leftover > 0) { + // Fill the trailer (literal below relies on kHeaderSize and + // kRecyclableHeaderSize being <= 11) + assert(header_size <= 11); + s = dest_->Append(Slice("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + static_cast(leftover)), + 0 /* crc32c_checksum */, rate_limiter_priority); + if (!s.ok()) { + break; + } + } + block_offset_ = 0; + } + + // Invariant: we never leave < header_size bytes in a block. + assert(static_cast(kBlockSize - block_offset_) >= header_size); + + const size_t avail = kBlockSize - block_offset_ - header_size; + + // Compress the record if compression is enabled. + // Compress() is called at least once (compress_start=true) and after the + // previous generated compressed chunk is written out as one or more + // physical records (left=0). + if (compress_ && (compress_start || left == 0)) { + compress_remaining = compress_->Compress(slice.data(), slice.size(), + compressed_buffer_.get(), &left); + + if (compress_remaining < 0) { + // Set failure status + s = IOStatus::IOError("Unexpected WAL compression error"); + s.SetDataLoss(true); + break; + } else if (left == 0) { + // Nothing left to compress + if (!compress_start) { + break; + } + } + compress_start = false; + ptr = compressed_buffer_.get(); + } + + const size_t fragment_length = (left < avail) ? left : avail; + + RecordType type; + const bool end = (left == fragment_length && compress_remaining == 0); + if (begin && end) { + type = recycle_log_files_ ? kRecyclableFullType : kFullType; + } else if (begin) { + type = recycle_log_files_ ? kRecyclableFirstType : kFirstType; + } else if (end) { + type = recycle_log_files_ ? kRecyclableLastType : kLastType; + } else { + type = recycle_log_files_ ? kRecyclableMiddleType : kMiddleType; + } + + s = EmitPhysicalRecord(type, ptr, fragment_length, rate_limiter_priority); + ptr += fragment_length; + left -= fragment_length; + begin = false; + } while (s.ok() && (left > 0 || compress_remaining > 0)); + + if (s.ok()) { + if (!manual_flush_) { + s = dest_->Flush(rate_limiter_priority); + } + } + + return s; +} + +IOStatus Writer::AddCompressionTypeRecord() { + // Should be the first record + assert(block_offset_ == 0); + + if (compression_type_ == kNoCompression) { + // No need to add a record + return IOStatus::OK(); + } + + CompressionTypeRecord record(compression_type_); + std::string encode; + record.EncodeTo(&encode); + IOStatus s = + EmitPhysicalRecord(kSetCompressionType, encode.data(), encode.size()); + if (s.ok()) { + if (!manual_flush_) { + s = dest_->Flush(); + } + // Initialize fields required for compression + const size_t max_output_buffer_len = + kBlockSize - (recycle_log_files_ ? kRecyclableHeaderSize : kHeaderSize); + CompressionOptions opts; + constexpr uint32_t compression_format_version = 2; + compress_ = StreamingCompress::Create(compression_type_, opts, + compression_format_version, + max_output_buffer_len); + assert(compress_ != nullptr); + compressed_buffer_ = + std::unique_ptr(new char[max_output_buffer_len]); + assert(compressed_buffer_); + } else { + // Disable compression if the record could not be added. + compression_type_ = kNoCompression; + } + return s; +} + +bool Writer::BufferIsEmpty() { return dest_->BufferIsEmpty(); } + +IOStatus Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n, + Env::IOPriority rate_limiter_priority) { + assert(n <= 0xffff); // Must fit in two bytes + + size_t header_size; + char buf[kRecyclableHeaderSize]; + + // Format the header + buf[4] = static_cast(n & 0xff); + buf[5] = static_cast(n >> 8); + buf[6] = static_cast(t); + + uint32_t crc = type_crc_[t]; + if (t < kRecyclableFullType || t == kSetCompressionType) { + // Legacy record format + assert(block_offset_ + kHeaderSize + n <= kBlockSize); + header_size = kHeaderSize; + } else { + // Recyclable record format + assert(block_offset_ + kRecyclableHeaderSize + n <= kBlockSize); + header_size = kRecyclableHeaderSize; + + // Only encode low 32-bits of the 64-bit log number. This means + // we will fail to detect an old record if we recycled a log from + // ~4 billion logs ago, but that is effectively impossible, and + // even if it were we'dbe far more likely to see a false positive + // on the 32-bit CRC. + EncodeFixed32(buf + 7, static_cast(log_number_)); + crc = crc32c::Extend(crc, buf + 7, 4); + } + + // Compute the crc of the record type and the payload. + uint32_t payload_crc = crc32c::Value(ptr, n); + crc = crc32c::Crc32cCombine(crc, payload_crc, n); + crc = crc32c::Mask(crc); // Adjust for storage + TEST_SYNC_POINT_CALLBACK("LogWriter::EmitPhysicalRecord:BeforeEncodeChecksum", + &crc); + EncodeFixed32(buf, crc); + + // Write the header and the payload + IOStatus s = dest_->Append(Slice(buf, header_size), 0 /* crc32c_checksum */, + rate_limiter_priority); + if (s.ok()) { + s = dest_->Append(Slice(ptr, n), payload_crc, rate_limiter_priority); + } + block_offset_ += header_size + n; + return s; +} + +} // namespace log +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/log_writer.h b/src/rocksdb/db/log_writer.h new file mode 100644 index 000000000..5d266e434 --- /dev/null +++ b/src/rocksdb/db/log_writer.h @@ -0,0 +1,128 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include + +#include "db/log_format.h" +#include "rocksdb/compression_type.h" +#include "rocksdb/env.h" +#include "rocksdb/io_status.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "util/compression.h" + +namespace ROCKSDB_NAMESPACE { + +class WritableFileWriter; + +namespace log { + +/** + * Writer is a general purpose log stream writer. It provides an append-only + * abstraction for writing data. The details of the how the data is written is + * handled by the WritableFile sub-class implementation. + * + * File format: + * + * File is broken down into variable sized records. The format of each record + * is described below. + * +-----+-------------+--+----+----------+------+-- ... ----+ + * File | r0 | r1 |P | r2 | r3 | r4 | | + * +-----+-------------+--+----+----------+------+-- ... ----+ + * <--- kBlockSize ------>|<-- kBlockSize ------>| + * rn = variable size records + * P = Padding + * + * Data is written out in kBlockSize chunks. If next record does not fit + * into the space left, the leftover space will be padded with \0. + * + * Legacy record format: + * + * +---------+-----------+-----------+--- ... ---+ + * |CRC (4B) | Size (2B) | Type (1B) | Payload | + * +---------+-----------+-----------+--- ... ---+ + * + * CRC = 32bit hash computed over the record type and payload using CRC + * Size = Length of the payload data + * Type = Type of record + * (kZeroType, kFullType, kFirstType, kLastType, kMiddleType ) + * The type is used to group a bunch of records together to represent + * blocks that are larger than kBlockSize + * Payload = Byte stream as long as specified by the payload size + * + * Recyclable record format: + * + * +---------+-----------+-----------+----------------+--- ... ---+ + * |CRC (4B) | Size (2B) | Type (1B) | Log number (4B)| Payload | + * +---------+-----------+-----------+----------------+--- ... ---+ + * + * Same as above, with the addition of + * Log number = 32bit log file number, so that we can distinguish between + * records written by the most recent log writer vs a previous one. + */ +class Writer { + public: + // Create a writer that will append data to "*dest". + // "*dest" must be initially empty. + // "*dest" must remain live while this Writer is in use. + explicit Writer(std::unique_ptr&& dest, + uint64_t log_number, bool recycle_log_files, + bool manual_flush = false, + CompressionType compressionType = kNoCompression); + // No copying allowed + Writer(const Writer&) = delete; + void operator=(const Writer&) = delete; + + ~Writer(); + + IOStatus AddRecord(const Slice& slice, + Env::IOPriority rate_limiter_priority = Env::IO_TOTAL); + IOStatus AddCompressionTypeRecord(); + + WritableFileWriter* file() { return dest_.get(); } + const WritableFileWriter* file() const { return dest_.get(); } + + uint64_t get_log_number() const { return log_number_; } + + IOStatus WriteBuffer(); + + IOStatus Close(); + + bool BufferIsEmpty(); + + private: + std::unique_ptr dest_; + size_t block_offset_; // Current offset in block + uint64_t log_number_; + bool recycle_log_files_; + + // crc32c values for all supported record types. These are + // pre-computed to reduce the overhead of computing the crc of the + // record type stored in the header. + uint32_t type_crc_[kMaxRecordType + 1]; + + IOStatus EmitPhysicalRecord( + RecordType type, const char* ptr, size_t length, + Env::IOPriority rate_limiter_priority = Env::IO_TOTAL); + + // If true, it does not flush after each write. Instead it relies on the upper + // layer to manually does the flush by calling ::WriteBuffer() + bool manual_flush_; + + // Compression Type + CompressionType compression_type_; + StreamingCompress* compress_; + // Reusable compressed output buffer + std::unique_ptr compressed_buffer_; +}; + +} // namespace log +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/logs_with_prep_tracker.cc b/src/rocksdb/db/logs_with_prep_tracker.cc new file mode 100644 index 000000000..ff98155c4 --- /dev/null +++ b/src/rocksdb/db/logs_with_prep_tracker.cc @@ -0,0 +1,67 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "db/logs_with_prep_tracker.h" + +#include "port/likely.h" + +namespace ROCKSDB_NAMESPACE { +void LogsWithPrepTracker::MarkLogAsHavingPrepSectionFlushed(uint64_t log) { + assert(log != 0); + std::lock_guard lock(prepared_section_completed_mutex_); + auto it = prepared_section_completed_.find(log); + if (UNLIKELY(it == prepared_section_completed_.end())) { + prepared_section_completed_[log] = 1; + } else { + it->second += 1; + } +} + +void LogsWithPrepTracker::MarkLogAsContainingPrepSection(uint64_t log) { + assert(log != 0); + std::lock_guard lock(logs_with_prep_mutex_); + + auto rit = logs_with_prep_.rbegin(); + bool updated = false; + // Most probably the last log is the one that is being marked for + // having a prepare section; so search from the end. + for (; rit != logs_with_prep_.rend() && rit->log >= log; ++rit) { + if (rit->log == log) { + rit->cnt++; + updated = true; + break; + } + } + if (!updated) { + // We are either at the start, or at a position with rit->log < log + logs_with_prep_.insert(rit.base(), {log, 1}); + } +} + +uint64_t LogsWithPrepTracker::FindMinLogContainingOutstandingPrep() { + std::lock_guard lock(logs_with_prep_mutex_); + auto it = logs_with_prep_.begin(); + // start with the smallest log + for (; it != logs_with_prep_.end();) { + auto min_log = it->log; + { + std::lock_guard lock2(prepared_section_completed_mutex_); + auto completed_it = prepared_section_completed_.find(min_log); + if (completed_it == prepared_section_completed_.end() || + completed_it->second < it->cnt) { + return min_log; + } + assert(completed_it != prepared_section_completed_.end() && + completed_it->second == it->cnt); + prepared_section_completed_.erase(completed_it); + } + // erase from beginning in vector is not efficient but this function is not + // on the fast path. + it = logs_with_prep_.erase(it); + } + // no such log found + return 0; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/logs_with_prep_tracker.h b/src/rocksdb/db/logs_with_prep_tracker.h new file mode 100644 index 000000000..f72f0ca07 --- /dev/null +++ b/src/rocksdb/db/logs_with_prep_tracker.h @@ -0,0 +1,62 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// This class is used to track the log files with outstanding prepare entries. +class LogsWithPrepTracker { + public: + // Called when a transaction prepared in `log` has been committed or aborted. + void MarkLogAsHavingPrepSectionFlushed(uint64_t log); + // Called when a transaction is prepared in `log`. + void MarkLogAsContainingPrepSection(uint64_t log); + // Return the earliest log file with outstanding prepare entries. + uint64_t FindMinLogContainingOutstandingPrep(); + size_t TEST_PreparedSectionCompletedSize() { + return prepared_section_completed_.size(); + } + size_t TEST_LogsWithPrepSize() { return logs_with_prep_.size(); } + + private: + // REQUIRES: logs_with_prep_mutex_ held + // + // sorted list of log numbers still containing prepared data. + // this is used by FindObsoleteFiles to determine which + // flushed logs we must keep around because they still + // contain prepared data which has not been committed or rolled back + struct LogCnt { + uint64_t log; // the log number + uint64_t cnt; // number of prepared sections in the log + }; + std::vector logs_with_prep_; + std::mutex logs_with_prep_mutex_; + + // REQUIRES: prepared_section_completed_mutex_ held + // + // to be used in conjunction with logs_with_prep_. + // once a transaction with data in log L is committed or rolled back + // rather than updating logs_with_prep_ directly we keep track of that + // in prepared_section_completed_ which maps LOG -> instance_count. This helps + // avoiding contention between a commit thread and the prepare threads. + // + // when trying to determine the minimum log still active we first + // consult logs_with_prep_. while that root value maps to + // an equal value in prepared_section_completed_ we erase the log from + // both logs_with_prep_ and prepared_section_completed_. + std::unordered_map prepared_section_completed_; + std::mutex prepared_section_completed_mutex_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/lookup_key.h b/src/rocksdb/db/lookup_key.h new file mode 100644 index 000000000..68851bddd --- /dev/null +++ b/src/rocksdb/db/lookup_key.h @@ -0,0 +1,68 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include "rocksdb/slice.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +// A helper class useful for DBImpl::Get() +class LookupKey { + public: + // Initialize *this for looking up user_key at a snapshot with + // the specified sequence number. + LookupKey(const Slice& _user_key, SequenceNumber sequence, + const Slice* ts = nullptr); + + ~LookupKey(); + + // Return a key suitable for lookup in a MemTable. + Slice memtable_key() const { + return Slice(start_, static_cast(end_ - start_)); + } + + // Return an internal key (suitable for passing to an internal iterator) + Slice internal_key() const { + return Slice(kstart_, static_cast(end_ - kstart_)); + } + + // Return the user key. + // If user-defined timestamp is enabled, then timestamp is included in the + // result. + Slice user_key() const { + return Slice(kstart_, static_cast(end_ - kstart_ - 8)); + } + + private: + // We construct a char array of the form: + // klength varint32 <-- start_ + // userkey char[klength] <-- kstart_ + // tag uint64 + // <-- end_ + // The array is a suitable MemTable key. + // The suffix starting with "userkey" can be used as an InternalKey. + const char* start_; + const char* kstart_; + const char* end_; + char space_[200]; // Avoid allocation for short keys + + // No copying allowed + LookupKey(const LookupKey&); + void operator=(const LookupKey&); +}; + +inline LookupKey::~LookupKey() { + if (start_ != space_) delete[] start_; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/malloc_stats.cc b/src/rocksdb/db/malloc_stats.cc new file mode 100644 index 000000000..52f2e6e0f --- /dev/null +++ b/src/rocksdb/db/malloc_stats.cc @@ -0,0 +1,55 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/malloc_stats.h" + +#ifndef ROCKSDB_LITE +#include + +#include + +#include "port/jemalloc_helper.h" + +namespace ROCKSDB_NAMESPACE { + +#ifdef ROCKSDB_JEMALLOC + +struct MallocStatus { + char* cur; + char* end; +}; + +static void GetJemallocStatus(void* mstat_arg, const char* status) { + MallocStatus* mstat = reinterpret_cast(mstat_arg); + size_t status_len = status ? strlen(status) : 0; + size_t buf_size = (size_t)(mstat->end - mstat->cur); + if (!status_len || status_len > buf_size) { + return; + } + + snprintf(mstat->cur, buf_size, "%s", status); + mstat->cur += status_len; +} +void DumpMallocStats(std::string* stats) { + if (!HasJemalloc()) { + return; + } + MallocStatus mstat; + const unsigned int kMallocStatusLen = 1000000; + std::unique_ptr buf{new char[kMallocStatusLen + 1]}; + mstat.cur = buf.get(); + mstat.end = buf.get() + kMallocStatusLen; + malloc_stats_print(GetJemallocStatus, &mstat, ""); + stats->append(buf.get()); +} +#else +void DumpMallocStats(std::string*) {} +#endif // ROCKSDB_JEMALLOC +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/malloc_stats.h b/src/rocksdb/db/malloc_stats.h new file mode 100644 index 000000000..18aff3ad0 --- /dev/null +++ b/src/rocksdb/db/malloc_stats.h @@ -0,0 +1,24 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#ifndef ROCKSDB_LITE + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +void DumpMallocStats(std::string*); + +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/manual_compaction_test.cc b/src/rocksdb/db/manual_compaction_test.cc new file mode 100644 index 000000000..b92cb794b --- /dev/null +++ b/src/rocksdb/db/manual_compaction_test.cc @@ -0,0 +1,308 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Test for issue 178: a manual compaction causes deleted data to reappear. +#include + +#include "port/port.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/write_batch.h" +#include "test_util/testharness.h" + +using ROCKSDB_NAMESPACE::CompactionFilter; +using ROCKSDB_NAMESPACE::CompactionStyle; +using ROCKSDB_NAMESPACE::CompactRangeOptions; +using ROCKSDB_NAMESPACE::CompressionType; +using ROCKSDB_NAMESPACE::DB; +using ROCKSDB_NAMESPACE::DestroyDB; +using ROCKSDB_NAMESPACE::FlushOptions; +using ROCKSDB_NAMESPACE::Iterator; +using ROCKSDB_NAMESPACE::Options; +using ROCKSDB_NAMESPACE::ReadOptions; +using ROCKSDB_NAMESPACE::Slice; +using ROCKSDB_NAMESPACE::WriteBatch; +using ROCKSDB_NAMESPACE::WriteOptions; + +namespace { + +// Reasoning: previously the number was 1100000. Since the keys are written to +// the batch in one write each write will result into one SST file. each write +// will result into one SST file. We reduced the write_buffer_size to 1K to +// basically have the same effect with however less number of keys, which +// results into less test runtime. +const int kNumKeys = 1100; + +std::string Key1(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "my_key_%d", i); + return buf; +} + +std::string Key2(int i) { return Key1(i) + "_xxx"; } + +class ManualCompactionTest : public testing::Test { + public: + ManualCompactionTest() { + // Get rid of any state from an old run. + dbname_ = ROCKSDB_NAMESPACE::test::PerThreadDBPath( + "rocksdb_manual_compaction_test"); + EXPECT_OK(DestroyDB(dbname_, Options())); + } + + std::string dbname_; +}; + +class DestroyAllCompactionFilter : public CompactionFilter { + public: + DestroyAllCompactionFilter() {} + + bool Filter(int /*level*/, const Slice& /*key*/, const Slice& existing_value, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + return existing_value.ToString() == "destroy"; + } + + const char* Name() const override { return "DestroyAllCompactionFilter"; } +}; + +class LogCompactionFilter : public CompactionFilter { + public: + const char* Name() const override { return "LogCompactionFilter"; } + + bool Filter(int level, const Slice& key, const Slice& /*existing_value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + key_level_[key.ToString()] = level; + return false; + } + + void Reset() { key_level_.clear(); } + + size_t NumKeys() const { return key_level_.size(); } + + int KeyLevel(const Slice& key) { + auto it = key_level_.find(key.ToString()); + if (it == key_level_.end()) { + return -1; + } + return it->second; + } + + private: + mutable std::map key_level_; +}; + +TEST_F(ManualCompactionTest, CompactTouchesAllKeys) { + for (int iter = 0; iter < 2; ++iter) { + DB* db; + Options options; + if (iter == 0) { // level compaction + options.num_levels = 3; + options.compaction_style = CompactionStyle::kCompactionStyleLevel; + } else { // universal compaction + options.compaction_style = CompactionStyle::kCompactionStyleUniversal; + } + options.create_if_missing = true; + options.compression = CompressionType::kNoCompression; + options.compaction_filter = new DestroyAllCompactionFilter(); + ASSERT_OK(DB::Open(options, dbname_, &db)); + + ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice("destroy"))); + ASSERT_OK(db->Put(WriteOptions(), Slice("key2"), Slice("destroy"))); + ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3"))); + ASSERT_OK(db->Put(WriteOptions(), Slice("key4"), Slice("destroy"))); + + Slice key4("key4"); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, &key4)); + Iterator* itr = db->NewIterator(ReadOptions()); + itr->SeekToFirst(); + ASSERT_TRUE(itr->Valid()); + ASSERT_EQ("key3", itr->key().ToString()); + itr->Next(); + ASSERT_TRUE(!itr->Valid()); + delete itr; + + delete options.compaction_filter; + delete db; + ASSERT_OK(DestroyDB(dbname_, options)); + } +} + +TEST_F(ManualCompactionTest, Test) { + // Open database. Disable compression since it affects the creation + // of layers and the code below is trying to test against a very + // specific scenario. + DB* db; + Options db_options; + db_options.write_buffer_size = 1024; + db_options.create_if_missing = true; + db_options.compression = CompressionType::kNoCompression; + ASSERT_OK(DB::Open(db_options, dbname_, &db)); + + // create first key range + WriteBatch batch; + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(batch.Put(Key1(i), "value for range 1 key")); + } + ASSERT_OK(db->Write(WriteOptions(), &batch)); + + // create second key range + batch.Clear(); + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(batch.Put(Key2(i), "value for range 2 key")); + } + ASSERT_OK(db->Write(WriteOptions(), &batch)); + + // delete second key range + batch.Clear(); + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(batch.Delete(Key2(i))); + } + ASSERT_OK(db->Write(WriteOptions(), &batch)); + + // compact database + std::string start_key = Key1(0); + std::string end_key = Key1(kNumKeys - 1); + Slice least(start_key.data(), start_key.size()); + Slice greatest(end_key.data(), end_key.size()); + + // commenting out the line below causes the example to work correctly + ASSERT_OK(db->CompactRange(CompactRangeOptions(), &least, &greatest)); + + // count the keys + Iterator* iter = db->NewIterator(ReadOptions()); + int num_keys = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + num_keys++; + } + delete iter; + ASSERT_EQ(kNumKeys, num_keys) << "Bad number of keys"; + + // close database + delete db; + ASSERT_OK(DestroyDB(dbname_, Options())); +} + +TEST_F(ManualCompactionTest, SkipLevel) { + DB* db; + Options options; + options.num_levels = 3; + // Initially, flushed L0 files won't exceed 100. + options.level0_file_num_compaction_trigger = 100; + options.compaction_style = CompactionStyle::kCompactionStyleLevel; + options.create_if_missing = true; + options.compression = CompressionType::kNoCompression; + LogCompactionFilter* filter = new LogCompactionFilter(); + options.compaction_filter = filter; + ASSERT_OK(DB::Open(options, dbname_, &db)); + + WriteOptions wo; + FlushOptions fo; + ASSERT_OK(db->Put(wo, "1", "")); + ASSERT_OK(db->Flush(fo)); + ASSERT_OK(db->Put(wo, "2", "")); + ASSERT_OK(db->Flush(fo)); + ASSERT_OK(db->Put(wo, "4", "")); + ASSERT_OK(db->Put(wo, "8", "")); + ASSERT_OK(db->Flush(fo)); + + { + // L0: 1, 2, [4, 8] + // no file has keys in range [5, 7] + Slice start("5"); + Slice end("7"); + filter->Reset(); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end)); + ASSERT_EQ(0, filter->NumKeys()); + } + + { + // L0: 1, 2, [4, 8] + // [3, 7] overlaps with 4 in L0 + Slice start("3"); + Slice end("7"); + filter->Reset(); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end)); + ASSERT_EQ(2, filter->NumKeys()); + ASSERT_EQ(0, filter->KeyLevel("4")); + ASSERT_EQ(0, filter->KeyLevel("8")); + } + + { + // L0: 1, 2 + // L1: [4, 8] + // no file has keys in range (-inf, 0] + Slice end("0"); + filter->Reset(); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, &end)); + ASSERT_EQ(0, filter->NumKeys()); + } + + { + // L0: 1, 2 + // L1: [4, 8] + // no file has keys in range [9, inf) + Slice start("9"); + filter->Reset(); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, nullptr)); + ASSERT_EQ(0, filter->NumKeys()); + } + + { + // L0: 1, 2 + // L1: [4, 8] + // [2, 2] overlaps with 2 in L0 + Slice start("2"); + Slice end("2"); + filter->Reset(); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end)); + ASSERT_EQ(1, filter->NumKeys()); + ASSERT_EQ(0, filter->KeyLevel("2")); + } + + { + // L0: 1 + // L1: 2, [4, 8] + // [2, 5] overlaps with 2 and [4, 8) in L1, skip L0 + Slice start("2"); + Slice end("5"); + filter->Reset(); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end)); + ASSERT_EQ(3, filter->NumKeys()); + ASSERT_EQ(1, filter->KeyLevel("2")); + ASSERT_EQ(1, filter->KeyLevel("4")); + ASSERT_EQ(1, filter->KeyLevel("8")); + } + + { + // L0: 1 + // L1: [2, 4, 8] + // [0, inf) overlaps all files + Slice start("0"); + filter->Reset(); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, nullptr)); + ASSERT_EQ(4, filter->NumKeys()); + // 1 is first compacted to L1 and then further compacted into [2, 4, 8], + // so finally the logged level for 1 is L1. + ASSERT_EQ(1, filter->KeyLevel("1")); + ASSERT_EQ(1, filter->KeyLevel("2")); + ASSERT_EQ(1, filter->KeyLevel("4")); + ASSERT_EQ(1, filter->KeyLevel("8")); + } + + delete filter; + delete db; + ASSERT_OK(DestroyDB(dbname_, options)); +} + +} // anonymous namespace + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/memtable.cc b/src/rocksdb/db/memtable.cc new file mode 100644 index 000000000..45b139e80 --- /dev/null +++ b/src/rocksdb/db/memtable.cc @@ -0,0 +1,1675 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/memtable.h" + +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/kv_checksum.h" +#include "db/merge_context.h" +#include "db/merge_helper.h" +#include "db/pinned_iterators_manager.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/read_callback.h" +#include "db/wide/wide_column_serialization.h" +#include "logging/logging.h" +#include "memory/arena.h" +#include "memory/memory_usage.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/statistics.h" +#include "port/lang.h" +#include "port/port.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/types.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/internal_iterator.h" +#include "table/iterator_wrapper.h" +#include "table/merging_iterator.h" +#include "util/autovector.h" +#include "util/coding.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +ImmutableMemTableOptions::ImmutableMemTableOptions( + const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options) + : arena_block_size(mutable_cf_options.arena_block_size), + memtable_prefix_bloom_bits( + static_cast( + static_cast(mutable_cf_options.write_buffer_size) * + mutable_cf_options.memtable_prefix_bloom_size_ratio) * + 8u), + memtable_huge_page_size(mutable_cf_options.memtable_huge_page_size), + memtable_whole_key_filtering( + mutable_cf_options.memtable_whole_key_filtering), + inplace_update_support(ioptions.inplace_update_support), + inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks), + inplace_callback(ioptions.inplace_callback), + max_successive_merges(mutable_cf_options.max_successive_merges), + statistics(ioptions.stats), + merge_operator(ioptions.merge_operator.get()), + info_log(ioptions.logger), + allow_data_in_errors(ioptions.allow_data_in_errors), + protection_bytes_per_key( + mutable_cf_options.memtable_protection_bytes_per_key) {} + +MemTable::MemTable(const InternalKeyComparator& cmp, + const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + WriteBufferManager* write_buffer_manager, + SequenceNumber latest_seq, uint32_t column_family_id) + : comparator_(cmp), + moptions_(ioptions, mutable_cf_options), + refs_(0), + kArenaBlockSize(OptimizeBlockSize(moptions_.arena_block_size)), + mem_tracker_(write_buffer_manager), + arena_(moptions_.arena_block_size, + (write_buffer_manager != nullptr && + (write_buffer_manager->enabled() || + write_buffer_manager->cost_to_cache())) + ? &mem_tracker_ + : nullptr, + mutable_cf_options.memtable_huge_page_size), + table_(ioptions.memtable_factory->CreateMemTableRep( + comparator_, &arena_, mutable_cf_options.prefix_extractor.get(), + ioptions.logger, column_family_id)), + range_del_table_(SkipListFactory().CreateMemTableRep( + comparator_, &arena_, nullptr /* transform */, ioptions.logger, + column_family_id)), + is_range_del_table_empty_(true), + data_size_(0), + num_entries_(0), + num_deletes_(0), + write_buffer_size_(mutable_cf_options.write_buffer_size), + flush_in_progress_(false), + flush_completed_(false), + file_number_(0), + first_seqno_(0), + earliest_seqno_(latest_seq), + creation_seq_(latest_seq), + mem_next_logfile_number_(0), + min_prep_log_referenced_(0), + locks_(moptions_.inplace_update_support + ? moptions_.inplace_update_num_locks + : 0), + prefix_extractor_(mutable_cf_options.prefix_extractor.get()), + flush_state_(FLUSH_NOT_REQUESTED), + clock_(ioptions.clock), + insert_with_hint_prefix_extractor_( + ioptions.memtable_insert_with_hint_prefix_extractor.get()), + oldest_key_time_(std::numeric_limits::max()), + atomic_flush_seqno_(kMaxSequenceNumber), + approximate_memory_usage_(0) { + UpdateFlushState(); + // something went wrong if we need to flush before inserting anything + assert(!ShouldScheduleFlush()); + + // use bloom_filter_ for both whole key and prefix bloom filter + if ((prefix_extractor_ || moptions_.memtable_whole_key_filtering) && + moptions_.memtable_prefix_bloom_bits > 0) { + bloom_filter_.reset( + new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits, + 6 /* hard coded 6 probes */, + moptions_.memtable_huge_page_size, ioptions.logger)); + } + // Initialize cached_range_tombstone_ here since it could + // be read before it is constructed in MemTable::Add(), which could also lead + // to a data race on the global mutex table backing atomic shared_ptr. + auto new_cache = std::make_shared(); + size_t size = cached_range_tombstone_.Size(); + for (size_t i = 0; i < size; ++i) { + std::shared_ptr* local_cache_ref_ptr = + cached_range_tombstone_.AccessAtCore(i); + auto new_local_cache_ref = std::make_shared< + const std::shared_ptr>(new_cache); + std::atomic_store_explicit( + local_cache_ref_ptr, + std::shared_ptr(new_local_cache_ref, + new_cache.get()), + std::memory_order_relaxed); + } +} + +MemTable::~MemTable() { + mem_tracker_.FreeMem(); + assert(refs_ == 0); +} + +size_t MemTable::ApproximateMemoryUsage() { + autovector usages = { + arena_.ApproximateMemoryUsage(), table_->ApproximateMemoryUsage(), + range_del_table_->ApproximateMemoryUsage(), + ROCKSDB_NAMESPACE::ApproximateMemoryUsage(insert_hints_)}; + size_t total_usage = 0; + for (size_t usage : usages) { + // If usage + total_usage >= kMaxSizet, return kMaxSizet. + // the following variation is to avoid numeric overflow. + if (usage >= std::numeric_limits::max() - total_usage) { + return std::numeric_limits::max(); + } + total_usage += usage; + } + approximate_memory_usage_.store(total_usage, std::memory_order_relaxed); + // otherwise, return the actual usage + return total_usage; +} + +bool MemTable::ShouldFlushNow() { + size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed); + // In a lot of times, we cannot allocate arena blocks that exactly matches the + // buffer size. Thus we have to decide if we should over-allocate or + // under-allocate. + // This constant variable can be interpreted as: if we still have more than + // "kAllowOverAllocationRatio * kArenaBlockSize" space left, we'd try to over + // allocate one more block. + const double kAllowOverAllocationRatio = 0.6; + + // If arena still have room for new block allocation, we can safely say it + // shouldn't flush. + auto allocated_memory = table_->ApproximateMemoryUsage() + + range_del_table_->ApproximateMemoryUsage() + + arena_.MemoryAllocatedBytes(); + + approximate_memory_usage_.store(allocated_memory, std::memory_order_relaxed); + + // if we can still allocate one more block without exceeding the + // over-allocation ratio, then we should not flush. + if (allocated_memory + kArenaBlockSize < + write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) { + return false; + } + + // if user keeps adding entries that exceeds write_buffer_size, we need to + // flush earlier even though we still have much available memory left. + if (allocated_memory > + write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) { + return true; + } + + // In this code path, Arena has already allocated its "last block", which + // means the total allocatedmemory size is either: + // (1) "moderately" over allocated the memory (no more than `0.6 * arena + // block size`. Or, + // (2) the allocated memory is less than write buffer size, but we'll stop + // here since if we allocate a new arena block, we'll over allocate too much + // more (half of the arena block size) memory. + // + // In either case, to avoid over-allocate, the last block will stop allocation + // when its usage reaches a certain ratio, which we carefully choose "0.75 + // full" as the stop condition because it addresses the following issue with + // great simplicity: What if the next inserted entry's size is + // bigger than AllocatedAndUnused()? + // + // The answer is: if the entry size is also bigger than 0.25 * + // kArenaBlockSize, a dedicated block will be allocated for it; otherwise + // arena will anyway skip the AllocatedAndUnused() and allocate a new, empty + // and regular block. In either case, we *overly* over-allocated. + // + // Therefore, setting the last block to be at most "0.75 full" avoids both + // cases. + // + // NOTE: the average percentage of waste space of this approach can be counted + // as: "arena block size * 0.25 / write buffer size". User who specify a small + // write buffer size and/or big arena block size may suffer. + return arena_.AllocatedAndUnused() < kArenaBlockSize / 4; +} + +void MemTable::UpdateFlushState() { + auto state = flush_state_.load(std::memory_order_relaxed); + if (state == FLUSH_NOT_REQUESTED && ShouldFlushNow()) { + // ignore CAS failure, because that means somebody else requested + // a flush + flush_state_.compare_exchange_strong(state, FLUSH_REQUESTED, + std::memory_order_relaxed, + std::memory_order_relaxed); + } +} + +void MemTable::UpdateOldestKeyTime() { + uint64_t oldest_key_time = oldest_key_time_.load(std::memory_order_relaxed); + if (oldest_key_time == std::numeric_limits::max()) { + int64_t current_time = 0; + auto s = clock_->GetCurrentTime(¤t_time); + if (s.ok()) { + assert(current_time >= 0); + // If fail, the timestamp is already set. + oldest_key_time_.compare_exchange_strong( + oldest_key_time, static_cast(current_time), + std::memory_order_relaxed, std::memory_order_relaxed); + } + } +} + +Status MemTable::VerifyEntryChecksum(const char* entry, + size_t protection_bytes_per_key, + bool allow_data_in_errors) { + if (protection_bytes_per_key == 0) { + return Status::OK(); + } + uint32_t key_length; + const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if (key_ptr == nullptr) { + return Status::Corruption("Unable to parse internal key length"); + } + if (key_length < 8) { + return Status::Corruption("Memtable entry internal key length too short."); + } + Slice user_key = Slice(key_ptr, key_length - 8); + + const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); + ValueType type; + SequenceNumber seq; + UnPackSequenceAndType(tag, &seq, &type); + + uint32_t value_length = 0; + const char* value_ptr = GetVarint32Ptr( + key_ptr + key_length, key_ptr + key_length + 5, &value_length); + if (value_ptr == nullptr) { + return Status::Corruption("Unable to parse internal key value"); + } + Slice value = Slice(value_ptr, value_length); + + const char* checksum_ptr = value_ptr + value_length; + uint64_t expected = ProtectionInfo64() + .ProtectKVO(user_key, value, type) + .ProtectS(seq) + .GetVal(); + bool match = true; + switch (protection_bytes_per_key) { + case 1: + match = static_cast(checksum_ptr[0]) == + static_cast(expected); + break; + case 2: + match = DecodeFixed16(checksum_ptr) == static_cast(expected); + break; + case 4: + match = DecodeFixed32(checksum_ptr) == static_cast(expected); + break; + case 8: + match = DecodeFixed64(checksum_ptr) == expected; + break; + default: + assert(false); + } + if (!match) { + std::string msg( + "Corrupted memtable entry, per key-value checksum verification " + "failed."); + if (allow_data_in_errors) { + msg.append("Unrecognized value type: " + + std::to_string(static_cast(type)) + ". "); + msg.append("User key: " + user_key.ToString(/*hex=*/true) + ". "); + msg.append("seq: " + std::to_string(seq) + "."); + } + return Status::Corruption(msg.c_str()); + } + return Status::OK(); +} + +int MemTable::KeyComparator::operator()(const char* prefix_len_key1, + const char* prefix_len_key2) const { + // Internal keys are encoded as length-prefixed strings. + Slice k1 = GetLengthPrefixedSlice(prefix_len_key1); + Slice k2 = GetLengthPrefixedSlice(prefix_len_key2); + return comparator.CompareKeySeq(k1, k2); +} + +int MemTable::KeyComparator::operator()( + const char* prefix_len_key, const KeyComparator::DecodedType& key) const { + // Internal keys are encoded as length-prefixed strings. + Slice a = GetLengthPrefixedSlice(prefix_len_key); + return comparator.CompareKeySeq(a, key); +} + +void MemTableRep::InsertConcurrently(KeyHandle /*handle*/) { +#ifndef ROCKSDB_LITE + throw std::runtime_error("concurrent insert not supported"); +#else + abort(); +#endif +} + +Slice MemTableRep::UserKey(const char* key) const { + Slice slice = GetLengthPrefixedSlice(key); + return Slice(slice.data(), slice.size() - 8); +} + +KeyHandle MemTableRep::Allocate(const size_t len, char** buf) { + *buf = allocator_->Allocate(len); + return static_cast(*buf); +} + +// Encode a suitable internal key target for "target" and return it. +// Uses *scratch as scratch space, and the returned pointer will point +// into this scratch space. +const char* EncodeKey(std::string* scratch, const Slice& target) { + scratch->clear(); + PutVarint32(scratch, static_cast(target.size())); + scratch->append(target.data(), target.size()); + return scratch->data(); +} + +class MemTableIterator : public InternalIterator { + public: + MemTableIterator(const MemTable& mem, const ReadOptions& read_options, + Arena* arena, bool use_range_del_table = false) + : bloom_(nullptr), + prefix_extractor_(mem.prefix_extractor_), + comparator_(mem.comparator_), + valid_(false), + arena_mode_(arena != nullptr), + value_pinned_( + !mem.GetImmutableMemTableOptions()->inplace_update_support), + protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key), + status_(Status::OK()), + logger_(mem.moptions_.info_log) { + if (use_range_del_table) { + iter_ = mem.range_del_table_->GetIterator(arena); + } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek && + !read_options.auto_prefix_mode) { + // Auto prefix mode is not implemented in memtable yet. + bloom_ = mem.bloom_filter_.get(); + iter_ = mem.table_->GetDynamicPrefixIterator(arena); + } else { + iter_ = mem.table_->GetIterator(arena); + } + status_.PermitUncheckedError(); + } + // No copying allowed + MemTableIterator(const MemTableIterator&) = delete; + void operator=(const MemTableIterator&) = delete; + + ~MemTableIterator() override { +#ifndef NDEBUG + // Assert that the MemTableIterator is never deleted while + // Pinning is Enabled. + assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled()); +#endif + if (arena_mode_) { + iter_->~Iterator(); + } else { + delete iter_; + } + } + +#ifndef NDEBUG + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + } + PinnedIteratorsManager* pinned_iters_mgr_ = nullptr; +#endif + + bool Valid() const override { return valid_ && status_.ok(); } + void Seek(const Slice& k) override { + PERF_TIMER_GUARD(seek_on_memtable_time); + PERF_COUNTER_ADD(seek_on_memtable_count, 1); + if (bloom_) { + // iterator should only use prefix bloom filter + auto ts_sz = comparator_.comparator.user_comparator()->timestamp_size(); + Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz)); + if (prefix_extractor_->InDomain(user_k_without_ts)) { + if (!bloom_->MayContain( + prefix_extractor_->Transform(user_k_without_ts))) { + PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); + valid_ = false; + return; + } else { + PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); + } + } + } + iter_->Seek(k, nullptr); + valid_ = iter_->Valid(); + VerifyEntryChecksum(); + } + void SeekForPrev(const Slice& k) override { + PERF_TIMER_GUARD(seek_on_memtable_time); + PERF_COUNTER_ADD(seek_on_memtable_count, 1); + if (bloom_) { + auto ts_sz = comparator_.comparator.user_comparator()->timestamp_size(); + Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz)); + if (prefix_extractor_->InDomain(user_k_without_ts)) { + if (!bloom_->MayContain( + prefix_extractor_->Transform(user_k_without_ts))) { + PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); + valid_ = false; + return; + } else { + PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); + } + } + } + iter_->Seek(k, nullptr); + valid_ = iter_->Valid(); + VerifyEntryChecksum(); + if (!Valid() && status().ok()) { + SeekToLast(); + } + while (Valid() && comparator_.comparator.Compare(k, key()) < 0) { + Prev(); + } + } + void SeekToFirst() override { + iter_->SeekToFirst(); + valid_ = iter_->Valid(); + VerifyEntryChecksum(); + } + void SeekToLast() override { + iter_->SeekToLast(); + valid_ = iter_->Valid(); + VerifyEntryChecksum(); + } + void Next() override { + PERF_COUNTER_ADD(next_on_memtable_count, 1); + assert(Valid()); + iter_->Next(); + TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_); + valid_ = iter_->Valid(); + VerifyEntryChecksum(); + } + bool NextAndGetResult(IterateResult* result) override { + Next(); + bool is_valid = Valid(); + if (is_valid) { + result->key = key(); + result->bound_check_result = IterBoundCheck::kUnknown; + result->value_prepared = true; + } + return is_valid; + } + void Prev() override { + PERF_COUNTER_ADD(prev_on_memtable_count, 1); + assert(Valid()); + iter_->Prev(); + valid_ = iter_->Valid(); + VerifyEntryChecksum(); + } + Slice key() const override { + assert(Valid()); + return GetLengthPrefixedSlice(iter_->key()); + } + Slice value() const override { + assert(Valid()); + Slice key_slice = GetLengthPrefixedSlice(iter_->key()); + return GetLengthPrefixedSlice(key_slice.data() + key_slice.size()); + } + + Status status() const override { return status_; } + + bool IsKeyPinned() const override { + // memtable data is always pinned + return true; + } + + bool IsValuePinned() const override { + // memtable value is always pinned, except if we allow inplace update. + return value_pinned_; + } + + private: + DynamicBloom* bloom_; + const SliceTransform* const prefix_extractor_; + const MemTable::KeyComparator comparator_; + MemTableRep::Iterator* iter_; + bool valid_; + bool arena_mode_; + bool value_pinned_; + size_t protection_bytes_per_key_; + Status status_; + Logger* logger_; + + void VerifyEntryChecksum() { + if (protection_bytes_per_key_ > 0 && Valid()) { + status_ = MemTable::VerifyEntryChecksum(iter_->key(), + protection_bytes_per_key_); + if (!status_.ok()) { + ROCKS_LOG_ERROR(logger_, "In MemtableIterator: %s", status_.getState()); + } + } + } +}; + +InternalIterator* MemTable::NewIterator(const ReadOptions& read_options, + Arena* arena) { + assert(arena != nullptr); + auto mem = arena->AllocateAligned(sizeof(MemTableIterator)); + return new (mem) MemTableIterator(*this, read_options, arena); +} + +FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator( + const ReadOptions& read_options, SequenceNumber read_seq, + bool immutable_memtable) { + if (read_options.ignore_range_deletions || + is_range_del_table_empty_.load(std::memory_order_relaxed)) { + return nullptr; + } + return NewRangeTombstoneIteratorInternal(read_options, read_seq, + immutable_memtable); +} + +FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal( + const ReadOptions& read_options, SequenceNumber read_seq, + bool immutable_memtable) { + if (immutable_memtable) { + // Note that caller should already have verified that + // !is_range_del_table_empty_ + assert(IsFragmentedRangeTombstonesConstructed()); + return new FragmentedRangeTombstoneIterator( + fragmented_range_tombstone_list_.get(), comparator_.comparator, + read_seq, read_options.timestamp); + } + + // takes current cache + std::shared_ptr cache = + std::atomic_load_explicit(cached_range_tombstone_.Access(), + std::memory_order_relaxed); + // construct fragmented tombstone list if necessary + if (!cache->initialized.load(std::memory_order_acquire)) { + cache->reader_mutex.lock(); + if (!cache->tombstones) { + auto* unfragmented_iter = + new MemTableIterator(*this, read_options, nullptr /* arena */, + true /* use_range_del_table */); + cache->tombstones.reset(new FragmentedRangeTombstoneList( + std::unique_ptr(unfragmented_iter), + comparator_.comparator)); + cache->initialized.store(true, std::memory_order_release); + } + cache->reader_mutex.unlock(); + } + + auto* fragmented_iter = new FragmentedRangeTombstoneIterator( + cache, comparator_.comparator, read_seq, read_options.timestamp); + return fragmented_iter; +} + +void MemTable::ConstructFragmentedRangeTombstones() { + assert(!IsFragmentedRangeTombstonesConstructed(false)); + // There should be no concurrent Construction + if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) { + auto* unfragmented_iter = + new MemTableIterator(*this, ReadOptions(), nullptr /* arena */, + true /* use_range_del_table */); + + fragmented_range_tombstone_list_ = + std::make_unique( + std::unique_ptr(unfragmented_iter), + comparator_.comparator); + } +} + +port::RWMutex* MemTable::GetLock(const Slice& key) { + return &locks_[GetSliceRangedNPHash(key, locks_.size())]; +} + +MemTable::MemTableStats MemTable::ApproximateStats(const Slice& start_ikey, + const Slice& end_ikey) { + uint64_t entry_count = table_->ApproximateNumEntries(start_ikey, end_ikey); + entry_count += range_del_table_->ApproximateNumEntries(start_ikey, end_ikey); + if (entry_count == 0) { + return {0, 0}; + } + uint64_t n = num_entries_.load(std::memory_order_relaxed); + if (n == 0) { + return {0, 0}; + } + if (entry_count > n) { + // (range_del_)table_->ApproximateNumEntries() is just an estimate so it can + // be larger than actual entries we have. Cap it to entries we have to limit + // the inaccuracy. + entry_count = n; + } + uint64_t data_size = data_size_.load(std::memory_order_relaxed); + return {entry_count * (data_size / n), entry_count}; +} + +Status MemTable::VerifyEncodedEntry(Slice encoded, + const ProtectionInfoKVOS64& kv_prot_info) { + uint32_t ikey_len = 0; + if (!GetVarint32(&encoded, &ikey_len)) { + return Status::Corruption("Unable to parse internal key length"); + } + size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size(); + if (ikey_len < 8 + ts_sz) { + return Status::Corruption("Internal key length too short"); + } + if (ikey_len > encoded.size()) { + return Status::Corruption("Internal key length too long"); + } + uint32_t value_len = 0; + const size_t user_key_len = ikey_len - 8; + Slice key(encoded.data(), user_key_len); + encoded.remove_prefix(user_key_len); + + uint64_t packed = DecodeFixed64(encoded.data()); + ValueType value_type = kMaxValue; + SequenceNumber sequence_number = kMaxSequenceNumber; + UnPackSequenceAndType(packed, &sequence_number, &value_type); + encoded.remove_prefix(8); + + if (!GetVarint32(&encoded, &value_len)) { + return Status::Corruption("Unable to parse value length"); + } + if (value_len < encoded.size()) { + return Status::Corruption("Value length too short"); + } + if (value_len > encoded.size()) { + return Status::Corruption("Value length too long"); + } + Slice value(encoded.data(), value_len); + + return kv_prot_info.StripS(sequence_number) + .StripKVO(key, value, value_type) + .GetStatus(); +} + +void MemTable::UpdateEntryChecksum(const ProtectionInfoKVOS64* kv_prot_info, + const Slice& key, const Slice& value, + ValueType type, SequenceNumber s, + char* checksum_ptr) { + if (moptions_.protection_bytes_per_key == 0) { + return; + } + + uint64_t checksum = 0; + if (kv_prot_info == nullptr) { + checksum = + ProtectionInfo64().ProtectKVO(key, value, type).ProtectS(s).GetVal(); + } else { + checksum = kv_prot_info->GetVal(); + } + switch (moptions_.protection_bytes_per_key) { + case 1: + checksum_ptr[0] = static_cast(checksum); + break; + case 2: + EncodeFixed16(checksum_ptr, static_cast(checksum)); + break; + case 4: + EncodeFixed32(checksum_ptr, static_cast(checksum)); + break; + case 8: + EncodeFixed64(checksum_ptr, checksum); + break; + default: + assert(false); + } +} + +Status MemTable::Add(SequenceNumber s, ValueType type, + const Slice& key, /* user key */ + const Slice& value, + const ProtectionInfoKVOS64* kv_prot_info, + bool allow_concurrent, + MemTablePostProcessInfo* post_process_info, void** hint) { + // Format of an entry is concatenation of: + // key_size : varint32 of internal_key.size() + // key bytes : char[internal_key.size()] + // value_size : varint32 of value.size() + // value bytes : char[value.size()] + // checksum : char[moptions_.protection_bytes_per_key] + uint32_t key_size = static_cast(key.size()); + uint32_t val_size = static_cast(value.size()); + uint32_t internal_key_size = key_size + 8; + const uint32_t encoded_len = VarintLength(internal_key_size) + + internal_key_size + VarintLength(val_size) + + val_size + moptions_.protection_bytes_per_key; + char* buf = nullptr; + std::unique_ptr& table = + type == kTypeRangeDeletion ? range_del_table_ : table_; + KeyHandle handle = table->Allocate(encoded_len, &buf); + + char* p = EncodeVarint32(buf, internal_key_size); + memcpy(p, key.data(), key_size); + Slice key_slice(p, key_size); + p += key_size; + uint64_t packed = PackSequenceAndType(s, type); + EncodeFixed64(p, packed); + p += 8; + p = EncodeVarint32(p, val_size); + memcpy(p, value.data(), val_size); + assert((unsigned)(p + val_size - buf + moptions_.protection_bytes_per_key) == + (unsigned)encoded_len); + + UpdateEntryChecksum(kv_prot_info, key, value, type, s, + buf + encoded_len - moptions_.protection_bytes_per_key); + Slice encoded(buf, encoded_len - moptions_.protection_bytes_per_key); + if (kv_prot_info != nullptr) { + TEST_SYNC_POINT_CALLBACK("MemTable::Add:Encoded", &encoded); + Status status = VerifyEncodedEntry(encoded, *kv_prot_info); + if (!status.ok()) { + return status; + } + } + + size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size(); + Slice key_without_ts = StripTimestampFromUserKey(key, ts_sz); + + if (!allow_concurrent) { + // Extract prefix for insert with hint. + if (insert_with_hint_prefix_extractor_ != nullptr && + insert_with_hint_prefix_extractor_->InDomain(key_slice)) { + Slice prefix = insert_with_hint_prefix_extractor_->Transform(key_slice); + bool res = table->InsertKeyWithHint(handle, &insert_hints_[prefix]); + if (UNLIKELY(!res)) { + return Status::TryAgain("key+seq exists"); + } + } else { + bool res = table->InsertKey(handle); + if (UNLIKELY(!res)) { + return Status::TryAgain("key+seq exists"); + } + } + + // this is a bit ugly, but is the way to avoid locked instructions + // when incrementing an atomic + num_entries_.store(num_entries_.load(std::memory_order_relaxed) + 1, + std::memory_order_relaxed); + data_size_.store(data_size_.load(std::memory_order_relaxed) + encoded_len, + std::memory_order_relaxed); + if (type == kTypeDeletion || type == kTypeSingleDeletion || + type == kTypeDeletionWithTimestamp) { + num_deletes_.store(num_deletes_.load(std::memory_order_relaxed) + 1, + std::memory_order_relaxed); + } + + if (bloom_filter_ && prefix_extractor_ && + prefix_extractor_->InDomain(key_without_ts)) { + bloom_filter_->Add(prefix_extractor_->Transform(key_without_ts)); + } + if (bloom_filter_ && moptions_.memtable_whole_key_filtering) { + bloom_filter_->Add(key_without_ts); + } + + // The first sequence number inserted into the memtable + assert(first_seqno_ == 0 || s >= first_seqno_); + if (first_seqno_ == 0) { + first_seqno_.store(s, std::memory_order_relaxed); + + if (earliest_seqno_ == kMaxSequenceNumber) { + earliest_seqno_.store(GetFirstSequenceNumber(), + std::memory_order_relaxed); + } + assert(first_seqno_.load() >= earliest_seqno_.load()); + } + assert(post_process_info == nullptr); + UpdateFlushState(); + } else { + bool res = (hint == nullptr) + ? table->InsertKeyConcurrently(handle) + : table->InsertKeyWithHintConcurrently(handle, hint); + if (UNLIKELY(!res)) { + return Status::TryAgain("key+seq exists"); + } + + assert(post_process_info != nullptr); + post_process_info->num_entries++; + post_process_info->data_size += encoded_len; + if (type == kTypeDeletion) { + post_process_info->num_deletes++; + } + + if (bloom_filter_ && prefix_extractor_ && + prefix_extractor_->InDomain(key_without_ts)) { + bloom_filter_->AddConcurrently( + prefix_extractor_->Transform(key_without_ts)); + } + if (bloom_filter_ && moptions_.memtable_whole_key_filtering) { + bloom_filter_->AddConcurrently(key_without_ts); + } + + // atomically update first_seqno_ and earliest_seqno_. + uint64_t cur_seq_num = first_seqno_.load(std::memory_order_relaxed); + while ((cur_seq_num == 0 || s < cur_seq_num) && + !first_seqno_.compare_exchange_weak(cur_seq_num, s)) { + } + uint64_t cur_earliest_seqno = + earliest_seqno_.load(std::memory_order_relaxed); + while ( + (cur_earliest_seqno == kMaxSequenceNumber || s < cur_earliest_seqno) && + !first_seqno_.compare_exchange_weak(cur_earliest_seqno, s)) { + } + } + if (type == kTypeRangeDeletion) { + auto new_cache = std::make_shared(); + size_t size = cached_range_tombstone_.Size(); + if (allow_concurrent) { + range_del_mutex_.lock(); + } + for (size_t i = 0; i < size; ++i) { + std::shared_ptr* local_cache_ref_ptr = + cached_range_tombstone_.AccessAtCore(i); + auto new_local_cache_ref = std::make_shared< + const std::shared_ptr>(new_cache); + // It is okay for some reader to load old cache during invalidation as + // the new sequence number is not published yet. + // Each core will have a shared_ptr to a shared_ptr to the cached + // fragmented range tombstones, so that ref count is maintianed locally + // per-core using the per-core shared_ptr. + std::atomic_store_explicit( + local_cache_ref_ptr, + std::shared_ptr( + new_local_cache_ref, new_cache.get()), + std::memory_order_relaxed); + } + if (allow_concurrent) { + range_del_mutex_.unlock(); + } + is_range_del_table_empty_.store(false, std::memory_order_relaxed); + } + UpdateOldestKeyTime(); + + TEST_SYNC_POINT_CALLBACK("MemTable::Add:BeforeReturn:Encoded", &encoded); + return Status::OK(); +} + +// Callback from MemTable::Get() +namespace { + +struct Saver { + Status* status; + const LookupKey* key; + bool* found_final_value; // Is value set correctly? Used by KeyMayExist + bool* merge_in_progress; + std::string* value; + PinnableWideColumns* columns; + SequenceNumber seq; + std::string* timestamp; + const MergeOperator* merge_operator; + // the merge operations encountered; + MergeContext* merge_context; + SequenceNumber max_covering_tombstone_seq; + MemTable* mem; + Logger* logger; + Statistics* statistics; + bool inplace_update_support; + bool do_merge; + SystemClock* clock; + + ReadCallback* callback_; + bool* is_blob_index; + bool allow_data_in_errors; + size_t protection_bytes_per_key; + bool CheckCallback(SequenceNumber _seq) { + if (callback_) { + return callback_->IsVisible(_seq); + } + return true; + } +}; +} // anonymous namespace + +static bool SaveValue(void* arg, const char* entry) { + TEST_SYNC_POINT_CALLBACK("Memtable::SaveValue:Begin:entry", &entry); + Saver* s = reinterpret_cast(arg); + assert(s != nullptr); + assert(!s->value || !s->columns); + + if (s->protection_bytes_per_key > 0) { + *(s->status) = MemTable::VerifyEntryChecksum( + entry, s->protection_bytes_per_key, s->allow_data_in_errors); + if (!s->status->ok()) { + ROCKS_LOG_ERROR(s->logger, "In SaveValue: %s", s->status->getState()); + // Memtable entry corrupted + return false; + } + } + + MergeContext* merge_context = s->merge_context; + SequenceNumber max_covering_tombstone_seq = s->max_covering_tombstone_seq; + const MergeOperator* merge_operator = s->merge_operator; + + assert(merge_context != nullptr); + + // Refer to comments under MemTable::Add() for entry format. + // Check that it belongs to same user key. + uint32_t key_length = 0; + const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + assert(key_length >= 8); + Slice user_key_slice = Slice(key_ptr, key_length - 8); + const Comparator* user_comparator = + s->mem->GetInternalKeyComparator().user_comparator(); + size_t ts_sz = user_comparator->timestamp_size(); + if (ts_sz && s->timestamp && max_covering_tombstone_seq > 0) { + // timestamp should already be set to range tombstone timestamp + assert(s->timestamp->size() == ts_sz); + } + if (user_comparator->EqualWithoutTimestamp(user_key_slice, + s->key->user_key())) { + // Correct user key + const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); + ValueType type; + SequenceNumber seq; + UnPackSequenceAndType(tag, &seq, &type); + // If the value is not in the snapshot, skip it + if (!s->CheckCallback(seq)) { + return true; // to continue to the next seq + } + + if (s->seq == kMaxSequenceNumber) { + s->seq = seq; + if (s->seq > max_covering_tombstone_seq) { + if (ts_sz && s->timestamp != nullptr) { + // `timestamp` was set to range tombstone's timestamp before + // `SaveValue` is ever called. This key has a higher sequence number + // than range tombstone, and is the key with the highest seqno across + // all keys with this user_key, so we update timestamp here. + Slice ts = ExtractTimestampFromUserKey(user_key_slice, ts_sz); + s->timestamp->assign(ts.data(), ts_sz); + } + } else { + s->seq = max_covering_tombstone_seq; + } + } + + if (ts_sz > 0 && s->timestamp != nullptr) { + if (!s->timestamp->empty()) { + assert(ts_sz == s->timestamp->size()); + } + // TODO optimize for smaller size ts + const std::string kMaxTs(ts_sz, '\xff'); + if (s->timestamp->empty() || + user_comparator->CompareTimestamp(*(s->timestamp), kMaxTs) == 0) { + Slice ts = ExtractTimestampFromUserKey(user_key_slice, ts_sz); + s->timestamp->assign(ts.data(), ts_sz); + } + } + + if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex || + type == kTypeWideColumnEntity || type == kTypeDeletion || + type == kTypeSingleDeletion || type == kTypeDeletionWithTimestamp) && + max_covering_tombstone_seq > seq) { + type = kTypeRangeDeletion; + } + switch (type) { + case kTypeBlobIndex: { + if (!s->do_merge) { + *(s->status) = Status::NotSupported( + "GetMergeOperands not supported by stacked BlobDB"); + *(s->found_final_value) = true; + return false; + } + + if (*(s->merge_in_progress)) { + *(s->status) = Status::NotSupported( + "Merge operator not supported by stacked BlobDB"); + *(s->found_final_value) = true; + return false; + } + + if (s->is_blob_index == nullptr) { + ROCKS_LOG_ERROR(s->logger, "Encountered unexpected blob index."); + *(s->status) = Status::NotSupported( + "Encountered unexpected blob index. Please open DB with " + "ROCKSDB_NAMESPACE::blob_db::BlobDB."); + *(s->found_final_value) = true; + return false; + } + + if (s->inplace_update_support) { + s->mem->GetLock(s->key->user_key())->ReadLock(); + } + + Slice v = GetLengthPrefixedSlice(key_ptr + key_length); + + *(s->status) = Status::OK(); + + if (s->value) { + s->value->assign(v.data(), v.size()); + } else if (s->columns) { + s->columns->SetPlainValue(v); + } + + if (s->inplace_update_support) { + s->mem->GetLock(s->key->user_key())->ReadUnlock(); + } + + *(s->found_final_value) = true; + *(s->is_blob_index) = true; + + return false; + } + case kTypeValue: { + if (s->inplace_update_support) { + s->mem->GetLock(s->key->user_key())->ReadLock(); + } + + Slice v = GetLengthPrefixedSlice(key_ptr + key_length); + + *(s->status) = Status::OK(); + + if (!s->do_merge) { + // Preserve the value with the goal of returning it as part of + // raw merge operands to the user + // TODO(yanqin) update MergeContext so that timestamps information + // can also be retained. + + merge_context->PushOperand( + v, s->inplace_update_support == false /* operand_pinned */); + } else if (*(s->merge_in_progress)) { + assert(s->do_merge); + + if (s->value || s->columns) { + std::string result; + *(s->status) = MergeHelper::TimedFullMerge( + merge_operator, s->key->user_key(), &v, + merge_context->GetOperands(), &result, s->logger, s->statistics, + s->clock, /* result_operand */ nullptr, + /* update_num_ops_stats */ true); + + if (s->status->ok()) { + if (s->value) { + *(s->value) = std::move(result); + } else { + assert(s->columns); + s->columns->SetPlainValue(result); + } + } + } + } else if (s->value) { + s->value->assign(v.data(), v.size()); + } else if (s->columns) { + s->columns->SetPlainValue(v); + } + + if (s->inplace_update_support) { + s->mem->GetLock(s->key->user_key())->ReadUnlock(); + } + + *(s->found_final_value) = true; + + if (s->is_blob_index != nullptr) { + *(s->is_blob_index) = false; + } + + return false; + } + case kTypeWideColumnEntity: { + if (s->inplace_update_support) { + s->mem->GetLock(s->key->user_key())->ReadLock(); + } + + Slice v = GetLengthPrefixedSlice(key_ptr + key_length); + + *(s->status) = Status::OK(); + + if (!s->do_merge) { + // Preserve the value with the goal of returning it as part of + // raw merge operands to the user + + Slice value_of_default; + *(s->status) = WideColumnSerialization::GetValueOfDefaultColumn( + v, value_of_default); + + if (s->status->ok()) { + merge_context->PushOperand( + value_of_default, + s->inplace_update_support == false /* operand_pinned */); + } + } else if (*(s->merge_in_progress)) { + assert(s->do_merge); + + if (s->value) { + Slice value_of_default; + *(s->status) = WideColumnSerialization::GetValueOfDefaultColumn( + v, value_of_default); + if (s->status->ok()) { + *(s->status) = MergeHelper::TimedFullMerge( + merge_operator, s->key->user_key(), &value_of_default, + merge_context->GetOperands(), s->value, s->logger, + s->statistics, s->clock, /* result_operand */ nullptr, + /* update_num_ops_stats */ true); + } + } else if (s->columns) { + std::string result; + *(s->status) = MergeHelper::TimedFullMergeWithEntity( + merge_operator, s->key->user_key(), v, + merge_context->GetOperands(), &result, s->logger, s->statistics, + s->clock, /* update_num_ops_stats */ true); + + if (s->status->ok()) { + *(s->status) = s->columns->SetWideColumnValue(result); + } + } + } else if (s->value) { + Slice value_of_default; + *(s->status) = WideColumnSerialization::GetValueOfDefaultColumn( + v, value_of_default); + if (s->status->ok()) { + s->value->assign(value_of_default.data(), value_of_default.size()); + } + } else if (s->columns) { + *(s->status) = s->columns->SetWideColumnValue(v); + } + + if (s->inplace_update_support) { + s->mem->GetLock(s->key->user_key())->ReadUnlock(); + } + + *(s->found_final_value) = true; + + if (s->is_blob_index != nullptr) { + *(s->is_blob_index) = false; + } + + return false; + } + case kTypeDeletion: + case kTypeDeletionWithTimestamp: + case kTypeSingleDeletion: + case kTypeRangeDeletion: { + if (*(s->merge_in_progress)) { + if (s->value || s->columns) { + std::string result; + *(s->status) = MergeHelper::TimedFullMerge( + merge_operator, s->key->user_key(), nullptr, + merge_context->GetOperands(), &result, s->logger, s->statistics, + s->clock, /* result_operand */ nullptr, + /* update_num_ops_stats */ true); + + if (s->status->ok()) { + if (s->value) { + *(s->value) = std::move(result); + } else { + assert(s->columns); + s->columns->SetPlainValue(result); + } + } + } + } else { + *(s->status) = Status::NotFound(); + } + *(s->found_final_value) = true; + return false; + } + case kTypeMerge: { + if (!merge_operator) { + *(s->status) = Status::InvalidArgument( + "merge_operator is not properly initialized."); + // Normally we continue the loop (return true) when we see a merge + // operand. But in case of an error, we should stop the loop + // immediately and pretend we have found the value to stop further + // seek. Otherwise, the later call will override this error status. + *(s->found_final_value) = true; + return false; + } + Slice v = GetLengthPrefixedSlice(key_ptr + key_length); + *(s->merge_in_progress) = true; + merge_context->PushOperand( + v, s->inplace_update_support == false /* operand_pinned */); + if (s->do_merge && merge_operator->ShouldMerge( + merge_context->GetOperandsDirectionBackward())) { + if (s->value || s->columns) { + std::string result; + *(s->status) = MergeHelper::TimedFullMerge( + merge_operator, s->key->user_key(), nullptr, + merge_context->GetOperands(), &result, s->logger, s->statistics, + s->clock, /* result_operand */ nullptr, + /* update_num_ops_stats */ true); + + if (s->status->ok()) { + if (s->value) { + *(s->value) = std::move(result); + } else { + assert(s->columns); + s->columns->SetPlainValue(result); + } + } + } + + *(s->found_final_value) = true; + return false; + } + return true; + } + default: { + std::string msg("Corrupted value not expected."); + if (s->allow_data_in_errors) { + msg.append("Unrecognized value type: " + + std::to_string(static_cast(type)) + ". "); + msg.append("User key: " + user_key_slice.ToString(/*hex=*/true) + + ". "); + msg.append("seq: " + std::to_string(seq) + "."); + } + *(s->status) = Status::Corruption(msg.c_str()); + return false; + } + } + } + + // s->state could be Corrupt, merge or notfound + return false; +} + +bool MemTable::Get(const LookupKey& key, std::string* value, + PinnableWideColumns* columns, std::string* timestamp, + Status* s, MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + SequenceNumber* seq, const ReadOptions& read_opts, + bool immutable_memtable, ReadCallback* callback, + bool* is_blob_index, bool do_merge) { + // The sequence number is updated synchronously in version_set.h + if (IsEmpty()) { + // Avoiding recording stats for speed. + return false; + } + PERF_TIMER_GUARD(get_from_memtable_time); + + std::unique_ptr range_del_iter( + NewRangeTombstoneIterator(read_opts, + GetInternalKeySeqno(key.internal_key()), + immutable_memtable)); + if (range_del_iter != nullptr) { + SequenceNumber covering_seq = + range_del_iter->MaxCoveringTombstoneSeqnum(key.user_key()); + if (covering_seq > *max_covering_tombstone_seq) { + *max_covering_tombstone_seq = covering_seq; + if (timestamp) { + // Will be overwritten in SaveValue() if there is a point key with + // a higher seqno. + timestamp->assign(range_del_iter->timestamp().data(), + range_del_iter->timestamp().size()); + } + } + } + + bool found_final_value = false; + bool merge_in_progress = s->IsMergeInProgress(); + bool may_contain = true; + size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size(); + Slice user_key_without_ts = StripTimestampFromUserKey(key.user_key(), ts_sz); + bool bloom_checked = false; + if (bloom_filter_) { + // when both memtable_whole_key_filtering and prefix_extractor_ are set, + // only do whole key filtering for Get() to save CPU + if (moptions_.memtable_whole_key_filtering) { + may_contain = bloom_filter_->MayContain(user_key_without_ts); + bloom_checked = true; + } else { + assert(prefix_extractor_); + if (prefix_extractor_->InDomain(user_key_without_ts)) { + may_contain = bloom_filter_->MayContain( + prefix_extractor_->Transform(user_key_without_ts)); + bloom_checked = true; + } + } + } + + if (bloom_filter_ && !may_contain) { + // iter is null if prefix bloom says the key does not exist + PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); + *seq = kMaxSequenceNumber; + } else { + if (bloom_checked) { + PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); + } + GetFromTable(key, *max_covering_tombstone_seq, do_merge, callback, + is_blob_index, value, columns, timestamp, s, merge_context, + seq, &found_final_value, &merge_in_progress); + } + + // No change to value, since we have not yet found a Put/Delete + // Propagate corruption error + if (!found_final_value && merge_in_progress && !s->IsCorruption()) { + *s = Status::MergeInProgress(); + } + PERF_COUNTER_ADD(get_from_memtable_count, 1); + return found_final_value; +} + +void MemTable::GetFromTable(const LookupKey& key, + SequenceNumber max_covering_tombstone_seq, + bool do_merge, ReadCallback* callback, + bool* is_blob_index, std::string* value, + PinnableWideColumns* columns, + std::string* timestamp, Status* s, + MergeContext* merge_context, SequenceNumber* seq, + bool* found_final_value, bool* merge_in_progress) { + Saver saver; + saver.status = s; + saver.found_final_value = found_final_value; + saver.merge_in_progress = merge_in_progress; + saver.key = &key; + saver.value = value; + saver.columns = columns; + saver.timestamp = timestamp; + saver.seq = kMaxSequenceNumber; + saver.mem = this; + saver.merge_context = merge_context; + saver.max_covering_tombstone_seq = max_covering_tombstone_seq; + saver.merge_operator = moptions_.merge_operator; + saver.logger = moptions_.info_log; + saver.inplace_update_support = moptions_.inplace_update_support; + saver.statistics = moptions_.statistics; + saver.clock = clock_; + saver.callback_ = callback; + saver.is_blob_index = is_blob_index; + saver.do_merge = do_merge; + saver.allow_data_in_errors = moptions_.allow_data_in_errors; + saver.protection_bytes_per_key = moptions_.protection_bytes_per_key; + table_->Get(key, &saver, SaveValue); + *seq = saver.seq; +} + +void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range, + ReadCallback* callback, bool immutable_memtable) { + // The sequence number is updated synchronously in version_set.h + if (IsEmpty()) { + // Avoiding recording stats for speed. + return; + } + PERF_TIMER_GUARD(get_from_memtable_time); + + // For now, memtable Bloom filter is effectively disabled if there are any + // range tombstones. This is the simplest way to ensure range tombstones are + // handled. TODO: allow Bloom checks where max_covering_tombstone_seq==0 + bool no_range_del = read_options.ignore_range_deletions || + is_range_del_table_empty_.load(std::memory_order_relaxed); + MultiGetRange temp_range(*range, range->begin(), range->end()); + if (bloom_filter_ && no_range_del) { + bool whole_key = + !prefix_extractor_ || moptions_.memtable_whole_key_filtering; + std::array bloom_keys; + std::array may_match; + std::array range_indexes; + int num_keys = 0; + for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) { + if (whole_key) { + bloom_keys[num_keys] = iter->ukey_without_ts; + range_indexes[num_keys++] = iter.index(); + } else if (prefix_extractor_->InDomain(iter->ukey_without_ts)) { + bloom_keys[num_keys] = + prefix_extractor_->Transform(iter->ukey_without_ts); + range_indexes[num_keys++] = iter.index(); + } + } + bloom_filter_->MayContain(num_keys, &bloom_keys[0], &may_match[0]); + for (int i = 0; i < num_keys; ++i) { + if (!may_match[i]) { + temp_range.SkipIndex(range_indexes[i]); + PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); + } else { + PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); + } + } + } + for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) { + bool found_final_value{false}; + bool merge_in_progress = iter->s->IsMergeInProgress(); + if (!no_range_del) { + std::unique_ptr range_del_iter( + NewRangeTombstoneIteratorInternal( + read_options, GetInternalKeySeqno(iter->lkey->internal_key()), + immutable_memtable)); + SequenceNumber covering_seq = + range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key()); + if (covering_seq > iter->max_covering_tombstone_seq) { + iter->max_covering_tombstone_seq = covering_seq; + if (iter->timestamp) { + // Will be overwritten in SaveValue() if there is a point key with + // a higher seqno. + iter->timestamp->assign(range_del_iter->timestamp().data(), + range_del_iter->timestamp().size()); + } + } + } + SequenceNumber dummy_seq; + GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true, + callback, &iter->is_blob_index, iter->value->GetSelf(), + /*columns=*/nullptr, iter->timestamp, iter->s, + &(iter->merge_context), &dummy_seq, &found_final_value, + &merge_in_progress); + + if (!found_final_value && merge_in_progress) { + *(iter->s) = Status::MergeInProgress(); + } + + if (found_final_value) { + iter->value->PinSelf(); + range->AddValueSize(iter->value->size()); + range->MarkKeyDone(iter); + RecordTick(moptions_.statistics, MEMTABLE_HIT); + if (range->GetValueSize() > read_options.value_size_soft_limit) { + // Set all remaining keys in range to Abort + for (auto range_iter = range->begin(); range_iter != range->end(); + ++range_iter) { + range->MarkKeyDone(range_iter); + *(range_iter->s) = Status::Aborted(); + } + break; + } + } + } + PERF_COUNTER_ADD(get_from_memtable_count, 1); +} + +Status MemTable::Update(SequenceNumber seq, ValueType value_type, + const Slice& key, const Slice& value, + const ProtectionInfoKVOS64* kv_prot_info) { + LookupKey lkey(key, seq); + Slice mem_key = lkey.memtable_key(); + + std::unique_ptr iter( + table_->GetDynamicPrefixIterator()); + iter->Seek(lkey.internal_key(), mem_key.data()); + + if (iter->Valid()) { + // Refer to comments under MemTable::Add() for entry format. + // Check that it belongs to same user key. We do not check the + // sequence number since the Seek() call above should have skipped + // all entries with overly large sequence numbers. + const char* entry = iter->key(); + uint32_t key_length = 0; + const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if (comparator_.comparator.user_comparator()->Equal( + Slice(key_ptr, key_length - 8), lkey.user_key())) { + // Correct user key + const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); + ValueType type; + SequenceNumber existing_seq; + UnPackSequenceAndType(tag, &existing_seq, &type); + assert(existing_seq != seq); + if (type == value_type) { + Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length); + uint32_t prev_size = static_cast(prev_value.size()); + uint32_t new_size = static_cast(value.size()); + + // Update value, if new value size <= previous value size + if (new_size <= prev_size) { + char* p = + EncodeVarint32(const_cast(key_ptr) + key_length, new_size); + WriteLock wl(GetLock(lkey.user_key())); + memcpy(p, value.data(), value.size()); + assert((unsigned)((p + value.size()) - entry) == + (unsigned)(VarintLength(key_length) + key_length + + VarintLength(value.size()) + value.size())); + RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED); + if (kv_prot_info != nullptr) { + ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info); + // `seq` is swallowed and `existing_seq` prevails. + updated_kv_prot_info.UpdateS(seq, existing_seq); + UpdateEntryChecksum(&updated_kv_prot_info, key, value, type, + existing_seq, p + value.size()); + Slice encoded(entry, p + value.size() - entry); + return VerifyEncodedEntry(encoded, updated_kv_prot_info); + } else { + UpdateEntryChecksum(nullptr, key, value, type, existing_seq, + p + value.size()); + } + return Status::OK(); + } + } + } + } + + // The latest value is not value_type or key doesn't exist + return Add(seq, value_type, key, value, kv_prot_info); +} + +Status MemTable::UpdateCallback(SequenceNumber seq, const Slice& key, + const Slice& delta, + const ProtectionInfoKVOS64* kv_prot_info) { + LookupKey lkey(key, seq); + Slice memkey = lkey.memtable_key(); + + std::unique_ptr iter( + table_->GetDynamicPrefixIterator()); + iter->Seek(lkey.internal_key(), memkey.data()); + + if (iter->Valid()) { + // Refer to comments under MemTable::Add() for entry format. + // Check that it belongs to same user key. We do not check the + // sequence number since the Seek() call above should have skipped + // all entries with overly large sequence numbers. + const char* entry = iter->key(); + uint32_t key_length = 0; + const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if (comparator_.comparator.user_comparator()->Equal( + Slice(key_ptr, key_length - 8), lkey.user_key())) { + // Correct user key + const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); + ValueType type; + uint64_t existing_seq; + UnPackSequenceAndType(tag, &existing_seq, &type); + if (type == kTypeValue) { + Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length); + uint32_t prev_size = static_cast(prev_value.size()); + + char* prev_buffer = const_cast(prev_value.data()); + uint32_t new_prev_size = prev_size; + + std::string str_value; + WriteLock wl(GetLock(lkey.user_key())); + auto status = moptions_.inplace_callback(prev_buffer, &new_prev_size, + delta, &str_value); + if (status == UpdateStatus::UPDATED_INPLACE) { + // Value already updated by callback. + assert(new_prev_size <= prev_size); + if (new_prev_size < prev_size) { + // overwrite the new prev_size + char* p = EncodeVarint32(const_cast(key_ptr) + key_length, + new_prev_size); + if (VarintLength(new_prev_size) < VarintLength(prev_size)) { + // shift the value buffer as well. + memcpy(p, prev_buffer, new_prev_size); + prev_buffer = p; + } + } + RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED); + UpdateFlushState(); + Slice new_value(prev_buffer, new_prev_size); + if (kv_prot_info != nullptr) { + ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info); + // `seq` is swallowed and `existing_seq` prevails. + updated_kv_prot_info.UpdateS(seq, existing_seq); + updated_kv_prot_info.UpdateV(delta, new_value); + Slice encoded(entry, prev_buffer + new_prev_size - entry); + UpdateEntryChecksum(&updated_kv_prot_info, key, new_value, type, + existing_seq, prev_buffer + new_prev_size); + return VerifyEncodedEntry(encoded, updated_kv_prot_info); + } else { + UpdateEntryChecksum(nullptr, key, new_value, type, existing_seq, + prev_buffer + new_prev_size); + } + return Status::OK(); + } else if (status == UpdateStatus::UPDATED) { + Status s; + if (kv_prot_info != nullptr) { + ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info); + updated_kv_prot_info.UpdateV(delta, str_value); + s = Add(seq, kTypeValue, key, Slice(str_value), + &updated_kv_prot_info); + } else { + s = Add(seq, kTypeValue, key, Slice(str_value), + nullptr /* kv_prot_info */); + } + RecordTick(moptions_.statistics, NUMBER_KEYS_WRITTEN); + UpdateFlushState(); + return s; + } else if (status == UpdateStatus::UPDATE_FAILED) { + // `UPDATE_FAILED` is named incorrectly. It indicates no update + // happened. It does not indicate a failure happened. + UpdateFlushState(); + return Status::OK(); + } + } + } + } + // The latest value is not `kTypeValue` or key doesn't exist + return Status::NotFound(); +} + +size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) { + Slice memkey = key.memtable_key(); + + // A total ordered iterator is costly for some memtablerep (prefix aware + // reps). By passing in the user key, we allow efficient iterator creation. + // The iterator only needs to be ordered within the same user key. + std::unique_ptr iter( + table_->GetDynamicPrefixIterator()); + iter->Seek(key.internal_key(), memkey.data()); + + size_t num_successive_merges = 0; + + for (; iter->Valid(); iter->Next()) { + const char* entry = iter->key(); + uint32_t key_length = 0; + const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if (!comparator_.comparator.user_comparator()->Equal( + Slice(iter_key_ptr, key_length - 8), key.user_key())) { + break; + } + + const uint64_t tag = DecodeFixed64(iter_key_ptr + key_length - 8); + ValueType type; + uint64_t unused; + UnPackSequenceAndType(tag, &unused, &type); + if (type != kTypeMerge) { + break; + } + + ++num_successive_merges; + } + + return num_successive_merges; +} + +void MemTableRep::Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) { + auto iter = GetDynamicPrefixIterator(); + for (iter->Seek(k.internal_key(), k.memtable_key().data()); + iter->Valid() && callback_func(callback_args, iter->key()); + iter->Next()) { + } +} + +void MemTable::RefLogContainingPrepSection(uint64_t log) { + assert(log > 0); + auto cur = min_prep_log_referenced_.load(); + while ((log < cur || cur == 0) && + !min_prep_log_referenced_.compare_exchange_strong(cur, log)) { + cur = min_prep_log_referenced_.load(); + } +} + +uint64_t MemTable::GetMinLogContainingPrepSection() { + return min_prep_log_referenced_.load(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/memtable.h b/src/rocksdb/db/memtable.h new file mode 100644 index 000000000..6db2721e4 --- /dev/null +++ b/src/rocksdb/db/memtable.h @@ -0,0 +1,664 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/kv_checksum.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/read_callback.h" +#include "db/version_edit.h" +#include "memory/allocator.h" +#include "memory/concurrent_arena.h" +#include "monitoring/instrumented_mutex.h" +#include "options/cf_options.h" +#include "rocksdb/db.h" +#include "rocksdb/memtablerep.h" +#include "table/multiget_context.h" +#include "util/dynamic_bloom.h" +#include "util/hash.h" +#include "util/hash_containers.h" + +namespace ROCKSDB_NAMESPACE { + +struct FlushJobInfo; +class Mutex; +class MemTableIterator; +class MergeContext; +class SystemClock; + +struct ImmutableMemTableOptions { + explicit ImmutableMemTableOptions(const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options); + size_t arena_block_size; + uint32_t memtable_prefix_bloom_bits; + size_t memtable_huge_page_size; + bool memtable_whole_key_filtering; + bool inplace_update_support; + size_t inplace_update_num_locks; + UpdateStatus (*inplace_callback)(char* existing_value, + uint32_t* existing_value_size, + Slice delta_value, + std::string* merged_value); + size_t max_successive_merges; + Statistics* statistics; + MergeOperator* merge_operator; + Logger* info_log; + bool allow_data_in_errors; + uint32_t protection_bytes_per_key; +}; + +// Batched counters to updated when inserting keys in one write batch. +// In post process of the write batch, these can be updated together. +// Only used in concurrent memtable insert case. +struct MemTablePostProcessInfo { + uint64_t data_size = 0; + uint64_t num_entries = 0; + uint64_t num_deletes = 0; +}; + +using MultiGetRange = MultiGetContext::Range; +// Note: Many of the methods in this class have comments indicating that +// external synchronization is required as these methods are not thread-safe. +// It is up to higher layers of code to decide how to prevent concurrent +// invocation of these methods. This is usually done by acquiring either +// the db mutex or the single writer thread. +// +// Some of these methods are documented to only require external +// synchronization if this memtable is immutable. Calling MarkImmutable() is +// not sufficient to guarantee immutability. It is up to higher layers of +// code to determine if this MemTable can still be modified by other threads. +// Eg: The Superversion stores a pointer to the current MemTable (that can +// be modified) and a separate list of the MemTables that can no longer be +// written to (aka the 'immutable memtables'). +class MemTable { + public: + struct KeyComparator : public MemTableRep::KeyComparator { + const InternalKeyComparator comparator; + explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) {} + virtual int operator()(const char* prefix_len_key1, + const char* prefix_len_key2) const override; + virtual int operator()(const char* prefix_len_key, + const DecodedType& key) const override; + }; + + // MemTables are reference counted. The initial reference count + // is zero and the caller must call Ref() at least once. + // + // earliest_seq should be the current SequenceNumber in the db such that any + // key inserted into this memtable will have an equal or larger seq number. + // (When a db is first created, the earliest sequence number will be 0). + // If the earliest sequence number is not known, kMaxSequenceNumber may be + // used, but this may prevent some transactions from succeeding until the + // first key is inserted into the memtable. + explicit MemTable(const InternalKeyComparator& comparator, + const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + WriteBufferManager* write_buffer_manager, + SequenceNumber earliest_seq, uint32_t column_family_id); + // No copying allowed + MemTable(const MemTable&) = delete; + MemTable& operator=(const MemTable&) = delete; + + // Do not delete this MemTable unless Unref() indicates it not in use. + ~MemTable(); + + // Increase reference count. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + void Ref() { ++refs_; } + + // Drop reference count. + // If the refcount goes to zero return this memtable, otherwise return null. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + MemTable* Unref() { + --refs_; + assert(refs_ >= 0); + if (refs_ <= 0) { + return this; + } + return nullptr; + } + + // Returns an estimate of the number of bytes of data in use by this + // data structure. + // + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable (unless this Memtable is immutable). + size_t ApproximateMemoryUsage(); + + // As a cheap version of `ApproximateMemoryUsage()`, this function doesn't + // require external synchronization. The value may be less accurate though + size_t ApproximateMemoryUsageFast() const { + return approximate_memory_usage_.load(std::memory_order_relaxed); + } + + // used by MemTableListVersion::MemoryAllocatedBytesExcludingLast + size_t MemoryAllocatedBytes() const { + return table_->ApproximateMemoryUsage() + + range_del_table_->ApproximateMemoryUsage() + + arena_.MemoryAllocatedBytes(); + } + + // Returns a vector of unique random memtable entries of size 'sample_size'. + // + // Note: the entries are stored in the unordered_set as length-prefixed keys, + // hence their representation in the set as "const char*". + // Note2: the size of the output set 'entries' is not enforced to be strictly + // equal to 'target_sample_size'. Its final size might be slightly + // greater or slightly less than 'target_sample_size' + // + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable (unless this Memtable is immutable). + // REQUIRES: SkipList memtable representation. This function is not + // implemented for any other type of memtable representation (vectorrep, + // hashskiplist,...). + void UniqueRandomSample(const uint64_t& target_sample_size, + std::unordered_set* entries) { + // TODO(bjlemaire): at the moment, only supported by skiplistrep. + // Extend it to all other memtable representations. + table_->UniqueRandomSample(num_entries(), target_sample_size, entries); + } + + // This method heuristically determines if the memtable should continue to + // host more data. + bool ShouldScheduleFlush() const { + return flush_state_.load(std::memory_order_relaxed) == FLUSH_REQUESTED; + } + + // Returns true if a flush should be scheduled and the caller should + // be the one to schedule it + bool MarkFlushScheduled() { + auto before = FLUSH_REQUESTED; + return flush_state_.compare_exchange_strong(before, FLUSH_SCHEDULED, + std::memory_order_relaxed, + std::memory_order_relaxed); + } + + // Return an iterator that yields the contents of the memtable. + // + // The caller must ensure that the underlying MemTable remains live + // while the returned iterator is live. The keys returned by this + // iterator are internal keys encoded by AppendInternalKey in the + // db/dbformat.{h,cc} module. + // + // By default, it returns an iterator for prefix seek if prefix_extractor + // is configured in Options. + // arena: If not null, the arena needs to be used to allocate the Iterator. + // Calling ~Iterator of the iterator will destroy all the states but + // those allocated in arena. + InternalIterator* NewIterator(const ReadOptions& read_options, Arena* arena); + + // Returns an iterator that yields the range tombstones of the memtable. + // The caller must ensure that the underlying MemTable remains live + // while the returned iterator is live. + // @param immutable_memtable Whether this memtable is an immutable memtable. + // This information is not stored in memtable itself, so it needs to be + // specified by the caller. This flag is used internally to decide whether a + // cached fragmented range tombstone list can be returned. This cached version + // is constructed when a memtable becomes immutable. Setting the flag to false + // will always yield correct result, but may incur performance penalty as it + // always creates a new fragmented range tombstone list. + FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( + const ReadOptions& read_options, SequenceNumber read_seq, + bool immutable_memtable); + + Status VerifyEncodedEntry(Slice encoded, + const ProtectionInfoKVOS64& kv_prot_info); + + // Add an entry into memtable that maps key to value at the + // specified sequence number and with the specified type. + // Typically value will be empty if type==kTypeDeletion. + // + // REQUIRES: if allow_concurrent = false, external synchronization to prevent + // simultaneous operations on the same MemTable. + // + // Returns `Status::TryAgain` if the `seq`, `key` combination already exists + // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true. + // The next attempt should try a larger value for `seq`. + Status Add(SequenceNumber seq, ValueType type, const Slice& key, + const Slice& value, const ProtectionInfoKVOS64* kv_prot_info, + bool allow_concurrent = false, + MemTablePostProcessInfo* post_process_info = nullptr, + void** hint = nullptr); + + // Used to Get value associated with key or Get Merge Operands associated + // with key. + // If do_merge = true the default behavior which is Get value for key is + // executed. Expected behavior is described right below. + // If memtable contains a value for key, store it in *value and return true. + // If memtable contains a deletion for key, store a NotFound() error + // in *status and return true. + // If memtable contains Merge operation as the most recent entry for a key, + // and the merge process does not stop (not reaching a value or delete), + // prepend the current merge operand to *operands. + // store MergeInProgress in s, and return false. + // Else, return false. + // If any operation was found, its most recent sequence number + // will be stored in *seq on success (regardless of whether true/false is + // returned). Otherwise, *seq will be set to kMaxSequenceNumber. + // On success, *s may be set to OK, NotFound, or MergeInProgress. Any other + // status returned indicates a corruption or other unexpected error. + // If do_merge = false then any Merge Operands encountered for key are simply + // stored in merge_context.operands_list and never actually merged to get a + // final value. The raw Merge Operands are eventually returned to the user. + // @param immutable_memtable Whether this memtable is immutable. Used + // internally by NewRangeTombstoneIterator(). See comment above + // NewRangeTombstoneIterator() for more detail. + bool Get(const LookupKey& key, std::string* value, + PinnableWideColumns* columns, std::string* timestamp, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, + const ReadOptions& read_opts, bool immutable_memtable, + ReadCallback* callback = nullptr, bool* is_blob_index = nullptr, + bool do_merge = true); + + bool Get(const LookupKey& key, std::string* value, + PinnableWideColumns* columns, std::string* timestamp, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + const ReadOptions& read_opts, bool immutable_memtable, + ReadCallback* callback = nullptr, bool* is_blob_index = nullptr, + bool do_merge = true) { + SequenceNumber seq; + return Get(key, value, columns, timestamp, s, merge_context, + max_covering_tombstone_seq, &seq, read_opts, immutable_memtable, + callback, is_blob_index, do_merge); + } + + // @param immutable_memtable Whether this memtable is immutable. Used + // internally by NewRangeTombstoneIterator(). See comment above + // NewRangeTombstoneIterator() for more detail. + void MultiGet(const ReadOptions& read_options, MultiGetRange* range, + ReadCallback* callback, bool immutable_memtable); + + // If `key` exists in current memtable with type value_type and the existing + // value is at least as large as the new value, updates it in-place. Otherwise + // adds the new value to the memtable out-of-place. + // + // Returns `Status::TryAgain` if the `seq`, `key` combination already exists + // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true. + // The next attempt should try a larger value for `seq`. + // + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + Status Update(SequenceNumber seq, ValueType value_type, const Slice& key, + const Slice& value, const ProtectionInfoKVOS64* kv_prot_info); + + // If `key` exists in current memtable with type `kTypeValue` and the existing + // value is at least as large as the new value, updates it in-place. Otherwise + // if `key` exists in current memtable with type `kTypeValue`, adds the new + // value to the memtable out-of-place. + // + // Returns `Status::NotFound` if `key` does not exist in current memtable or + // the latest version of `key` does not have `kTypeValue`. + // + // Returns `Status::TryAgain` if the `seq`, `key` combination already exists + // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true. + // The next attempt should try a larger value for `seq`. + // + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + Status UpdateCallback(SequenceNumber seq, const Slice& key, + const Slice& delta, + const ProtectionInfoKVOS64* kv_prot_info); + + // Returns the number of successive merge entries starting from the newest + // entry for the key up to the last non-merge entry or last entry for the + // key in the memtable. + size_t CountSuccessiveMergeEntries(const LookupKey& key); + + // Update counters and flush status after inserting a whole write batch + // Used in concurrent memtable inserts. + void BatchPostProcess(const MemTablePostProcessInfo& update_counters) { + num_entries_.fetch_add(update_counters.num_entries, + std::memory_order_relaxed); + data_size_.fetch_add(update_counters.data_size, std::memory_order_relaxed); + if (update_counters.num_deletes != 0) { + num_deletes_.fetch_add(update_counters.num_deletes, + std::memory_order_relaxed); + } + UpdateFlushState(); + } + + // Get total number of entries in the mem table. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable (unless this Memtable is immutable). + uint64_t num_entries() const { + return num_entries_.load(std::memory_order_relaxed); + } + + // Get total number of deletes in the mem table. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable (unless this Memtable is immutable). + uint64_t num_deletes() const { + return num_deletes_.load(std::memory_order_relaxed); + } + + uint64_t get_data_size() const { + return data_size_.load(std::memory_order_relaxed); + } + + // Dynamically change the memtable's capacity. If set below the current usage, + // the next key added will trigger a flush. Can only increase size when + // memtable prefix bloom is disabled, since we can't easily allocate more + // space. + void UpdateWriteBufferSize(size_t new_write_buffer_size) { + if (bloom_filter_ == nullptr || + new_write_buffer_size < write_buffer_size_) { + write_buffer_size_.store(new_write_buffer_size, + std::memory_order_relaxed); + } + } + + // Returns the edits area that is needed for flushing the memtable + VersionEdit* GetEdits() { return &edit_; } + + // Returns if there is no entry inserted to the mem table. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable (unless this Memtable is immutable). + bool IsEmpty() const { return first_seqno_ == 0; } + + // Returns the sequence number of the first element that was inserted + // into the memtable. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable (unless this Memtable is immutable). + SequenceNumber GetFirstSequenceNumber() { + return first_seqno_.load(std::memory_order_relaxed); + } + + // Returns the sequence number of the first element that was inserted + // into the memtable. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable (unless this Memtable is immutable). + void SetFirstSequenceNumber(SequenceNumber first_seqno) { + return first_seqno_.store(first_seqno, std::memory_order_relaxed); + } + + // Returns the sequence number that is guaranteed to be smaller than or equal + // to the sequence number of any key that could be inserted into this + // memtable. It can then be assumed that any write with a larger(or equal) + // sequence number will be present in this memtable or a later memtable. + // + // If the earliest sequence number could not be determined, + // kMaxSequenceNumber will be returned. + SequenceNumber GetEarliestSequenceNumber() { + return earliest_seqno_.load(std::memory_order_relaxed); + } + + // Sets the sequence number that is guaranteed to be smaller than or equal + // to the sequence number of any key that could be inserted into this + // memtable. It can then be assumed that any write with a larger(or equal) + // sequence number will be present in this memtable or a later memtable. + // Used only for MemPurge operation + void SetEarliestSequenceNumber(SequenceNumber earliest_seqno) { + return earliest_seqno_.store(earliest_seqno, std::memory_order_relaxed); + } + + // DB's latest sequence ID when the memtable is created. This number + // may be updated to a more recent one before any key is inserted. + SequenceNumber GetCreationSeq() const { return creation_seq_; } + + void SetCreationSeq(SequenceNumber sn) { creation_seq_ = sn; } + + // Returns the next active logfile number when this memtable is about to + // be flushed to storage + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + uint64_t GetNextLogNumber() { return mem_next_logfile_number_; } + + // Sets the next active logfile number when this memtable is about to + // be flushed to storage + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; } + + // if this memtable contains data from a committed + // two phase transaction we must take note of the + // log which contains that data so we can know + // when to relese that log + void RefLogContainingPrepSection(uint64_t log); + uint64_t GetMinLogContainingPrepSection(); + + // Notify the underlying storage that no more items will be added. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + // After MarkImmutable() is called, you should not attempt to + // write anything to this MemTable(). (Ie. do not call Add() or Update()). + void MarkImmutable() { + table_->MarkReadOnly(); + mem_tracker_.DoneAllocating(); + } + + // Notify the underlying storage that all data it contained has been + // persisted. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + void MarkFlushed() { table_->MarkFlushed(); } + + // return true if the current MemTableRep supports merge operator. + bool IsMergeOperatorSupported() const { + return table_->IsMergeOperatorSupported(); + } + + // return true if the current MemTableRep supports snapshots. + // inplace update prevents snapshots, + bool IsSnapshotSupported() const { + return table_->IsSnapshotSupported() && !moptions_.inplace_update_support; + } + + struct MemTableStats { + uint64_t size; + uint64_t count; + }; + + MemTableStats ApproximateStats(const Slice& start_ikey, + const Slice& end_ikey); + + // Get the lock associated for the key + port::RWMutex* GetLock(const Slice& key); + + const InternalKeyComparator& GetInternalKeyComparator() const { + return comparator_.comparator; + } + + const ImmutableMemTableOptions* GetImmutableMemTableOptions() const { + return &moptions_; + } + + uint64_t ApproximateOldestKeyTime() const { + return oldest_key_time_.load(std::memory_order_relaxed); + } + + // REQUIRES: db_mutex held. + void SetID(uint64_t id) { id_ = id; } + + uint64_t GetID() const { return id_; } + + void SetFlushCompleted(bool completed) { flush_completed_ = completed; } + + uint64_t GetFileNumber() const { return file_number_; } + + void SetFileNumber(uint64_t file_num) { file_number_ = file_num; } + + void SetFlushInProgress(bool in_progress) { + flush_in_progress_ = in_progress; + } + +#ifndef ROCKSDB_LITE + void SetFlushJobInfo(std::unique_ptr&& info) { + flush_job_info_ = std::move(info); + } + + std::unique_ptr ReleaseFlushJobInfo() { + return std::move(flush_job_info_); + } +#endif // !ROCKSDB_LITE + + // Returns a heuristic flush decision + bool ShouldFlushNow(); + + void ConstructFragmentedRangeTombstones(); + + // Returns whether a fragmented range tombstone list is already constructed + // for this memtable. It should be constructed right before a memtable is + // added to an immutable memtable list. Note that if a memtable does not have + // any range tombstone, then no range tombstone list will ever be constructed. + // @param allow_empty Specifies whether a memtable with no range tombstone is + // considered to have its fragmented range tombstone list constructed. + bool IsFragmentedRangeTombstonesConstructed(bool allow_empty = true) const { + if (allow_empty) { + return fragmented_range_tombstone_list_.get() != nullptr || + is_range_del_table_empty_; + } else { + return fragmented_range_tombstone_list_.get() != nullptr; + } + } + + // Returns Corruption status if verification fails. + static Status VerifyEntryChecksum(const char* entry, + size_t protection_bytes_per_key, + bool allow_data_in_errors = false); + + private: + enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED }; + + friend class MemTableIterator; + friend class MemTableBackwardIterator; + friend class MemTableList; + + KeyComparator comparator_; + const ImmutableMemTableOptions moptions_; + int refs_; + const size_t kArenaBlockSize; + AllocTracker mem_tracker_; + ConcurrentArena arena_; + std::unique_ptr table_; + std::unique_ptr range_del_table_; + std::atomic_bool is_range_del_table_empty_; + + // Total data size of all data inserted + std::atomic data_size_; + std::atomic num_entries_; + std::atomic num_deletes_; + + // Dynamically changeable memtable option + std::atomic write_buffer_size_; + + // These are used to manage memtable flushes to storage + bool flush_in_progress_; // started the flush + bool flush_completed_; // finished the flush + uint64_t file_number_; // filled up after flush is complete + + // The updates to be applied to the transaction log when this + // memtable is flushed to storage. + VersionEdit edit_; + + // The sequence number of the kv that was inserted first + std::atomic first_seqno_; + + // The db sequence number at the time of creation or kMaxSequenceNumber + // if not set. + std::atomic earliest_seqno_; + + SequenceNumber creation_seq_; + + // The log files earlier than this number can be deleted. + uint64_t mem_next_logfile_number_; + + // the earliest log containing a prepared section + // which has been inserted into this memtable. + std::atomic min_prep_log_referenced_; + + // rw locks for inplace updates + std::vector locks_; + + const SliceTransform* const prefix_extractor_; + std::unique_ptr bloom_filter_; + + std::atomic flush_state_; + + SystemClock* clock_; + + // Extract sequential insert prefixes. + const SliceTransform* insert_with_hint_prefix_extractor_; + + // Insert hints for each prefix. + UnorderedMapH insert_hints_; + + // Timestamp of oldest key + std::atomic oldest_key_time_; + + // Memtable id to track flush. + uint64_t id_ = 0; + + // Sequence number of the atomic flush that is responsible for this memtable. + // The sequence number of atomic flush is a seq, such that no writes with + // sequence numbers greater than or equal to seq are flushed, while all + // writes with sequence number smaller than seq are flushed. + SequenceNumber atomic_flush_seqno_; + + // keep track of memory usage in table_, arena_, and range_del_table_. + // Gets refreshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow` + std::atomic approximate_memory_usage_; + +#ifndef ROCKSDB_LITE + // Flush job info of the current memtable. + std::unique_ptr flush_job_info_; +#endif // !ROCKSDB_LITE + + // Updates flush_state_ using ShouldFlushNow() + void UpdateFlushState(); + + void UpdateOldestKeyTime(); + + void GetFromTable(const LookupKey& key, + SequenceNumber max_covering_tombstone_seq, bool do_merge, + ReadCallback* callback, bool* is_blob_index, + std::string* value, PinnableWideColumns* columns, + std::string* timestamp, Status* s, + MergeContext* merge_context, SequenceNumber* seq, + bool* found_final_value, bool* merge_in_progress); + + // Always returns non-null and assumes certain pre-checks (e.g., + // is_range_del_table_empty_) are done. This is only valid during the lifetime + // of the underlying memtable. + // read_seq and read_options.timestamp will be used as the upper bound + // for range tombstones. + FragmentedRangeTombstoneIterator* NewRangeTombstoneIteratorInternal( + const ReadOptions& read_options, SequenceNumber read_seq, + bool immutable_memtable); + + // The fragmented range tombstones of this memtable. + // This is constructed when this memtable becomes immutable + // if !is_range_del_table_empty_. + std::unique_ptr + fragmented_range_tombstone_list_; + + // makes sure there is a single range tombstone writer to invalidate cache + std::mutex range_del_mutex_; + CoreLocalArray> + cached_range_tombstone_; + + void UpdateEntryChecksum(const ProtectionInfoKVOS64* kv_prot_info, + const Slice& key, const Slice& value, ValueType type, + SequenceNumber s, char* checksum_ptr); +}; + +extern const char* EncodeKey(std::string* scratch, const Slice& target); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/memtable_list.cc b/src/rocksdb/db/memtable_list.cc new file mode 100644 index 000000000..1545003ad --- /dev/null +++ b/src/rocksdb/db/memtable_list.cc @@ -0,0 +1,991 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "db/memtable_list.h" + +#include +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/memtable.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/version_set.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "monitoring/thread_status_util.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "table/merging_iterator.h" +#include "test_util/sync_point.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +class InternalKeyComparator; +class Mutex; +class VersionSet; + +void MemTableListVersion::AddMemTable(MemTable* m) { + memlist_.push_front(m); + *parent_memtable_list_memory_usage_ += m->ApproximateMemoryUsage(); +} + +void MemTableListVersion::UnrefMemTable(autovector* to_delete, + MemTable* m) { + if (m->Unref()) { + to_delete->push_back(m); + assert(*parent_memtable_list_memory_usage_ >= m->ApproximateMemoryUsage()); + *parent_memtable_list_memory_usage_ -= m->ApproximateMemoryUsage(); + } +} + +MemTableListVersion::MemTableListVersion( + size_t* parent_memtable_list_memory_usage, const MemTableListVersion& old) + : max_write_buffer_number_to_maintain_( + old.max_write_buffer_number_to_maintain_), + max_write_buffer_size_to_maintain_( + old.max_write_buffer_size_to_maintain_), + parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) { + memlist_ = old.memlist_; + for (auto& m : memlist_) { + m->Ref(); + } + + memlist_history_ = old.memlist_history_; + for (auto& m : memlist_history_) { + m->Ref(); + } +} + +MemTableListVersion::MemTableListVersion( + size_t* parent_memtable_list_memory_usage, + int max_write_buffer_number_to_maintain, + int64_t max_write_buffer_size_to_maintain) + : max_write_buffer_number_to_maintain_(max_write_buffer_number_to_maintain), + max_write_buffer_size_to_maintain_(max_write_buffer_size_to_maintain), + parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {} + +void MemTableListVersion::Ref() { ++refs_; } + +// called by superversion::clean() +void MemTableListVersion::Unref(autovector* to_delete) { + assert(refs_ >= 1); + --refs_; + if (refs_ == 0) { + // if to_delete is equal to nullptr it means we're confident + // that refs_ will not be zero + assert(to_delete != nullptr); + for (const auto& m : memlist_) { + UnrefMemTable(to_delete, m); + } + for (const auto& m : memlist_history_) { + UnrefMemTable(to_delete, m); + } + delete this; + } +} + +int MemTableList::NumNotFlushed() const { + int size = static_cast(current_->memlist_.size()); + assert(num_flush_not_started_ <= size); + return size; +} + +int MemTableList::NumFlushed() const { + return static_cast(current_->memlist_history_.size()); +} + +// Search all the memtables starting from the most recent one. +// Return the most recent value found, if any. +// Operands stores the list of merge operations to apply, so far. +bool MemTableListVersion::Get(const LookupKey& key, std::string* value, + PinnableWideColumns* columns, + std::string* timestamp, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + SequenceNumber* seq, const ReadOptions& read_opts, + ReadCallback* callback, bool* is_blob_index) { + return GetFromList(&memlist_, key, value, columns, timestamp, s, + merge_context, max_covering_tombstone_seq, seq, read_opts, + callback, is_blob_index); +} + +void MemTableListVersion::MultiGet(const ReadOptions& read_options, + MultiGetRange* range, + ReadCallback* callback) { + for (auto memtable : memlist_) { + memtable->MultiGet(read_options, range, callback, + true /* immutable_memtable */); + if (range->empty()) { + return; + } + } +} + +bool MemTableListVersion::GetMergeOperands( + const LookupKey& key, Status* s, MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, const ReadOptions& read_opts) { + for (MemTable* memtable : memlist_) { + bool done = memtable->Get( + key, /*value=*/nullptr, /*columns=*/nullptr, /*timestamp=*/nullptr, s, + merge_context, max_covering_tombstone_seq, read_opts, + true /* immutable_memtable */, nullptr, nullptr, false); + if (done) { + return true; + } + } + return false; +} + +bool MemTableListVersion::GetFromHistory( + const LookupKey& key, std::string* value, PinnableWideColumns* columns, + std::string* timestamp, Status* s, MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, + const ReadOptions& read_opts, bool* is_blob_index) { + return GetFromList(&memlist_history_, key, value, columns, timestamp, s, + merge_context, max_covering_tombstone_seq, seq, read_opts, + nullptr /*read_callback*/, is_blob_index); +} + +bool MemTableListVersion::GetFromList( + std::list* list, const LookupKey& key, std::string* value, + PinnableWideColumns* columns, std::string* timestamp, Status* s, + MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, + SequenceNumber* seq, const ReadOptions& read_opts, ReadCallback* callback, + bool* is_blob_index) { + *seq = kMaxSequenceNumber; + + for (auto& memtable : *list) { + assert(memtable->IsFragmentedRangeTombstonesConstructed()); + SequenceNumber current_seq = kMaxSequenceNumber; + + bool done = + memtable->Get(key, value, columns, timestamp, s, merge_context, + max_covering_tombstone_seq, ¤t_seq, read_opts, + true /* immutable_memtable */, callback, is_blob_index); + if (*seq == kMaxSequenceNumber) { + // Store the most recent sequence number of any operation on this key. + // Since we only care about the most recent change, we only need to + // return the first operation found when searching memtables in + // reverse-chronological order. + // current_seq would be equal to kMaxSequenceNumber if the value was to be + // skipped. This allows seq to be assigned again when the next value is + // read. + *seq = current_seq; + } + + if (done) { + assert(*seq != kMaxSequenceNumber || s->IsNotFound()); + return true; + } + if (!done && !s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) { + return false; + } + } + return false; +} + +Status MemTableListVersion::AddRangeTombstoneIterators( + const ReadOptions& read_opts, Arena* /*arena*/, + RangeDelAggregator* range_del_agg) { + assert(range_del_agg != nullptr); + // Except for snapshot read, using kMaxSequenceNumber is OK because these + // are immutable memtables. + SequenceNumber read_seq = read_opts.snapshot != nullptr + ? read_opts.snapshot->GetSequenceNumber() + : kMaxSequenceNumber; + for (auto& m : memlist_) { + assert(m->IsFragmentedRangeTombstonesConstructed()); + std::unique_ptr range_del_iter( + m->NewRangeTombstoneIterator(read_opts, read_seq, + true /* immutable_memtable */)); + range_del_agg->AddTombstones(std::move(range_del_iter)); + } + return Status::OK(); +} + +void MemTableListVersion::AddIterators( + const ReadOptions& options, std::vector* iterator_list, + Arena* arena) { + for (auto& m : memlist_) { + iterator_list->push_back(m->NewIterator(options, arena)); + } +} + +void MemTableListVersion::AddIterators(const ReadOptions& options, + MergeIteratorBuilder* merge_iter_builder, + bool add_range_tombstone_iter) { + for (auto& m : memlist_) { + auto mem_iter = m->NewIterator(options, merge_iter_builder->GetArena()); + if (!add_range_tombstone_iter || options.ignore_range_deletions) { + merge_iter_builder->AddIterator(mem_iter); + } else { + // Except for snapshot read, using kMaxSequenceNumber is OK because these + // are immutable memtables. + SequenceNumber read_seq = options.snapshot != nullptr + ? options.snapshot->GetSequenceNumber() + : kMaxSequenceNumber; + TruncatedRangeDelIterator* mem_tombstone_iter = nullptr; + auto range_del_iter = m->NewRangeTombstoneIterator( + options, read_seq, true /* immutale_memtable */); + if (range_del_iter == nullptr || range_del_iter->empty()) { + delete range_del_iter; + } else { + mem_tombstone_iter = new TruncatedRangeDelIterator( + std::unique_ptr(range_del_iter), + &m->GetInternalKeyComparator(), nullptr /* smallest */, + nullptr /* largest */); + } + merge_iter_builder->AddPointAndTombstoneIterator(mem_iter, + mem_tombstone_iter); + } + } +} + +uint64_t MemTableListVersion::GetTotalNumEntries() const { + uint64_t total_num = 0; + for (auto& m : memlist_) { + total_num += m->num_entries(); + } + return total_num; +} + +MemTable::MemTableStats MemTableListVersion::ApproximateStats( + const Slice& start_ikey, const Slice& end_ikey) { + MemTable::MemTableStats total_stats = {0, 0}; + for (auto& m : memlist_) { + auto mStats = m->ApproximateStats(start_ikey, end_ikey); + total_stats.size += mStats.size; + total_stats.count += mStats.count; + } + return total_stats; +} + +uint64_t MemTableListVersion::GetTotalNumDeletes() const { + uint64_t total_num = 0; + for (auto& m : memlist_) { + total_num += m->num_deletes(); + } + return total_num; +} + +SequenceNumber MemTableListVersion::GetEarliestSequenceNumber( + bool include_history) const { + if (include_history && !memlist_history_.empty()) { + return memlist_history_.back()->GetEarliestSequenceNumber(); + } else if (!memlist_.empty()) { + return memlist_.back()->GetEarliestSequenceNumber(); + } else { + return kMaxSequenceNumber; + } +} + +SequenceNumber MemTableListVersion::GetFirstSequenceNumber() const { + SequenceNumber min_first_seqno = kMaxSequenceNumber; + // The first memtable in the list might not be the oldest one with mempurge + for (const auto& m : memlist_) { + min_first_seqno = std::min(m->GetFirstSequenceNumber(), min_first_seqno); + } + return min_first_seqno; +} + +// caller is responsible for referencing m +void MemTableListVersion::Add(MemTable* m, autovector* to_delete) { + assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable + AddMemTable(m); + // m->MemoryAllocatedBytes() is added in MemoryAllocatedBytesExcludingLast + TrimHistory(to_delete, 0); +} + +// Removes m from list of memtables not flushed. Caller should NOT Unref m. +void MemTableListVersion::Remove(MemTable* m, + autovector* to_delete) { + assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable + memlist_.remove(m); + + m->MarkFlushed(); + if (max_write_buffer_size_to_maintain_ > 0 || + max_write_buffer_number_to_maintain_ > 0) { + memlist_history_.push_front(m); + // Unable to get size of mutable memtable at this point, pass 0 to + // TrimHistory as a best effort. + TrimHistory(to_delete, 0); + } else { + UnrefMemTable(to_delete, m); + } +} + +// return the total memory usage assuming the oldest flushed memtable is dropped +size_t MemTableListVersion::MemoryAllocatedBytesExcludingLast() const { + size_t total_memtable_size = 0; + for (auto& memtable : memlist_) { + total_memtable_size += memtable->MemoryAllocatedBytes(); + } + for (auto& memtable : memlist_history_) { + total_memtable_size += memtable->MemoryAllocatedBytes(); + } + if (!memlist_history_.empty()) { + total_memtable_size -= memlist_history_.back()->MemoryAllocatedBytes(); + } + return total_memtable_size; +} + +bool MemTableListVersion::MemtableLimitExceeded(size_t usage) { + if (max_write_buffer_size_to_maintain_ > 0) { + // calculate the total memory usage after dropping the oldest flushed + // memtable, compare with max_write_buffer_size_to_maintain_ to decide + // whether to trim history + return MemoryAllocatedBytesExcludingLast() + usage >= + static_cast(max_write_buffer_size_to_maintain_); + } else if (max_write_buffer_number_to_maintain_ > 0) { + return memlist_.size() + memlist_history_.size() > + static_cast(max_write_buffer_number_to_maintain_); + } else { + return false; + } +} + +// Make sure we don't use up too much space in history +bool MemTableListVersion::TrimHistory(autovector* to_delete, + size_t usage) { + bool ret = false; + while (MemtableLimitExceeded(usage) && !memlist_history_.empty()) { + MemTable* x = memlist_history_.back(); + memlist_history_.pop_back(); + + UnrefMemTable(to_delete, x); + ret = true; + } + return ret; +} + +// Returns true if there is at least one memtable on which flush has +// not yet started. +bool MemTableList::IsFlushPending() const { + if ((flush_requested_ && num_flush_not_started_ > 0) || + (num_flush_not_started_ >= min_write_buffer_number_to_merge_)) { + assert(imm_flush_needed.load(std::memory_order_relaxed)); + return true; + } + return false; +} + +bool MemTableList::IsFlushPendingOrRunning() const { + if (current_->memlist_.size() - num_flush_not_started_ > 0) { + // Flush is already running on at least one memtable + return true; + } + return IsFlushPending(); +} + +// Returns the memtables that need to be flushed. +void MemTableList::PickMemtablesToFlush(uint64_t max_memtable_id, + autovector* ret, + uint64_t* max_next_log_number) { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH); + const auto& memlist = current_->memlist_; + bool atomic_flush = false; + + // Note: every time MemTableList::Add(mem) is called, it adds the new mem + // at the FRONT of the memlist (memlist.push_front(mem)). Therefore, by + // iterating through the memlist starting at the end, the vector + // ret is filled with memtables already sorted in increasing MemTable ID. + // However, when the mempurge feature is activated, new memtables with older + // IDs will be added to the memlist. + for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) { + MemTable* m = *it; + if (!atomic_flush && m->atomic_flush_seqno_ != kMaxSequenceNumber) { + atomic_flush = true; + } + if (m->GetID() > max_memtable_id) { + break; + } + if (!m->flush_in_progress_) { + assert(!m->flush_completed_); + num_flush_not_started_--; + if (num_flush_not_started_ == 0) { + imm_flush_needed.store(false, std::memory_order_release); + } + m->flush_in_progress_ = true; // flushing will start very soon + if (max_next_log_number) { + *max_next_log_number = + std::max(m->GetNextLogNumber(), *max_next_log_number); + } + ret->push_back(m); + } else if (!ret->empty()) { + // This `break` is necessary to prevent picking non-consecutive memtables + // in case `memlist` has one or more entries with + // `flush_in_progress_ == true` sandwiched between entries with + // `flush_in_progress_ == false`. This could happen after parallel flushes + // are picked and the one flushing older memtables is rolled back. + break; + } + } + if (!atomic_flush || num_flush_not_started_ == 0) { + flush_requested_ = false; // start-flush request is complete + } +} + +void MemTableList::RollbackMemtableFlush(const autovector& mems, + uint64_t /*file_number*/) { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_MEMTABLE_ROLLBACK); + assert(!mems.empty()); + + // If the flush was not successful, then just reset state. + // Maybe a succeeding attempt to flush will be successful. + for (MemTable* m : mems) { + assert(m->flush_in_progress_); + assert(m->file_number_ == 0); + + m->flush_in_progress_ = false; + m->flush_completed_ = false; + m->edit_.Clear(); + num_flush_not_started_++; + } + imm_flush_needed.store(true, std::memory_order_release); +} + +// Try record a successful flush in the manifest file. It might just return +// Status::OK letting a concurrent flush to do actual the recording.. +Status MemTableList::TryInstallMemtableFlushResults( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + const autovector& mems, LogsWithPrepTracker* prep_tracker, + VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number, + autovector* to_delete, FSDirectory* db_directory, + LogBuffer* log_buffer, + std::list>* committed_flush_jobs_info, + bool write_edits) { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS); + mu->AssertHeld(); + + // Flush was successful + // Record the status on the memtable object. Either this call or a call by a + // concurrent flush thread will read the status and write it to manifest. + for (size_t i = 0; i < mems.size(); ++i) { + // All the edits are associated with the first memtable of this batch. + assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0); + + mems[i]->flush_completed_ = true; + mems[i]->file_number_ = file_number; + } + + // if some other thread is already committing, then return + Status s; + if (commit_in_progress_) { + TEST_SYNC_POINT("MemTableList::TryInstallMemtableFlushResults:InProgress"); + return s; + } + + // Only a single thread can be executing this piece of code + commit_in_progress_ = true; + + // Retry until all completed flushes are committed. New flushes can finish + // while the current thread is writing manifest where mutex is released. + while (s.ok()) { + auto& memlist = current_->memlist_; + // The back is the oldest; if flush_completed_ is not set to it, it means + // that we were assigned a more recent memtable. The memtables' flushes must + // be recorded in manifest in order. A concurrent flush thread, who is + // assigned to flush the oldest memtable, will later wake up and does all + // the pending writes to manifest, in order. + if (memlist.empty() || !memlist.back()->flush_completed_) { + break; + } + // scan all memtables from the earliest, and commit those + // (in that order) that have finished flushing. Memtables + // are always committed in the order that they were created. + uint64_t batch_file_number = 0; + size_t batch_count = 0; + autovector edit_list; + autovector memtables_to_flush; + // enumerate from the last (earliest) element to see how many batch finished + for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) { + MemTable* m = *it; + if (!m->flush_completed_) { + break; + } + if (it == memlist.rbegin() || batch_file_number != m->file_number_) { + batch_file_number = m->file_number_; + if (m->edit_.GetBlobFileAdditions().empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 " started", + cfd->GetName().c_str(), m->file_number_); + } else { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + " (+%zu blob files) started", + cfd->GetName().c_str(), m->file_number_, + m->edit_.GetBlobFileAdditions().size()); + } + + edit_list.push_back(&m->edit_); + memtables_to_flush.push_back(m); +#ifndef ROCKSDB_LITE + std::unique_ptr info = m->ReleaseFlushJobInfo(); + if (info != nullptr) { + committed_flush_jobs_info->push_back(std::move(info)); + } +#else + (void)committed_flush_jobs_info; +#endif // !ROCKSDB_LITE + } + batch_count++; + } + + // TODO(myabandeh): Not sure how batch_count could be 0 here. + if (batch_count > 0) { + uint64_t min_wal_number_to_keep = 0; + assert(edit_list.size() > 0); + if (vset->db_options()->allow_2pc) { + // Note that if mempurge is successful, the edit_list will + // not be applicable (contains info of new min_log number to keep, + // and level 0 file path of SST file created during normal flush, + // so both pieces of information are irrelevant after a successful + // mempurge operation). + min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC( + vset, *cfd, edit_list, memtables_to_flush, prep_tracker); + + // We piggyback the information of earliest log file to keep in the + // manifest entry for the last file flushed. + } else { + min_wal_number_to_keep = + PrecomputeMinLogNumberToKeepNon2PC(vset, *cfd, edit_list); + } + + VersionEdit wal_deletion; + wal_deletion.SetMinLogNumberToKeep(min_wal_number_to_keep); + if (vset->db_options()->track_and_verify_wals_in_manifest) { + if (min_wal_number_to_keep > + vset->GetWalSet().GetMinWalNumberToKeep()) { + wal_deletion.DeleteWalsBefore(min_wal_number_to_keep); + } + TEST_SYNC_POINT_CALLBACK( + "MemTableList::TryInstallMemtableFlushResults:" + "AfterComputeMinWalToKeep", + nullptr); + } + edit_list.push_back(&wal_deletion); + + const auto manifest_write_cb = [this, cfd, batch_count, log_buffer, + to_delete, mu](const Status& status) { + RemoveMemTablesOrRestoreFlags(status, cfd, batch_count, log_buffer, + to_delete, mu); + }; + if (write_edits) { + // this can release and reacquire the mutex. + s = vset->LogAndApply(cfd, mutable_cf_options, edit_list, mu, + db_directory, /*new_descriptor_log=*/false, + /*column_family_options=*/nullptr, + manifest_write_cb); + } else { + // If write_edit is false (e.g: successful mempurge), + // then remove old memtables, wake up manifest write queue threads, + // and don't commit anything to the manifest file. + RemoveMemTablesOrRestoreFlags(s, cfd, batch_count, log_buffer, + to_delete, mu); + // Note: cfd->SetLogNumber is only called when a VersionEdit + // is written to MANIFEST. When mempurge is succesful, we skip + // this step, therefore cfd->GetLogNumber is always is + // earliest log with data unflushed. + // Notify new head of manifest write queue. + // wake up all the waiting writers + // TODO(bjlemaire): explain full reason WakeUpWaitingManifestWriters + // needed or investigate more. + vset->WakeUpWaitingManifestWriters(); + } + } + } + commit_in_progress_ = false; + return s; +} + +// New memtables are inserted at the front of the list. +void MemTableList::Add(MemTable* m, autovector* to_delete) { + assert(static_cast(current_->memlist_.size()) >= num_flush_not_started_); + InstallNewVersion(); + // this method is used to move mutable memtable into an immutable list. + // since mutable memtable is already refcounted by the DBImpl, + // and when moving to the immutable list we don't unref it, + // we don't have to ref the memtable here. we just take over the + // reference from the DBImpl. + current_->Add(m, to_delete); + m->MarkImmutable(); + num_flush_not_started_++; + if (num_flush_not_started_ == 1) { + imm_flush_needed.store(true, std::memory_order_release); + } + UpdateCachedValuesFromMemTableListVersion(); + ResetTrimHistoryNeeded(); +} + +bool MemTableList::TrimHistory(autovector* to_delete, size_t usage) { + InstallNewVersion(); + bool ret = current_->TrimHistory(to_delete, usage); + UpdateCachedValuesFromMemTableListVersion(); + ResetTrimHistoryNeeded(); + return ret; +} + +// Returns an estimate of the number of bytes of data in use. +size_t MemTableList::ApproximateUnflushedMemTablesMemoryUsage() { + size_t total_size = 0; + for (auto& memtable : current_->memlist_) { + total_size += memtable->ApproximateMemoryUsage(); + } + return total_size; +} + +size_t MemTableList::ApproximateMemoryUsage() { return current_memory_usage_; } + +size_t MemTableList::MemoryAllocatedBytesExcludingLast() const { + const size_t usage = current_memory_allocted_bytes_excluding_last_.load( + std::memory_order_relaxed); + return usage; +} + +bool MemTableList::HasHistory() const { + const bool has_history = current_has_history_.load(std::memory_order_relaxed); + return has_history; +} + +void MemTableList::UpdateCachedValuesFromMemTableListVersion() { + const size_t total_memtable_size = + current_->MemoryAllocatedBytesExcludingLast(); + current_memory_allocted_bytes_excluding_last_.store( + total_memtable_size, std::memory_order_relaxed); + + const bool has_history = current_->HasHistory(); + current_has_history_.store(has_history, std::memory_order_relaxed); +} + +uint64_t MemTableList::ApproximateOldestKeyTime() const { + if (!current_->memlist_.empty()) { + return current_->memlist_.back()->ApproximateOldestKeyTime(); + } + return std::numeric_limits::max(); +} + +void MemTableList::InstallNewVersion() { + if (current_->refs_ == 1) { + // we're the only one using the version, just keep using it + } else { + // somebody else holds the current version, we need to create new one + MemTableListVersion* version = current_; + current_ = new MemTableListVersion(¤t_memory_usage_, *version); + current_->Ref(); + version->Unref(); + } +} + +void MemTableList::RemoveMemTablesOrRestoreFlags( + const Status& s, ColumnFamilyData* cfd, size_t batch_count, + LogBuffer* log_buffer, autovector* to_delete, + InstrumentedMutex* mu) { + assert(mu); + mu->AssertHeld(); + assert(to_delete); + // we will be changing the version in the next code path, + // so we better create a new one, since versions are immutable + InstallNewVersion(); + + // All the later memtables that have the same filenum + // are part of the same batch. They can be committed now. + uint64_t mem_id = 1; // how many memtables have been flushed. + + // commit new state only if the column family is NOT dropped. + // The reason is as follows (refer to + // ColumnFamilyTest.FlushAndDropRaceCondition). + // If the column family is dropped, then according to LogAndApply, its + // corresponding flush operation is NOT written to the MANIFEST. This + // means the DB is not aware of the L0 files generated from the flush. + // By committing the new state, we remove the memtable from the memtable + // list. Creating an iterator on this column family will not be able to + // read full data since the memtable is removed, and the DB is not aware + // of the L0 files, causing MergingIterator unable to build child + // iterators. RocksDB contract requires that the iterator can be created + // on a dropped column family, and we must be able to + // read full data as long as column family handle is not deleted, even if + // the column family is dropped. + if (s.ok() && !cfd->IsDropped()) { // commit new state + while (batch_count-- > 0) { + MemTable* m = current_->memlist_.back(); + if (m->edit_.GetBlobFileAdditions().empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + ": memtable #%" PRIu64 " done", + cfd->GetName().c_str(), m->file_number_, mem_id); + } else { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + " (+%zu blob files)" + ": memtable #%" PRIu64 " done", + cfd->GetName().c_str(), m->file_number_, + m->edit_.GetBlobFileAdditions().size(), mem_id); + } + + assert(m->file_number_ > 0); + current_->Remove(m, to_delete); + UpdateCachedValuesFromMemTableListVersion(); + ResetTrimHistoryNeeded(); + ++mem_id; + } + } else { + for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; ++it) { + MemTable* m = *it; + // commit failed. setup state so that we can flush again. + if (m->edit_.GetBlobFileAdditions().empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "Level-0 commit table #%" PRIu64 ": memtable #%" PRIu64 + " failed", + m->file_number_, mem_id); + } else { + ROCKS_LOG_BUFFER(log_buffer, + "Level-0 commit table #%" PRIu64 + " (+%zu blob files)" + ": memtable #%" PRIu64 " failed", + m->file_number_, + m->edit_.GetBlobFileAdditions().size(), mem_id); + } + + m->flush_completed_ = false; + m->flush_in_progress_ = false; + m->edit_.Clear(); + num_flush_not_started_++; + m->file_number_ = 0; + imm_flush_needed.store(true, std::memory_order_release); + ++mem_id; + } + } +} + +uint64_t MemTableList::PrecomputeMinLogContainingPrepSection( + const std::unordered_set* memtables_to_flush) { + uint64_t min_log = 0; + + for (auto& m : current_->memlist_) { + if (memtables_to_flush && memtables_to_flush->count(m)) { + continue; + } + + auto log = m->GetMinLogContainingPrepSection(); + + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + } + + return min_log; +} + +// Commit a successful atomic flush in the manifest file. +Status InstallMemtableAtomicFlushResults( + const autovector* imm_lists, + const autovector& cfds, + const autovector& mutable_cf_options_list, + const autovector*>& mems_list, VersionSet* vset, + LogsWithPrepTracker* prep_tracker, InstrumentedMutex* mu, + const autovector& file_metas, + const autovector>*>& + committed_flush_jobs_info, + autovector* to_delete, FSDirectory* db_directory, + LogBuffer* log_buffer) { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS); + mu->AssertHeld(); + + size_t num = mems_list.size(); + assert(cfds.size() == num); + if (imm_lists != nullptr) { + assert(imm_lists->size() == num); + } + if (num == 0) { + return Status::OK(); + } + + for (size_t k = 0; k != num; ++k) { +#ifndef NDEBUG + const auto* imm = + (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k); + if (!mems_list[k]->empty()) { + assert((*mems_list[k])[0]->GetID() == imm->GetEarliestMemTableID()); + } +#endif + assert(nullptr != file_metas[k]); + for (size_t i = 0; i != mems_list[k]->size(); ++i) { + assert(i == 0 || (*mems_list[k])[i]->GetEdits()->NumEntries() == 0); + (*mems_list[k])[i]->SetFlushCompleted(true); + (*mems_list[k])[i]->SetFileNumber(file_metas[k]->fd.GetNumber()); + } +#ifndef ROCKSDB_LITE + if (committed_flush_jobs_info[k]) { + assert(!mems_list[k]->empty()); + assert((*mems_list[k])[0]); + std::unique_ptr flush_job_info = + (*mems_list[k])[0]->ReleaseFlushJobInfo(); + committed_flush_jobs_info[k]->push_back(std::move(flush_job_info)); + } +#else //! ROCKSDB_LITE + (void)committed_flush_jobs_info; +#endif // ROCKSDB_LITE + } + + Status s; + + autovector> edit_lists; + uint32_t num_entries = 0; + for (const auto mems : mems_list) { + assert(mems != nullptr); + autovector edits; + assert(!mems->empty()); + edits.emplace_back((*mems)[0]->GetEdits()); + ++num_entries; + edit_lists.emplace_back(edits); + } + + WalNumber min_wal_number_to_keep = 0; + if (vset->db_options()->allow_2pc) { + min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC( + vset, cfds, edit_lists, mems_list, prep_tracker); + } else { + min_wal_number_to_keep = + PrecomputeMinLogNumberToKeepNon2PC(vset, cfds, edit_lists); + } + + VersionEdit wal_deletion; + wal_deletion.SetMinLogNumberToKeep(min_wal_number_to_keep); + if (vset->db_options()->track_and_verify_wals_in_manifest && + min_wal_number_to_keep > vset->GetWalSet().GetMinWalNumberToKeep()) { + wal_deletion.DeleteWalsBefore(min_wal_number_to_keep); + } + edit_lists.back().push_back(&wal_deletion); + ++num_entries; + + // Mark the version edits as an atomic group if the number of version edits + // exceeds 1. + if (cfds.size() > 1) { + for (size_t i = 0; i < edit_lists.size(); i++) { + assert((edit_lists[i].size() == 1) || + ((edit_lists[i].size() == 2) && (i == edit_lists.size() - 1))); + for (auto& e : edit_lists[i]) { + e->MarkAtomicGroup(--num_entries); + } + } + assert(0 == num_entries); + } + + // this can release and reacquire the mutex. + s = vset->LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu, + db_directory); + + for (size_t k = 0; k != cfds.size(); ++k) { + auto* imm = (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k); + imm->InstallNewVersion(); + } + + if (s.ok() || s.IsColumnFamilyDropped()) { + for (size_t i = 0; i != cfds.size(); ++i) { + if (cfds[i]->IsDropped()) { + continue; + } + auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i); + for (auto m : *mems_list[i]) { + assert(m->GetFileNumber() > 0); + uint64_t mem_id = m->GetID(); + + const VersionEdit* const edit = m->GetEdits(); + assert(edit); + + if (edit->GetBlobFileAdditions().empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + ": memtable #%" PRIu64 " done", + cfds[i]->GetName().c_str(), m->GetFileNumber(), + mem_id); + } else { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + " (+%zu blob files)" + ": memtable #%" PRIu64 " done", + cfds[i]->GetName().c_str(), m->GetFileNumber(), + edit->GetBlobFileAdditions().size(), mem_id); + } + + imm->current_->Remove(m, to_delete); + imm->UpdateCachedValuesFromMemTableListVersion(); + imm->ResetTrimHistoryNeeded(); + } + } + } else { + for (size_t i = 0; i != cfds.size(); ++i) { + auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i); + for (auto m : *mems_list[i]) { + uint64_t mem_id = m->GetID(); + + const VersionEdit* const edit = m->GetEdits(); + assert(edit); + + if (edit->GetBlobFileAdditions().empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + ": memtable #%" PRIu64 " failed", + cfds[i]->GetName().c_str(), m->GetFileNumber(), + mem_id); + } else { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + " (+%zu blob files)" + ": memtable #%" PRIu64 " failed", + cfds[i]->GetName().c_str(), m->GetFileNumber(), + edit->GetBlobFileAdditions().size(), mem_id); + } + + m->SetFlushCompleted(false); + m->SetFlushInProgress(false); + m->GetEdits()->Clear(); + m->SetFileNumber(0); + imm->num_flush_not_started_++; + } + imm->imm_flush_needed.store(true, std::memory_order_release); + } + } + + return s; +} + +void MemTableList::RemoveOldMemTables(uint64_t log_number, + autovector* to_delete) { + assert(to_delete != nullptr); + InstallNewVersion(); + auto& memlist = current_->memlist_; + autovector old_memtables; + for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) { + MemTable* mem = *it; + if (mem->GetNextLogNumber() > log_number) { + break; + } + old_memtables.push_back(mem); + } + + for (auto it = old_memtables.begin(); it != old_memtables.end(); ++it) { + MemTable* mem = *it; + current_->Remove(mem, to_delete); + --num_flush_not_started_; + if (0 == num_flush_not_started_) { + imm_flush_needed.store(false, std::memory_order_release); + } + } + + UpdateCachedValuesFromMemTableListVersion(); + ResetTrimHistoryNeeded(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/memtable_list.h b/src/rocksdb/db/memtable_list.h new file mode 100644 index 000000000..1ad28a59e --- /dev/null +++ b/src/rocksdb/db/memtable_list.h @@ -0,0 +1,471 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "db/logs_with_prep_tracker.h" +#include "db/memtable.h" +#include "db/range_del_aggregator.h" +#include "file/filename.h" +#include "logging/log_buffer.h" +#include "monitoring/instrumented_mutex.h" +#include "rocksdb/db.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/types.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class ColumnFamilyData; +class InternalKeyComparator; +class InstrumentedMutex; +class MergeIteratorBuilder; +class MemTableList; + +struct FlushJobInfo; + +// keeps a list of immutable memtables in a vector. the list is immutable +// if refcount is bigger than one. It is used as a state for Get() and +// Iterator code paths +// +// This class is not thread-safe. External synchronization is required +// (such as holding the db mutex or being on the write thread). +class MemTableListVersion { + public: + explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage, + const MemTableListVersion& old); + explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage, + int max_write_buffer_number_to_maintain, + int64_t max_write_buffer_size_to_maintain); + + void Ref(); + void Unref(autovector* to_delete = nullptr); + + // Search all the memtables starting from the most recent one. + // Return the most recent value found, if any. + // + // If any operation was found for this key, its most recent sequence number + // will be stored in *seq on success (regardless of whether true/false is + // returned). Otherwise, *seq will be set to kMaxSequenceNumber. + bool Get(const LookupKey& key, std::string* value, + PinnableWideColumns* columns, std::string* timestamp, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, + const ReadOptions& read_opts, ReadCallback* callback = nullptr, + bool* is_blob_index = nullptr); + + bool Get(const LookupKey& key, std::string* value, + PinnableWideColumns* columns, std::string* timestamp, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + const ReadOptions& read_opts, ReadCallback* callback = nullptr, + bool* is_blob_index = nullptr) { + SequenceNumber seq; + return Get(key, value, columns, timestamp, s, merge_context, + max_covering_tombstone_seq, &seq, read_opts, callback, + is_blob_index); + } + + void MultiGet(const ReadOptions& read_options, MultiGetRange* range, + ReadCallback* callback); + + // Returns all the merge operands corresponding to the key by searching all + // memtables starting from the most recent one. + bool GetMergeOperands(const LookupKey& key, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + const ReadOptions& read_opts); + + // Similar to Get(), but searches the Memtable history of memtables that + // have already been flushed. Should only be used from in-memory only + // queries (such as Transaction validation) as the history may contain + // writes that are also present in the SST files. + bool GetFromHistory(const LookupKey& key, std::string* value, + PinnableWideColumns* columns, std::string* timestamp, + Status* s, MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + SequenceNumber* seq, const ReadOptions& read_opts, + bool* is_blob_index = nullptr); + bool GetFromHistory(const LookupKey& key, std::string* value, + PinnableWideColumns* columns, std::string* timestamp, + Status* s, MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + const ReadOptions& read_opts, + bool* is_blob_index = nullptr) { + SequenceNumber seq; + return GetFromHistory(key, value, columns, timestamp, s, merge_context, + max_covering_tombstone_seq, &seq, read_opts, + is_blob_index); + } + + Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena, + RangeDelAggregator* range_del_agg); + + void AddIterators(const ReadOptions& options, + std::vector* iterator_list, + Arena* arena); + + void AddIterators(const ReadOptions& options, + MergeIteratorBuilder* merge_iter_builder, + bool add_range_tombstone_iter); + + uint64_t GetTotalNumEntries() const; + + uint64_t GetTotalNumDeletes() const; + + MemTable::MemTableStats ApproximateStats(const Slice& start_ikey, + const Slice& end_ikey); + + // Returns the value of MemTable::GetEarliestSequenceNumber() on the most + // recent MemTable in this list or kMaxSequenceNumber if the list is empty. + // If include_history=true, will also search Memtables in MemTableList + // History. + SequenceNumber GetEarliestSequenceNumber(bool include_history = false) const; + + // Return the first sequence number from the memtable list, which is the + // smallest sequence number of all FirstSequenceNumber. + // Return kMaxSequenceNumber if the list is empty. + SequenceNumber GetFirstSequenceNumber() const; + + private: + friend class MemTableList; + + friend Status InstallMemtableAtomicFlushResults( + const autovector* imm_lists, + const autovector& cfds, + const autovector& mutable_cf_options_list, + const autovector*>& mems_list, + VersionSet* vset, LogsWithPrepTracker* prep_tracker, + InstrumentedMutex* mu, const autovector& file_meta, + const autovector>*>& + committed_flush_jobs_info, + autovector* to_delete, FSDirectory* db_directory, + LogBuffer* log_buffer); + + // REQUIRE: m is an immutable memtable + void Add(MemTable* m, autovector* to_delete); + // REQUIRE: m is an immutable memtable + void Remove(MemTable* m, autovector* to_delete); + + // Return true if memtable is trimmed + bool TrimHistory(autovector* to_delete, size_t usage); + + bool GetFromList(std::list* list, const LookupKey& key, + std::string* value, PinnableWideColumns* columns, + std::string* timestamp, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + SequenceNumber* seq, const ReadOptions& read_opts, + ReadCallback* callback = nullptr, + bool* is_blob_index = nullptr); + + void AddMemTable(MemTable* m); + + void UnrefMemTable(autovector* to_delete, MemTable* m); + + // Calculate the total amount of memory used by memlist_ and memlist_history_ + // excluding the last MemTable in memlist_history_. The reason for excluding + // the last MemTable is to see if dropping the last MemTable will keep total + // memory usage above or equal to max_write_buffer_size_to_maintain_ + size_t MemoryAllocatedBytesExcludingLast() const; + + // Whether this version contains flushed memtables that are only kept around + // for transaction conflict checking. + bool HasHistory() const { return !memlist_history_.empty(); } + + bool MemtableLimitExceeded(size_t usage); + + // Immutable MemTables that have not yet been flushed. + std::list memlist_; + + // MemTables that have already been flushed + // (used during Transaction validation) + std::list memlist_history_; + + // Maximum number of MemTables to keep in memory (including both flushed + const int max_write_buffer_number_to_maintain_; + // Maximum size of MemTables to keep in memory (including both flushed + // and not-yet-flushed tables). + const int64_t max_write_buffer_size_to_maintain_; + + int refs_ = 0; + + size_t* parent_memtable_list_memory_usage_; +}; + +// This class stores references to all the immutable memtables. +// The memtables are flushed to L0 as soon as possible and in +// any order. If there are more than one immutable memtable, their +// flushes can occur concurrently. However, they are 'committed' +// to the manifest in FIFO order to maintain correctness and +// recoverability from a crash. +// +// +// Other than imm_flush_needed and imm_trim_needed, this class is not +// thread-safe and requires external synchronization (such as holding the db +// mutex or being on the write thread.) +class MemTableList { + public: + // A list of memtables. + explicit MemTableList(int min_write_buffer_number_to_merge, + int max_write_buffer_number_to_maintain, + int64_t max_write_buffer_size_to_maintain) + : imm_flush_needed(false), + imm_trim_needed(false), + min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge), + current_(new MemTableListVersion(¤t_memory_usage_, + max_write_buffer_number_to_maintain, + max_write_buffer_size_to_maintain)), + num_flush_not_started_(0), + commit_in_progress_(false), + flush_requested_(false), + current_memory_usage_(0), + current_memory_allocted_bytes_excluding_last_(0), + current_has_history_(false) { + current_->Ref(); + } + + // Should not delete MemTableList without making sure MemTableList::current() + // is Unref()'d. + ~MemTableList() {} + + MemTableListVersion* current() const { return current_; } + + // so that background threads can detect non-nullptr pointer to + // determine whether there is anything more to start flushing. + std::atomic imm_flush_needed; + + std::atomic imm_trim_needed; + + // Returns the total number of memtables in the list that haven't yet + // been flushed and logged. + int NumNotFlushed() const; + + // Returns total number of memtables in the list that have been + // completely flushed and logged. + int NumFlushed() const; + + // Returns true if there is at least one memtable on which flush has + // not yet started. + bool IsFlushPending() const; + + // Returns true if there is at least one memtable that is pending flush or + // flushing. + bool IsFlushPendingOrRunning() const; + + // Returns the earliest memtables that needs to be flushed. The returned + // memtables are guaranteed to be in the ascending order of created time. + void PickMemtablesToFlush(uint64_t max_memtable_id, + autovector* mems, + uint64_t* max_next_log_number = nullptr); + + // Reset status of the given memtable list back to pending state so that + // they can get picked up again on the next round of flush. + void RollbackMemtableFlush(const autovector& mems, + uint64_t file_number); + + // Try commit a successful flush in the manifest file. It might just return + // Status::OK letting a concurrent flush to do the actual the recording. + Status TryInstallMemtableFlushResults( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + const autovector& m, LogsWithPrepTracker* prep_tracker, + VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number, + autovector* to_delete, FSDirectory* db_directory, + LogBuffer* log_buffer, + std::list>* committed_flush_jobs_info, + bool write_edits = true); + + // New memtables are inserted at the front of the list. + // Takes ownership of the referenced held on *m by the caller of Add(). + // By default, adding memtables will flag that the memtable list needs to be + // flushed, but in certain situations, like after a mempurge, we may want to + // avoid flushing the memtable list upon addition of a memtable. + void Add(MemTable* m, autovector* to_delete); + + // Returns an estimate of the number of bytes of data in use. + size_t ApproximateMemoryUsage(); + + // Returns the cached current_memory_allocted_bytes_excluding_last_ value. + size_t MemoryAllocatedBytesExcludingLast() const; + + // Returns the cached current_has_history_ value. + bool HasHistory() const; + + // Updates current_memory_allocted_bytes_excluding_last_ and + // current_has_history_ from MemTableListVersion. Must be called whenever + // InstallNewVersion is called. + void UpdateCachedValuesFromMemTableListVersion(); + + // `usage` is the current size of the mutable Memtable. When + // max_write_buffer_size_to_maintain is used, total size of mutable and + // immutable memtables is checked against it to decide whether to trim + // memtable list. + // + // Return true if memtable is trimmed + bool TrimHistory(autovector* to_delete, size_t usage); + + // Returns an estimate of the number of bytes of data used by + // the unflushed mem-tables. + size_t ApproximateUnflushedMemTablesMemoryUsage(); + + // Returns an estimate of the timestamp of the earliest key. + uint64_t ApproximateOldestKeyTime() const; + + // Request a flush of all existing memtables to storage. This will + // cause future calls to IsFlushPending() to return true if this list is + // non-empty (regardless of the min_write_buffer_number_to_merge + // parameter). This flush request will persist until the next time + // PickMemtablesToFlush() is called. + void FlushRequested() { + flush_requested_ = true; + // If there are some memtables stored in imm() that don't trigger + // flush (eg: mempurge output memtable), then update imm_flush_needed. + // Note: if race condition and imm_flush_needed is set to true + // when there is num_flush_not_started_==0, then there is no + // impact whatsoever. Imm_flush_needed is only used in an assert + // in IsFlushPending(). + if (num_flush_not_started_ > 0) { + imm_flush_needed.store(true, std::memory_order_release); + } + } + + bool HasFlushRequested() { return flush_requested_; } + + // Returns true if a trim history should be scheduled and the caller should + // be the one to schedule it + bool MarkTrimHistoryNeeded() { + auto expected = false; + return imm_trim_needed.compare_exchange_strong( + expected, true, std::memory_order_relaxed, std::memory_order_relaxed); + } + + void ResetTrimHistoryNeeded() { + auto expected = true; + imm_trim_needed.compare_exchange_strong( + expected, false, std::memory_order_relaxed, std::memory_order_relaxed); + } + + // Copying allowed + // MemTableList(const MemTableList&); + // void operator=(const MemTableList&); + + size_t* current_memory_usage() { return ¤t_memory_usage_; } + + // Returns the min log containing the prep section after memtables listsed in + // `memtables_to_flush` are flushed and their status is persisted in manifest. + uint64_t PrecomputeMinLogContainingPrepSection( + const std::unordered_set* memtables_to_flush = nullptr); + + uint64_t GetEarliestMemTableID() const { + auto& memlist = current_->memlist_; + if (memlist.empty()) { + return std::numeric_limits::max(); + } + return memlist.back()->GetID(); + } + + uint64_t GetLatestMemTableID() const { + auto& memlist = current_->memlist_; + if (memlist.empty()) { + return 0; + } + return memlist.front()->GetID(); + } + + void AssignAtomicFlushSeq(const SequenceNumber& seq) { + const auto& memlist = current_->memlist_; + // Scan the memtable list from new to old + for (auto it = memlist.begin(); it != memlist.end(); ++it) { + MemTable* mem = *it; + if (mem->atomic_flush_seqno_ == kMaxSequenceNumber) { + mem->atomic_flush_seqno_ = seq; + } else { + // Earlier memtables must have been assigned a atomic flush seq, no + // need to continue scan. + break; + } + } + } + + // Used only by DBImplSecondary during log replay. + // Remove memtables whose data were written before the WAL with log_number + // was created, i.e. mem->GetNextLogNumber() <= log_number. The memtables are + // not freed, but put into a vector for future deref and reclamation. + void RemoveOldMemTables(uint64_t log_number, + autovector* to_delete); + + private: + friend Status InstallMemtableAtomicFlushResults( + const autovector* imm_lists, + const autovector& cfds, + const autovector& mutable_cf_options_list, + const autovector*>& mems_list, + VersionSet* vset, LogsWithPrepTracker* prep_tracker, + InstrumentedMutex* mu, const autovector& file_meta, + const autovector>*>& + committed_flush_jobs_info, + autovector* to_delete, FSDirectory* db_directory, + LogBuffer* log_buffer); + + // DB mutex held + void InstallNewVersion(); + + // DB mutex held + // Called after writing to MANIFEST + void RemoveMemTablesOrRestoreFlags(const Status& s, ColumnFamilyData* cfd, + size_t batch_count, LogBuffer* log_buffer, + autovector* to_delete, + InstrumentedMutex* mu); + + const int min_write_buffer_number_to_merge_; + + MemTableListVersion* current_; + + // the number of elements that still need flushing + int num_flush_not_started_; + + // committing in progress + bool commit_in_progress_; + + // Requested a flush of memtables to storage. It's possible to request that + // a subset of memtables be flushed. + bool flush_requested_; + + // The current memory usage. + size_t current_memory_usage_; + + // Cached value of current_->MemoryAllocatedBytesExcludingLast(). + std::atomic current_memory_allocted_bytes_excluding_last_; + + // Cached value of current_->HasHistory(). + std::atomic current_has_history_; +}; + +// Installs memtable atomic flush results. +// In most cases, imm_lists is nullptr, and the function simply uses the +// immutable memtable lists associated with the cfds. There are unit tests that +// installs flush results for external immutable memtable lists other than the +// cfds' own immutable memtable lists, e.g. MemTableLIstTest. In this case, +// imm_lists parameter is not nullptr. +extern Status InstallMemtableAtomicFlushResults( + const autovector* imm_lists, + const autovector& cfds, + const autovector& mutable_cf_options_list, + const autovector*>& mems_list, VersionSet* vset, + LogsWithPrepTracker* prep_tracker, InstrumentedMutex* mu, + const autovector& file_meta, + const autovector>*>& + committed_flush_jobs_info, + autovector* to_delete, FSDirectory* db_directory, + LogBuffer* log_buffer); +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/memtable_list_test.cc b/src/rocksdb/db/memtable_list_test.cc new file mode 100644 index 000000000..8242061af --- /dev/null +++ b/src/rocksdb/db/memtable_list_test.cc @@ -0,0 +1,1039 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/memtable_list.h" + +#include +#include +#include + +#include "db/merge_context.h" +#include "db/version_set.h" +#include "db/write_controller.h" +#include "rocksdb/db.h" +#include "rocksdb/status.h" +#include "rocksdb/write_buffer_manager.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class MemTableListTest : public testing::Test { + public: + std::string dbname; + DB* db; + Options options; + std::vector handles; + std::atomic file_number; + + MemTableListTest() : db(nullptr), file_number(1) { + dbname = test::PerThreadDBPath("memtable_list_test"); + options.create_if_missing = true; + EXPECT_OK(DestroyDB(dbname, options)); + } + + // Create a test db if not yet created + void CreateDB() { + if (db == nullptr) { + options.create_if_missing = true; + EXPECT_OK(DestroyDB(dbname, options)); + // Open DB only with default column family + ColumnFamilyOptions cf_options; + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, cf_options); + Status s = DB::Open(options, dbname, cf_descs, &handles, &db); + EXPECT_OK(s); + + ColumnFamilyOptions cf_opt1, cf_opt2; + cf_opt1.cf_paths.emplace_back(dbname + "_one_1", + std::numeric_limits::max()); + cf_opt2.cf_paths.emplace_back(dbname + "_two_1", + std::numeric_limits::max()); + int sz = static_cast(handles.size()); + handles.resize(sz + 2); + s = db->CreateColumnFamily(cf_opt1, "one", &handles[1]); + EXPECT_OK(s); + s = db->CreateColumnFamily(cf_opt2, "two", &handles[2]); + EXPECT_OK(s); + + cf_descs.emplace_back("one", cf_options); + cf_descs.emplace_back("two", cf_options); + } + } + + ~MemTableListTest() override { + if (db) { + std::vector cf_descs(handles.size()); +#ifndef ROCKSDB_LITE + for (int i = 0; i != static_cast(handles.size()); ++i) { + EXPECT_OK(handles[i]->GetDescriptor(&cf_descs[i])); + } +#endif // !ROCKSDB_LITE + for (auto h : handles) { + if (h) { + EXPECT_OK(db->DestroyColumnFamilyHandle(h)); + } + } + handles.clear(); + delete db; + db = nullptr; + EXPECT_OK(DestroyDB(dbname, options, cf_descs)); + } + } + + // Calls MemTableList::TryInstallMemtableFlushResults() and sets up all + // structures needed to call this function. + Status Mock_InstallMemtableFlushResults( + MemTableList* list, const MutableCFOptions& mutable_cf_options, + const autovector& m, autovector* to_delete) { + // Create a mock Logger + test::NullLogger logger; + LogBuffer log_buffer(DEBUG_LEVEL, &logger); + + CreateDB(); + // Create a mock VersionSet + DBOptions db_options; + ImmutableDBOptions immutable_db_options(db_options); + EnvOptions env_options; + std::shared_ptr table_cache(NewLRUCache(50000, 16)); + WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size); + WriteController write_controller(10000000u); + + VersionSet versions(dbname, &immutable_db_options, env_options, + table_cache.get(), &write_buffer_manager, + &write_controller, /*block_cache_tracer=*/nullptr, + /*io_tracer=*/nullptr, /*db_id*/ "", + /*db_session_id*/ ""); + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); + cf_descs.emplace_back("one", ColumnFamilyOptions()); + cf_descs.emplace_back("two", ColumnFamilyOptions()); + + EXPECT_OK(versions.Recover(cf_descs, false)); + + // Create mock default ColumnFamilyData + auto column_family_set = versions.GetColumnFamilySet(); + LogsWithPrepTracker dummy_prep_tracker; + auto cfd = column_family_set->GetDefault(); + EXPECT_TRUE(nullptr != cfd); + uint64_t file_num = file_number.fetch_add(1); + IOStatus io_s; + // Create dummy mutex. + InstrumentedMutex mutex; + InstrumentedMutexLock l(&mutex); + std::list> flush_jobs_info; + Status s = list->TryInstallMemtableFlushResults( + cfd, mutable_cf_options, m, &dummy_prep_tracker, &versions, &mutex, + file_num, to_delete, nullptr, &log_buffer, &flush_jobs_info); + EXPECT_OK(io_s); + return s; + } + + // Calls MemTableList::InstallMemtableFlushResults() and sets up all + // structures needed to call this function. + Status Mock_InstallMemtableAtomicFlushResults( + autovector& lists, const autovector& cf_ids, + const autovector& mutable_cf_options_list, + const autovector*>& mems_list, + autovector* to_delete) { + // Create a mock Logger + test::NullLogger logger; + LogBuffer log_buffer(DEBUG_LEVEL, &logger); + + CreateDB(); + // Create a mock VersionSet + DBOptions db_options; + + ImmutableDBOptions immutable_db_options(db_options); + EnvOptions env_options; + std::shared_ptr table_cache(NewLRUCache(50000, 16)); + WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size); + WriteController write_controller(10000000u); + + VersionSet versions(dbname, &immutable_db_options, env_options, + table_cache.get(), &write_buffer_manager, + &write_controller, /*block_cache_tracer=*/nullptr, + /*io_tracer=*/nullptr, /*db_id*/ "", + /*db_session_id*/ ""); + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); + cf_descs.emplace_back("one", ColumnFamilyOptions()); + cf_descs.emplace_back("two", ColumnFamilyOptions()); + EXPECT_OK(versions.Recover(cf_descs, false)); + + // Create mock default ColumnFamilyData + + auto column_family_set = versions.GetColumnFamilySet(); + + LogsWithPrepTracker dummy_prep_tracker; + autovector cfds; + for (int i = 0; i != static_cast(cf_ids.size()); ++i) { + cfds.emplace_back(column_family_set->GetColumnFamily(cf_ids[i])); + EXPECT_NE(nullptr, cfds[i]); + } + std::vector file_metas; + file_metas.reserve(cf_ids.size()); + for (size_t i = 0; i != cf_ids.size(); ++i) { + FileMetaData meta; + uint64_t file_num = file_number.fetch_add(1); + meta.fd = FileDescriptor(file_num, 0, 0); + file_metas.emplace_back(meta); + } + autovector file_meta_ptrs; + for (auto& meta : file_metas) { + file_meta_ptrs.push_back(&meta); + } + std::vector>> + committed_flush_jobs_info_storage(cf_ids.size()); + autovector>*> + committed_flush_jobs_info; + for (int i = 0; i < static_cast(cf_ids.size()); ++i) { + committed_flush_jobs_info.push_back( + &committed_flush_jobs_info_storage[i]); + } + + InstrumentedMutex mutex; + InstrumentedMutexLock l(&mutex); + return InstallMemtableAtomicFlushResults( + &lists, cfds, mutable_cf_options_list, mems_list, &versions, + nullptr /* prep_tracker */, &mutex, file_meta_ptrs, + committed_flush_jobs_info, to_delete, nullptr, &log_buffer); + } +}; + +TEST_F(MemTableListTest, Empty) { + // Create an empty MemTableList and validate basic functions. + MemTableList list(1, 0, 0); + + ASSERT_EQ(0, list.NumNotFlushed()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + ASSERT_FALSE(list.IsFlushPending()); + + autovector mems; + list.PickMemtablesToFlush( + std::numeric_limits::max() /* memtable_id */, &mems); + ASSERT_EQ(0, mems.size()); + + autovector to_delete; + list.current()->Unref(&to_delete); + ASSERT_EQ(0, to_delete.size()); +} + +TEST_F(MemTableListTest, GetTest) { + // Create MemTableList + int min_write_buffer_number_to_merge = 2; + int max_write_buffer_number_to_maintain = 0; + int64_t max_write_buffer_size_to_maintain = 0; + MemTableList list(min_write_buffer_number_to_merge, + max_write_buffer_number_to_maintain, + max_write_buffer_size_to_maintain); + + SequenceNumber seq = 1; + std::string value; + Status s; + MergeContext merge_context; + InternalKeyComparator ikey_cmp(options.comparator); + SequenceNumber max_covering_tombstone_seq = 0; + autovector to_delete; + + LookupKey lkey("key1", seq); + bool found = list.current()->Get(lkey, &value, /*columns=*/nullptr, + /*timestamp=*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_FALSE(found); + + // Create a MemTable + InternalKeyComparator cmp(BytewiseComparator()); + auto factory = std::make_shared(); + options.memtable_factory = factory; + ImmutableOptions ioptions(options); + + WriteBufferManager wb(options.db_write_buffer_size); + MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, + kMaxSequenceNumber, 0 /* column_family_id */); + mem->Ref(); + + // Write some keys to this memtable. + ASSERT_OK( + mem->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", "value1", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2.2", + nullptr /* kv_prot_info */)); + + // Fetch the newly written keys + merge_context.Clear(); + found = mem->Get(LookupKey("key1", seq), &value, /*columns*/ nullptr, + /*timestamp*/ nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions(), + false /* immutable_memtable */); + ASSERT_TRUE(s.ok() && found); + ASSERT_EQ(value, "value1"); + + merge_context.Clear(); + found = mem->Get(LookupKey("key1", 2), &value, /*columns*/ nullptr, + /*timestamp*/ nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions(), + false /* immutable_memtable */); + // MemTable found out that this key is *not* found (at this sequence#) + ASSERT_TRUE(found && s.IsNotFound()); + + merge_context.Clear(); + found = mem->Get(LookupKey("key2", seq), &value, /*columns*/ nullptr, + /*timestamp*/ nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions(), + false /* immutable_memtable */); + ASSERT_TRUE(s.ok() && found); + ASSERT_EQ(value, "value2.2"); + + ASSERT_EQ(4, mem->num_entries()); + ASSERT_EQ(1, mem->num_deletes()); + + // Add memtable to list + // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed()) + // in MemTableListVersion::GetFromList work. + mem->ConstructFragmentedRangeTombstones(); + list.Add(mem, &to_delete); + + SequenceNumber saved_seq = seq; + + // Create another memtable and write some keys to it + WriteBufferManager wb2(options.db_write_buffer_size); + MemTable* mem2 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb2, + kMaxSequenceNumber, 0 /* column_family_id */); + mem2->Ref(); + + ASSERT_OK( + mem2->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */)); + ASSERT_OK(mem2->Add(++seq, kTypeValue, "key2", "value2.3", + nullptr /* kv_prot_info */)); + + // Add second memtable to list + // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed()) + // in MemTableListVersion::GetFromList work. + mem2->ConstructFragmentedRangeTombstones(); + list.Add(mem2, &to_delete); + + // Fetch keys via MemTableList + merge_context.Clear(); + found = + list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr, + /*timestamp=*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_TRUE(found && s.IsNotFound()); + + merge_context.Clear(); + found = list.current()->Get(LookupKey("key1", saved_seq), &value, + /*columns=*/nullptr, /*timestamp=*/nullptr, &s, + &merge_context, &max_covering_tombstone_seq, + ReadOptions()); + ASSERT_TRUE(s.ok() && found); + ASSERT_EQ("value1", value); + + merge_context.Clear(); + found = + list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr, + /*timestamp=*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_TRUE(s.ok() && found); + ASSERT_EQ(value, "value2.3"); + + merge_context.Clear(); + found = list.current()->Get(LookupKey("key2", 1), &value, /*columns=*/nullptr, + /*timestamp=*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_FALSE(found); + + ASSERT_EQ(2, list.NumNotFlushed()); + + list.current()->Unref(&to_delete); + for (MemTable* m : to_delete) { + delete m; + } +} + +TEST_F(MemTableListTest, GetFromHistoryTest) { + // Create MemTableList + int min_write_buffer_number_to_merge = 2; + int max_write_buffer_number_to_maintain = 2; + int64_t max_write_buffer_size_to_maintain = 2 * Arena::kInlineSize; + MemTableList list(min_write_buffer_number_to_merge, + max_write_buffer_number_to_maintain, + max_write_buffer_size_to_maintain); + + SequenceNumber seq = 1; + std::string value; + Status s; + MergeContext merge_context; + InternalKeyComparator ikey_cmp(options.comparator); + SequenceNumber max_covering_tombstone_seq = 0; + autovector to_delete; + + LookupKey lkey("key1", seq); + bool found = list.current()->Get(lkey, &value, /*columns=*/nullptr, + /*timestamp=*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_FALSE(found); + + // Create a MemTable + InternalKeyComparator cmp(BytewiseComparator()); + auto factory = std::make_shared(); + options.memtable_factory = factory; + ImmutableOptions ioptions(options); + + WriteBufferManager wb(options.db_write_buffer_size); + MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, + kMaxSequenceNumber, 0 /* column_family_id */); + mem->Ref(); + + // Write some keys to this memtable. + ASSERT_OK( + mem->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2.2", + nullptr /* kv_prot_info */)); + + // Fetch the newly written keys + merge_context.Clear(); + found = mem->Get(LookupKey("key1", seq), &value, /*columns*/ nullptr, + /*timestamp*/ nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions(), + false /* immutable_memtable */); + // MemTable found out that this key is *not* found (at this sequence#) + ASSERT_TRUE(found && s.IsNotFound()); + + merge_context.Clear(); + found = mem->Get(LookupKey("key2", seq), &value, /*columns*/ nullptr, + /*timestamp*/ nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions(), + false /* immutable_memtable */); + ASSERT_TRUE(s.ok() && found); + ASSERT_EQ(value, "value2.2"); + + // Add memtable to list + // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed()) + // in MemTableListVersion::GetFromList work. + mem->ConstructFragmentedRangeTombstones(); + list.Add(mem, &to_delete); + ASSERT_EQ(0, to_delete.size()); + + // Fetch keys via MemTableList + merge_context.Clear(); + found = + list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr, + /*timestamp=*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_TRUE(found && s.IsNotFound()); + + merge_context.Clear(); + found = + list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr, + /*timestamp=*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_TRUE(s.ok() && found); + ASSERT_EQ("value2.2", value); + + // Flush this memtable from the list. + // (It will then be a part of the memtable history). + autovector to_flush; + list.PickMemtablesToFlush( + std::numeric_limits::max() /* memtable_id */, &to_flush); + ASSERT_EQ(1, to_flush.size()); + + MutableCFOptions mutable_cf_options(options); + s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush, + &to_delete); + ASSERT_OK(s); + ASSERT_EQ(0, list.NumNotFlushed()); + ASSERT_EQ(1, list.NumFlushed()); + ASSERT_EQ(0, to_delete.size()); + + // Verify keys are no longer in MemTableList + merge_context.Clear(); + found = + list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr, + /*timestamp=*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_FALSE(found); + + merge_context.Clear(); + found = + list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr, + /*timestamp=*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_FALSE(found); + + // Verify keys are present in history + merge_context.Clear(); + found = list.current()->GetFromHistory( + LookupKey("key1", seq), &value, /*columns=*/nullptr, + /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, + ReadOptions()); + ASSERT_TRUE(found && s.IsNotFound()); + + merge_context.Clear(); + found = list.current()->GetFromHistory( + LookupKey("key2", seq), &value, /*columns=*/nullptr, + /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, + ReadOptions()); + ASSERT_TRUE(found); + ASSERT_EQ("value2.2", value); + + // Create another memtable and write some keys to it + WriteBufferManager wb2(options.db_write_buffer_size); + MemTable* mem2 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb2, + kMaxSequenceNumber, 0 /* column_family_id */); + mem2->Ref(); + + ASSERT_OK( + mem2->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */)); + ASSERT_OK(mem2->Add(++seq, kTypeValue, "key3", "value3", + nullptr /* kv_prot_info */)); + + // Add second memtable to list + // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed()) + // in MemTableListVersion::GetFromList work. + mem2->ConstructFragmentedRangeTombstones(); + list.Add(mem2, &to_delete); + ASSERT_EQ(0, to_delete.size()); + + to_flush.clear(); + list.PickMemtablesToFlush( + std::numeric_limits::max() /* memtable_id */, &to_flush); + ASSERT_EQ(1, to_flush.size()); + + // Flush second memtable + s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush, + &to_delete); + ASSERT_OK(s); + ASSERT_EQ(0, list.NumNotFlushed()); + ASSERT_EQ(2, list.NumFlushed()); + ASSERT_EQ(0, to_delete.size()); + + // Add a third memtable to push the first memtable out of the history + WriteBufferManager wb3(options.db_write_buffer_size); + MemTable* mem3 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb3, + kMaxSequenceNumber, 0 /* column_family_id */); + mem3->Ref(); + // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed()) + // in MemTableListVersion::GetFromList work. + mem3->ConstructFragmentedRangeTombstones(); + list.Add(mem3, &to_delete); + ASSERT_EQ(1, list.NumNotFlushed()); + ASSERT_EQ(1, list.NumFlushed()); + ASSERT_EQ(1, to_delete.size()); + + // Verify keys are no longer in MemTableList + merge_context.Clear(); + found = + list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr, + /*timestamp=*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_FALSE(found); + + merge_context.Clear(); + found = + list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr, + /*timestamp=*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_FALSE(found); + + merge_context.Clear(); + found = + list.current()->Get(LookupKey("key3", seq), &value, /*columns=*/nullptr, + /*timestamp=*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_FALSE(found); + + // Verify that the second memtable's keys are in the history + merge_context.Clear(); + found = list.current()->GetFromHistory( + LookupKey("key1", seq), &value, /*columns=*/nullptr, + /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, + ReadOptions()); + ASSERT_TRUE(found && s.IsNotFound()); + + merge_context.Clear(); + found = list.current()->GetFromHistory( + LookupKey("key3", seq), &value, /*columns=*/nullptr, + /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, + ReadOptions()); + ASSERT_TRUE(found); + ASSERT_EQ("value3", value); + + // Verify that key2 from the first memtable is no longer in the history + merge_context.Clear(); + found = + list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr, + /*timestamp=*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_FALSE(found); + + // Cleanup + list.current()->Unref(&to_delete); + ASSERT_EQ(3, to_delete.size()); + for (MemTable* m : to_delete) { + delete m; + } +} + +TEST_F(MemTableListTest, FlushPendingTest) { + const int num_tables = 6; + SequenceNumber seq = 1; + Status s; + + auto factory = std::make_shared(); + options.memtable_factory = factory; + ImmutableOptions ioptions(options); + InternalKeyComparator cmp(BytewiseComparator()); + WriteBufferManager wb(options.db_write_buffer_size); + autovector to_delete; + + // Create MemTableList + int min_write_buffer_number_to_merge = 3; + int max_write_buffer_number_to_maintain = 7; + int64_t max_write_buffer_size_to_maintain = + 7 * static_cast(options.write_buffer_size); + MemTableList list(min_write_buffer_number_to_merge, + max_write_buffer_number_to_maintain, + max_write_buffer_size_to_maintain); + + // Create some MemTables + uint64_t memtable_id = 0; + std::vector tables; + MutableCFOptions mutable_cf_options(options); + for (int i = 0; i < num_tables; i++) { + MemTable* mem = new MemTable(cmp, ioptions, mutable_cf_options, &wb, + kMaxSequenceNumber, 0 /* column_family_id */); + mem->SetID(memtable_id++); + mem->Ref(); + + std::string value; + MergeContext merge_context; + + ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", std::to_string(i), + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "keyN" + std::to_string(i), "valueN", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "keyX" + std::to_string(i), "value", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "keyM" + std::to_string(i), "valueM", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeDeletion, "keyX" + std::to_string(i), "", + nullptr /* kv_prot_info */)); + + tables.push_back(mem); + } + + // Nothing to flush + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + autovector to_flush; + list.PickMemtablesToFlush( + std::numeric_limits::max() /* memtable_id */, &to_flush); + ASSERT_EQ(0, to_flush.size()); + + // Request a flush even though there is nothing to flush + list.FlushRequested(); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Attempt to 'flush' to clear request for flush + list.PickMemtablesToFlush( + std::numeric_limits::max() /* memtable_id */, &to_flush); + ASSERT_EQ(0, to_flush.size()); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Request a flush again + list.FlushRequested(); + // No flush pending since the list is empty. + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Add 2 tables + list.Add(tables[0], &to_delete); + list.Add(tables[1], &to_delete); + ASSERT_EQ(2, list.NumNotFlushed()); + ASSERT_EQ(0, to_delete.size()); + + // Even though we have less than the minimum to flush, a flush is + // pending since we had previously requested a flush and never called + // PickMemtablesToFlush() to clear the flush. + ASSERT_TRUE(list.IsFlushPending()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Pick tables to flush + list.PickMemtablesToFlush( + std::numeric_limits::max() /* memtable_id */, &to_flush); + ASSERT_EQ(2, to_flush.size()); + ASSERT_EQ(2, list.NumNotFlushed()); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Revert flush + list.RollbackMemtableFlush(to_flush, 0); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + to_flush.clear(); + + // Add another table + list.Add(tables[2], &to_delete); + // We now have the minimum to flush regardles of whether FlushRequested() + // was called. + ASSERT_TRUE(list.IsFlushPending()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + ASSERT_EQ(0, to_delete.size()); + + // Pick tables to flush + list.PickMemtablesToFlush( + std::numeric_limits::max() /* memtable_id */, &to_flush); + ASSERT_EQ(3, to_flush.size()); + ASSERT_EQ(3, list.NumNotFlushed()); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Pick tables to flush again + autovector to_flush2; + list.PickMemtablesToFlush( + std::numeric_limits::max() /* memtable_id */, &to_flush2); + ASSERT_EQ(0, to_flush2.size()); + ASSERT_EQ(3, list.NumNotFlushed()); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Add another table + list.Add(tables[3], &to_delete); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + ASSERT_EQ(0, to_delete.size()); + + // Request a flush again + list.FlushRequested(); + ASSERT_TRUE(list.IsFlushPending()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Pick tables to flush again + list.PickMemtablesToFlush( + std::numeric_limits::max() /* memtable_id */, &to_flush2); + ASSERT_EQ(1, to_flush2.size()); + ASSERT_EQ(4, list.NumNotFlushed()); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Rollback first pick of tables + list.RollbackMemtableFlush(to_flush, 0); + ASSERT_TRUE(list.IsFlushPending()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + to_flush.clear(); + + // Add another tables + list.Add(tables[4], &to_delete); + ASSERT_EQ(5, list.NumNotFlushed()); + // We now have the minimum to flush regardles of whether FlushRequested() + ASSERT_TRUE(list.IsFlushPending()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + ASSERT_EQ(0, to_delete.size()); + + // Pick tables to flush + list.PickMemtablesToFlush( + std::numeric_limits::max() /* memtable_id */, &to_flush); + // Picks three oldest memtables. The fourth oldest is picked in `to_flush2` so + // must be excluded. The newest (fifth oldest) is non-consecutive with the + // three oldest due to omitting the fourth oldest so must not be picked. + ASSERT_EQ(3, to_flush.size()); + ASSERT_EQ(5, list.NumNotFlushed()); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Pick tables to flush again + autovector to_flush3; + list.PickMemtablesToFlush( + std::numeric_limits::max() /* memtable_id */, &to_flush3); + // Picks newest (fifth oldest) + ASSERT_EQ(1, to_flush3.size()); + ASSERT_EQ(5, list.NumNotFlushed()); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Nothing left to flush + autovector to_flush4; + list.PickMemtablesToFlush( + std::numeric_limits::max() /* memtable_id */, &to_flush4); + ASSERT_EQ(0, to_flush4.size()); + ASSERT_EQ(5, list.NumNotFlushed()); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Flush the 3 memtables that were picked in to_flush + s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush, + &to_delete); + ASSERT_OK(s); + + // Note: now to_flush contains tables[0,1,2]. to_flush2 contains + // tables[3]. to_flush3 contains tables[4]. + // Current implementation will only commit memtables in the order they were + // created. So TryInstallMemtableFlushResults will install the first 3 tables + // in to_flush and stop when it encounters a table not yet flushed. + ASSERT_EQ(2, list.NumNotFlushed()); + int num_in_history = + std::min(3, static_cast(max_write_buffer_size_to_maintain) / + static_cast(options.write_buffer_size)); + ASSERT_EQ(num_in_history, list.NumFlushed()); + ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size()); + + // Request a flush again. Should be nothing to flush + list.FlushRequested(); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Flush the 1 memtable (tables[4]) that was picked in to_flush3 + s = MemTableListTest::Mock_InstallMemtableFlushResults( + &list, mutable_cf_options, to_flush3, &to_delete); + ASSERT_OK(s); + + // This will install 0 tables since tables[4] flushed while tables[3] has not + // yet flushed. + ASSERT_EQ(2, list.NumNotFlushed()); + ASSERT_EQ(0, to_delete.size()); + + // Flush the 1 memtable (tables[3]) that was picked in to_flush2 + s = MemTableListTest::Mock_InstallMemtableFlushResults( + &list, mutable_cf_options, to_flush2, &to_delete); + ASSERT_OK(s); + + // This will actually install 2 tables. The 1 we told it to flush, and also + // tables[4] which has been waiting for tables[3] to commit. + ASSERT_EQ(0, list.NumNotFlushed()); + num_in_history = + std::min(5, static_cast(max_write_buffer_size_to_maintain) / + static_cast(options.write_buffer_size)); + ASSERT_EQ(num_in_history, list.NumFlushed()); + ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size()); + + for (const auto& m : to_delete) { + // Refcount should be 0 after calling TryInstallMemtableFlushResults. + // Verify this, by Ref'ing then UnRef'ing: + m->Ref(); + ASSERT_EQ(m, m->Unref()); + delete m; + } + to_delete.clear(); + + // Add another table + list.Add(tables[5], &to_delete); + ASSERT_EQ(1, list.NumNotFlushed()); + ASSERT_EQ(5, list.GetLatestMemTableID()); + memtable_id = 4; + // Pick tables to flush. The tables to pick must have ID smaller than or + // equal to 4. Therefore, no table will be selected in this case. + autovector to_flush5; + list.FlushRequested(); + ASSERT_TRUE(list.HasFlushRequested()); + list.PickMemtablesToFlush(memtable_id, &to_flush5); + ASSERT_TRUE(to_flush5.empty()); + ASSERT_EQ(1, list.NumNotFlushed()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.HasFlushRequested()); + + // Pick tables to flush. The tables to pick must have ID smaller than or + // equal to 5. Therefore, only tables[5] will be selected. + memtable_id = 5; + list.FlushRequested(); + list.PickMemtablesToFlush(memtable_id, &to_flush5); + ASSERT_EQ(1, static_cast(to_flush5.size())); + ASSERT_EQ(1, list.NumNotFlushed()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + ASSERT_FALSE(list.IsFlushPending()); + to_delete.clear(); + + list.current()->Unref(&to_delete); + int to_delete_size = + std::min(num_tables, static_cast(max_write_buffer_size_to_maintain) / + static_cast(options.write_buffer_size)); + ASSERT_EQ(to_delete_size, to_delete.size()); + + for (const auto& m : to_delete) { + // Refcount should be 0 after calling TryInstallMemtableFlushResults. + // Verify this, by Ref'ing then UnRef'ing: + m->Ref(); + ASSERT_EQ(m, m->Unref()); + delete m; + } + to_delete.clear(); +} + +TEST_F(MemTableListTest, EmptyAtomicFlusTest) { + autovector lists; + autovector cf_ids; + autovector options_list; + autovector*> to_flush; + autovector to_delete; + Status s = Mock_InstallMemtableAtomicFlushResults(lists, cf_ids, options_list, + to_flush, &to_delete); + ASSERT_OK(s); + ASSERT_TRUE(to_delete.empty()); +} + +TEST_F(MemTableListTest, AtomicFlusTest) { + const int num_cfs = 3; + const int num_tables_per_cf = 2; + SequenceNumber seq = 1; + + auto factory = std::make_shared(); + options.memtable_factory = factory; + ImmutableOptions ioptions(options); + InternalKeyComparator cmp(BytewiseComparator()); + WriteBufferManager wb(options.db_write_buffer_size); + + // Create MemTableLists + int min_write_buffer_number_to_merge = 3; + int max_write_buffer_number_to_maintain = 7; + int64_t max_write_buffer_size_to_maintain = + 7 * static_cast(options.write_buffer_size); + autovector lists; + for (int i = 0; i != num_cfs; ++i) { + lists.emplace_back(new MemTableList(min_write_buffer_number_to_merge, + max_write_buffer_number_to_maintain, + max_write_buffer_size_to_maintain)); + } + + autovector cf_ids; + std::vector> tables(num_cfs); + autovector mutable_cf_options_list; + uint32_t cf_id = 0; + for (auto& elem : tables) { + mutable_cf_options_list.emplace_back(new MutableCFOptions(options)); + uint64_t memtable_id = 0; + for (int i = 0; i != num_tables_per_cf; ++i) { + MemTable* mem = + new MemTable(cmp, ioptions, *(mutable_cf_options_list.back()), &wb, + kMaxSequenceNumber, cf_id); + mem->SetID(memtable_id++); + mem->Ref(); + + std::string value; + + ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", std::to_string(i), + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "keyN" + std::to_string(i), + "valueN", nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "keyX" + std::to_string(i), "value", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "keyM" + std::to_string(i), + "valueM", nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeDeletion, "keyX" + std::to_string(i), "", + nullptr /* kv_prot_info */)); + + elem.push_back(mem); + } + cf_ids.push_back(cf_id++); + } + + std::vector> flush_candidates(num_cfs); + + // Nothing to flush + for (auto i = 0; i != num_cfs; ++i) { + auto* list = lists[i]; + ASSERT_FALSE(list->IsFlushPending()); + ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire)); + list->PickMemtablesToFlush( + std::numeric_limits::max() /* memtable_id */, + &flush_candidates[i]); + ASSERT_EQ(0, flush_candidates[i].size()); + } + // Request flush even though there is nothing to flush + for (auto i = 0; i != num_cfs; ++i) { + auto* list = lists[i]; + list->FlushRequested(); + ASSERT_FALSE(list->IsFlushPending()); + ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire)); + } + autovector to_delete; + // Add tables to the immutable memtalbe lists associated with column families + for (auto i = 0; i != num_cfs; ++i) { + for (auto j = 0; j != num_tables_per_cf; ++j) { + lists[i]->Add(tables[i][j], &to_delete); + } + ASSERT_EQ(num_tables_per_cf, lists[i]->NumNotFlushed()); + ASSERT_TRUE(lists[i]->IsFlushPending()); + ASSERT_TRUE(lists[i]->imm_flush_needed.load(std::memory_order_acquire)); + } + std::vector flush_memtable_ids = {1, 1, 0}; + // +----+ + // list[0]: |0 1| + // list[1]: |0 1| + // | +--+ + // list[2]: |0| 1 + // +-+ + // Pick memtables to flush + for (auto i = 0; i != num_cfs; ++i) { + flush_candidates[i].clear(); + lists[i]->PickMemtablesToFlush(flush_memtable_ids[i], &flush_candidates[i]); + ASSERT_EQ(flush_memtable_ids[i] - 0 + 1, + static_cast(flush_candidates[i].size())); + } + autovector tmp_lists; + autovector tmp_cf_ids; + autovector tmp_options_list; + autovector*> to_flush; + for (auto i = 0; i != num_cfs; ++i) { + if (!flush_candidates[i].empty()) { + to_flush.push_back(&flush_candidates[i]); + tmp_lists.push_back(lists[i]); + tmp_cf_ids.push_back(i); + tmp_options_list.push_back(mutable_cf_options_list[i]); + } + } + Status s = Mock_InstallMemtableAtomicFlushResults( + tmp_lists, tmp_cf_ids, tmp_options_list, to_flush, &to_delete); + ASSERT_OK(s); + + for (auto i = 0; i != num_cfs; ++i) { + for (auto j = 0; j != num_tables_per_cf; ++j) { + if (static_cast(j) <= flush_memtable_ids[i]) { + ASSERT_LT(0, tables[i][j]->GetFileNumber()); + } + } + ASSERT_EQ( + static_cast(num_tables_per_cf) - flush_candidates[i].size(), + lists[i]->NumNotFlushed()); + } + + to_delete.clear(); + for (auto list : lists) { + list->current()->Unref(&to_delete); + delete list; + } + for (auto& mutable_cf_options : mutable_cf_options_list) { + if (mutable_cf_options != nullptr) { + delete mutable_cf_options; + mutable_cf_options = nullptr; + } + } + // All memtables in tables array must have been flushed, thus ready to be + // deleted. + ASSERT_EQ(to_delete.size(), tables.size() * tables.front().size()); + for (const auto& m : to_delete) { + // Refcount should be 0 after calling InstallMemtableFlushResults. + // Verify this by Ref'ing and then Unref'ing. + m->Ref(); + ASSERT_EQ(m, m->Unref()); + delete m; + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/merge_context.h b/src/rocksdb/db/merge_context.h new file mode 100644 index 000000000..8a7b07290 --- /dev/null +++ b/src/rocksdb/db/merge_context.h @@ -0,0 +1,147 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once +#include +#include +#include +#include + +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +const std::vector empty_operand_list; + +// The merge context for merging a user key. +// When doing a Get(), DB will create such a class and pass it when +// issuing Get() operation to memtables and version_set. The operands +// will be fetched from the context when issuing partial of full merge. +class MergeContext { + public: + // Clear all the operands + void Clear() { + if (operand_list_) { + operand_list_->clear(); + copied_operands_->clear(); + } + } + + // Push a merge operand + void PushOperand(const Slice& operand_slice, bool operand_pinned = false) { + Initialize(); + SetDirectionBackward(); + + if (operand_pinned) { + operand_list_->push_back(operand_slice); + } else { + // We need to have our own copy of the operand since it's not pinned + copied_operands_->emplace_back( + new std::string(operand_slice.data(), operand_slice.size())); + operand_list_->push_back(*copied_operands_->back()); + } + } + + // Push back a merge operand + void PushOperandBack(const Slice& operand_slice, + bool operand_pinned = false) { + Initialize(); + SetDirectionForward(); + + if (operand_pinned) { + operand_list_->push_back(operand_slice); + } else { + // We need to have our own copy of the operand since it's not pinned + copied_operands_->emplace_back( + new std::string(operand_slice.data(), operand_slice.size())); + operand_list_->push_back(*copied_operands_->back()); + } + } + + // return total number of operands in the list + size_t GetNumOperands() const { + if (!operand_list_) { + return 0; + } + return operand_list_->size(); + } + + // Get the operand at the index. + Slice GetOperand(int index) const { + assert(operand_list_); + + SetDirectionForward(); + return (*operand_list_)[index]; + } + + // Same as GetOperandsDirectionForward + // + // Note that the returned reference is only good until another call + // to this MergeContext. If the returned value is needed for longer, + // a copy must be made. + const std::vector& GetOperands() const { + return GetOperandsDirectionForward(); + } + + // Return all the operands in the order as they were merged (passed to + // FullMerge or FullMergeV2) + // + // Note that the returned reference is only good until another call + // to this MergeContext. If the returned value is needed for longer, + // a copy must be made. + const std::vector& GetOperandsDirectionForward() const { + if (!operand_list_) { + return empty_operand_list; + } + + SetDirectionForward(); + return *operand_list_; + } + + // Return all the operands in the reversed order relative to how they were + // merged (passed to FullMerge or FullMergeV2) + // + // Note that the returned reference is only good until another call + // to this MergeContext. If the returned value is needed for longer, + // a copy must be made. + const std::vector& GetOperandsDirectionBackward() const { + if (!operand_list_) { + return empty_operand_list; + } + + SetDirectionBackward(); + return *operand_list_; + } + + private: + void Initialize() { + if (!operand_list_) { + operand_list_.reset(new std::vector()); + copied_operands_.reset(new std::vector>()); + } + } + + void SetDirectionForward() const { + if (operands_reversed_ == true) { + std::reverse(operand_list_->begin(), operand_list_->end()); + operands_reversed_ = false; + } + } + + void SetDirectionBackward() const { + if (operands_reversed_ == false) { + std::reverse(operand_list_->begin(), operand_list_->end()); + operands_reversed_ = true; + } + } + + // List of operands + mutable std::unique_ptr> operand_list_; + // Copy of operands that are not pinned. + std::unique_ptr>> copied_operands_; + mutable bool operands_reversed_ = true; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/merge_helper.cc b/src/rocksdb/db/merge_helper.cc new file mode 100644 index 000000000..6df841012 --- /dev/null +++ b/src/rocksdb/db/merge_helper.cc @@ -0,0 +1,583 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/merge_helper.h" + +#include + +#include "db/blob/blob_fetcher.h" +#include "db/blob/blob_index.h" +#include "db/blob/prefetch_buffer_collection.h" +#include "db/compaction/compaction_iteration_stats.h" +#include "db/dbformat.h" +#include "db/wide/wide_column_serialization.h" +#include "logging/logging.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/statistics.h" +#include "port/likely.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/system_clock.h" +#include "table/format.h" +#include "table/internal_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +MergeHelper::MergeHelper(Env* env, const Comparator* user_comparator, + const MergeOperator* user_merge_operator, + const CompactionFilter* compaction_filter, + Logger* logger, bool assert_valid_internal_key, + SequenceNumber latest_snapshot, + const SnapshotChecker* snapshot_checker, int level, + Statistics* stats, + const std::atomic* shutting_down) + : env_(env), + clock_(env->GetSystemClock().get()), + user_comparator_(user_comparator), + user_merge_operator_(user_merge_operator), + compaction_filter_(compaction_filter), + shutting_down_(shutting_down), + logger_(logger), + assert_valid_internal_key_(assert_valid_internal_key), + allow_single_operand_(false), + latest_snapshot_(latest_snapshot), + snapshot_checker_(snapshot_checker), + level_(level), + keys_(), + filter_timer_(clock_), + total_filter_time_(0U), + stats_(stats) { + assert(user_comparator_ != nullptr); + if (user_merge_operator_) { + allow_single_operand_ = user_merge_operator_->AllowSingleOperand(); + } +} + +Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator, + const Slice& key, const Slice* value, + const std::vector& operands, + std::string* result, Logger* logger, + Statistics* statistics, SystemClock* clock, + Slice* result_operand, + bool update_num_ops_stats) { + assert(merge_operator != nullptr); + + if (operands.empty()) { + assert(value != nullptr && result != nullptr); + result->assign(value->data(), value->size()); + return Status::OK(); + } + + if (update_num_ops_stats) { + RecordInHistogram(statistics, READ_NUM_MERGE_OPERANDS, + static_cast(operands.size())); + } + + bool success = false; + Slice tmp_result_operand(nullptr, 0); + const MergeOperator::MergeOperationInput merge_in(key, value, operands, + logger); + MergeOperator::MergeOperationOutput merge_out(*result, tmp_result_operand); + { + // Setup to time the merge + StopWatchNano timer(clock, statistics != nullptr); + PERF_TIMER_GUARD(merge_operator_time_nanos); + + // Do the merge + success = merge_operator->FullMergeV2(merge_in, &merge_out); + + if (tmp_result_operand.data()) { + // FullMergeV2 result is an existing operand + if (result_operand != nullptr) { + *result_operand = tmp_result_operand; + } else { + result->assign(tmp_result_operand.data(), tmp_result_operand.size()); + } + } else if (result_operand) { + *result_operand = Slice(nullptr, 0); + } + + RecordTick(statistics, MERGE_OPERATION_TOTAL_TIME, + statistics ? timer.ElapsedNanos() : 0); + } + + if (!success) { + RecordTick(statistics, NUMBER_MERGE_FAILURES); + return Status::Corruption("Error: Could not perform merge."); + } + + return Status::OK(); +} + +Status MergeHelper::TimedFullMergeWithEntity( + const MergeOperator* merge_operator, const Slice& key, Slice base_entity, + const std::vector& operands, std::string* result, Logger* logger, + Statistics* statistics, SystemClock* clock, bool update_num_ops_stats) { + WideColumns base_columns; + + { + const Status s = + WideColumnSerialization::Deserialize(base_entity, base_columns); + if (!s.ok()) { + return s; + } + } + + const bool has_default_column = + !base_columns.empty() && base_columns[0].name() == kDefaultWideColumnName; + + Slice value_of_default; + if (has_default_column) { + value_of_default = base_columns[0].value(); + } + + std::string merge_result; + + { + constexpr Slice* result_operand = nullptr; + + const Status s = TimedFullMerge( + merge_operator, key, &value_of_default, operands, &merge_result, logger, + statistics, clock, result_operand, update_num_ops_stats); + if (!s.ok()) { + return s; + } + } + + if (has_default_column) { + base_columns[0].value() = merge_result; + + const Status s = WideColumnSerialization::Serialize(base_columns, *result); + if (!s.ok()) { + return s; + } + } else { + const Status s = + WideColumnSerialization::Serialize(merge_result, base_columns, *result); + if (!s.ok()) { + return s; + } + } + + return Status::OK(); +} + +// PRE: iter points to the first merge type entry +// POST: iter points to the first entry beyond the merge process (or the end) +// keys_, operands_ are updated to reflect the merge result. +// keys_ stores the list of keys encountered while merging. +// operands_ stores the list of merge operands encountered while merging. +// keys_[i] corresponds to operands_[i] for each i. +// +// TODO: Avoid the snapshot stripe map lookup in CompactionRangeDelAggregator +// and just pass the StripeRep corresponding to the stripe being merged. +Status MergeHelper::MergeUntil(InternalIterator* iter, + CompactionRangeDelAggregator* range_del_agg, + const SequenceNumber stop_before, + const bool at_bottom, + const bool allow_data_in_errors, + const BlobFetcher* blob_fetcher, + const std::string* const full_history_ts_low, + PrefetchBufferCollection* prefetch_buffers, + CompactionIterationStats* c_iter_stats) { + // Get a copy of the internal key, before it's invalidated by iter->Next() + // Also maintain the list of merge operands seen. + assert(HasOperator()); + keys_.clear(); + merge_context_.Clear(); + has_compaction_filter_skip_until_ = false; + assert(user_merge_operator_); + assert(user_comparator_); + const size_t ts_sz = user_comparator_->timestamp_size(); + if (full_history_ts_low) { + assert(ts_sz > 0); + assert(ts_sz == full_history_ts_low->size()); + } + bool first_key = true; + + // We need to parse the internal key again as the parsed key is + // backed by the internal key! + // Assume no internal key corruption as it has been successfully parsed + // by the caller. + // original_key_is_iter variable is just caching the information: + // original_key_is_iter == (iter->key().ToString() == original_key) + bool original_key_is_iter = true; + std::string original_key = iter->key().ToString(); + // Important: + // orig_ikey is backed by original_key if keys_.empty() + // orig_ikey is backed by keys_.back() if !keys_.empty() + ParsedInternalKey orig_ikey; + + Status s = ParseInternalKey(original_key, &orig_ikey, allow_data_in_errors); + assert(s.ok()); + if (!s.ok()) return s; + + assert(kTypeMerge == orig_ikey.type); + + bool hit_the_next_user_key = false; + int cmp_with_full_history_ts_low = 0; + for (; iter->Valid(); iter->Next(), original_key_is_iter = false) { + if (IsShuttingDown()) { + s = Status::ShutdownInProgress(); + return s; + } + + ParsedInternalKey ikey; + assert(keys_.size() == merge_context_.GetNumOperands()); + + Status pik_status = + ParseInternalKey(iter->key(), &ikey, allow_data_in_errors); + Slice ts; + if (pik_status.ok()) { + ts = ExtractTimestampFromUserKey(ikey.user_key, ts_sz); + if (full_history_ts_low) { + cmp_with_full_history_ts_low = + user_comparator_->CompareTimestamp(ts, *full_history_ts_low); + } + } + if (!pik_status.ok()) { + // stop at corrupted key + if (assert_valid_internal_key_) { + return pik_status; + } + break; + } else if (first_key) { + // If user-defined timestamp is enabled, we expect both user key and + // timestamps are equal, as a sanity check. + assert(user_comparator_->Equal(ikey.user_key, orig_ikey.user_key)); + first_key = false; + } else if (!user_comparator_->EqualWithoutTimestamp(ikey.user_key, + orig_ikey.user_key) || + (ts_sz > 0 && + !user_comparator_->Equal(ikey.user_key, orig_ikey.user_key) && + cmp_with_full_history_ts_low >= 0)) { + // 1) hit a different user key, or + // 2) user-defined timestamp is enabled, and hit a version of user key NOT + // eligible for GC, then stop right here. + hit_the_next_user_key = true; + break; + } else if (stop_before > 0 && ikey.sequence <= stop_before && + LIKELY(snapshot_checker_ == nullptr || + snapshot_checker_->CheckInSnapshot(ikey.sequence, + stop_before) != + SnapshotCheckerResult::kNotInSnapshot)) { + // hit an entry that's possibly visible by the previous snapshot, can't + // touch that + break; + } + + // At this point we are guaranteed that we need to process this key. + + assert(IsValueType(ikey.type)); + if (ikey.type != kTypeMerge) { + // hit a put/delete/single delete + // => merge the put value or a nullptr with operands_ + // => store result in operands_.back() (and update keys_.back()) + // => change the entry type to kTypeValue for keys_.back() + // We are done! Success! + + // If there are no operands, just return the Status::OK(). That will cause + // the compaction iterator to write out the key we're currently at, which + // is the put/delete we just encountered. + if (keys_.empty()) { + return s; + } + + // TODO(noetzli) If the merge operator returns false, we are currently + // (almost) silently dropping the put/delete. That's probably not what we + // want. Also if we're in compaction and it's a put, it would be nice to + // run compaction filter on it. + std::string merge_result; + + if (range_del_agg && + range_del_agg->ShouldDelete( + ikey, RangeDelPositioningMode::kForwardTraversal)) { + s = TimedFullMerge(user_merge_operator_, ikey.user_key, nullptr, + merge_context_.GetOperands(), &merge_result, logger_, + stats_, clock_, + /* result_operand */ nullptr, + /* update_num_ops_stats */ false); + } else if (ikey.type == kTypeValue) { + const Slice val = iter->value(); + + s = TimedFullMerge(user_merge_operator_, ikey.user_key, &val, + merge_context_.GetOperands(), &merge_result, logger_, + stats_, clock_, + /* result_operand */ nullptr, + /* update_num_ops_stats */ false); + } else if (ikey.type == kTypeBlobIndex) { + BlobIndex blob_index; + + s = blob_index.DecodeFrom(iter->value()); + if (!s.ok()) { + return s; + } + + FilePrefetchBuffer* prefetch_buffer = + prefetch_buffers ? prefetch_buffers->GetOrCreatePrefetchBuffer( + blob_index.file_number()) + : nullptr; + + uint64_t bytes_read = 0; + + assert(blob_fetcher); + + PinnableSlice blob_value; + s = blob_fetcher->FetchBlob(ikey.user_key, blob_index, prefetch_buffer, + &blob_value, &bytes_read); + if (!s.ok()) { + return s; + } + + if (c_iter_stats) { + ++c_iter_stats->num_blobs_read; + c_iter_stats->total_blob_bytes_read += bytes_read; + } + + s = TimedFullMerge(user_merge_operator_, ikey.user_key, &blob_value, + merge_context_.GetOperands(), &merge_result, logger_, + stats_, clock_, + /* result_operand */ nullptr, + /* update_num_ops_stats */ false); + } else if (ikey.type == kTypeWideColumnEntity) { + s = TimedFullMergeWithEntity( + user_merge_operator_, ikey.user_key, iter->value(), + merge_context_.GetOperands(), &merge_result, logger_, stats_, + clock_, /* update_num_ops_stats */ false); + } else { + s = TimedFullMerge(user_merge_operator_, ikey.user_key, nullptr, + merge_context_.GetOperands(), &merge_result, logger_, + stats_, clock_, + /* result_operand */ nullptr, + /* update_num_ops_stats */ false); + } + + // We store the result in keys_.back() and operands_.back() + // if nothing went wrong (i.e.: no operand corruption on disk) + if (s.ok()) { + // The original key encountered + original_key = std::move(keys_.back()); + orig_ikey.type = ikey.type == kTypeWideColumnEntity + ? kTypeWideColumnEntity + : kTypeValue; + UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type); + keys_.clear(); + merge_context_.Clear(); + keys_.emplace_front(std::move(original_key)); + merge_context_.PushOperand(merge_result); + } + + // move iter to the next entry + iter->Next(); + return s; + } else { + // hit a merge + // => if there is a compaction filter, apply it. + // => check for range tombstones covering the operand + // => merge the operand into the front of the operands_ list + // if not filtered + // => then continue because we haven't yet seen a Put/Delete. + // + // Keep queuing keys and operands until we either meet a put / delete + // request or later did a partial merge. + + Slice value_slice = iter->value(); + // add an operand to the list if: + // 1) it's included in one of the snapshots. in that case we *must* write + // it out, no matter what compaction filter says + // 2) it's not filtered by a compaction filter + CompactionFilter::Decision filter = + ikey.sequence <= latest_snapshot_ + ? CompactionFilter::Decision::kKeep + : FilterMerge(orig_ikey.user_key, value_slice); + if (filter != CompactionFilter::Decision::kRemoveAndSkipUntil && + range_del_agg != nullptr && + range_del_agg->ShouldDelete( + iter->key(), RangeDelPositioningMode::kForwardTraversal)) { + filter = CompactionFilter::Decision::kRemove; + } + if (filter == CompactionFilter::Decision::kKeep || + filter == CompactionFilter::Decision::kChangeValue) { + if (original_key_is_iter) { + // this is just an optimization that saves us one memcpy + keys_.emplace_front(original_key); + } else { + keys_.emplace_front(iter->key().ToString()); + } + if (keys_.size() == 1) { + // we need to re-anchor the orig_ikey because it was anchored by + // original_key before + pik_status = + ParseInternalKey(keys_.back(), &orig_ikey, allow_data_in_errors); + pik_status.PermitUncheckedError(); + assert(pik_status.ok()); + } + if (filter == CompactionFilter::Decision::kKeep) { + merge_context_.PushOperand( + value_slice, iter->IsValuePinned() /* operand_pinned */); + } else { + assert(filter == CompactionFilter::Decision::kChangeValue); + // Compaction filter asked us to change the operand from value_slice + // to compaction_filter_value_. + merge_context_.PushOperand(compaction_filter_value_, false); + } + } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) { + // Compaction filter asked us to remove this key altogether + // (not just this operand), along with some keys following it. + keys_.clear(); + merge_context_.Clear(); + has_compaction_filter_skip_until_ = true; + return s; + } + } + } + + if (cmp_with_full_history_ts_low >= 0) { + size_t num_merge_operands = merge_context_.GetNumOperands(); + if (ts_sz && num_merge_operands > 1) { + // We do not merge merge operands with different timestamps if they are + // not eligible for GC. + ROCKS_LOG_ERROR(logger_, "ts_sz=%d, %d merge oprands", + static_cast(ts_sz), + static_cast(num_merge_operands)); + assert(false); + } + } + + if (merge_context_.GetNumOperands() == 0) { + // we filtered out all the merge operands + return s; + } + + // We are sure we have seen this key's entire history if: + // at_bottom == true (this does not necessarily mean it is the bottommost + // layer, but rather that we are confident the key does not appear on any of + // the lower layers, at_bottom == false doesn't mean it does appear, just + // that we can't be sure, see Compaction::IsBottommostLevel for details) + // AND + // we have either encountered another key or end of key history on this + // layer. + // Note that if user-defined timestamp is enabled, we need some extra caution + // here: if full_history_ts_low is nullptr, or it's not null but the key's + // timestamp is greater than or equal to full_history_ts_low, it means this + // key cannot be dropped. We may not have seen the beginning of the key. + // + // When these conditions are true we are able to merge all the keys + // using full merge. + // + // For these cases we are not sure about, we simply miss the opportunity + // to combine the keys. Since VersionSet::SetupOtherInputs() always makes + // sure that all merge-operands on the same level get compacted together, + // this will simply lead to these merge operands moving to the next level. + bool surely_seen_the_beginning = + (hit_the_next_user_key || !iter->Valid()) && at_bottom && + (ts_sz == 0 || cmp_with_full_history_ts_low < 0); + if (surely_seen_the_beginning) { + // do a final merge with nullptr as the existing value and say + // bye to the merge type (it's now converted to a Put) + assert(kTypeMerge == orig_ikey.type); + assert(merge_context_.GetNumOperands() >= 1); + assert(merge_context_.GetNumOperands() == keys_.size()); + std::string merge_result; + s = TimedFullMerge( + user_merge_operator_, orig_ikey.user_key, nullptr, + merge_context_.GetOperands(), &merge_result, logger_, stats_, clock_, + /* result_operand */ nullptr, /* update_num_ops_stats */ false); + if (s.ok()) { + // The original key encountered + // We are certain that keys_ is not empty here (see assertions couple of + // lines before). + original_key = std::move(keys_.back()); + orig_ikey.type = kTypeValue; + UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type); + keys_.clear(); + merge_context_.Clear(); + keys_.emplace_front(std::move(original_key)); + merge_context_.PushOperand(merge_result); + } + } else { + // We haven't seen the beginning of the key nor a Put/Delete. + // Attempt to use the user's associative merge function to + // merge the stacked merge operands into a single operand. + s = Status::MergeInProgress(); + if (merge_context_.GetNumOperands() >= 2 || + (allow_single_operand_ && merge_context_.GetNumOperands() == 1)) { + bool merge_success = false; + std::string merge_result; + { + StopWatchNano timer(clock_, stats_ != nullptr); + PERF_TIMER_GUARD(merge_operator_time_nanos); + merge_success = user_merge_operator_->PartialMergeMulti( + orig_ikey.user_key, + std::deque(merge_context_.GetOperands().begin(), + merge_context_.GetOperands().end()), + &merge_result, logger_); + RecordTick(stats_, MERGE_OPERATION_TOTAL_TIME, + stats_ ? timer.ElapsedNanosSafe() : 0); + } + if (merge_success) { + // Merging of operands (associative merge) was successful. + // Replace operands with the merge result + merge_context_.Clear(); + merge_context_.PushOperand(merge_result); + keys_.erase(keys_.begin(), keys_.end() - 1); + } + } + } + + return s; +} + +MergeOutputIterator::MergeOutputIterator(const MergeHelper* merge_helper) + : merge_helper_(merge_helper) { + it_keys_ = merge_helper_->keys().rend(); + it_values_ = merge_helper_->values().rend(); +} + +void MergeOutputIterator::SeekToFirst() { + const auto& keys = merge_helper_->keys(); + const auto& values = merge_helper_->values(); + assert(keys.size() == values.size()); + it_keys_ = keys.rbegin(); + it_values_ = values.rbegin(); +} + +void MergeOutputIterator::Next() { + ++it_keys_; + ++it_values_; +} + +CompactionFilter::Decision MergeHelper::FilterMerge(const Slice& user_key, + const Slice& value_slice) { + if (compaction_filter_ == nullptr) { + return CompactionFilter::Decision::kKeep; + } + if (stats_ != nullptr && ShouldReportDetailedTime(env_, stats_)) { + filter_timer_.Start(); + } + compaction_filter_value_.clear(); + compaction_filter_skip_until_.Clear(); + auto ret = compaction_filter_->FilterV2( + level_, user_key, CompactionFilter::ValueType::kMergeOperand, value_slice, + &compaction_filter_value_, compaction_filter_skip_until_.rep()); + if (ret == CompactionFilter::Decision::kRemoveAndSkipUntil) { + if (user_comparator_->Compare(*compaction_filter_skip_until_.rep(), + user_key) <= 0) { + // Invalid skip_until returned from compaction filter. + // Keep the key as per FilterV2 documentation. + ret = CompactionFilter::Decision::kKeep; + } else { + compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber, + kValueTypeForSeek); + } + } + if (stats_ != nullptr && ShouldReportDetailedTime(env_, stats_)) { + total_filter_time_ += filter_timer_.ElapsedNanosSafe(); + } + return ret; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/merge_helper.h b/src/rocksdb/db/merge_helper.h new file mode 100644 index 000000000..790ec6239 --- /dev/null +++ b/src/rocksdb/db/merge_helper.h @@ -0,0 +1,216 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#include +#include +#include + +#include "db/merge_context.h" +#include "db/range_del_aggregator.h" +#include "db/snapshot_checker.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "rocksdb/wide_columns.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +class Comparator; +class Iterator; +class Logger; +class MergeOperator; +class Statistics; +class SystemClock; +class BlobFetcher; +class PrefetchBufferCollection; +struct CompactionIterationStats; + +class MergeHelper { + public: + MergeHelper(Env* env, const Comparator* user_comparator, + const MergeOperator* user_merge_operator, + const CompactionFilter* compaction_filter, Logger* logger, + bool assert_valid_internal_key, SequenceNumber latest_snapshot, + const SnapshotChecker* snapshot_checker = nullptr, int level = 0, + Statistics* stats = nullptr, + const std::atomic* shutting_down = nullptr); + + // Wrapper around MergeOperator::FullMergeV2() that records perf statistics. + // Result of merge will be written to result if status returned is OK. + // If operands is empty, the value will simply be copied to result. + // Set `update_num_ops_stats` to true if it is from a user read, so that + // the latency is sensitive. + // Returns one of the following statuses: + // - OK: Entries were successfully merged. + // - Corruption: Merge operator reported unsuccessful merge. + static Status TimedFullMerge(const MergeOperator* merge_operator, + const Slice& key, const Slice* value, + const std::vector& operands, + std::string* result, Logger* logger, + Statistics* statistics, SystemClock* clock, + Slice* result_operand, + bool update_num_ops_stats); + + static Status TimedFullMergeWithEntity( + const MergeOperator* merge_operator, const Slice& key, Slice base_entity, + const std::vector& operands, std::string* result, Logger* logger, + Statistics* statistics, SystemClock* clock, bool update_num_ops_stats); + + // During compaction, merge entries until we hit + // - a corrupted key + // - a Put/Delete, + // - a different user key, + // - a specific sequence number (snapshot boundary), + // - REMOVE_AND_SKIP_UNTIL returned from compaction filter, + // or - the end of iteration + // iter: (IN) points to the first merge type entry + // (OUT) points to the first entry not included in the merge process + // range_del_agg: (IN) filters merge operands covered by range tombstones. + // stop_before: (IN) a sequence number that merge should not cross. + // 0 means no restriction + // at_bottom: (IN) true if the iterator covers the bottem level, which means + // we could reach the start of the history of this user key. + // allow_data_in_errors: (IN) if true, data details will be displayed in + // error/log messages. + // blob_fetcher: (IN) blob fetcher object for the compaction's input version. + // prefetch_buffers: (IN/OUT) a collection of blob file prefetch buffers + // used for compaction readahead. + // c_iter_stats: (OUT) compaction iteration statistics. + // + // Returns one of the following statuses: + // - OK: Entries were successfully merged. + // - MergeInProgress: Put/Delete not encountered, and didn't reach the start + // of key's history. Output consists of merge operands only. + // - Corruption: Merge operator reported unsuccessful merge or a corrupted + // key has been encountered and not expected (applies only when compiling + // with asserts removed). + // - ShutdownInProgress: interrupted by shutdown (*shutting_down == true). + // + // REQUIRED: The first key in the input is not corrupted. + Status MergeUntil(InternalIterator* iter, + CompactionRangeDelAggregator* range_del_agg, + const SequenceNumber stop_before, const bool at_bottom, + const bool allow_data_in_errors, + const BlobFetcher* blob_fetcher, + const std::string* const full_history_ts_low, + PrefetchBufferCollection* prefetch_buffers, + CompactionIterationStats* c_iter_stats); + + // Filters a merge operand using the compaction filter specified + // in the constructor. Returns the decision that the filter made. + // Uses compaction_filter_value_ and compaction_filter_skip_until_ for the + // optional outputs of compaction filter. + // user_key includes timestamp if user-defined timestamp is enabled. + CompactionFilter::Decision FilterMerge(const Slice& user_key, + const Slice& value_slice); + + // Query the merge result + // These are valid until the next MergeUntil call + // If the merging was successful: + // - keys() contains a single element with the latest sequence number of + // the merges. The type will be Put or Merge. See IMPORTANT 1 note, below. + // - values() contains a single element with the result of merging all the + // operands together + // + // IMPORTANT 1: the key type could change after the MergeUntil call. + // Put/Delete + Merge + ... + Merge => Put + // Merge + ... + Merge => Merge + // + // If the merge operator is not associative, and if a Put/Delete is not found + // then the merging will be unsuccessful. In this case: + // - keys() contains the list of internal keys seen in order of iteration. + // - values() contains the list of values (merges) seen in the same order. + // values() is parallel to keys() so that the first entry in + // keys() is the key associated with the first entry in values() + // and so on. These lists will be the same length. + // All of these pairs will be merges over the same user key. + // See IMPORTANT 2 note below. + // + // IMPORTANT 2: The entries were traversed in order from BACK to FRONT. + // So keys().back() was the first key seen by iterator. + // TODO: Re-style this comment to be like the first one + const std::deque& keys() const { return keys_; } + const std::vector& values() const { + return merge_context_.GetOperands(); + } + uint64_t TotalFilterTime() const { return total_filter_time_; } + bool HasOperator() const { return user_merge_operator_ != nullptr; } + + // If compaction filter returned REMOVE_AND_SKIP_UNTIL, this method will + // return true and fill *until with the key to which we should skip. + // If true, keys() and values() are empty. + bool FilteredUntil(Slice* skip_until) const { + if (!has_compaction_filter_skip_until_) { + return false; + } + assert(compaction_filter_ != nullptr); + assert(skip_until != nullptr); + assert(compaction_filter_skip_until_.Valid()); + *skip_until = compaction_filter_skip_until_.Encode(); + return true; + } + + private: + Env* env_; + SystemClock* clock_; + const Comparator* user_comparator_; + const MergeOperator* user_merge_operator_; + const CompactionFilter* compaction_filter_; + const std::atomic* shutting_down_; + Logger* logger_; + bool assert_valid_internal_key_; // enforce no internal key corruption? + bool allow_single_operand_; + SequenceNumber latest_snapshot_; + const SnapshotChecker* const snapshot_checker_; + int level_; + + // the scratch area that holds the result of MergeUntil + // valid up to the next MergeUntil call + + // Keeps track of the sequence of keys seen + std::deque keys_; + // Parallel with keys_; stores the operands + mutable MergeContext merge_context_; + + StopWatchNano filter_timer_; + uint64_t total_filter_time_; + Statistics* stats_; + + bool has_compaction_filter_skip_until_ = false; + std::string compaction_filter_value_; + InternalKey compaction_filter_skip_until_; + + bool IsShuttingDown() { + // This is a best-effort facility, so memory_order_relaxed is sufficient. + return shutting_down_ && shutting_down_->load(std::memory_order_relaxed); + } +}; + +// MergeOutputIterator can be used to iterate over the result of a merge. +class MergeOutputIterator { + public: + // The MergeOutputIterator is bound to a MergeHelper instance. + explicit MergeOutputIterator(const MergeHelper* merge_helper); + + // Seeks to the first record in the output. + void SeekToFirst(); + // Advances to the next record in the output. + void Next(); + + Slice key() { return Slice(*it_keys_); } + Slice value() { return Slice(*it_values_); } + bool Valid() { return it_keys_ != merge_helper_->keys().rend(); } + + private: + const MergeHelper* merge_helper_; + std::deque::const_reverse_iterator it_keys_; + std::vector::const_reverse_iterator it_values_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/merge_helper_test.cc b/src/rocksdb/db/merge_helper_test.cc new file mode 100644 index 000000000..05408d5b9 --- /dev/null +++ b/src/rocksdb/db/merge_helper_test.cc @@ -0,0 +1,298 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/merge_helper.h" + +#include +#include +#include + +#include "db/dbformat.h" +#include "rocksdb/comparator.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/coding.h" +#include "util/vector_iterator.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +class MergeHelperTest : public testing::Test { + public: + MergeHelperTest() : icmp_(BytewiseComparator()) { env_ = Env::Default(); } + + ~MergeHelperTest() override = default; + + Status Run(SequenceNumber stop_before, bool at_bottom, + SequenceNumber latest_snapshot = 0) { + iter_.reset(new VectorIterator(ks_, vs_, &icmp_)); + iter_->SeekToFirst(); + merge_helper_.reset(new MergeHelper(env_, icmp_.user_comparator(), + merge_op_.get(), filter_.get(), nullptr, + false, latest_snapshot)); + return merge_helper_->MergeUntil( + iter_.get(), nullptr /* range_del_agg */, stop_before, at_bottom, + false /* allow_data_in_errors */, nullptr /* blob_fetcher */, + nullptr /* full_history_ts_low */, nullptr /* prefetch_buffers */, + nullptr /* c_iter_stats */); + } + + void AddKeyVal(const std::string& user_key, const SequenceNumber& seq, + const ValueType& t, const std::string& val, + bool corrupt = false) { + InternalKey ikey(user_key, seq, t); + if (corrupt) { + test::CorruptKeyType(&ikey); + } + ks_.push_back(ikey.Encode().ToString()); + vs_.push_back(val); + } + + Env* env_; + InternalKeyComparator icmp_; + std::unique_ptr iter_; + std::shared_ptr merge_op_; + std::unique_ptr merge_helper_; + std::vector ks_; + std::vector vs_; + std::unique_ptr filter_; +}; + +// If MergeHelper encounters a new key on the last level, we know that +// the key has no more history and it can merge keys. +TEST_F(MergeHelperTest, MergeAtBottomSuccess) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + + AddKeyVal("a", 20, kTypeMerge, test::EncodeInt(1U)); + AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(3U)); + AddKeyVal("b", 10, kTypeMerge, test::EncodeInt(4U)); // <- iter_ after merge + + ASSERT_TRUE(Run(0, true).ok()); + ASSERT_EQ(ks_[2], iter_->key()); + ASSERT_EQ(test::KeyStr("a", 20, kTypeValue), merge_helper_->keys()[0]); + ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]); + ASSERT_EQ(1U, merge_helper_->keys().size()); + ASSERT_EQ(1U, merge_helper_->values().size()); +} + +// Merging with a value results in a successful merge. +TEST_F(MergeHelperTest, MergeValue) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + + AddKeyVal("a", 40, kTypeMerge, test::EncodeInt(1U)); + AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U)); + AddKeyVal("a", 20, kTypeValue, test::EncodeInt(4U)); // <- iter_ after merge + AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(1U)); + + ASSERT_TRUE(Run(0, false).ok()); + ASSERT_EQ(ks_[3], iter_->key()); + ASSERT_EQ(test::KeyStr("a", 40, kTypeValue), merge_helper_->keys()[0]); + ASSERT_EQ(test::EncodeInt(8U), merge_helper_->values()[0]); + ASSERT_EQ(1U, merge_helper_->keys().size()); + ASSERT_EQ(1U, merge_helper_->values().size()); +} + +// Merging stops before a snapshot. +TEST_F(MergeHelperTest, SnapshotBeforeValue) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + + AddKeyVal("a", 50, kTypeMerge, test::EncodeInt(1U)); + AddKeyVal("a", 40, kTypeMerge, test::EncodeInt(3U)); // <- iter_ after merge + AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(1U)); + AddKeyVal("a", 20, kTypeValue, test::EncodeInt(4U)); + AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(1U)); + + ASSERT_TRUE(Run(31, true).IsMergeInProgress()); + ASSERT_EQ(ks_[2], iter_->key()); + ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[0]); + ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]); + ASSERT_EQ(1U, merge_helper_->keys().size()); + ASSERT_EQ(1U, merge_helper_->values().size()); +} + +// MergeHelper preserves the operand stack for merge operators that +// cannot do a partial merge. +TEST_F(MergeHelperTest, NoPartialMerge) { + merge_op_ = MergeOperators::CreateStringAppendTESTOperator(); + + AddKeyVal("a", 50, kTypeMerge, "v2"); + AddKeyVal("a", 40, kTypeMerge, "v"); // <- iter_ after merge + AddKeyVal("a", 30, kTypeMerge, "v"); + + ASSERT_TRUE(Run(31, true).IsMergeInProgress()); + ASSERT_EQ(ks_[2], iter_->key()); + ASSERT_EQ(test::KeyStr("a", 40, kTypeMerge), merge_helper_->keys()[0]); + ASSERT_EQ("v", merge_helper_->values()[0]); + ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[1]); + ASSERT_EQ("v2", merge_helper_->values()[1]); + ASSERT_EQ(2U, merge_helper_->keys().size()); + ASSERT_EQ(2U, merge_helper_->values().size()); +} + +// A single operand can not be merged. +TEST_F(MergeHelperTest, SingleOperand) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + + AddKeyVal("a", 50, kTypeMerge, test::EncodeInt(1U)); + + ASSERT_TRUE(Run(31, false).IsMergeInProgress()); + ASSERT_FALSE(iter_->Valid()); + ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[0]); + ASSERT_EQ(test::EncodeInt(1U), merge_helper_->values()[0]); + ASSERT_EQ(1U, merge_helper_->keys().size()); + ASSERT_EQ(1U, merge_helper_->values().size()); +} + +// Merging with a deletion turns the deletion into a value +TEST_F(MergeHelperTest, MergeDeletion) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + + AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U)); + AddKeyVal("a", 20, kTypeDeletion, ""); + + ASSERT_TRUE(Run(15, false).ok()); + ASSERT_FALSE(iter_->Valid()); + ASSERT_EQ(test::KeyStr("a", 30, kTypeValue), merge_helper_->keys()[0]); + ASSERT_EQ(test::EncodeInt(3U), merge_helper_->values()[0]); + ASSERT_EQ(1U, merge_helper_->keys().size()); + ASSERT_EQ(1U, merge_helper_->values().size()); +} + +// The merge helper stops upon encountering a corrupt key +TEST_F(MergeHelperTest, CorruptKey) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + + AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U)); + AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(1U)); + // Corrupt key + AddKeyVal("a", 20, kTypeDeletion, "", true); // <- iter_ after merge + + ASSERT_TRUE(Run(15, false).IsMergeInProgress()); + ASSERT_EQ(ks_[2], iter_->key()); + ASSERT_EQ(test::KeyStr("a", 30, kTypeMerge), merge_helper_->keys()[0]); + ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]); + ASSERT_EQ(1U, merge_helper_->keys().size()); + ASSERT_EQ(1U, merge_helper_->values().size()); +} + +// The compaction filter is called on every merge operand +TEST_F(MergeHelperTest, FilterMergeOperands) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + filter_.reset(new test::FilterNumber(5U)); + + AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U)); + AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(5U)); // Filtered + AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(3U)); + AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(1U)); + AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U)); // Filtered + AddKeyVal("a", 25, kTypeValue, test::EncodeInt(1U)); + + ASSERT_TRUE(Run(15, false).ok()); + ASSERT_FALSE(iter_->Valid()); + MergeOutputIterator merge_output_iter(merge_helper_.get()); + merge_output_iter.SeekToFirst(); + ASSERT_EQ(test::KeyStr("a", 30, kTypeValue), + merge_output_iter.key().ToString()); + ASSERT_EQ(test::EncodeInt(8U), merge_output_iter.value().ToString()); + merge_output_iter.Next(); + ASSERT_FALSE(merge_output_iter.Valid()); +} + +TEST_F(MergeHelperTest, FilterAllMergeOperands) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + filter_.reset(new test::FilterNumber(5U)); + + AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U)); + AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(5U)); + AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(5U)); + AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(5U)); + AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U)); + AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U)); + + // filtered out all + ASSERT_TRUE(Run(15, false).ok()); + ASSERT_FALSE(iter_->Valid()); + MergeOutputIterator merge_output_iter(merge_helper_.get()); + merge_output_iter.SeekToFirst(); + ASSERT_FALSE(merge_output_iter.Valid()); + + // we have one operand that will survive because it's a delete + AddKeyVal("a", 24, kTypeDeletion, test::EncodeInt(5U)); + AddKeyVal("b", 23, kTypeValue, test::EncodeInt(5U)); + ASSERT_TRUE(Run(15, true).ok()); + merge_output_iter = MergeOutputIterator(merge_helper_.get()); + ASSERT_TRUE(iter_->Valid()); + merge_output_iter.SeekToFirst(); + ASSERT_FALSE(merge_output_iter.Valid()); + + // when all merge operands are filtered out, we leave the iterator pointing to + // the Put/Delete that survived + ASSERT_EQ(test::KeyStr("a", 24, kTypeDeletion), iter_->key().ToString()); + ASSERT_EQ(test::EncodeInt(5U), iter_->value().ToString()); +} + +// Make sure that merge operands are filtered at the beginning +TEST_F(MergeHelperTest, FilterFirstMergeOperand) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + filter_.reset(new test::FilterNumber(5U)); + + AddKeyVal("a", 31, kTypeMerge, test::EncodeInt(5U)); // Filtered + AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U)); // Filtered + AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(2U)); + AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(1U)); + AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(3U)); + AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U)); // Filtered + AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U)); // Filtered + AddKeyVal("b", 24, kTypeValue, test::EncodeInt(5U)); // next user key + + ASSERT_OK(Run(15, true)); + ASSERT_TRUE(iter_->Valid()); + MergeOutputIterator merge_output_iter(merge_helper_.get()); + merge_output_iter.SeekToFirst(); + // sequence number is 29 here, because the first merge operand got filtered + // out + ASSERT_EQ(test::KeyStr("a", 29, kTypeValue), + merge_output_iter.key().ToString()); + ASSERT_EQ(test::EncodeInt(6U), merge_output_iter.value().ToString()); + merge_output_iter.Next(); + ASSERT_FALSE(merge_output_iter.Valid()); + + // make sure that we're passing user keys into the filter + ASSERT_EQ("a", filter_->last_merge_operand_key()); +} + +// Make sure that merge operands are not filtered out if there's a snapshot +// pointing at them +TEST_F(MergeHelperTest, DontFilterMergeOperandsBeforeSnapshotTest) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + filter_.reset(new test::FilterNumber(5U)); + + AddKeyVal("a", 31, kTypeMerge, test::EncodeInt(5U)); + AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U)); + AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(2U)); + AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(1U)); + AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(3U)); + AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U)); + AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U)); + AddKeyVal("b", 24, kTypeValue, test::EncodeInt(5U)); + + ASSERT_OK(Run(15, true, 32)); + ASSERT_TRUE(iter_->Valid()); + MergeOutputIterator merge_output_iter(merge_helper_.get()); + merge_output_iter.SeekToFirst(); + ASSERT_EQ(test::KeyStr("a", 31, kTypeValue), + merge_output_iter.key().ToString()); + ASSERT_EQ(test::EncodeInt(26U), merge_output_iter.value().ToString()); + merge_output_iter.Next(); + ASSERT_FALSE(merge_output_iter.Valid()); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/merge_operator.cc b/src/rocksdb/db/merge_operator.cc new file mode 100644 index 000000000..d32585640 --- /dev/null +++ b/src/rocksdb/db/merge_operator.cc @@ -0,0 +1,85 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +/** + * Back-end implementation details specific to the Merge Operator. + */ + +#include "rocksdb/merge_operator.h" + +namespace ROCKSDB_NAMESPACE { + +bool MergeOperator::FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { + // If FullMergeV2 is not implemented, we convert the operand_list to + // std::deque and pass it to FullMerge + std::deque operand_list_str; + for (auto& op : merge_in.operand_list) { + operand_list_str.emplace_back(op.data(), op.size()); + } + return FullMerge(merge_in.key, merge_in.existing_value, operand_list_str, + &merge_out->new_value, merge_in.logger); +} + +// The default implementation of PartialMergeMulti, which invokes +// PartialMerge multiple times internally and merges two operands at +// a time. +bool MergeOperator::PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const { + assert(operand_list.size() >= 2); + // Simply loop through the operands + Slice temp_slice(operand_list[0]); + + for (size_t i = 1; i < operand_list.size(); ++i) { + auto& operand = operand_list[i]; + std::string temp_value; + if (!PartialMerge(key, temp_slice, operand, &temp_value, logger)) { + return false; + } + swap(temp_value, *new_value); + temp_slice = Slice(*new_value); + } + + // The result will be in *new_value. All merges succeeded. + return true; +} + +// Given a "real" merge from the library, call the user's +// associative merge function one-by-one on each of the operands. +// NOTE: It is assumed that the client's merge-operator will handle any errors. +bool AssociativeMergeOperator::FullMergeV2( + const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { + // Simply loop through the operands + Slice temp_existing; + const Slice* existing_value = merge_in.existing_value; + for (const auto& operand : merge_in.operand_list) { + std::string temp_value; + if (!Merge(merge_in.key, existing_value, operand, &temp_value, + merge_in.logger)) { + return false; + } + swap(temp_value, merge_out->new_value); + temp_existing = Slice(merge_out->new_value); + existing_value = &temp_existing; + } + + // The result will be in *new_value. All merges succeeded. + return true; +} + +// Call the user defined simple merge on the operands; +// NOTE: It is assumed that the client's merge-operator will handle any errors. +bool AssociativeMergeOperator::PartialMerge(const Slice& key, + const Slice& left_operand, + const Slice& right_operand, + std::string* new_value, + Logger* logger) const { + return Merge(key, &left_operand, right_operand, new_value, logger); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/merge_test.cc b/src/rocksdb/db/merge_test.cc new file mode 100644 index 000000000..0d373d41e --- /dev/null +++ b/src/rocksdb/db/merge_test.cc @@ -0,0 +1,629 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include + +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "db/write_batch_internal.h" +#include "port/stack_trace.h" +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/utilities/db_ttl.h" +#include "test_util/testharness.h" +#include "util/coding.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +bool use_compression; + +class MergeTest : public testing::Test {}; + +size_t num_merge_operator_calls; +void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; } + +size_t num_partial_merge_calls; +void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; } + +class CountMergeOperator : public AssociativeMergeOperator { + public: + CountMergeOperator() { + mergeOperator_ = MergeOperators::CreateUInt64AddOperator(); + } + + bool Merge(const Slice& key, const Slice* existing_value, const Slice& value, + std::string* new_value, Logger* logger) const override { + assert(new_value->empty()); + ++num_merge_operator_calls; + if (existing_value == nullptr) { + new_value->assign(value.data(), value.size()); + return true; + } + + return mergeOperator_->PartialMerge(key, *existing_value, value, new_value, + logger); + } + + bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const override { + assert(new_value->empty()); + ++num_partial_merge_calls; + return mergeOperator_->PartialMergeMulti(key, operand_list, new_value, + logger); + } + + const char* Name() const override { return "UInt64AddOperator"; } + + private: + std::shared_ptr mergeOperator_; +}; + +class EnvMergeTest : public EnvWrapper { + public: + EnvMergeTest() : EnvWrapper(Env::Default()) {} + static const char* kClassName() { return "MergeEnv"; } + const char* Name() const override { return kClassName(); } + // ~EnvMergeTest() override {} + + uint64_t NowNanos() override { + ++now_nanos_count_; + return target()->NowNanos(); + } + + static uint64_t now_nanos_count_; + + static std::unique_ptr singleton_; + + static EnvMergeTest* GetInstance() { + if (nullptr == singleton_) singleton_.reset(new EnvMergeTest); + return singleton_.get(); + } +}; + +uint64_t EnvMergeTest::now_nanos_count_{0}; +std::unique_ptr EnvMergeTest::singleton_; + +std::shared_ptr OpenDb(const std::string& dbname, const bool ttl = false, + const size_t max_successive_merges = 0) { + DB* db; + Options options; + options.create_if_missing = true; + options.merge_operator = std::make_shared(); + options.max_successive_merges = max_successive_merges; + options.env = EnvMergeTest::GetInstance(); + EXPECT_OK(DestroyDB(dbname, Options())); + Status s; +// DBWithTTL is not supported in ROCKSDB_LITE +#ifndef ROCKSDB_LITE + if (ttl) { + DBWithTTL* db_with_ttl; + s = DBWithTTL::Open(options, dbname, &db_with_ttl); + db = db_with_ttl; + } else { + s = DB::Open(options, dbname, &db); + } +#else + assert(!ttl); + s = DB::Open(options, dbname, &db); +#endif // !ROCKSDB_LITE + EXPECT_OK(s); + assert(s.ok()); + // Allowed to call NowNanos during DB creation (in GenerateRawUniqueId() for + // session ID) + EnvMergeTest::now_nanos_count_ = 0; + return std::shared_ptr(db); +} + +// Imagine we are maintaining a set of uint64 counters. +// Each counter has a distinct name. And we would like +// to support four high level operations: +// set, add, get and remove +// This is a quick implementation without a Merge operation. +class Counters { + protected: + std::shared_ptr db_; + + WriteOptions put_option_; + ReadOptions get_option_; + WriteOptions delete_option_; + + uint64_t default_; + + public: + explicit Counters(std::shared_ptr db, uint64_t defaultCount = 0) + : db_(db), + put_option_(), + get_option_(), + delete_option_(), + default_(defaultCount) { + assert(db_); + } + + virtual ~Counters() {} + + // public interface of Counters. + // All four functions return false + // if the underlying level db operation failed. + + // mapped to a levedb Put + bool set(const std::string& key, uint64_t value) { + // just treat the internal rep of int64 as the string + char buf[sizeof(value)]; + EncodeFixed64(buf, value); + Slice slice(buf, sizeof(value)); + auto s = db_->Put(put_option_, key, slice); + + if (s.ok()) { + return true; + } else { + std::cerr << s.ToString() << std::endl; + return false; + } + } + + // mapped to a rocksdb Delete + bool remove(const std::string& key) { + auto s = db_->Delete(delete_option_, key); + + if (s.ok()) { + return true; + } else { + std::cerr << s.ToString() << std::endl; + return false; + } + } + + // mapped to a rocksdb Get + bool get(const std::string& key, uint64_t* value) { + std::string str; + auto s = db_->Get(get_option_, key, &str); + + if (s.IsNotFound()) { + // return default value if not found; + *value = default_; + return true; + } else if (s.ok()) { + // deserialization + if (str.size() != sizeof(uint64_t)) { + std::cerr << "value corruption\n"; + return false; + } + *value = DecodeFixed64(&str[0]); + return true; + } else { + std::cerr << s.ToString() << std::endl; + return false; + } + } + + // 'add' is implemented as get -> modify -> set + // An alternative is a single merge operation, see MergeBasedCounters + virtual bool add(const std::string& key, uint64_t value) { + uint64_t base = default_; + return get(key, &base) && set(key, base + value); + } + + // convenience functions for testing + void assert_set(const std::string& key, uint64_t value) { + assert(set(key, value)); + } + + void assert_remove(const std::string& key) { assert(remove(key)); } + + uint64_t assert_get(const std::string& key) { + uint64_t value = default_; + int result = get(key, &value); + assert(result); + if (result == 0) exit(1); // Disable unused variable warning. + return value; + } + + void assert_add(const std::string& key, uint64_t value) { + int result = add(key, value); + assert(result); + if (result == 0) exit(1); // Disable unused variable warning. + } +}; + +// Implement 'add' directly with the new Merge operation +class MergeBasedCounters : public Counters { + private: + WriteOptions merge_option_; // for merge + + public: + explicit MergeBasedCounters(std::shared_ptr db, uint64_t defaultCount = 0) + : Counters(db, defaultCount), merge_option_() {} + + // mapped to a rocksdb Merge operation + bool add(const std::string& key, uint64_t value) override { + char encoded[sizeof(uint64_t)]; + EncodeFixed64(encoded, value); + Slice slice(encoded, sizeof(uint64_t)); + auto s = db_->Merge(merge_option_, key, slice); + + if (s.ok()) { + return true; + } else { + std::cerr << s.ToString() << std::endl; + return false; + } + } +}; + +void dumpDb(DB* db) { + auto it = std::unique_ptr(db->NewIterator(ReadOptions())); + for (it->SeekToFirst(); it->Valid(); it->Next()) { + // uint64_t value = DecodeFixed64(it->value().data()); + // std::cout << it->key().ToString() << ": " << value << std::endl; + } + assert(it->status().ok()); // Check for any errors found during the scan +} + +void testCounters(Counters& counters, DB* db, bool test_compaction) { + FlushOptions o; + o.wait = true; + + counters.assert_set("a", 1); + + if (test_compaction) { + ASSERT_OK(db->Flush(o)); + } + + ASSERT_EQ(counters.assert_get("a"), 1); + + counters.assert_remove("b"); + + // defaut value is 0 if non-existent + ASSERT_EQ(counters.assert_get("b"), 0); + + counters.assert_add("a", 2); + + if (test_compaction) { + ASSERT_OK(db->Flush(o)); + } + + // 1+2 = 3 + ASSERT_EQ(counters.assert_get("a"), 3); + + dumpDb(db); + + // 1+...+49 = ? + uint64_t sum = 0; + for (int i = 1; i < 50; i++) { + counters.assert_add("b", i); + sum += i; + } + ASSERT_EQ(counters.assert_get("b"), sum); + + dumpDb(db); + + if (test_compaction) { + ASSERT_OK(db->Flush(o)); + + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + dumpDb(db); + + ASSERT_EQ(counters.assert_get("a"), 3); + ASSERT_EQ(counters.assert_get("b"), sum); + } +} + +void testCountersWithFlushAndCompaction(Counters& counters, DB* db) { + ASSERT_OK(db->Put({}, "1", "1")); + ASSERT_OK(db->Flush(FlushOptions())); + + std::atomic cnt{0}; + const auto get_thread_id = [&cnt]() { + thread_local int thread_id{cnt++}; + return thread_id; + }; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:BeforeWriterWaiting", [&](void* /*arg*/) { + int thread_id = get_thread_id(); + if (1 == thread_id) { + TEST_SYNC_POINT( + "testCountersWithFlushAndCompaction::bg_compact_thread:0"); + } else if (2 == thread_id) { + TEST_SYNC_POINT( + "testCountersWithFlushAndCompaction::bg_flush_thread:0"); + } + }); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", [&](void* /*arg*/) { + int thread_id = get_thread_id(); + if (0 == thread_id) { + TEST_SYNC_POINT( + "testCountersWithFlushAndCompaction::set_options_thread:0"); + TEST_SYNC_POINT( + "testCountersWithFlushAndCompaction::set_options_thread:1"); + } + }); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WakeUpAndDone", [&](void* arg) { + auto* mutex = reinterpret_cast(arg); + mutex->AssertHeld(); + int thread_id = get_thread_id(); + ASSERT_EQ(2, thread_id); + mutex->Unlock(); + TEST_SYNC_POINT( + "testCountersWithFlushAndCompaction::bg_flush_thread:1"); + TEST_SYNC_POINT( + "testCountersWithFlushAndCompaction::bg_flush_thread:2"); + mutex->Lock(); + }); + SyncPoint::GetInstance()->LoadDependency({ + {"testCountersWithFlushAndCompaction::set_options_thread:0", + "testCountersWithCompactionAndFlush:BeforeCompact"}, + {"testCountersWithFlushAndCompaction::bg_compact_thread:0", + "testCountersWithFlushAndCompaction:BeforeIncCounters"}, + {"testCountersWithFlushAndCompaction::bg_flush_thread:0", + "testCountersWithFlushAndCompaction::set_options_thread:1"}, + {"testCountersWithFlushAndCompaction::bg_flush_thread:1", + "testCountersWithFlushAndCompaction:BeforeVerification"}, + {"testCountersWithFlushAndCompaction:AfterGet", + "testCountersWithFlushAndCompaction::bg_flush_thread:2"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + port::Thread set_options_thread([&]() { + ASSERT_OK(reinterpret_cast(db)->SetOptions( + {{"disable_auto_compactions", "false"}})); + }); + TEST_SYNC_POINT("testCountersWithCompactionAndFlush:BeforeCompact"); + port::Thread compact_thread([&]() { + ASSERT_OK(reinterpret_cast(db)->CompactRange( + CompactRangeOptions(), db->DefaultColumnFamily(), nullptr, nullptr)); + }); + + TEST_SYNC_POINT("testCountersWithFlushAndCompaction:BeforeIncCounters"); + counters.add("test-key", 1); + + FlushOptions flush_opts; + flush_opts.wait = false; + ASSERT_OK(db->Flush(flush_opts)); + + TEST_SYNC_POINT("testCountersWithFlushAndCompaction:BeforeVerification"); + std::string expected; + PutFixed64(&expected, 1); + std::string actual; + Status s = db->Get(ReadOptions(), "test-key", &actual); + TEST_SYNC_POINT("testCountersWithFlushAndCompaction:AfterGet"); + set_options_thread.join(); + compact_thread.join(); + ASSERT_OK(s); + ASSERT_EQ(expected, actual); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +void testSuccessiveMerge(Counters& counters, size_t max_num_merges, + size_t num_merges) { + counters.assert_remove("z"); + uint64_t sum = 0; + + for (size_t i = 1; i <= num_merges; ++i) { + resetNumMergeOperatorCalls(); + counters.assert_add("z", i); + sum += i; + + if (i % (max_num_merges + 1) == 0) { + ASSERT_EQ(num_merge_operator_calls, max_num_merges + 1); + } else { + ASSERT_EQ(num_merge_operator_calls, 0); + } + + resetNumMergeOperatorCalls(); + ASSERT_EQ(counters.assert_get("z"), sum); + ASSERT_EQ(num_merge_operator_calls, i % (max_num_merges + 1)); + } +} + +void testPartialMerge(Counters* counters, DB* db, size_t max_merge, + size_t min_merge, size_t count) { + FlushOptions o; + o.wait = true; + + // Test case 1: partial merge should be called when the number of merge + // operands exceeds the threshold. + uint64_t tmp_sum = 0; + resetNumPartialMergeCalls(); + for (size_t i = 1; i <= count; i++) { + counters->assert_add("b", i); + tmp_sum += i; + } + ASSERT_OK(db->Flush(o)); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(tmp_sum, counters->assert_get("b")); + if (count > max_merge) { + // in this case, FullMerge should be called instead. + ASSERT_EQ(num_partial_merge_calls, 0U); + } else { + // if count >= min_merge, then partial merge should be called once. + ASSERT_EQ((count >= min_merge), (num_partial_merge_calls == 1)); + } + + // Test case 2: partial merge should not be called when a put is found. + resetNumPartialMergeCalls(); + tmp_sum = 0; + ASSERT_OK(db->Put(ROCKSDB_NAMESPACE::WriteOptions(), "c", "10")); + for (size_t i = 1; i <= count; i++) { + counters->assert_add("c", i); + tmp_sum += i; + } + ASSERT_OK(db->Flush(o)); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(tmp_sum, counters->assert_get("c")); + ASSERT_EQ(num_partial_merge_calls, 0U); + // NowNanos was previously called in MergeHelper::FilterMerge(), which + // harmed performance. + ASSERT_EQ(EnvMergeTest::now_nanos_count_, 0U); +} + +void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges, + size_t num_merges) { + ASSERT_GT(num_merges, max_num_merges); + + Slice key("BatchSuccessiveMerge"); + uint64_t merge_value = 1; + char buf[sizeof(merge_value)]; + EncodeFixed64(buf, merge_value); + Slice merge_value_slice(buf, sizeof(merge_value)); + + // Create the batch + WriteBatch batch; + for (size_t i = 0; i < num_merges; ++i) { + ASSERT_OK(batch.Merge(key, merge_value_slice)); + } + + // Apply to memtable and count the number of merges + resetNumMergeOperatorCalls(); + ASSERT_OK(db->Write(WriteOptions(), &batch)); + ASSERT_EQ( + num_merge_operator_calls, + static_cast(num_merges - (num_merges % (max_num_merges + 1)))); + + // Get the value + resetNumMergeOperatorCalls(); + std::string get_value_str; + ASSERT_OK(db->Get(ReadOptions(), key, &get_value_str)); + assert(get_value_str.size() == sizeof(uint64_t)); + uint64_t get_value = DecodeFixed64(&get_value_str[0]); + ASSERT_EQ(get_value, num_merges * merge_value); + ASSERT_EQ(num_merge_operator_calls, + static_cast((num_merges % (max_num_merges + 1)))); +} + +void runTest(const std::string& dbname, const bool use_ttl = false) { + { + auto db = OpenDb(dbname, use_ttl); + + { + Counters counters(db, 0); + testCounters(counters, db.get(), true); + } + + { + MergeBasedCounters counters(db, 0); + testCounters(counters, db.get(), use_compression); + } + } + + ASSERT_OK(DestroyDB(dbname, Options())); + + { + size_t max_merge = 5; + auto db = OpenDb(dbname, use_ttl, max_merge); + MergeBasedCounters counters(db, 0); + testCounters(counters, db.get(), use_compression); + testSuccessiveMerge(counters, max_merge, max_merge * 2); + testSingleBatchSuccessiveMerge(db.get(), 5, 7); + ASSERT_OK(db->Close()); + ASSERT_OK(DestroyDB(dbname, Options())); + } + + { + size_t max_merge = 100; + // Min merge is hard-coded to 2. + uint32_t min_merge = 2; + for (uint32_t count = min_merge - 1; count <= min_merge + 1; count++) { + auto db = OpenDb(dbname, use_ttl, max_merge); + MergeBasedCounters counters(db, 0); + testPartialMerge(&counters, db.get(), max_merge, min_merge, count); + ASSERT_OK(db->Close()); + ASSERT_OK(DestroyDB(dbname, Options())); + } + { + auto db = OpenDb(dbname, use_ttl, max_merge); + MergeBasedCounters counters(db, 0); + testPartialMerge(&counters, db.get(), max_merge, min_merge, + min_merge * 10); + ASSERT_OK(db->Close()); + ASSERT_OK(DestroyDB(dbname, Options())); + } + } + + { + { + auto db = OpenDb(dbname); + MergeBasedCounters counters(db, 0); + counters.add("test-key", 1); + counters.add("test-key", 1); + counters.add("test-key", 1); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } + + DB* reopen_db; + ASSERT_OK(DB::Open(Options(), dbname, &reopen_db)); + std::string value; + ASSERT_NOK(reopen_db->Get(ReadOptions(), "test-key", &value)); + delete reopen_db; + ASSERT_OK(DestroyDB(dbname, Options())); + } + + /* Temporary remove this test + { + std::cout << "Test merge-operator not set after reopen (recovery case)\n"; + { + auto db = OpenDb(dbname); + MergeBasedCounters counters(db, 0); + counters.add("test-key", 1); + counters.add("test-key", 1); + counters.add("test-key", 1); + } + + DB* reopen_db; + ASSERT_TRUE(DB::Open(Options(), dbname, &reopen_db).IsInvalidArgument()); + } + */ +} + +TEST_F(MergeTest, MergeDbTest) { + runTest(test::PerThreadDBPath("merge_testdb")); +} + +#ifndef ROCKSDB_LITE +TEST_F(MergeTest, MergeDbTtlTest) { + runTest(test::PerThreadDBPath("merge_testdbttl"), + true); // Run test on TTL database +} + +TEST_F(MergeTest, MergeWithCompactionAndFlush) { + const std::string dbname = + test::PerThreadDBPath("merge_with_compaction_and_flush"); + { + auto db = OpenDb(dbname); + { + MergeBasedCounters counters(db, 0); + testCountersWithFlushAndCompaction(counters, db.get()); + } + } + ASSERT_OK(DestroyDB(dbname, Options())); +} +#endif // !ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::use_compression = false; + if (argc > 1) { + ROCKSDB_NAMESPACE::use_compression = true; + } + + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/obsolete_files_test.cc b/src/rocksdb/db/obsolete_files_test.cc new file mode 100644 index 000000000..8e9f28f65 --- /dev/null +++ b/src/rocksdb/db/obsolete_files_test.cc @@ -0,0 +1,328 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE + +#include + +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "file/filename.h" +#include "port/stack_trace.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/transaction_log.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class ObsoleteFilesTest : public DBTestBase { + public: + ObsoleteFilesTest() + : DBTestBase("obsolete_files_test", /*env_do_fsync=*/true), + wal_dir_(dbname_ + "/wal_files") {} + + void AddKeys(int numkeys, int startkey) { + WriteOptions options; + options.sync = false; + for (int i = startkey; i < (numkeys + startkey); i++) { + std::string temp = std::to_string(i); + Slice key(temp); + Slice value(temp); + ASSERT_OK(db_->Put(options, key, value)); + } + } + + void createLevel0Files(int numFiles, int numKeysPerFile) { + int startKey = 0; + for (int i = 0; i < numFiles; i++) { + AddKeys(numKeysPerFile, startKey); + startKey += numKeysPerFile; + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_OK( + dbfull()->TEST_WaitForCompact()); // wait for background flush (flush + // is also a kind of compaction). + } + } + + void CheckFileTypeCounts(const std::string& dir, int required_log, + int required_sst, int required_manifest) { + std::vector filenames; + ASSERT_OK(env_->GetChildren(dir, &filenames)); + + int log_cnt = 0; + int sst_cnt = 0; + int manifest_cnt = 0; + for (auto file : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(file, &number, &type)) { + log_cnt += (type == kWalFile); + sst_cnt += (type == kTableFile); + manifest_cnt += (type == kDescriptorFile); + } + } + ASSERT_EQ(required_log, log_cnt); + ASSERT_EQ(required_sst, sst_cnt); + ASSERT_EQ(required_manifest, manifest_cnt); + } + + void ReopenDB() { + Options options = CurrentOptions(); + // Trigger compaction when the number of level 0 files reaches 2. + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.disable_auto_compactions = false; + options.delete_obsolete_files_period_micros = 0; // always do full purge + options.enable_thread_tracking = true; + options.write_buffer_size = 1024 * 1024 * 1000; + options.target_file_size_base = 1024 * 1024 * 1000; + options.max_bytes_for_level_base = 1024 * 1024 * 1000; + options.WAL_ttl_seconds = 300; // Used to test log files + options.WAL_size_limit_MB = 1024; // Used to test log files + options.wal_dir = wal_dir_; + + // Note: the following prevents an otherwise harmless data race between the + // test setup code (AddBlobFile) in ObsoleteFilesTest.BlobFiles and the + // periodic stat dumping thread. + options.stats_dump_period_sec = 0; + + Destroy(options); + Reopen(options); + } + + const std::string wal_dir_; +}; + +TEST_F(ObsoleteFilesTest, RaceForObsoleteFileDeletion) { + ReopenDB(); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::BackgroundCallCompaction:FoundObsoleteFiles", + "ObsoleteFilesTest::RaceForObsoleteFileDeletion:1"}, + {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles", + "ObsoleteFilesTest::RaceForObsoleteFileDeletion:2"}, + }); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DeleteObsoleteFileImpl:AfterDeletion", [&](void* arg) { + Status* p_status = reinterpret_cast(arg); + ASSERT_OK(*p_status); + }); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::CloseHelper:PendingPurgeFinished", [&](void* arg) { + std::unordered_set* files_grabbed_for_purge_ptr = + reinterpret_cast*>(arg); + ASSERT_TRUE(files_grabbed_for_purge_ptr->empty()); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + createLevel0Files(2, 50000); + CheckFileTypeCounts(wal_dir_, 1, 0, 0); + + port::Thread user_thread([this]() { + JobContext jobCxt(0); + TEST_SYNC_POINT("ObsoleteFilesTest::RaceForObsoleteFileDeletion:1"); + dbfull()->TEST_LockMutex(); + dbfull()->FindObsoleteFiles(&jobCxt, true /* force=true */, + false /* no_full_scan=false */); + dbfull()->TEST_UnlockMutex(); + TEST_SYNC_POINT("ObsoleteFilesTest::RaceForObsoleteFileDeletion:2"); + dbfull()->PurgeObsoleteFiles(jobCxt); + jobCxt.Clean(); + }); + + user_thread.join(); +} + +TEST_F(ObsoleteFilesTest, DeleteObsoleteOptionsFile) { + ReopenDB(); + + createLevel0Files(2, 50000); + CheckFileTypeCounts(wal_dir_, 1, 0, 0); + + ASSERT_OK(dbfull()->DisableFileDeletions()); + for (int i = 0; i != 4; ++i) { + if (i % 2) { + ASSERT_OK(dbfull()->SetOptions(dbfull()->DefaultColumnFamily(), + {{"paranoid_file_checks", "false"}})); + } else { + ASSERT_OK(dbfull()->SetOptions(dbfull()->DefaultColumnFamily(), + {{"paranoid_file_checks", "true"}})); + } + } + ASSERT_OK(dbfull()->EnableFileDeletions(true /* force */)); + + Close(); + + std::vector files; + int opts_file_count = 0; + ASSERT_OK(env_->GetChildren(dbname_, &files)); + for (const auto& file : files) { + uint64_t file_num; + Slice dummy_info_log_name_prefix; + FileType type; + WalFileType log_type; + if (ParseFileName(file, &file_num, dummy_info_log_name_prefix, &type, + &log_type) && + type == kOptionsFile) { + opts_file_count++; + } + } + ASSERT_EQ(2, opts_file_count); +} + +TEST_F(ObsoleteFilesTest, BlobFiles) { + ReopenDB(); + + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + assert(versions->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + const ImmutableCFOptions* const ioptions = cfd->ioptions(); + assert(ioptions); + assert(!ioptions->cf_paths.empty()); + + const std::string& path = ioptions->cf_paths.front().path; + + // Add an obsolete blob file. + constexpr uint64_t first_blob_file_number = 234; + versions->AddObsoleteBlobFile(first_blob_file_number, path); + + // Add a live blob file. + Version* const version = cfd->current(); + assert(version); + + VersionStorageInfo* const storage_info = version->storage_info(); + assert(storage_info); + + constexpr uint64_t second_blob_file_number = 456; + constexpr uint64_t second_total_blob_count = 100; + constexpr uint64_t second_total_blob_bytes = 2000000; + constexpr char second_checksum_method[] = "CRC32B"; + constexpr char second_checksum_value[] = "\x6d\xbd\xf2\x3a"; + + auto shared_meta = SharedBlobFileMetaData::Create( + second_blob_file_number, second_total_blob_count, second_total_blob_bytes, + second_checksum_method, second_checksum_value); + + constexpr uint64_t second_garbage_blob_count = 0; + constexpr uint64_t second_garbage_blob_bytes = 0; + + auto meta = BlobFileMetaData::Create( + std::move(shared_meta), BlobFileMetaData::LinkedSsts(), + second_garbage_blob_count, second_garbage_blob_bytes); + + storage_info->AddBlobFile(std::move(meta)); + + // Check for obsolete files and make sure the first blob file is picked up + // and grabbed for purge. The second blob file should be on the live list. + constexpr int job_id = 0; + JobContext job_context{job_id}; + + dbfull()->TEST_LockMutex(); + constexpr bool force_full_scan = false; + dbfull()->FindObsoleteFiles(&job_context, force_full_scan); + dbfull()->TEST_UnlockMutex(); + + ASSERT_TRUE(job_context.HaveSomethingToDelete()); + ASSERT_EQ(job_context.blob_delete_files.size(), 1); + ASSERT_EQ(job_context.blob_delete_files[0].GetBlobFileNumber(), + first_blob_file_number); + + const auto& files_grabbed_for_purge = + dbfull()->TEST_GetFilesGrabbedForPurge(); + ASSERT_NE(files_grabbed_for_purge.find(first_blob_file_number), + files_grabbed_for_purge.end()); + + ASSERT_EQ(job_context.blob_live.size(), 1); + ASSERT_EQ(job_context.blob_live[0], second_blob_file_number); + + // Hack the job context a bit by adding a few files to the full scan + // list and adjusting the pending file number. We add the two files + // above as well as two additional ones, where one is old + // and should be cleaned up, and the other is still pending. + constexpr uint64_t old_blob_file_number = 123; + constexpr uint64_t pending_blob_file_number = 567; + + job_context.full_scan_candidate_files.emplace_back( + BlobFileName(old_blob_file_number), path); + job_context.full_scan_candidate_files.emplace_back( + BlobFileName(first_blob_file_number), path); + job_context.full_scan_candidate_files.emplace_back( + BlobFileName(second_blob_file_number), path); + job_context.full_scan_candidate_files.emplace_back( + BlobFileName(pending_blob_file_number), path); + + job_context.min_pending_output = pending_blob_file_number; + + // Purge obsolete files and make sure we purge the old file and the first file + // (and keep the second file and the pending file). + std::vector deleted_files; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion", [&](void* arg) { + const std::string* file = static_cast(arg); + assert(file); + + constexpr char blob_extension[] = ".blob"; + + if (file->find(blob_extension) != std::string::npos) { + deleted_files.emplace_back(*file); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + dbfull()->PurgeObsoleteFiles(job_context); + job_context.Clean(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_EQ(files_grabbed_for_purge.find(first_blob_file_number), + files_grabbed_for_purge.end()); + + std::sort(deleted_files.begin(), deleted_files.end()); + const std::vector expected_deleted_files{ + BlobFileName(path, old_blob_file_number), + BlobFileName(path, first_blob_file_number)}; + + ASSERT_EQ(deleted_files, expected_deleted_files); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as DBImpl::DeleteFile is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/options_file_test.cc b/src/rocksdb/db/options_file_test.cc new file mode 100644 index 000000000..eb02e6ca4 --- /dev/null +++ b/src/rocksdb/db/options_file_test.cc @@ -0,0 +1,120 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#include + +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "test_util/testharness.h" + +namespace ROCKSDB_NAMESPACE { +class OptionsFileTest : public testing::Test { + public: + OptionsFileTest() : dbname_(test::PerThreadDBPath("options_file_test")) {} + + std::string dbname_; +}; + +namespace { +void UpdateOptionsFiles(DB* db, + std::unordered_set* filename_history, + int* options_files_count) { + std::vector filenames; + EXPECT_OK(db->GetEnv()->GetChildren(db->GetName(), &filenames)); + uint64_t number; + FileType type; + *options_files_count = 0; + for (auto filename : filenames) { + if (ParseFileName(filename, &number, &type) && type == kOptionsFile) { + filename_history->insert(filename); + (*options_files_count)++; + } + } +} + +// Verify whether the current Options Files are the latest ones. +void VerifyOptionsFileName( + DB* db, const std::unordered_set& past_filenames) { + std::vector filenames; + std::unordered_set current_filenames; + EXPECT_OK(db->GetEnv()->GetChildren(db->GetName(), &filenames)); + uint64_t number; + FileType type; + for (auto filename : filenames) { + if (ParseFileName(filename, &number, &type) && type == kOptionsFile) { + current_filenames.insert(filename); + } + } + for (auto past_filename : past_filenames) { + if (current_filenames.find(past_filename) != current_filenames.end()) { + continue; + } + for (auto filename : current_filenames) { + ASSERT_GT(filename, past_filename); + } + } +} +} // anonymous namespace + +TEST_F(OptionsFileTest, NumberOfOptionsFiles) { + const int kReopenCount = 20; + Options opt; + opt.create_if_missing = true; + ASSERT_OK(DestroyDB(dbname_, opt)); + std::unordered_set filename_history; + DB* db; + for (int i = 0; i < kReopenCount; ++i) { + ASSERT_OK(DB::Open(opt, dbname_, &db)); + int num_options_files = 0; + UpdateOptionsFiles(db, &filename_history, &num_options_files); + ASSERT_GT(num_options_files, 0); + ASSERT_LE(num_options_files, 2); + // Make sure we always keep the latest option files. + VerifyOptionsFileName(db, filename_history); + delete db; + } +} + +TEST_F(OptionsFileTest, OptionsFileName) { + const uint64_t kOptionsFileNum = 12345; + uint64_t number; + FileType type; + + auto options_file_name = OptionsFileName("", kOptionsFileNum); + ASSERT_TRUE(ParseFileName(options_file_name, &number, &type, nullptr)); + ASSERT_EQ(type, kOptionsFile); + ASSERT_EQ(number, kOptionsFileNum); + + const uint64_t kTempOptionsFileNum = 54352; + auto temp_options_file_name = TempOptionsFileName("", kTempOptionsFileNum); + ASSERT_TRUE(ParseFileName(temp_options_file_name, &number, &type, nullptr)); + ASSERT_NE(temp_options_file_name.find(kTempFileNameSuffix), + std::string::npos); + ASSERT_EQ(type, kTempFile); + ASSERT_EQ(number, kTempOptionsFileNum); +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { +#if !(defined NDEBUG) || !defined(OS_WIN) + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +#else + return 0; +#endif // !(defined NDEBUG) || !defined(OS_WIN) +} +#else + +#include + +int main(int /*argc*/, char** /*argv*/) { + printf("Skipped as Options file is not supported in RocksDBLite.\n"); + return 0; +} +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/output_validator.cc b/src/rocksdb/db/output_validator.cc new file mode 100644 index 000000000..e93e2d68c --- /dev/null +++ b/src/rocksdb/db/output_validator.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "db/output_validator.h" + +#include "test_util/sync_point.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { +Status OutputValidator::Add(const Slice& key, const Slice& value) { + if (enable_hash_) { + // Generate a rolling 64-bit hash of the key and values + paranoid_hash_ = NPHash64(key.data(), key.size(), paranoid_hash_); + paranoid_hash_ = NPHash64(value.data(), value.size(), paranoid_hash_); + } + if (enable_order_check_) { + TEST_SYNC_POINT_CALLBACK("OutputValidator::Add:order_check", + /*arg=*/nullptr); + if (key.size() < kNumInternalBytes) { + return Status::Corruption( + "Compaction tries to write a key without internal bytes."); + } + // prev_key_ starts with empty. + if (!prev_key_.empty() && icmp_.Compare(key, prev_key_) < 0) { + return Status::Corruption("Compaction sees out-of-order keys."); + } + prev_key_.assign(key.data(), key.size()); + } + return Status::OK(); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/output_validator.h b/src/rocksdb/db/output_validator.h new file mode 100644 index 000000000..40635f9c4 --- /dev/null +++ b/src/rocksdb/db/output_validator.h @@ -0,0 +1,48 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once +#include "db/dbformat.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +// A class that validates key/value that is inserted to an SST file. +// Pass every key/value of the file using OutputValidator::Add() +// and the class validates key order and optionally calculate a hash +// of all the key and value. +class OutputValidator { + public: + explicit OutputValidator(const InternalKeyComparator& icmp, + bool enable_order_check, bool enable_hash, + uint64_t precalculated_hash = 0) + : icmp_(icmp), + paranoid_hash_(precalculated_hash), + enable_order_check_(enable_order_check), + enable_hash_(enable_hash) {} + + // Add a key to the KV sequence, and return whether the key follows + // criteria, e.g. key is ordered. + Status Add(const Slice& key, const Slice& value); + + // Compare result of two key orders are the same. It can be used + // to compare the keys inserted into a file, and what is read back. + // Return true if the validation passes. + bool CompareValidator(const OutputValidator& other_validator) { + return GetHash() == other_validator.GetHash(); + } + + // Not (yet) intended to be persisted, so subject to change + // without notice between releases. + uint64_t GetHash() const { return paranoid_hash_; } + + private: + const InternalKeyComparator& icmp_; + std::string prev_key_; + uint64_t paranoid_hash_ = 0; + bool enable_order_check_; + bool enable_hash_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/perf_context_test.cc b/src/rocksdb/db/perf_context_test.cc new file mode 100644 index 000000000..454d12dc5 --- /dev/null +++ b/src/rocksdb/db/perf_context_test.cc @@ -0,0 +1,1010 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "rocksdb/perf_context.h" + +#include +#include +#include +#include + +#include "monitoring/histogram.h" +#include "monitoring/instrumented_mutex.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/thread_status_util.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/system_clock.h" +#include "test_util/testharness.h" +#include "util/stop_watch.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +bool FLAGS_random_key = false; +bool FLAGS_use_set_based_memetable = false; +int FLAGS_total_keys = 100; +int FLAGS_write_buffer_size = 1000000000; +int FLAGS_max_write_buffer_number = 8; +int FLAGS_min_write_buffer_number_to_merge = 7; +bool FLAGS_verbose = false; + +// Path to the database on file system +const std::string kDbName = + ROCKSDB_NAMESPACE::test::PerThreadDBPath("perf_context_test"); + +namespace ROCKSDB_NAMESPACE { + +std::shared_ptr OpenDb(bool read_only = false) { + DB* db; + Options options; + options.create_if_missing = true; + options.max_open_files = -1; + options.write_buffer_size = FLAGS_write_buffer_size; + options.max_write_buffer_number = FLAGS_max_write_buffer_number; + options.min_write_buffer_number_to_merge = + FLAGS_min_write_buffer_number_to_merge; + + if (FLAGS_use_set_based_memetable) { +#ifndef ROCKSDB_LITE + options.prefix_extractor.reset( + ROCKSDB_NAMESPACE::NewFixedPrefixTransform(0)); + options.memtable_factory.reset(NewHashSkipListRepFactory()); +#endif // ROCKSDB_LITE + } + + Status s; + if (!read_only) { + s = DB::Open(options, kDbName, &db); + } else { + s = DB::OpenForReadOnly(options, kDbName, &db); + } + EXPECT_OK(s); + return std::shared_ptr(db); +} + +class PerfContextTest : public testing::Test {}; + +TEST_F(PerfContextTest, SeekIntoDeletion) { + ASSERT_OK(DestroyDB(kDbName, Options())); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + for (int i = 0; i < FLAGS_total_keys; ++i) { + std::string key = "k" + std::to_string(i); + std::string value = "v" + std::to_string(i); + + ASSERT_OK(db->Put(write_options, key, value)); + } + + for (int i = 0; i < FLAGS_total_keys - 1; ++i) { + std::string key = "k" + std::to_string(i); + ASSERT_OK(db->Delete(write_options, key)); + } + + HistogramImpl hist_get; + HistogramImpl hist_get_time; + for (int i = 0; i < FLAGS_total_keys - 1; ++i) { + std::string key = "k" + std::to_string(i); + std::string value; + + get_perf_context()->Reset(); + StopWatchNano timer(SystemClock::Default().get()); + timer.Start(); + auto status = db->Get(read_options, key, &value); + auto elapsed_nanos = timer.ElapsedNanos(); + ASSERT_TRUE(status.IsNotFound()); + hist_get.Add(get_perf_context()->user_key_comparison_count); + hist_get_time.Add(elapsed_nanos); + } + + if (FLAGS_verbose) { + std::cout << "Get user key comparison: \n" + << hist_get.ToString() << "Get time: \n" + << hist_get_time.ToString(); + } + + { + HistogramImpl hist_seek_to_first; + std::unique_ptr iter(db->NewIterator(read_options)); + + get_perf_context()->Reset(); + StopWatchNano timer(SystemClock::Default().get(), true); + iter->SeekToFirst(); + hist_seek_to_first.Add(get_perf_context()->user_key_comparison_count); + auto elapsed_nanos = timer.ElapsedNanos(); + + if (FLAGS_verbose) { + std::cout << "SeekToFirst user key comparison: \n" + << hist_seek_to_first.ToString() << "ikey skipped: " + << get_perf_context()->internal_key_skipped_count << "\n" + << "idelete skipped: " + << get_perf_context()->internal_delete_skipped_count << "\n" + << "elapsed: " << elapsed_nanos << "\n"; + } + } + + HistogramImpl hist_seek; + for (int i = 0; i < FLAGS_total_keys; ++i) { + std::unique_ptr iter(db->NewIterator(read_options)); + std::string key = "k" + std::to_string(i); + + get_perf_context()->Reset(); + StopWatchNano timer(SystemClock::Default().get(), true); + iter->Seek(key); + auto elapsed_nanos = timer.ElapsedNanos(); + hist_seek.Add(get_perf_context()->user_key_comparison_count); + if (FLAGS_verbose) { + std::cout << "seek cmp: " << get_perf_context()->user_key_comparison_count + << " ikey skipped " + << get_perf_context()->internal_key_skipped_count + << " idelete skipped " + << get_perf_context()->internal_delete_skipped_count + << " elapsed: " << elapsed_nanos << "ns\n"; + } + + get_perf_context()->Reset(); + ASSERT_TRUE(iter->Valid()); + StopWatchNano timer2(SystemClock::Default().get(), true); + iter->Next(); + auto elapsed_nanos2 = timer2.ElapsedNanos(); + if (FLAGS_verbose) { + std::cout << "next cmp: " << get_perf_context()->user_key_comparison_count + << "elapsed: " << elapsed_nanos2 << "ns\n"; + } + } + + if (FLAGS_verbose) { + std::cout << "Seek user key comparison: \n" << hist_seek.ToString(); + } +} + +TEST_F(PerfContextTest, StopWatchNanoOverhead) { + // profile the timer cost by itself! + const int kTotalIterations = 1000000; + std::vector timings(kTotalIterations); + + StopWatchNano timer(SystemClock::Default().get(), true); + for (auto& timing : timings) { + timing = timer.ElapsedNanos(true /* reset */); + } + + HistogramImpl histogram; + for (const auto timing : timings) { + histogram.Add(timing); + } + + if (FLAGS_verbose) { + std::cout << histogram.ToString(); + } +} + +TEST_F(PerfContextTest, StopWatchOverhead) { + // profile the timer cost by itself! + const int kTotalIterations = 1000000; + uint64_t elapsed = 0; + std::vector timings(kTotalIterations); + + StopWatch timer(SystemClock::Default().get(), nullptr, 0, &elapsed); + for (auto& timing : timings) { + timing = elapsed; + } + + HistogramImpl histogram; + uint64_t prev_timing = 0; + for (const auto timing : timings) { + histogram.Add(timing - prev_timing); + prev_timing = timing; + } + + if (FLAGS_verbose) { + std::cout << histogram.ToString(); + } +} + +void ProfileQueries(bool enabled_time = false) { + ASSERT_OK(DestroyDB(kDbName, Options())); // Start this test with a fresh DB + + auto db = OpenDb(); + + WriteOptions write_options; + ReadOptions read_options; + + HistogramImpl hist_put; + + HistogramImpl hist_get; + HistogramImpl hist_get_snapshot; + HistogramImpl hist_get_memtable; + HistogramImpl hist_get_files; + HistogramImpl hist_get_post_process; + HistogramImpl hist_num_memtable_checked; + + HistogramImpl hist_mget; + HistogramImpl hist_mget_snapshot; + HistogramImpl hist_mget_memtable; + HistogramImpl hist_mget_files; + HistogramImpl hist_mget_post_process; + HistogramImpl hist_mget_num_memtable_checked; + + HistogramImpl hist_write_pre_post; + HistogramImpl hist_write_wal_time; + HistogramImpl hist_write_memtable_time; + HistogramImpl hist_write_delay_time; + HistogramImpl hist_write_thread_wait_nanos; + HistogramImpl hist_write_scheduling_time; + + uint64_t total_db_mutex_nanos = 0; + + if (FLAGS_verbose) { + std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n"; + } + + std::vector keys; + const int kFlushFlag = -1; + for (int i = 0; i < FLAGS_total_keys; ++i) { + keys.push_back(i); + if (i == FLAGS_total_keys / 2) { + // Issuing a flush in the middle. + keys.push_back(kFlushFlag); + } + } + + if (FLAGS_random_key) { + RandomShuffle(std::begin(keys), std::end(keys)); + } +#ifndef NDEBUG + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 1U); +#endif + int num_mutex_waited = 0; + for (const int i : keys) { + if (i == kFlushFlag) { + FlushOptions fo; + db->Flush(fo); + continue; + } + + std::string key = "k" + std::to_string(i); + std::string value = "v" + std::to_string(i); + + std::vector values; + + get_perf_context()->Reset(); + ASSERT_OK(db->Put(write_options, key, value)); + if (++num_mutex_waited > 3) { +#ifndef NDEBUG + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U); +#endif + } + hist_write_pre_post.Add( + get_perf_context()->write_pre_and_post_process_time); + hist_write_wal_time.Add(get_perf_context()->write_wal_time); + hist_write_memtable_time.Add(get_perf_context()->write_memtable_time); + hist_write_delay_time.Add(get_perf_context()->write_delay_time); + hist_write_thread_wait_nanos.Add( + get_perf_context()->write_thread_wait_nanos); + hist_write_scheduling_time.Add( + get_perf_context()->write_scheduling_flushes_compactions_time); + hist_put.Add(get_perf_context()->user_key_comparison_count); + total_db_mutex_nanos += get_perf_context()->db_mutex_lock_nanos; + } +#ifndef NDEBUG + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U); +#endif + + for (const int i : keys) { + if (i == kFlushFlag) { + continue; + } + std::string key = "k" + std::to_string(i); + std::string expected_value = "v" + std::to_string(i); + std::string value; + + std::vector multiget_keys = {Slice(key)}; + std::vector values; + + get_perf_context()->Reset(); + ASSERT_OK(db->Get(read_options, key, &value)); + ASSERT_EQ(expected_value, value); + hist_get_snapshot.Add(get_perf_context()->get_snapshot_time); + hist_get_memtable.Add(get_perf_context()->get_from_memtable_time); + hist_get_files.Add(get_perf_context()->get_from_output_files_time); + hist_num_memtable_checked.Add(get_perf_context()->get_from_memtable_count); + hist_get_post_process.Add(get_perf_context()->get_post_process_time); + hist_get.Add(get_perf_context()->user_key_comparison_count); + + get_perf_context()->Reset(); + auto statuses = db->MultiGet(read_options, multiget_keys, &values); + for (const auto& s : statuses) { + ASSERT_OK(s); + } + hist_mget_snapshot.Add(get_perf_context()->get_snapshot_time); + hist_mget_memtable.Add(get_perf_context()->get_from_memtable_time); + hist_mget_files.Add(get_perf_context()->get_from_output_files_time); + hist_mget_num_memtable_checked.Add( + get_perf_context()->get_from_memtable_count); + hist_mget_post_process.Add(get_perf_context()->get_post_process_time); + hist_mget.Add(get_perf_context()->user_key_comparison_count); + } + + if (FLAGS_verbose) { + std::cout << "Put user key comparison: \n" + << hist_put.ToString() << "Get user key comparison: \n" + << hist_get.ToString() << "MultiGet user key comparison: \n" + << hist_get.ToString(); + std::cout << "Put(): Pre and Post Process Time: \n" + << hist_write_pre_post.ToString() << " Writing WAL time: \n" + << hist_write_wal_time.ToString() << "\n" + << " Writing Mem Table time: \n" + << hist_write_memtable_time.ToString() << "\n" + << " Write Delay: \n" + << hist_write_delay_time.ToString() << "\n" + << " Waiting for Batch time: \n" + << hist_write_thread_wait_nanos.ToString() << "\n" + << " Scheduling Flushes and Compactions Time: \n" + << hist_write_scheduling_time.ToString() << "\n" + << " Total DB mutex nanos: \n" + << total_db_mutex_nanos << "\n"; + + std::cout << "Get(): Time to get snapshot: \n" + << hist_get_snapshot.ToString() + << " Time to get value from memtables: \n" + << hist_get_memtable.ToString() << "\n" + << " Time to get value from output files: \n" + << hist_get_files.ToString() << "\n" + << " Number of memtables checked: \n" + << hist_num_memtable_checked.ToString() << "\n" + << " Time to post process: \n" + << hist_get_post_process.ToString() << "\n"; + + std::cout << "MultiGet(): Time to get snapshot: \n" + << hist_mget_snapshot.ToString() + << " Time to get value from memtables: \n" + << hist_mget_memtable.ToString() << "\n" + << " Time to get value from output files: \n" + << hist_mget_files.ToString() << "\n" + << " Number of memtables checked: \n" + << hist_mget_num_memtable_checked.ToString() << "\n" + << " Time to post process: \n" + << hist_mget_post_process.ToString() << "\n"; + } + + if (enabled_time) { + ASSERT_GT(hist_get.Average(), 0); + ASSERT_GT(hist_get_snapshot.Average(), 0); + ASSERT_GT(hist_get_memtable.Average(), 0); + ASSERT_GT(hist_get_files.Average(), 0); + ASSERT_GT(hist_get_post_process.Average(), 0); + ASSERT_GT(hist_num_memtable_checked.Average(), 0); + + ASSERT_GT(hist_mget.Average(), 0); + ASSERT_GT(hist_mget_snapshot.Average(), 0); + ASSERT_GT(hist_mget_memtable.Average(), 0); + ASSERT_GT(hist_mget_files.Average(), 0); + ASSERT_GT(hist_mget_post_process.Average(), 0); + ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0); + + EXPECT_GT(hist_write_pre_post.Average(), 0); + EXPECT_GT(hist_write_wal_time.Average(), 0); + EXPECT_GT(hist_write_memtable_time.Average(), 0); + EXPECT_EQ(hist_write_delay_time.Average(), 0); + EXPECT_EQ(hist_write_thread_wait_nanos.Average(), 0); + EXPECT_GT(hist_write_scheduling_time.Average(), 0); + +#ifndef NDEBUG + ASSERT_LT(total_db_mutex_nanos, 100U); +#endif + } + + db.reset(); + db = OpenDb(true); + + hist_get.Clear(); + hist_get_snapshot.Clear(); + hist_get_memtable.Clear(); + hist_get_files.Clear(); + hist_get_post_process.Clear(); + hist_num_memtable_checked.Clear(); + + hist_mget.Clear(); + hist_mget_snapshot.Clear(); + hist_mget_memtable.Clear(); + hist_mget_files.Clear(); + hist_mget_post_process.Clear(); + hist_mget_num_memtable_checked.Clear(); + + for (const int i : keys) { + if (i == kFlushFlag) { + continue; + } + std::string key = "k" + std::to_string(i); + std::string expected_value = "v" + std::to_string(i); + std::string value; + + std::vector multiget_keys = {Slice(key)}; + std::vector values; + + get_perf_context()->Reset(); + ASSERT_OK(db->Get(read_options, key, &value)); + ASSERT_EQ(expected_value, value); + hist_get_snapshot.Add(get_perf_context()->get_snapshot_time); + hist_get_memtable.Add(get_perf_context()->get_from_memtable_time); + hist_get_files.Add(get_perf_context()->get_from_output_files_time); + hist_num_memtable_checked.Add(get_perf_context()->get_from_memtable_count); + hist_get_post_process.Add(get_perf_context()->get_post_process_time); + hist_get.Add(get_perf_context()->user_key_comparison_count); + + get_perf_context()->Reset(); + auto statuses = db->MultiGet(read_options, multiget_keys, &values); + for (const auto& s : statuses) { + ASSERT_OK(s); + } + hist_mget_snapshot.Add(get_perf_context()->get_snapshot_time); + hist_mget_memtable.Add(get_perf_context()->get_from_memtable_time); + hist_mget_files.Add(get_perf_context()->get_from_output_files_time); + hist_mget_num_memtable_checked.Add( + get_perf_context()->get_from_memtable_count); + hist_mget_post_process.Add(get_perf_context()->get_post_process_time); + hist_mget.Add(get_perf_context()->user_key_comparison_count); + } + + if (FLAGS_verbose) { + std::cout << "ReadOnly Get user key comparison: \n" + << hist_get.ToString() + << "ReadOnly MultiGet user key comparison: \n" + << hist_mget.ToString(); + + std::cout << "ReadOnly Get(): Time to get snapshot: \n" + << hist_get_snapshot.ToString() + << " Time to get value from memtables: \n" + << hist_get_memtable.ToString() << "\n" + << " Time to get value from output files: \n" + << hist_get_files.ToString() << "\n" + << " Number of memtables checked: \n" + << hist_num_memtable_checked.ToString() << "\n" + << " Time to post process: \n" + << hist_get_post_process.ToString() << "\n"; + + std::cout << "ReadOnly MultiGet(): Time to get snapshot: \n" + << hist_mget_snapshot.ToString() + << " Time to get value from memtables: \n" + << hist_mget_memtable.ToString() << "\n" + << " Time to get value from output files: \n" + << hist_mget_files.ToString() << "\n" + << " Number of memtables checked: \n" + << hist_mget_num_memtable_checked.ToString() << "\n" + << " Time to post process: \n" + << hist_mget_post_process.ToString() << "\n"; + } + + if (enabled_time) { + ASSERT_GT(hist_get.Average(), 0); + ASSERT_GT(hist_get_memtable.Average(), 0); + ASSERT_GT(hist_get_files.Average(), 0); + ASSERT_GT(hist_num_memtable_checked.Average(), 0); + // In read-only mode Get(), no super version operation is needed + ASSERT_EQ(hist_get_post_process.Average(), 0); + ASSERT_GT(hist_get_snapshot.Average(), 0); + + ASSERT_GT(hist_mget.Average(), 0); + ASSERT_GT(hist_mget_snapshot.Average(), 0); + ASSERT_GT(hist_mget_memtable.Average(), 0); + ASSERT_GT(hist_mget_files.Average(), 0); + ASSERT_GT(hist_mget_post_process.Average(), 0); + ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0); + } +} + +#ifndef ROCKSDB_LITE +TEST_F(PerfContextTest, KeyComparisonCount) { + SetPerfLevel(kEnableCount); + ProfileQueries(); + + SetPerfLevel(kDisable); + ProfileQueries(); + + SetPerfLevel(kEnableTime); + ProfileQueries(true); +} +#endif // ROCKSDB_LITE + +// make perf_context_test +// export ROCKSDB_TESTS=PerfContextTest.SeekKeyComparison +// For one memtable: +// ./perf_context_test --write_buffer_size=500000 --total_keys=10000 +// For two memtables: +// ./perf_context_test --write_buffer_size=250000 --total_keys=10000 +// Specify --random_key=1 to shuffle the key before insertion +// Results show that, for sequential insertion, worst-case Seek Key comparison +// is close to the total number of keys (linear), when there is only one +// memtable. When there are two memtables, even the avg Seek Key comparison +// starts to become linear to the input size. + +TEST_F(PerfContextTest, SeekKeyComparison) { + ASSERT_OK(DestroyDB(kDbName, Options())); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + if (FLAGS_verbose) { + std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n"; + } + + std::vector keys; + for (int i = 0; i < FLAGS_total_keys; ++i) { + keys.push_back(i); + } + + if (FLAGS_random_key) { + RandomShuffle(std::begin(keys), std::end(keys)); + } + + HistogramImpl hist_put_time; + HistogramImpl hist_wal_time; + HistogramImpl hist_time_diff; + + SetPerfLevel(kEnableTime); + StopWatchNano timer(SystemClock::Default().get()); + for (const int i : keys) { + std::string key = "k" + std::to_string(i); + std::string value = "v" + std::to_string(i); + + get_perf_context()->Reset(); + timer.Start(); + ASSERT_OK(db->Put(write_options, key, value)); + auto put_time = timer.ElapsedNanos(); + hist_put_time.Add(put_time); + hist_wal_time.Add(get_perf_context()->write_wal_time); + hist_time_diff.Add(put_time - get_perf_context()->write_wal_time); + } + + if (FLAGS_verbose) { + std::cout << "Put time:\n" + << hist_put_time.ToString() << "WAL time:\n" + << hist_wal_time.ToString() << "time diff:\n" + << hist_time_diff.ToString(); + } + + HistogramImpl hist_seek; + HistogramImpl hist_next; + + for (int i = 0; i < FLAGS_total_keys; ++i) { + std::string key = "k" + std::to_string(i); + std::string value = "v" + std::to_string(i); + + std::unique_ptr iter(db->NewIterator(read_options)); + get_perf_context()->Reset(); + iter->Seek(key); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->value().ToString(), value); + hist_seek.Add(get_perf_context()->user_key_comparison_count); + } + + std::unique_ptr iter(db->NewIterator(read_options)); + for (iter->SeekToFirst(); iter->Valid();) { + get_perf_context()->Reset(); + iter->Next(); + hist_next.Add(get_perf_context()->user_key_comparison_count); + } + ASSERT_OK(iter->status()); + if (FLAGS_verbose) { + std::cout << "Seek:\n" + << hist_seek.ToString() << "Next:\n" + << hist_next.ToString(); + } +} + +TEST_F(PerfContextTest, DBMutexLockCounter) { + int stats_code[] = {0, static_cast(DB_MUTEX_WAIT_MICROS)}; + for (PerfLevel perf_level_test : + {PerfLevel::kEnableTimeExceptForMutex, PerfLevel::kEnableTime}) { + for (int c = 0; c < 2; ++c) { + InstrumentedMutex mutex(nullptr, SystemClock::Default().get(), + stats_code[c]); + mutex.Lock(); + ROCKSDB_NAMESPACE::port::Thread child_thread([&] { + SetPerfLevel(perf_level_test); + get_perf_context()->Reset(); + ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0); + mutex.Lock(); + mutex.Unlock(); + if (perf_level_test == PerfLevel::kEnableTimeExceptForMutex || + stats_code[c] != DB_MUTEX_WAIT_MICROS) { + ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0); + } else { + // increment the counter only when it's a DB Mutex + ASSERT_GT(get_perf_context()->db_mutex_lock_nanos, 0); + } + }); + SystemClock::Default()->SleepForMicroseconds(100); + mutex.Unlock(); + child_thread.join(); + } + } +} + +TEST_F(PerfContextTest, FalseDBMutexWait) { + SetPerfLevel(kEnableTime); + int stats_code[] = {0, static_cast(DB_MUTEX_WAIT_MICROS)}; + for (int c = 0; c < 2; ++c) { + InstrumentedMutex mutex(nullptr, SystemClock::Default().get(), + stats_code[c]); + InstrumentedCondVar lock(&mutex); + get_perf_context()->Reset(); + mutex.Lock(); + lock.TimedWait(100); + mutex.Unlock(); + if (stats_code[c] == static_cast(DB_MUTEX_WAIT_MICROS)) { + // increment the counter only when it's a DB Mutex + ASSERT_GT(get_perf_context()->db_condition_wait_nanos, 0); + } else { + ASSERT_EQ(get_perf_context()->db_condition_wait_nanos, 0); + } + } +} + +TEST_F(PerfContextTest, ToString) { + get_perf_context()->Reset(); + get_perf_context()->block_read_count = 12345; + + std::string zero_included = get_perf_context()->ToString(); + ASSERT_NE(std::string::npos, zero_included.find("= 0")); + ASSERT_NE(std::string::npos, zero_included.find("= 12345")); + + std::string zero_excluded = get_perf_context()->ToString(true); + ASSERT_EQ(std::string::npos, zero_excluded.find("= 0")); + ASSERT_NE(std::string::npos, zero_excluded.find("= 12345")); +} + +TEST_F(PerfContextTest, MergeOperatorTime) { + ASSERT_OK(DestroyDB(kDbName, Options())); + DB* db; + Options options; + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + Status s = DB::Open(options, kDbName, &db); + EXPECT_OK(s); + + std::string val; + ASSERT_OK(db->Merge(WriteOptions(), "k1", "val1")); + ASSERT_OK(db->Merge(WriteOptions(), "k1", "val2")); + ASSERT_OK(db->Merge(WriteOptions(), "k1", "val3")); + ASSERT_OK(db->Merge(WriteOptions(), "k1", "val4")); + + SetPerfLevel(kEnableTime); + get_perf_context()->Reset(); + ASSERT_OK(db->Get(ReadOptions(), "k1", &val)); +#ifdef OS_SOLARIS + for (int i = 0; i < 100; i++) { + ASSERT_OK(db->Get(ReadOptions(), "k1", &val)); + } +#endif + EXPECT_GT(get_perf_context()->merge_operator_time_nanos, 0); + + ASSERT_OK(db->Flush(FlushOptions())); + + get_perf_context()->Reset(); + ASSERT_OK(db->Get(ReadOptions(), "k1", &val)); +#ifdef OS_SOLARIS + for (int i = 0; i < 100; i++) { + ASSERT_OK(db->Get(ReadOptions(), "k1", &val)); + } +#endif + EXPECT_GT(get_perf_context()->merge_operator_time_nanos, 0); + + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + get_perf_context()->Reset(); + ASSERT_OK(db->Get(ReadOptions(), "k1", &val)); +#ifdef OS_SOLARIS + for (int i = 0; i < 100; i++) { + ASSERT_OK(db->Get(ReadOptions(), "k1", &val)); + } +#endif + EXPECT_GT(get_perf_context()->merge_operator_time_nanos, 0); + + delete db; +} + +TEST_F(PerfContextTest, CopyAndMove) { + // Assignment operator + { + get_perf_context()->Reset(); + get_perf_context()->EnablePerLevelPerfContext(); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5); + ASSERT_EQ( + 1, + (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful); + PerfContext perf_context_assign; + perf_context_assign = *get_perf_context(); + ASSERT_EQ( + 1, + (*(perf_context_assign.level_to_perf_context))[5].bloom_filter_useful); + get_perf_context()->ClearPerLevelPerfContext(); + get_perf_context()->Reset(); + ASSERT_EQ( + 1, + (*(perf_context_assign.level_to_perf_context))[5].bloom_filter_useful); + perf_context_assign.ClearPerLevelPerfContext(); + perf_context_assign.Reset(); + } + // Copy constructor + { + get_perf_context()->Reset(); + get_perf_context()->EnablePerLevelPerfContext(); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5); + ASSERT_EQ( + 1, + (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful); + PerfContext perf_context_copy(*get_perf_context()); + ASSERT_EQ( + 1, (*(perf_context_copy.level_to_perf_context))[5].bloom_filter_useful); + get_perf_context()->ClearPerLevelPerfContext(); + get_perf_context()->Reset(); + ASSERT_EQ( + 1, (*(perf_context_copy.level_to_perf_context))[5].bloom_filter_useful); + perf_context_copy.ClearPerLevelPerfContext(); + perf_context_copy.Reset(); + } + // Move constructor + { + get_perf_context()->Reset(); + get_perf_context()->EnablePerLevelPerfContext(); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5); + ASSERT_EQ( + 1, + (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful); + PerfContext perf_context_move = std::move(*get_perf_context()); + ASSERT_EQ( + 1, (*(perf_context_move.level_to_perf_context))[5].bloom_filter_useful); + get_perf_context()->ClearPerLevelPerfContext(); + get_perf_context()->Reset(); + ASSERT_EQ( + 1, (*(perf_context_move.level_to_perf_context))[5].bloom_filter_useful); + perf_context_move.ClearPerLevelPerfContext(); + perf_context_move.Reset(); + } +} + +TEST_F(PerfContextTest, PerfContextDisableEnable) { + get_perf_context()->Reset(); + get_perf_context()->EnablePerLevelPerfContext(); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, 0); + get_perf_context()->DisablePerLevelPerfContext(); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5); + get_perf_context()->EnablePerLevelPerfContext(); + PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, 0); + get_perf_context()->DisablePerLevelPerfContext(); + PerfContext perf_context_copy(*get_perf_context()); + ASSERT_EQ(1, (*(perf_context_copy.level_to_perf_context))[0] + .bloom_filter_full_positive); + // this was set when per level perf context is disabled, should not be copied + ASSERT_NE( + 1, (*(perf_context_copy.level_to_perf_context))[5].bloom_filter_useful); + ASSERT_EQ( + 1, (*(perf_context_copy.level_to_perf_context))[0].block_cache_hit_count); + perf_context_copy.ClearPerLevelPerfContext(); + perf_context_copy.Reset(); + get_perf_context()->ClearPerLevelPerfContext(); + get_perf_context()->Reset(); +} + +TEST_F(PerfContextTest, PerfContextByLevelGetSet) { + get_perf_context()->Reset(); + get_perf_context()->EnablePerLevelPerfContext(); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, 0); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 7); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 7); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, 2); + PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, 0); + PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 5, 2); + PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 2, 3); + PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 4, 1); + ASSERT_EQ( + 0, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + ASSERT_EQ( + 1, (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful); + ASSERT_EQ( + 2, (*(get_perf_context()->level_to_perf_context))[7].bloom_filter_useful); + ASSERT_EQ(1, (*(get_perf_context()->level_to_perf_context))[0] + .bloom_filter_full_positive); + ASSERT_EQ(1, (*(get_perf_context()->level_to_perf_context))[2] + .bloom_filter_full_true_positive); + ASSERT_EQ( + 1, + (*(get_perf_context()->level_to_perf_context))[0].block_cache_hit_count); + ASSERT_EQ( + 5, + (*(get_perf_context()->level_to_perf_context))[2].block_cache_hit_count); + ASSERT_EQ( + 2, + (*(get_perf_context()->level_to_perf_context))[3].block_cache_miss_count); + ASSERT_EQ( + 4, + (*(get_perf_context()->level_to_perf_context))[1].block_cache_miss_count); + std::string zero_excluded = get_perf_context()->ToString(true); + ASSERT_NE(std::string::npos, + zero_excluded.find("bloom_filter_useful = 1@level5, 2@level7")); + ASSERT_NE(std::string::npos, + zero_excluded.find("bloom_filter_full_positive = 1@level0")); + ASSERT_NE(std::string::npos, + zero_excluded.find("bloom_filter_full_true_positive = 1@level2")); + ASSERT_NE(std::string::npos, + zero_excluded.find("block_cache_hit_count = 1@level0, 5@level2")); + ASSERT_NE(std::string::npos, + zero_excluded.find("block_cache_miss_count = 4@level1, 2@level3")); +} + +TEST_F(PerfContextTest, CPUTimer) { + if (SystemClock::Default()->CPUNanos() == 0) { + ROCKSDB_GTEST_SKIP("Target without CPUNanos support"); + return; + } + + ASSERT_OK(DestroyDB(kDbName, Options())); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); + + std::string max_str = "0"; + for (int i = 0; i < FLAGS_total_keys; ++i) { + std::string i_str = std::to_string(i); + std::string key = "k" + i_str; + std::string value = "v" + i_str; + max_str = max_str > i_str ? max_str : i_str; + + ASSERT_OK(db->Put(write_options, key, value)); + } + std::string last_key = "k" + max_str; + std::string last_value = "v" + max_str; + + { + // Get + get_perf_context()->Reset(); + std::string value; + ASSERT_OK(db->Get(read_options, "k0", &value)); + ASSERT_EQ(value, "v0"); + + if (FLAGS_verbose) { + std::cout << "Get CPU time nanos: " << get_perf_context()->get_cpu_nanos + << "ns\n"; + } + + // Iter + std::unique_ptr iter(db->NewIterator(read_options)); + + // Seek + get_perf_context()->Reset(); + iter->Seek(last_key); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(last_value, iter->value().ToString()); + + if (FLAGS_verbose) { + std::cout << "Iter Seek CPU time nanos: " + << get_perf_context()->iter_seek_cpu_nanos << "ns\n"; + } + + // SeekForPrev + get_perf_context()->Reset(); + iter->SeekForPrev(last_key); + ASSERT_TRUE(iter->Valid()); + + if (FLAGS_verbose) { + std::cout << "Iter SeekForPrev CPU time nanos: " + << get_perf_context()->iter_seek_cpu_nanos << "ns\n"; + } + + // SeekToLast + get_perf_context()->Reset(); + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(last_value, iter->value().ToString()); + + if (FLAGS_verbose) { + std::cout << "Iter SeekToLast CPU time nanos: " + << get_perf_context()->iter_seek_cpu_nanos << "ns\n"; + } + + // SeekToFirst + get_perf_context()->Reset(); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("v0", iter->value().ToString()); + + if (FLAGS_verbose) { + std::cout << "Iter SeekToFirst CPU time nanos: " + << get_perf_context()->iter_seek_cpu_nanos << "ns\n"; + } + + // Next + get_perf_context()->Reset(); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("v1", iter->value().ToString()); + + if (FLAGS_verbose) { + std::cout << "Iter Next CPU time nanos: " + << get_perf_context()->iter_next_cpu_nanos << "ns\n"; + } + + // Prev + get_perf_context()->Reset(); + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("v0", iter->value().ToString()); + + if (FLAGS_verbose) { + std::cout << "Iter Prev CPU time nanos: " + << get_perf_context()->iter_prev_cpu_nanos << "ns\n"; + } + + // monotonically increasing + get_perf_context()->Reset(); + auto count = get_perf_context()->iter_seek_cpu_nanos; + for (int i = 0; i < FLAGS_total_keys; ++i) { + iter->Seek("k" + std::to_string(i)); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("v" + std::to_string(i), iter->value().ToString()); + auto next_count = get_perf_context()->iter_seek_cpu_nanos; + ASSERT_GT(next_count, count); + count = next_count; + } + + // iterator creation/destruction; multiple iterators + { + std::unique_ptr iter2(db->NewIterator(read_options)); + ASSERT_EQ(count, get_perf_context()->iter_seek_cpu_nanos); + iter2->Seek(last_key); + ASSERT_TRUE(iter2->Valid()); + ASSERT_EQ(last_value, iter2->value().ToString()); + ASSERT_GT(get_perf_context()->iter_seek_cpu_nanos, count); + count = get_perf_context()->iter_seek_cpu_nanos; + } + ASSERT_EQ(count, get_perf_context()->iter_seek_cpu_nanos); + } +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + + for (int i = 1; i < argc; i++) { + int n; + char junk; + + if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) { + FLAGS_write_buffer_size = n; + } + + if (sscanf(argv[i], "--total_keys=%d%c", &n, &junk) == 1) { + FLAGS_total_keys = n; + } + + if (sscanf(argv[i], "--random_key=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_random_key = n; + } + + if (sscanf(argv[i], "--use_set_based_memetable=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_use_set_based_memetable = n; + } + + if (sscanf(argv[i], "--verbose=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_verbose = n; + } + } + + if (FLAGS_verbose) { + std::cout << kDbName << "\n"; + } + + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/periodic_task_scheduler.cc b/src/rocksdb/db/periodic_task_scheduler.cc new file mode 100644 index 000000000..2024510dd --- /dev/null +++ b/src/rocksdb/db/periodic_task_scheduler.cc @@ -0,0 +1,113 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/periodic_task_scheduler.h" + +#include "rocksdb/system_clock.h" + +#ifndef ROCKSDB_LITE +namespace ROCKSDB_NAMESPACE { + +// `timer_mutex` is a global mutex serves 3 purposes currently: +// (1) to ensure calls to `Start()` and `Shutdown()` are serialized, as +// they are currently not implemented in a thread-safe way; and +// (2) to ensure the `Timer::Add()`s and `Timer::Start()` run atomically, and +// the `Timer::Cancel()`s and `Timer::Shutdown()` run atomically. +// (3) protect tasks_map_ in PeriodicTaskScheduler +// Note: It's not efficient to have a static global mutex, for +// PeriodicTaskScheduler it should be okay, as the operations are called +// infrequently. +static port::Mutex timer_mutex; + +static const std::map kDefaultPeriodSeconds = { + {PeriodicTaskType::kDumpStats, kInvalidPeriodSec}, + {PeriodicTaskType::kPersistStats, kInvalidPeriodSec}, + {PeriodicTaskType::kFlushInfoLog, 10}, + {PeriodicTaskType::kRecordSeqnoTime, kInvalidPeriodSec}, +}; + +static const std::map kPeriodicTaskTypeNames = { + {PeriodicTaskType::kDumpStats, "dump_st"}, + {PeriodicTaskType::kPersistStats, "pst_st"}, + {PeriodicTaskType::kFlushInfoLog, "flush_info_log"}, + {PeriodicTaskType::kRecordSeqnoTime, "record_seq_time"}, +}; + +Status PeriodicTaskScheduler::Register(PeriodicTaskType task_type, + const PeriodicTaskFunc& fn) { + return Register(task_type, fn, kDefaultPeriodSeconds.at(task_type)); +} + +Status PeriodicTaskScheduler::Register(PeriodicTaskType task_type, + const PeriodicTaskFunc& fn, + uint64_t repeat_period_seconds) { + MutexLock l(&timer_mutex); + static std::atomic initial_delay(0); + + if (repeat_period_seconds == kInvalidPeriodSec) { + return Status::InvalidArgument("Invalid task repeat period"); + } + auto it = tasks_map_.find(task_type); + if (it != tasks_map_.end()) { + // the task already exists and it's the same, no update needed + if (it->second.repeat_every_sec == repeat_period_seconds) { + return Status::OK(); + } + // cancel the existing one before register new one + timer_->Cancel(it->second.name); + tasks_map_.erase(it); + } + + timer_->Start(); + // put task type name as prefix, for easy debug + std::string unique_id = + kPeriodicTaskTypeNames.at(task_type) + std::to_string(id_++); + + bool succeeded = timer_->Add( + fn, unique_id, + (initial_delay.fetch_add(1) % repeat_period_seconds) * kMicrosInSecond, + repeat_period_seconds * kMicrosInSecond); + if (!succeeded) { + return Status::Aborted("Failed to register periodic task"); + } + auto result = tasks_map_.try_emplace( + task_type, TaskInfo{unique_id, repeat_period_seconds}); + if (!result.second) { + return Status::Aborted("Failed to add periodic task"); + }; + return Status::OK(); +} + +Status PeriodicTaskScheduler::Unregister(PeriodicTaskType task_type) { + MutexLock l(&timer_mutex); + auto it = tasks_map_.find(task_type); + if (it != tasks_map_.end()) { + timer_->Cancel(it->second.name); + tasks_map_.erase(it); + } + if (!timer_->HasPendingTask()) { + timer_->Shutdown(); + } + return Status::OK(); +} + +Timer* PeriodicTaskScheduler::Default() { + static Timer timer(SystemClock::Default().get()); + return &timer; +} + +#ifndef NDEBUG +void PeriodicTaskScheduler::TEST_OverrideTimer(SystemClock* clock) { + static Timer test_timer(clock); + test_timer.TEST_OverrideTimer(clock); + MutexLock l(&timer_mutex); + timer_ = &test_timer; +} +#endif // NDEBUG + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/periodic_task_scheduler.h b/src/rocksdb/db/periodic_task_scheduler.h new file mode 100644 index 000000000..f45b80c4d --- /dev/null +++ b/src/rocksdb/db/periodic_task_scheduler.h @@ -0,0 +1,110 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include "util/timer.h" + +namespace ROCKSDB_NAMESPACE { +class SystemClock; + +using PeriodicTaskFunc = std::function; + +constexpr uint64_t kInvalidPeriodSec = 0; + +// List of task types +enum class PeriodicTaskType : uint8_t { + kDumpStats = 0, + kPersistStats, + kFlushInfoLog, + kRecordSeqnoTime, + kMax, +}; + +// PeriodicTaskScheduler contains the periodic task scheduled from the DB +// instance. It's used to schedule/unschedule DumpStats(), PersistStats(), +// FlushInfoLog(), etc. Each type of the task can only have one instance, +// re-register the same task type would only update the repeat period. +// +// Internally, it uses a global single threaded timer object to run the periodic +// task functions. Timer thread will always be started since the info log +// flushing cannot be disabled. +class PeriodicTaskScheduler { + public: + explicit PeriodicTaskScheduler() = default; + + PeriodicTaskScheduler(const PeriodicTaskScheduler&) = delete; + PeriodicTaskScheduler(PeriodicTaskScheduler&&) = delete; + PeriodicTaskScheduler& operator=(const PeriodicTaskScheduler&) = delete; + PeriodicTaskScheduler& operator=(PeriodicTaskScheduler&&) = delete; + + // Register a task with its default repeat period + Status Register(PeriodicTaskType task_type, const PeriodicTaskFunc& fn); + + // Register a task with specified repeat period. 0 is an invalid argument + // (kInvalidPeriodSec). To stop the task, please use Unregister() specifically + Status Register(PeriodicTaskType task_type, const PeriodicTaskFunc& fn, + uint64_t repeat_period_seconds); + + // Unregister the task + Status Unregister(PeriodicTaskType task_type); + +#ifndef NDEBUG + // Override the timer for the unittest + void TEST_OverrideTimer(SystemClock* clock); + + // Call Timer TEST_WaitForRun() which wait until Timer starting waiting. + void TEST_WaitForRun(const std::function& callback) const { + if (timer_ != nullptr) { + timer_->TEST_WaitForRun(callback); + } + } + + // Get global valid task number in the Timer + size_t TEST_GetValidTaskNum() const { + if (timer_ != nullptr) { + return timer_->TEST_GetPendingTaskNum(); + } + return 0; + } + + // If it has the specified task type registered + bool TEST_HasTask(PeriodicTaskType task_type) const { + auto it = tasks_map_.find(task_type); + return it != tasks_map_.end(); + } +#endif // NDEBUG + + private: + // default global Timer instance + static Timer* Default(); + + // Internal structure to store task information + struct TaskInfo { + TaskInfo(std::string _name, uint64_t _repeat_every_sec) + : name(std::move(_name)), repeat_every_sec(_repeat_every_sec) {} + std::string name; + uint64_t repeat_every_sec; + }; + + // Internal tasks map + std::map tasks_map_; + + // Global timer pointer, which doesn't support synchronous add/cancel tasks + // so having a global `timer_mutex` for add/cancel task. + Timer* timer_ = Default(); + + // Global task id, protected by the global `timer_mutex` + inline static uint64_t id_; + + static constexpr uint64_t kMicrosInSecond = 1000U * 1000U; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/periodic_task_scheduler_test.cc b/src/rocksdb/db/periodic_task_scheduler_test.cc new file mode 100644 index 000000000..4abea4d5e --- /dev/null +++ b/src/rocksdb/db/periodic_task_scheduler_test.cc @@ -0,0 +1,231 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/periodic_task_scheduler.h" + +#include "db/db_test_util.h" +#include "env/composite_env_wrapper.h" +#include "test_util/mock_time_env.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE +class PeriodicTaskSchedulerTest : public DBTestBase { + public: + PeriodicTaskSchedulerTest() + : DBTestBase("periodic_task_scheduler_test", /*env_do_fsync=*/true) { + mock_clock_ = std::make_shared(env_->GetSystemClock()); + mock_env_.reset(new CompositeEnvWrapper(env_, mock_clock_)); + } + + protected: + std::unique_ptr mock_env_; + std::shared_ptr mock_clock_; + + void SetUp() override { + mock_clock_->InstallTimedWaitFixCallback(); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::StartPeriodicTaskScheduler:Init", [&](void* arg) { + auto periodic_task_scheduler_ptr = + reinterpret_cast(arg); + periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock_.get()); + }); + } +}; + +TEST_F(PeriodicTaskSchedulerTest, Basic) { + constexpr unsigned int kPeriodSec = 10; + Close(); + Options options; + options.stats_dump_period_sec = kPeriodSec; + options.stats_persist_period_sec = kPeriodSec; + options.create_if_missing = true; + options.env = mock_env_.get(); + + int dump_st_counter = 0; + SyncPoint::GetInstance()->SetCallBack("DBImpl::DumpStats:StartRunning", + [&](void*) { dump_st_counter++; }); + + int pst_st_counter = 0; + SyncPoint::GetInstance()->SetCallBack("DBImpl::PersistStats:StartRunning", + [&](void*) { pst_st_counter++; }); + + int flush_info_log_counter = 0; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushInfoLog:StartRunning", + [&](void*) { flush_info_log_counter++; }); + SyncPoint::GetInstance()->EnableProcessing(); + + Reopen(options); + + ASSERT_EQ(kPeriodSec, dbfull()->GetDBOptions().stats_dump_period_sec); + ASSERT_EQ(kPeriodSec, dbfull()->GetDBOptions().stats_persist_period_sec); + + ASSERT_GT(kPeriodSec, 1u); + dbfull()->TEST_WaitForPeridicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast(kPeriodSec) - 1); + }); + + const PeriodicTaskScheduler& scheduler = + dbfull()->TEST_GetPeriodicTaskScheduler(); + ASSERT_EQ(3, scheduler.TEST_GetValidTaskNum()); + + ASSERT_EQ(1, dump_st_counter); + ASSERT_EQ(1, pst_st_counter); + ASSERT_EQ(1, flush_info_log_counter); + + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(kPeriodSec)); }); + + ASSERT_EQ(2, dump_st_counter); + ASSERT_EQ(2, pst_st_counter); + ASSERT_EQ(2, flush_info_log_counter); + + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(kPeriodSec)); }); + + ASSERT_EQ(3, dump_st_counter); + ASSERT_EQ(3, pst_st_counter); + ASSERT_EQ(3, flush_info_log_counter); + + // Disable scheduler with SetOption + ASSERT_OK(dbfull()->SetDBOptions( + {{"stats_dump_period_sec", "0"}, {"stats_persist_period_sec", "0"}})); + ASSERT_EQ(0u, dbfull()->GetDBOptions().stats_dump_period_sec); + ASSERT_EQ(0u, dbfull()->GetDBOptions().stats_persist_period_sec); + + // Info log flush should still run. + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(kPeriodSec)); }); + ASSERT_EQ(3, dump_st_counter); + ASSERT_EQ(3, pst_st_counter); + ASSERT_EQ(4, flush_info_log_counter); + + ASSERT_EQ(1u, scheduler.TEST_GetValidTaskNum()); + + // Re-enable one task + ASSERT_OK(dbfull()->SetDBOptions({{"stats_dump_period_sec", "5"}})); + ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec); + ASSERT_EQ(0u, dbfull()->GetDBOptions().stats_persist_period_sec); + + ASSERT_EQ(2, scheduler.TEST_GetValidTaskNum()); + + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(kPeriodSec)); }); + ASSERT_EQ(4, dump_st_counter); + ASSERT_EQ(3, pst_st_counter); + ASSERT_EQ(5, flush_info_log_counter); + + Close(); +} + +TEST_F(PeriodicTaskSchedulerTest, MultiInstances) { + constexpr int kPeriodSec = 5; + const int kInstanceNum = 10; + + Close(); + Options options; + options.stats_dump_period_sec = kPeriodSec; + options.stats_persist_period_sec = kPeriodSec; + options.create_if_missing = true; + options.env = mock_env_.get(); + + int dump_st_counter = 0; + SyncPoint::GetInstance()->SetCallBack("DBImpl::DumpStats:2", + [&](void*) { dump_st_counter++; }); + + int pst_st_counter = 0; + SyncPoint::GetInstance()->SetCallBack("DBImpl::PersistStats:StartRunning", + [&](void*) { pst_st_counter++; }); + SyncPoint::GetInstance()->EnableProcessing(); + + auto dbs = std::vector(kInstanceNum); + for (int i = 0; i < kInstanceNum; i++) { + ASSERT_OK( + DB::Open(options, test::PerThreadDBPath(std::to_string(i)), &(dbs[i]))); + } + + auto dbi = static_cast_with_check(dbs[kInstanceNum - 1]); + + const PeriodicTaskScheduler& scheduler = dbi->TEST_GetPeriodicTaskScheduler(); + ASSERT_EQ(kInstanceNum * 3, scheduler.TEST_GetValidTaskNum()); + + int expected_run = kInstanceNum; + dbi->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); + ASSERT_EQ(expected_run, dump_st_counter); + ASSERT_EQ(expected_run, pst_st_counter); + + expected_run += kInstanceNum; + dbi->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); + ASSERT_EQ(expected_run, dump_st_counter); + ASSERT_EQ(expected_run, pst_st_counter); + + expected_run += kInstanceNum; + dbi->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); + ASSERT_EQ(expected_run, dump_st_counter); + ASSERT_EQ(expected_run, pst_st_counter); + + int half = kInstanceNum / 2; + for (int i = 0; i < half; i++) { + delete dbs[i]; + } + + expected_run += (kInstanceNum - half) * 2; + + dbi->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); + dbi->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); + ASSERT_EQ(expected_run, dump_st_counter); + ASSERT_EQ(expected_run, pst_st_counter); + + for (int i = half; i < kInstanceNum; i++) { + ASSERT_OK(dbs[i]->Close()); + delete dbs[i]; + } +} + +TEST_F(PeriodicTaskSchedulerTest, MultiEnv) { + constexpr int kDumpPeriodSec = 5; + constexpr int kPersistPeriodSec = 10; + Close(); + Options options1; + options1.stats_dump_period_sec = kDumpPeriodSec; + options1.stats_persist_period_sec = kPersistPeriodSec; + options1.create_if_missing = true; + options1.env = mock_env_.get(); + + Reopen(options1); + + std::unique_ptr mock_env2( + new CompositeEnvWrapper(Env::Default(), mock_clock_)); + Options options2; + options2.stats_dump_period_sec = kDumpPeriodSec; + options2.stats_persist_period_sec = kPersistPeriodSec; + options2.create_if_missing = true; + options1.env = mock_env2.get(); + + std::string dbname = test::PerThreadDBPath("multi_env_test"); + DB* db; + ASSERT_OK(DB::Open(options2, dbname, &db)); + + ASSERT_OK(db->Close()); + delete db; + Close(); +} + +#endif // !ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/pinned_iterators_manager.h b/src/rocksdb/db/pinned_iterators_manager.h new file mode 100644 index 000000000..0fcf231da --- /dev/null +++ b/src/rocksdb/db/pinned_iterators_manager.h @@ -0,0 +1,92 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once +#include +#include +#include +#include + +#include "table/internal_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +// PinnedIteratorsManager will be notified whenever we need to pin an Iterator +// and it will be responsible for deleting pinned Iterators when they are +// not needed anymore. +class PinnedIteratorsManager : public Cleanable { + public: + PinnedIteratorsManager() : pinning_enabled(false) {} + ~PinnedIteratorsManager() { + if (pinning_enabled) { + ReleasePinnedData(); + } + } + + // Move constructor and move assignment is allowed. + PinnedIteratorsManager(PinnedIteratorsManager&& other) noexcept = default; + PinnedIteratorsManager& operator=(PinnedIteratorsManager&& other) noexcept = + default; + + // Enable Iterators pinning + void StartPinning() { + assert(pinning_enabled == false); + pinning_enabled = true; + } + + // Is pinning enabled ? + bool PinningEnabled() { return pinning_enabled; } + + // Take ownership of iter and delete it when ReleasePinnedData() is called + void PinIterator(InternalIterator* iter, bool arena = false) { + if (arena) { + PinPtr(iter, &PinnedIteratorsManager::ReleaseArenaInternalIterator); + } else { + PinPtr(iter, &PinnedIteratorsManager::ReleaseInternalIterator); + } + } + + using ReleaseFunction = void (*)(void* arg1); + void PinPtr(void* ptr, ReleaseFunction release_func) { + assert(pinning_enabled); + if (ptr == nullptr) { + return; + } + pinned_ptrs_.emplace_back(ptr, release_func); + } + + // Release pinned Iterators + inline void ReleasePinnedData() { + assert(pinning_enabled == true); + pinning_enabled = false; + + // Remove duplicate pointers + std::sort(pinned_ptrs_.begin(), pinned_ptrs_.end()); + auto unique_end = std::unique(pinned_ptrs_.begin(), pinned_ptrs_.end()); + + for (auto i = pinned_ptrs_.begin(); i != unique_end; ++i) { + void* ptr = i->first; + ReleaseFunction release_func = i->second; + release_func(ptr); + } + pinned_ptrs_.clear(); + // Also do cleanups from the base Cleanable + Cleanable::Reset(); + } + + private: + static void ReleaseInternalIterator(void* ptr) { + delete reinterpret_cast(ptr); + } + + static void ReleaseArenaInternalIterator(void* ptr) { + reinterpret_cast(ptr)->~InternalIterator(); + } + + bool pinning_enabled; + std::vector> pinned_ptrs_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/plain_table_db_test.cc b/src/rocksdb/db/plain_table_db_test.cc new file mode 100644 index 000000000..755b639b0 --- /dev/null +++ b/src/rocksdb/db/plain_table_db_test.cc @@ -0,0 +1,1357 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE + +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "file/filename.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "table/meta_blocks.h" +#include "table/plain/plain_table_bloom.h" +#include "table/plain/plain_table_factory.h" +#include "table/plain/plain_table_key_coding.h" +#include "table/plain/plain_table_reader.h" +#include "table/table_builder.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/cast_util.h" +#include "util/hash.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { +class PlainTableKeyDecoderTest : public testing::Test {}; + +TEST_F(PlainTableKeyDecoderTest, ReadNonMmap) { + Random rnd(301); + const uint32_t kLength = 2222; + std::string tmp = rnd.RandomString(kLength); + Slice contents(tmp); + test::StringSource* string_source = + new test::StringSource(contents, 0, false); + std::unique_ptr holder(string_source); + std::unique_ptr file_reader( + new RandomAccessFileReader(std::move(holder), "test")); + std::unique_ptr file_info( + new PlainTableReaderFileInfo(std::move(file_reader), EnvOptions(), + kLength)); + + { + PlainTableFileReader reader(file_info.get()); + + const uint32_t kReadSize = 77; + for (uint32_t pos = 0; pos < kLength; pos += kReadSize) { + uint32_t read_size = std::min(kLength - pos, kReadSize); + Slice out; + ASSERT_TRUE(reader.Read(pos, read_size, &out)); + ASSERT_EQ(0, out.compare(tmp.substr(pos, read_size))); + } + + ASSERT_LT(uint32_t(string_source->total_reads()), kLength / kReadSize / 2); + } + + std::vector>> reads = { + {{600, 30}, {590, 30}, {600, 20}, {600, 40}}, + {{800, 20}, {100, 20}, {500, 20}, {1500, 20}, {100, 20}, {80, 20}}, + {{1000, 20}, {500, 20}, {1000, 50}}, + {{1000, 20}, {500, 20}, {500, 20}}, + {{1000, 20}, {500, 20}, {200, 20}, {500, 20}}, + {{1000, 20}, {500, 20}, {200, 20}, {1000, 50}}, + {{600, 500}, {610, 20}, {100, 20}}, + {{500, 100}, {490, 100}, {550, 50}}, + }; + + std::vector num_file_reads = {2, 6, 2, 2, 4, 3, 2, 2}; + + for (size_t i = 0; i < reads.size(); i++) { + string_source->set_total_reads(0); + PlainTableFileReader reader(file_info.get()); + for (auto p : reads[i]) { + Slice out; + ASSERT_TRUE(reader.Read(p.first, p.second, &out)); + ASSERT_EQ(0, out.compare(tmp.substr(p.first, p.second))); + } + ASSERT_EQ(num_file_reads[i], string_source->total_reads()); + } +} + +class PlainTableDBTest : public testing::Test, + public testing::WithParamInterface { + protected: + private: + std::string dbname_; + Env* env_; + DB* db_; + + bool mmap_mode_; + Options last_options_; + + public: + PlainTableDBTest() : env_(Env::Default()) {} + + ~PlainTableDBTest() override { + delete db_; + EXPECT_OK(DestroyDB(dbname_, Options())); + } + + void SetUp() override { + mmap_mode_ = GetParam(); + dbname_ = test::PerThreadDBPath("plain_table_db_test"); + EXPECT_OK(DestroyDB(dbname_, Options())); + db_ = nullptr; + Reopen(); + } + + // Return the current option configuration. + Options CurrentOptions() { + Options options; + + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 0; + plain_table_options.bloom_bits_per_key = 2; + plain_table_options.hash_table_ratio = 0.8; + plain_table_options.index_sparseness = 3; + plain_table_options.huge_page_tlb_size = 0; + plain_table_options.encoding_type = kPrefix; + plain_table_options.full_scan_mode = false; + plain_table_options.store_index_in_file = false; + + options.table_factory.reset(NewPlainTableFactory(plain_table_options)); + options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true)); + + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.allow_mmap_reads = mmap_mode_; + options.allow_concurrent_memtable_write = false; + options.unordered_write = false; + return options; + } + + DBImpl* dbfull() { return static_cast_with_check(db_); } + + void Reopen(Options* options = nullptr) { ASSERT_OK(TryReopen(options)); } + + void Close() { + delete db_; + db_ = nullptr; + } + + bool mmap_mode() const { return mmap_mode_; } + + void DestroyAndReopen(Options* options = nullptr) { + // Destroy using last options + Destroy(&last_options_); + ASSERT_OK(TryReopen(options)); + } + + void Destroy(Options* options) { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, *options)); + } + + Status PureReopen(Options* options, DB** db) { + return DB::Open(*options, dbname_, db); + } + + Status ReopenForReadOnly(Options* options) { + delete db_; + db_ = nullptr; + return DB::OpenForReadOnly(*options, dbname_, &db_); + } + + Status TryReopen(Options* options = nullptr) { + delete db_; + db_ = nullptr; + Options opts; + if (options != nullptr) { + opts = *options; + } else { + opts = CurrentOptions(); + opts.create_if_missing = true; + } + last_options_ = opts; + + return DB::Open(opts, dbname_, &db_); + } + + Status Put(const Slice& k, const Slice& v) { + return db_->Put(WriteOptions(), k, v); + } + + Status Delete(const std::string& k) { return db_->Delete(WriteOptions(), k); } + + std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { + ReadOptions options; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + int NumTableFilesAtLevel(int level) { + std::string property; + EXPECT_TRUE(db_->GetProperty( + "rocksdb.num-files-at-level" + std::to_string(level), &property)); + return atoi(property.c_str()); + } + + // Return spread of files per level + std::string FilesPerLevel() { + std::string result; + size_t last_non_zero_offset = 0; + for (int level = 0; level < db_->NumberLevels(); level++) { + int f = NumTableFilesAtLevel(level); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; + } + + std::string IterStatus(Iterator* iter) { + std::string result; + if (iter->Valid()) { + result = iter->key().ToString() + "->" + iter->value().ToString(); + } else { + result = "(invalid)"; + } + return result; + } +}; + +TEST_P(PlainTableDBTest, Empty) { + ASSERT_TRUE(dbfull() != nullptr); + ASSERT_EQ("NOT_FOUND", Get("0000000000000foo")); +} + +extern const uint64_t kPlainTableMagicNumber; + +class TestPlainTableReader : public PlainTableReader { + public: + TestPlainTableReader( + const EnvOptions& env_options, const InternalKeyComparator& icomparator, + EncodingType encoding_type, uint64_t file_size, int bloom_bits_per_key, + double hash_table_ratio, size_t index_sparseness, + std::unique_ptr&& props, + std::unique_ptr&& file, + const ImmutableOptions& ioptions, const SliceTransform* prefix_extractor, + bool* expect_bloom_not_match, bool store_index_in_file, + uint32_t column_family_id, const std::string& column_family_name) + : PlainTableReader(ioptions, std::move(file), env_options, icomparator, + encoding_type, file_size, props.get(), + prefix_extractor), + expect_bloom_not_match_(expect_bloom_not_match) { + Status s = MmapDataIfNeeded(); + EXPECT_TRUE(s.ok()); + + s = PopulateIndex(props.get(), bloom_bits_per_key, hash_table_ratio, + index_sparseness, 2 * 1024 * 1024); + EXPECT_TRUE(s.ok()); + + EXPECT_EQ(column_family_id, static_cast(props->column_family_id)); + EXPECT_EQ(column_family_name, props->column_family_name); + if (store_index_in_file) { + auto bloom_version_ptr = props->user_collected_properties.find( + PlainTablePropertyNames::kBloomVersion); + EXPECT_TRUE(bloom_version_ptr != props->user_collected_properties.end()); + EXPECT_EQ(bloom_version_ptr->second, std::string("1")); + if (ioptions.bloom_locality > 0) { + auto num_blocks_ptr = props->user_collected_properties.find( + PlainTablePropertyNames::kNumBloomBlocks); + EXPECT_TRUE(num_blocks_ptr != props->user_collected_properties.end()); + } + } + table_properties_ = std::move(props); + } + + ~TestPlainTableReader() override {} + + private: + bool MatchBloom(uint32_t hash) const override { + bool ret = PlainTableReader::MatchBloom(hash); + if (*expect_bloom_not_match_) { + EXPECT_TRUE(!ret); + } else { + EXPECT_TRUE(ret); + } + return ret; + } + bool* expect_bloom_not_match_; +}; + +extern const uint64_t kPlainTableMagicNumber; +class TestPlainTableFactory : public PlainTableFactory { + public: + explicit TestPlainTableFactory(bool* expect_bloom_not_match, + const PlainTableOptions& options, + uint32_t column_family_id, + std::string column_family_name) + : PlainTableFactory(options), + bloom_bits_per_key_(options.bloom_bits_per_key), + hash_table_ratio_(options.hash_table_ratio), + index_sparseness_(options.index_sparseness), + store_index_in_file_(options.store_index_in_file), + expect_bloom_not_match_(expect_bloom_not_match), + column_family_id_(column_family_id), + column_family_name_(std::move(column_family_name)) {} + + using PlainTableFactory::NewTableReader; + Status NewTableReader( + const ReadOptions& /*ro*/, const TableReaderOptions& table_reader_options, + std::unique_ptr&& file, uint64_t file_size, + std::unique_ptr* table, + bool /*prefetch_index_and_filter_in_cache*/) const override { + std::unique_ptr props; + auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, + table_reader_options.ioptions, &props); + EXPECT_TRUE(s.ok()); + + if (store_index_in_file_) { + BlockHandle bloom_block_handle; + s = FindMetaBlockInFile(file.get(), file_size, kPlainTableMagicNumber, + table_reader_options.ioptions, + BloomBlockBuilder::kBloomBlock, + &bloom_block_handle); + EXPECT_TRUE(s.ok()); + + BlockHandle index_block_handle; + s = FindMetaBlockInFile(file.get(), file_size, kPlainTableMagicNumber, + table_reader_options.ioptions, + PlainTableIndexBuilder::kPlainTableIndexBlock, + &index_block_handle); + EXPECT_TRUE(s.ok()); + } + + auto& user_props = props->user_collected_properties; + auto encoding_type_prop = + user_props.find(PlainTablePropertyNames::kEncodingType); + assert(encoding_type_prop != user_props.end()); + EncodingType encoding_type = static_cast( + DecodeFixed32(encoding_type_prop->second.c_str())); + + std::unique_ptr new_reader(new TestPlainTableReader( + table_reader_options.env_options, + table_reader_options.internal_comparator, encoding_type, file_size, + bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, + std::move(props), std::move(file), table_reader_options.ioptions, + table_reader_options.prefix_extractor.get(), expect_bloom_not_match_, + store_index_in_file_, column_family_id_, column_family_name_)); + + *table = std::move(new_reader); + return s; + } + + private: + int bloom_bits_per_key_; + double hash_table_ratio_; + size_t index_sparseness_; + bool store_index_in_file_; + bool* expect_bloom_not_match_; + const uint32_t column_family_id_; + const std::string column_family_name_; +}; + +TEST_P(PlainTableDBTest, BadOptions1) { + // Build with a prefix extractor + ASSERT_OK(Put("1000000000000foo", "v1")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + + // Bad attempt to re-open without a prefix extractor + Options options = CurrentOptions(); + options.prefix_extractor.reset(); + ASSERT_EQ( + "Invalid argument: Prefix extractor is missing when opening a PlainTable " + "built using a prefix extractor", + TryReopen(&options).ToString()); + + // Bad attempt to re-open with different prefix extractor + options.prefix_extractor.reset(NewFixedPrefixTransform(6)); + ASSERT_EQ( + "Invalid argument: Prefix extractor given doesn't match the one used to " + "build PlainTable", + TryReopen(&options).ToString()); + + // Correct prefix extractor + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + Reopen(&options); + ASSERT_EQ("v1", Get("1000000000000foo")); +} + +TEST_P(PlainTableDBTest, BadOptions2) { + Options options = CurrentOptions(); + options.prefix_extractor.reset(); + options.create_if_missing = true; + DestroyAndReopen(&options); + // Build without a prefix extractor + // (apparently works even if hash_table_ratio > 0) + ASSERT_OK(Put("1000000000000foo", "v1")); + // Build without a prefix extractor, this call will fail and returns the + // status for this bad attempt. + ASSERT_NOK(dbfull()->TEST_FlushMemTable()); + + // Bad attempt to re-open with hash_table_ratio > 0 and no prefix extractor + Status s = TryReopen(&options); + ASSERT_EQ( + "Not implemented: PlainTable requires a prefix extractor enable prefix " + "hash mode.", + s.ToString()); + + // OK to open with hash_table_ratio == 0 and no prefix extractor + PlainTableOptions plain_table_options; + plain_table_options.hash_table_ratio = 0; + options.table_factory.reset(NewPlainTableFactory(plain_table_options)); + Reopen(&options); + ASSERT_EQ("v1", Get("1000000000000foo")); + + // OK to open newly with a prefix_extractor and hash table; builds index + // in memory. + options = CurrentOptions(); + Reopen(&options); + ASSERT_EQ("v1", Get("1000000000000foo")); +} + +TEST_P(PlainTableDBTest, Flush) { + for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; + huge_page_tlb_size += 2 * 1024 * 1024) { + for (EncodingType encoding_type : {kPlain, kPrefix}) { + for (int bloom = -1; bloom <= 117; bloom += 117) { + const int bloom_bits = std::max(bloom, 0); + const bool full_scan_mode = bloom < 0; + for (int total_order = 0; total_order <= 1; total_order++) { + for (int store_index_in_file = 0; store_index_in_file <= 1; + ++store_index_in_file) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + if (total_order) { + options.prefix_extractor.reset(); + + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 0; + plain_table_options.bloom_bits_per_key = bloom_bits; + plain_table_options.hash_table_ratio = 0; + plain_table_options.index_sparseness = 2; + plain_table_options.huge_page_tlb_size = huge_page_tlb_size; + plain_table_options.encoding_type = encoding_type; + plain_table_options.full_scan_mode = full_scan_mode; + plain_table_options.store_index_in_file = store_index_in_file; + + options.table_factory.reset( + NewPlainTableFactory(plain_table_options)); + } else { + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 0; + plain_table_options.bloom_bits_per_key = bloom_bits; + plain_table_options.hash_table_ratio = 0.75; + plain_table_options.index_sparseness = 16; + plain_table_options.huge_page_tlb_size = huge_page_tlb_size; + plain_table_options.encoding_type = encoding_type; + plain_table_options.full_scan_mode = full_scan_mode; + plain_table_options.store_index_in_file = store_index_in_file; + + options.table_factory.reset( + NewPlainTableFactory(plain_table_options)); + } + DestroyAndReopen(&options); + uint64_t int_num; + ASSERT_TRUE(dbfull()->GetIntProperty( + "rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_EQ(int_num, 0U); + + ASSERT_OK(Put("1000000000000foo", "v1")); + ASSERT_OK(Put("0000000000000bar", "v2")); + ASSERT_OK(Put("1000000000000foo", "v3")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + + ASSERT_TRUE(dbfull()->GetIntProperty( + "rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_GT(int_num, 0U); + + TablePropertiesCollection ptc; + ASSERT_OK(reinterpret_cast(dbfull())->GetPropertiesOfAllTables( + &ptc)); + ASSERT_EQ(1U, ptc.size()); + auto row = ptc.begin(); + auto tp = row->second; + + if (full_scan_mode) { + // Does not support Get/Seek + std::unique_ptr iter( + dbfull()->NewIterator(ReadOptions())); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("0000000000000bar", iter->key().ToString()); + ASSERT_EQ("v2", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000000foo", iter->key().ToString()); + ASSERT_EQ("v3", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + ASSERT_TRUE(iter->status().ok()); + } else { + if (!store_index_in_file) { + ASSERT_EQ(total_order ? "4" : "12", + (tp->user_collected_properties) + .at("plain_table_hash_table_size")); + ASSERT_EQ("0", (tp->user_collected_properties) + .at("plain_table_sub_index_size")); + } else { + ASSERT_EQ("0", (tp->user_collected_properties) + .at("plain_table_hash_table_size")); + ASSERT_EQ("0", (tp->user_collected_properties) + .at("plain_table_sub_index_size")); + } + ASSERT_EQ("v3", Get("1000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); + } + } + } + } + } + } +} + +TEST_P(PlainTableDBTest, Flush2) { + for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; + huge_page_tlb_size += 2 * 1024 * 1024) { + for (EncodingType encoding_type : {kPlain, kPrefix}) { + for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { + for (int total_order = 0; total_order <= 1; total_order++) { + for (int store_index_in_file = 0; store_index_in_file <= 1; + ++store_index_in_file) { + if (encoding_type == kPrefix && total_order) { + continue; + } + if (!bloom_bits && store_index_in_file) { + continue; + } + if (total_order && store_index_in_file) { + continue; + } + bool expect_bloom_not_match = false; + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + PlainTableOptions plain_table_options; + if (total_order) { + options.prefix_extractor = nullptr; + plain_table_options.hash_table_ratio = 0; + plain_table_options.index_sparseness = 2; + } else { + plain_table_options.hash_table_ratio = 0.75; + plain_table_options.index_sparseness = 16; + } + plain_table_options.user_key_len = kPlainTableVariableLength; + plain_table_options.bloom_bits_per_key = bloom_bits; + plain_table_options.huge_page_tlb_size = huge_page_tlb_size; + plain_table_options.encoding_type = encoding_type; + plain_table_options.store_index_in_file = store_index_in_file; + options.table_factory.reset(new TestPlainTableFactory( + &expect_bloom_not_match, plain_table_options, + 0 /* column_family_id */, kDefaultColumnFamilyName)); + + DestroyAndReopen(&options); + ASSERT_OK(Put("0000000000000bar", "b")); + ASSERT_OK(Put("1000000000000foo", "v1")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + + ASSERT_OK(Put("1000000000000foo", "v2")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ("v2", Get("1000000000000foo")); + + ASSERT_OK(Put("0000000000000eee", "v3")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ("v3", Get("0000000000000eee")); + + ASSERT_OK(Delete("0000000000000bar")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ("NOT_FOUND", Get("0000000000000bar")); + + ASSERT_OK(Put("0000000000000eee", "v5")); + ASSERT_OK(Put("9000000000000eee", "v5")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ("v5", Get("0000000000000eee")); + + // Test Bloom Filter + if (bloom_bits > 0) { + // Neither key nor value should exist. + expect_bloom_not_match = true; + ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar")); + // Key doesn't exist any more but prefix exists. + if (total_order) { + ASSERT_EQ("NOT_FOUND", Get("1000000000000not")); + ASSERT_EQ("NOT_FOUND", Get("0000000000000not")); + } + expect_bloom_not_match = false; + } + } + } + } + } + } +} + +TEST_P(PlainTableDBTest, Immortal) { + for (EncodingType encoding_type : {kPlain, kPrefix}) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.max_open_files = -1; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + PlainTableOptions plain_table_options; + plain_table_options.hash_table_ratio = 0.75; + plain_table_options.index_sparseness = 16; + plain_table_options.user_key_len = kPlainTableVariableLength; + plain_table_options.bloom_bits_per_key = 10; + plain_table_options.encoding_type = encoding_type; + options.table_factory.reset(NewPlainTableFactory(plain_table_options)); + + DestroyAndReopen(&options); + ASSERT_OK(Put("0000000000000bar", "b")); + ASSERT_OK(Put("1000000000000foo", "v1")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + + int copied = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "GetContext::SaveValue::PinSelf", [&](void* /*arg*/) { copied++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_EQ("b", Get("0000000000000bar")); + ASSERT_EQ("v1", Get("1000000000000foo")); + ASSERT_EQ(2, copied); + copied = 0; + + Close(); + ASSERT_OK(ReopenForReadOnly(&options)); + + ASSERT_EQ("b", Get("0000000000000bar")); + ASSERT_EQ("v1", Get("1000000000000foo")); + ASSERT_EQ("NOT_FOUND", Get("1000000000000bar")); + if (mmap_mode()) { + ASSERT_EQ(0, copied); + } else { + ASSERT_EQ(2, copied); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_P(PlainTableDBTest, Iterator) { + for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; + huge_page_tlb_size += 2 * 1024 * 1024) { + for (EncodingType encoding_type : {kPlain, kPrefix}) { + for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { + for (int total_order = 0; total_order <= 1; total_order++) { + if (encoding_type == kPrefix && total_order == 1) { + continue; + } + bool expect_bloom_not_match = false; + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + if (total_order) { + options.prefix_extractor = nullptr; + + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 16; + plain_table_options.bloom_bits_per_key = bloom_bits; + plain_table_options.hash_table_ratio = 0; + plain_table_options.index_sparseness = 2; + plain_table_options.huge_page_tlb_size = huge_page_tlb_size; + plain_table_options.encoding_type = encoding_type; + + options.table_factory.reset(new TestPlainTableFactory( + &expect_bloom_not_match, plain_table_options, + 0 /* column_family_id */, kDefaultColumnFamilyName)); + } else { + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 16; + plain_table_options.bloom_bits_per_key = bloom_bits; + plain_table_options.hash_table_ratio = 0.75; + plain_table_options.index_sparseness = 16; + plain_table_options.huge_page_tlb_size = huge_page_tlb_size; + plain_table_options.encoding_type = encoding_type; + + options.table_factory.reset(new TestPlainTableFactory( + &expect_bloom_not_match, plain_table_options, + 0 /* column_family_id */, kDefaultColumnFamilyName)); + } + DestroyAndReopen(&options); + + ASSERT_OK(Put("1000000000foo002", "v_2")); + ASSERT_OK(Put("0000000000000bar", "random")); + ASSERT_OK(Put("1000000000foo001", "v1")); + ASSERT_OK(Put("3000000000000bar", "bar_v")); + ASSERT_OK(Put("1000000000foo003", "v__3")); + ASSERT_OK(Put("1000000000foo004", "v__4")); + ASSERT_OK(Put("1000000000foo005", "v__5")); + ASSERT_OK(Put("1000000000foo007", "v__7")); + ASSERT_OK(Put("1000000000foo008", "v__8")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ("v1", Get("1000000000foo001")); + ASSERT_EQ("v__3", Get("1000000000foo003")); + Iterator* iter = dbfull()->NewIterator(ReadOptions()); + iter->Seek("1000000000foo000"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo001", iter->key().ToString()); + ASSERT_EQ("v1", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo002", iter->key().ToString()); + ASSERT_EQ("v_2", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo003", iter->key().ToString()); + ASSERT_EQ("v__3", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo004", iter->key().ToString()); + ASSERT_EQ("v__4", iter->value().ToString()); + + iter->Seek("3000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + ASSERT_EQ("bar_v", iter->value().ToString()); + + iter->Seek("1000000000foo000"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo001", iter->key().ToString()); + ASSERT_EQ("v1", iter->value().ToString()); + + iter->Seek("1000000000foo005"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); + + iter->Seek("1000000000foo006"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo007", iter->key().ToString()); + ASSERT_EQ("v__7", iter->value().ToString()); + + iter->Seek("1000000000foo008"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo008", iter->key().ToString()); + ASSERT_EQ("v__8", iter->value().ToString()); + + if (total_order == 0) { + iter->Seek("1000000000foo009"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + } + + // Test Bloom Filter + if (bloom_bits > 0) { + if (!total_order) { + // Neither key nor value should exist. + expect_bloom_not_match = true; + iter->Seek("2not000000000bar"); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ("NOT_FOUND", Get("2not000000000bar")); + expect_bloom_not_match = false; + } else { + expect_bloom_not_match = true; + ASSERT_EQ("NOT_FOUND", Get("2not000000000bar")); + expect_bloom_not_match = false; + } + } + ASSERT_OK(iter->status()); + delete iter; + } + } + } + } +} + +namespace { +std::string NthKey(size_t n, char filler) { + std::string rv(16, filler); + rv[0] = n % 10; + rv[1] = (n / 10) % 10; + rv[2] = (n / 100) % 10; + rv[3] = (n / 1000) % 10; + return rv; +} +} // anonymous namespace + +TEST_P(PlainTableDBTest, BloomSchema) { + Options options = CurrentOptions(); + options.create_if_missing = true; + for (int bloom_locality = 0; bloom_locality <= 1; bloom_locality++) { + options.bloom_locality = bloom_locality; + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 16; + plain_table_options.bloom_bits_per_key = 3; // high FP rate for test + plain_table_options.hash_table_ratio = 0.75; + plain_table_options.index_sparseness = 16; + plain_table_options.huge_page_tlb_size = 0; + plain_table_options.encoding_type = kPlain; + + bool expect_bloom_not_match = false; + options.table_factory.reset(new TestPlainTableFactory( + &expect_bloom_not_match, plain_table_options, 0 /* column_family_id */, + kDefaultColumnFamilyName)); + DestroyAndReopen(&options); + + for (unsigned i = 0; i < 2345; ++i) { + ASSERT_OK(Put(NthKey(i, 'y'), "added")); + } + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ("added", Get(NthKey(42, 'y'))); + + for (unsigned i = 0; i < 32; ++i) { + // Known pattern of Bloom filter false positives can detect schema change + // with high probability. Known FPs stuffed into bits: + uint32_t pattern; + if (!bloom_locality) { + pattern = 1785868347UL; + } else if (CACHE_LINE_SIZE == 64U) { + pattern = 2421694657UL; + } else if (CACHE_LINE_SIZE == 128U) { + pattern = 788710956UL; + } else { + ASSERT_EQ(CACHE_LINE_SIZE, 256U); + pattern = 163905UL; + } + bool expect_fp = pattern & (1UL << i); + // fprintf(stderr, "expect_fp@%u: %d\n", i, (int)expect_fp); + expect_bloom_not_match = !expect_fp; + ASSERT_EQ("NOT_FOUND", Get(NthKey(i, 'n'))); + } + } +} + +namespace { +std::string MakeLongKey(size_t length, char c) { + return std::string(length, c); +} +} // anonymous namespace + +TEST_P(PlainTableDBTest, IteratorLargeKeys) { + Options options = CurrentOptions(); + + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 0; + plain_table_options.bloom_bits_per_key = 0; + plain_table_options.hash_table_ratio = 0; + + options.table_factory.reset(NewPlainTableFactory(plain_table_options)); + options.create_if_missing = true; + options.prefix_extractor.reset(); + DestroyAndReopen(&options); + + std::string key_list[] = {MakeLongKey(30, '0'), MakeLongKey(16, '1'), + MakeLongKey(32, '2'), MakeLongKey(60, '3'), + MakeLongKey(90, '4'), MakeLongKey(50, '5'), + MakeLongKey(26, '6')}; + + for (size_t i = 0; i < 7; i++) { + ASSERT_OK(Put(key_list[i], std::to_string(i))); + } + + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + + Iterator* iter = dbfull()->NewIterator(ReadOptions()); + iter->Seek(key_list[0]); + + for (size_t i = 0; i < 7; i++) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(key_list[i], iter->key().ToString()); + ASSERT_EQ(std::to_string(i), iter->value().ToString()); + iter->Next(); + } + + ASSERT_TRUE(!iter->Valid()); + + delete iter; +} + +namespace { +std::string MakeLongKeyWithPrefix(size_t length, char c) { + return "00000000" + std::string(length - 8, c); +} +} // anonymous namespace + +TEST_P(PlainTableDBTest, IteratorLargeKeysWithPrefix) { + Options options = CurrentOptions(); + + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 16; + plain_table_options.bloom_bits_per_key = 0; + plain_table_options.hash_table_ratio = 0.8; + plain_table_options.index_sparseness = 3; + plain_table_options.huge_page_tlb_size = 0; + plain_table_options.encoding_type = kPrefix; + + options.table_factory.reset(NewPlainTableFactory(plain_table_options)); + options.create_if_missing = true; + DestroyAndReopen(&options); + + std::string key_list[] = { + MakeLongKeyWithPrefix(30, '0'), MakeLongKeyWithPrefix(16, '1'), + MakeLongKeyWithPrefix(32, '2'), MakeLongKeyWithPrefix(60, '3'), + MakeLongKeyWithPrefix(90, '4'), MakeLongKeyWithPrefix(50, '5'), + MakeLongKeyWithPrefix(26, '6')}; + + for (size_t i = 0; i < 7; i++) { + ASSERT_OK(Put(key_list[i], std::to_string(i))); + } + + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + + Iterator* iter = dbfull()->NewIterator(ReadOptions()); + iter->Seek(key_list[0]); + + for (size_t i = 0; i < 7; i++) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(key_list[i], iter->key().ToString()); + ASSERT_EQ(std::to_string(i), iter->value().ToString()); + iter->Next(); + } + + ASSERT_TRUE(!iter->Valid()); + + delete iter; +} + +TEST_P(PlainTableDBTest, IteratorReverseSuffixComparator) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + test::SimpleSuffixReverseComparator comp; + options.comparator = ∁ + DestroyAndReopen(&options); + + ASSERT_OK(Put("1000000000foo002", "v_2")); + ASSERT_OK(Put("0000000000000bar", "random")); + ASSERT_OK(Put("1000000000foo001", "v1")); + ASSERT_OK(Put("3000000000000bar", "bar_v")); + ASSERT_OK(Put("1000000000foo003", "v__3")); + ASSERT_OK(Put("1000000000foo004", "v__4")); + ASSERT_OK(Put("1000000000foo005", "v__5")); + ASSERT_OK(Put("1000000000foo007", "v__7")); + ASSERT_OK(Put("1000000000foo008", "v__8")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ("v1", Get("1000000000foo001")); + ASSERT_EQ("v__3", Get("1000000000foo003")); + Iterator* iter = dbfull()->NewIterator(ReadOptions()); + iter->Seek("1000000000foo009"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo008", iter->key().ToString()); + ASSERT_EQ("v__8", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo007", iter->key().ToString()); + ASSERT_EQ("v__7", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo004", iter->key().ToString()); + ASSERT_EQ("v__4", iter->value().ToString()); + + iter->Seek("3000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + ASSERT_EQ("bar_v", iter->value().ToString()); + + iter->Seek("1000000000foo005"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); + + iter->Seek("1000000000foo006"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); + + iter->Seek("1000000000foo008"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo008", iter->key().ToString()); + ASSERT_EQ("v__8", iter->value().ToString()); + + iter->Seek("1000000000foo000"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + + delete iter; +} + +TEST_P(PlainTableDBTest, HashBucketConflict) { + for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; + huge_page_tlb_size += 2 * 1024 * 1024) { + for (unsigned char i = 1; i <= 3; i++) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 16; + plain_table_options.bloom_bits_per_key = 0; + plain_table_options.hash_table_ratio = 0; + plain_table_options.index_sparseness = 2 ^ i; + plain_table_options.huge_page_tlb_size = huge_page_tlb_size; + + options.table_factory.reset(NewPlainTableFactory(plain_table_options)); + + DestroyAndReopen(&options); + ASSERT_OK(Put("5000000000000fo0", "v1")); + ASSERT_OK(Put("5000000000000fo1", "v2")); + ASSERT_OK(Put("5000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo0", "v3")); + ASSERT_OK(Put("2000000000000fo1", "v4")); + ASSERT_OK(Put("2000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo3", "v")); + + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + + ASSERT_EQ("v1", Get("5000000000000fo0")); + ASSERT_EQ("v2", Get("5000000000000fo1")); + ASSERT_EQ("v3", Get("2000000000000fo0")); + ASSERT_EQ("v4", Get("2000000000000fo1")); + + ASSERT_EQ("NOT_FOUND", Get("5000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8")); + + ReadOptions ro; + Iterator* iter = dbfull()->NewIterator(ro); + + iter->Seek("5000000000000fo0"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + + iter->Seek("5000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000fo0"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo0", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo0", iter->key().ToString()); + + iter->Seek("5000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + + iter->Seek("2000000000000fo8"); + ASSERT_TRUE(!iter->Valid() || + options.comparator->Compare(iter->key(), "20000001") > 0); + + iter->Seek("5000000000000fo8"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("1000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("3000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("8000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + ASSERT_OK(iter->status()); + delete iter; + } + } +} + +TEST_P(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) { + for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; + huge_page_tlb_size += 2 * 1024 * 1024) { + for (unsigned char i = 1; i <= 3; i++) { + Options options = CurrentOptions(); + options.create_if_missing = true; + test::SimpleSuffixReverseComparator comp; + options.comparator = ∁ + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 16; + plain_table_options.bloom_bits_per_key = 0; + plain_table_options.hash_table_ratio = 0; + plain_table_options.index_sparseness = 2 ^ i; + plain_table_options.huge_page_tlb_size = huge_page_tlb_size; + + options.table_factory.reset(NewPlainTableFactory(plain_table_options)); + DestroyAndReopen(&options); + ASSERT_OK(Put("5000000000000fo0", "v1")); + ASSERT_OK(Put("5000000000000fo1", "v2")); + ASSERT_OK(Put("5000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo0", "v3")); + ASSERT_OK(Put("2000000000000fo1", "v4")); + ASSERT_OK(Put("2000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo3", "v")); + + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + + ASSERT_EQ("v1", Get("5000000000000fo0")); + ASSERT_EQ("v2", Get("5000000000000fo1")); + ASSERT_EQ("v3", Get("2000000000000fo0")); + ASSERT_EQ("v4", Get("2000000000000fo1")); + + ASSERT_EQ("NOT_FOUND", Get("5000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8")); + + ReadOptions ro; + Iterator* iter = dbfull()->NewIterator(ro); + + iter->Seek("5000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + + iter->Seek("5000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo0", iter->key().ToString()); + + iter->Seek("2000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000var"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo3", iter->key().ToString()); + + iter->Seek("5000000000000var"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo2", iter->key().ToString()); + + std::string seek_key = "2000000000000bar"; + iter->Seek(seek_key); + ASSERT_TRUE(!iter->Valid() || + options.prefix_extractor->Transform(iter->key()) != + options.prefix_extractor->Transform(seek_key)); + + iter->Seek("1000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("3000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("8000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + ASSERT_OK(iter->status()); + delete iter; + } + } +} + +TEST_P(PlainTableDBTest, NonExistingKeyToNonEmptyBucket) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 16; + plain_table_options.bloom_bits_per_key = 0; + plain_table_options.hash_table_ratio = 0; + plain_table_options.index_sparseness = 5; + + options.table_factory.reset(NewPlainTableFactory(plain_table_options)); + DestroyAndReopen(&options); + ASSERT_OK(Put("5000000000000fo0", "v1")); + ASSERT_OK(Put("5000000000000fo1", "v2")); + ASSERT_OK(Put("5000000000000fo2", "v3")); + + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + + ASSERT_EQ("v1", Get("5000000000000fo0")); + ASSERT_EQ("v2", Get("5000000000000fo1")); + ASSERT_EQ("v3", Get("5000000000000fo2")); + + ASSERT_EQ("NOT_FOUND", Get("8000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("1000000000000bar")); + + Iterator* iter = dbfull()->NewIterator(ReadOptions()); + + iter->Seek("5000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + + iter->Seek("5000000000000fo8"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("1000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("8000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + ASSERT_OK(iter->status()); + delete iter; +} + +static std::string Key(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "key_______%06d", i); + return std::string(buf); +} + +TEST_P(PlainTableDBTest, CompactionTrigger) { + Options options = CurrentOptions(); + options.write_buffer_size = 120 << 10; // 120KB + options.num_levels = 3; + options.level0_file_num_compaction_trigger = 3; + Reopen(&options); + + Random rnd(301); + + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + std::vector values; + // Write 120KB (10 values, each 12K) + for (int i = 0; i < 10; i++) { + values.push_back(rnd.RandomString(12 << 10)); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Put(Key(999), "")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); + } + + // generate one more file in level-0, and should trigger level-0 compaction + std::vector values; + for (int i = 0; i < 12; i++) { + values.push_back(rnd.RandomString(10000)); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Put(Key(999), "")); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 1); +} + +TEST_P(PlainTableDBTest, AdaptiveTable) { + Options options = CurrentOptions(); + options.create_if_missing = true; + + options.table_factory.reset(NewPlainTableFactory()); + DestroyAndReopen(&options); + + ASSERT_OK(Put("1000000000000foo", "v1")); + ASSERT_OK(Put("0000000000000bar", "v2")); + ASSERT_OK(Put("1000000000000foo", "v3")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + + options.create_if_missing = false; + std::shared_ptr block_based_factory( + NewBlockBasedTableFactory()); + std::shared_ptr plain_table_factory(NewPlainTableFactory()); + std::shared_ptr dummy_factory; + options.table_factory.reset(NewAdaptiveTableFactory( + block_based_factory, block_based_factory, plain_table_factory)); + Reopen(&options); + ASSERT_EQ("v3", Get("1000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); + + ASSERT_OK(Put("2000000000000foo", "v4")); + ASSERT_OK(Put("3000000000000bar", "v5")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ("v4", Get("2000000000000foo")); + ASSERT_EQ("v5", Get("3000000000000bar")); + + Reopen(&options); + ASSERT_EQ("v3", Get("1000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); + ASSERT_EQ("v4", Get("2000000000000foo")); + ASSERT_EQ("v5", Get("3000000000000bar")); + + options.paranoid_checks = false; + options.table_factory.reset(NewBlockBasedTableFactory()); + Reopen(&options); + ASSERT_NE("v3", Get("1000000000000foo")); + + options.paranoid_checks = false; + options.table_factory.reset(NewPlainTableFactory()); + Reopen(&options); + ASSERT_NE("v5", Get("3000000000000bar")); +} + +INSTANTIATE_TEST_CASE_P(PlainTableDBTest, PlainTableDBTest, ::testing::Bool()); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as plain table is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/post_memtable_callback.h b/src/rocksdb/db/post_memtable_callback.h new file mode 100644 index 000000000..fbf2fbe86 --- /dev/null +++ b/src/rocksdb/db/post_memtable_callback.h @@ -0,0 +1,25 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +// Callback invoked after finishing writing to the memtable but before +// publishing the sequence number to readers. +// Note that with write-prepared/write-unprepared transactions with +// two-write-queues, PreReleaseCallback is called before publishing the +// sequence numbers to readers. +class PostMemTableCallback { + public: + virtual ~PostMemTableCallback() {} + + virtual Status operator()(SequenceNumber seq, bool disable_memtable) = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/pre_release_callback.h b/src/rocksdb/db/pre_release_callback.h new file mode 100644 index 000000000..6b9039487 --- /dev/null +++ b/src/rocksdb/db/pre_release_callback.h @@ -0,0 +1,37 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +class PreReleaseCallback { + public: + virtual ~PreReleaseCallback() {} + + // Will be called while on the write thread after the write to the WAL and + // before the write to memtable. This is useful if any operation needs to be + // done before the write gets visible to the readers, or if we want to reduce + // the overhead of locking by updating something sequentially while we are on + // the write thread. If the callback fails, this function returns a non-OK + // status, the sequence number will not be released, and same status will be + // propagated to all the writers in the write group. + // seq is the sequence number that is used for this write and will be + // released. + // is_mem_disabled is currently used for debugging purposes to assert that + // the callback is done from the right write queue. + // If non-zero, log_number indicates the WAL log to which we wrote. + // index >= 0 specifies the order of callback in the same write thread. + // total > index specifies the total number of callbacks in the same write + // thread. Together with index, could be used to reduce the redundant + // operations among the callbacks. + virtual Status Callback(SequenceNumber seq, bool is_mem_disabled, + uint64_t log_number, size_t index, size_t total) = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/prefix_test.cc b/src/rocksdb/db/prefix_test.cc new file mode 100644 index 000000000..8592b8f31 --- /dev/null +++ b/src/rocksdb/db/prefix_test.cc @@ -0,0 +1,906 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#ifndef GFLAGS +#include +int main() { + fprintf(stderr, "Please install gflags to run this test... Skipping...\n"); + return 0; +} +#else + +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "monitoring/histogram.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/table.h" +#include "test_util/testharness.h" +#include "util/cast_util.h" +#include "util/coding.h" +#include "util/gflags_compat.h" +#include "util/random.h" +#include "util/stop_watch.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; + +DEFINE_bool(trigger_deadlock, false, + "issue delete in range scan to trigger PrefixHashMap deadlock"); +DEFINE_int32(bucket_count, 100000, "number of buckets"); +DEFINE_uint64(num_locks, 10001, "number of locks"); +DEFINE_bool(random_prefix, false, "randomize prefix"); +DEFINE_uint64(total_prefixes, 100000, "total number of prefixes"); +DEFINE_uint64(items_per_prefix, 1, "total number of values per prefix"); +DEFINE_int64(write_buffer_size, 33554432, ""); +DEFINE_int32(max_write_buffer_number, 2, ""); +DEFINE_int32(min_write_buffer_number_to_merge, 1, ""); +DEFINE_int32(skiplist_height, 4, ""); +DEFINE_double(memtable_prefix_bloom_size_ratio, 0.1, ""); +DEFINE_int32(memtable_huge_page_size, 2 * 1024 * 1024, ""); +DEFINE_int32(value_size, 40, ""); +DEFINE_bool(enable_print, false, "Print options generated to console."); + +// Path to the database on file system +const std::string kDbName = + ROCKSDB_NAMESPACE::test::PerThreadDBPath("prefix_test"); + +namespace ROCKSDB_NAMESPACE { + +struct TestKey { + uint64_t prefix; + uint64_t sorted; + + TestKey(uint64_t _prefix, uint64_t _sorted) + : prefix(_prefix), sorted(_sorted) {} +}; + +// return a slice backed by test_key +inline Slice TestKeyToSlice(std::string& s, const TestKey& test_key) { + s.clear(); + PutFixed64(&s, test_key.prefix); + PutFixed64(&s, test_key.sorted); + return Slice(s.c_str(), s.size()); +} + +inline const TestKey SliceToTestKey(const Slice& slice) { + return TestKey(DecodeFixed64(slice.data()), DecodeFixed64(slice.data() + 8)); +} + +class TestKeyComparator : public Comparator { + public: + // Compare needs to be aware of the possibility of a and/or b is + // prefix only + int Compare(const Slice& a, const Slice& b) const override { + const TestKey kkey_a = SliceToTestKey(a); + const TestKey kkey_b = SliceToTestKey(b); + const TestKey* key_a = &kkey_a; + const TestKey* key_b = &kkey_b; + if (key_a->prefix != key_b->prefix) { + if (key_a->prefix < key_b->prefix) return -1; + if (key_a->prefix > key_b->prefix) return 1; + } else { + EXPECT_TRUE(key_a->prefix == key_b->prefix); + // note, both a and b could be prefix only + if (a.size() != b.size()) { + // one of them is prefix + EXPECT_TRUE( + (a.size() == sizeof(uint64_t) && b.size() == sizeof(TestKey)) || + (b.size() == sizeof(uint64_t) && a.size() == sizeof(TestKey))); + if (a.size() < b.size()) return -1; + if (a.size() > b.size()) return 1; + } else { + // both a and b are prefix + if (a.size() == sizeof(uint64_t)) { + return 0; + } + + // both a and b are whole key + EXPECT_TRUE(a.size() == sizeof(TestKey) && b.size() == sizeof(TestKey)); + if (key_a->sorted < key_b->sorted) return -1; + if (key_a->sorted > key_b->sorted) return 1; + if (key_a->sorted == key_b->sorted) return 0; + } + } + return 0; + } + + bool operator()(const TestKey& a, const TestKey& b) const { + std::string sa, sb; + return Compare(TestKeyToSlice(sa, a), TestKeyToSlice(sb, b)) < 0; + } + + const char* Name() const override { return "TestKeyComparator"; } + + void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override {} + + void FindShortSuccessor(std::string* /*key*/) const override {} +}; + +namespace { +void PutKey(DB* db, WriteOptions write_options, uint64_t prefix, + uint64_t suffix, const Slice& value) { + TestKey test_key(prefix, suffix); + std::string s; + Slice key = TestKeyToSlice(s, test_key); + ASSERT_OK(db->Put(write_options, key, value)); +} + +void PutKey(DB* db, WriteOptions write_options, const TestKey& test_key, + const Slice& value) { + std::string s; + Slice key = TestKeyToSlice(s, test_key); + ASSERT_OK(db->Put(write_options, key, value)); +} + +void MergeKey(DB* db, WriteOptions write_options, const TestKey& test_key, + const Slice& value) { + std::string s; + Slice key = TestKeyToSlice(s, test_key); + ASSERT_OK(db->Merge(write_options, key, value)); +} + +void DeleteKey(DB* db, WriteOptions write_options, const TestKey& test_key) { + std::string s; + Slice key = TestKeyToSlice(s, test_key); + ASSERT_OK(db->Delete(write_options, key)); +} + +void SeekIterator(Iterator* iter, uint64_t prefix, uint64_t suffix) { + TestKey test_key(prefix, suffix); + std::string s; + Slice key = TestKeyToSlice(s, test_key); + iter->Seek(key); +} + +const std::string kNotFoundResult = "NOT_FOUND"; + +std::string Get(DB* db, const ReadOptions& read_options, uint64_t prefix, + uint64_t suffix) { + TestKey test_key(prefix, suffix); + std::string s2; + Slice key = TestKeyToSlice(s2, test_key); + + std::string result; + Status s = db->Get(read_options, key, &result); + if (s.IsNotFound()) { + result = kNotFoundResult; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; +} + +class SamePrefixTransform : public SliceTransform { + private: + const Slice prefix_; + std::string name_; + + public: + explicit SamePrefixTransform(const Slice& prefix) + : prefix_(prefix), name_("rocksdb.SamePrefix." + prefix.ToString()) {} + + const char* Name() const override { return name_.c_str(); } + + Slice Transform(const Slice& src) const override { + assert(InDomain(src)); + return prefix_; + } + + bool InDomain(const Slice& src) const override { + if (src.size() >= prefix_.size()) { + return Slice(src.data(), prefix_.size()) == prefix_; + } + return false; + } + + bool InRange(const Slice& dst) const override { return dst == prefix_; } + + bool FullLengthEnabled(size_t* /*len*/) const override { return false; } +}; + +} // anonymous namespace + +class PrefixTest : public testing::Test { + public: + std::shared_ptr OpenDb() { + DB* db; + + options.create_if_missing = true; + options.write_buffer_size = FLAGS_write_buffer_size; + options.max_write_buffer_number = FLAGS_max_write_buffer_number; + options.min_write_buffer_number_to_merge = + FLAGS_min_write_buffer_number_to_merge; + + options.memtable_prefix_bloom_size_ratio = + FLAGS_memtable_prefix_bloom_size_ratio; + options.memtable_huge_page_size = FLAGS_memtable_huge_page_size; + + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.allow_concurrent_memtable_write = false; + + Status s = DB::Open(options, kDbName, &db); + EXPECT_OK(s); + return std::shared_ptr(db); + } + + void FirstOption() { option_config_ = kBegin; } + + bool NextOptions(int bucket_count) { + // skip some options + option_config_++; + if (option_config_ < kEnd) { + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + switch (option_config_) { + case kHashSkipList: + options.memtable_factory.reset( + NewHashSkipListRepFactory(bucket_count, FLAGS_skiplist_height)); + return true; + case kHashLinkList: + options.memtable_factory.reset( + NewHashLinkListRepFactory(bucket_count)); + return true; + case kHashLinkListHugePageTlb: + options.memtable_factory.reset( + NewHashLinkListRepFactory(bucket_count, 2 * 1024 * 1024)); + return true; + case kHashLinkListTriggerSkipList: + options.memtable_factory.reset( + NewHashLinkListRepFactory(bucket_count, 0, 3)); + return true; + default: + return false; + } + } + return false; + } + + PrefixTest() : option_config_(kBegin) { + options.comparator = new TestKeyComparator(); + } + ~PrefixTest() override { delete options.comparator; } + + protected: + enum OptionConfig { + kBegin, + kHashSkipList, + kHashLinkList, + kHashLinkListHugePageTlb, + kHashLinkListTriggerSkipList, + kEnd + }; + int option_config_; + Options options; +}; + +TEST(SamePrefixTest, InDomainTest) { + DB* db; + Options options; + options.create_if_missing = true; + options.prefix_extractor.reset(new SamePrefixTransform("HHKB")); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + WriteOptions write_options; + ReadOptions read_options; + { + ASSERT_OK(DestroyDB(kDbName, Options())); + ASSERT_OK(DB::Open(options, kDbName, &db)); + ASSERT_OK(db->Put(write_options, "HHKB pro2", "Mar 24, 2006")); + ASSERT_OK(db->Put(write_options, "HHKB pro2 Type-S", "June 29, 2011")); + ASSERT_OK(db->Put(write_options, "Realforce 87u", "idk")); + ASSERT_OK(db->Flush(FlushOptions())); + std::string result; + auto db_iter = db->NewIterator(ReadOptions()); + + db_iter->Seek("Realforce 87u"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_OK(db_iter->status()); + ASSERT_EQ(db_iter->key(), "Realforce 87u"); + ASSERT_EQ(db_iter->value(), "idk"); + + delete db_iter; + delete db; + ASSERT_OK(DestroyDB(kDbName, Options())); + } + + { + ASSERT_OK(DB::Open(options, kDbName, &db)); + ASSERT_OK(db->Put(write_options, "pikachu", "1")); + ASSERT_OK(db->Put(write_options, "Meowth", "1")); + ASSERT_OK(db->Put(write_options, "Mewtwo", "idk")); + ASSERT_OK(db->Flush(FlushOptions())); + std::string result; + auto db_iter = db->NewIterator(ReadOptions()); + + db_iter->Seek("Mewtwo"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_OK(db_iter->status()); + delete db_iter; + delete db; + ASSERT_OK(DestroyDB(kDbName, Options())); + } +} + +TEST_F(PrefixTest, TestResult) { + for (int num_buckets = 1; num_buckets <= 2; num_buckets++) { + FirstOption(); + while (NextOptions(num_buckets)) { + std::cout << "*** Mem table: " << options.memtable_factory->Name() + << " number of buckets: " << num_buckets << std::endl; + ASSERT_OK(DestroyDB(kDbName, Options())); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + // 1. Insert one row. + Slice v16("v16"); + PutKey(db.get(), write_options, 1, 6, v16); + std::unique_ptr iter(db->NewIterator(read_options)); + SeekIterator(iter.get(), 1, 6); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + SeekIterator(iter.get(), 1, 5); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + SeekIterator(iter.get(), 1, 5); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); + + SeekIterator(iter.get(), 2, 0); + ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); + + ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6)); + ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 5)); + ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 7)); + ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 0, 6)); + ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 2, 6)); + + // 2. Insert an entry for the same prefix as the last entry in the bucket. + Slice v17("v17"); + PutKey(db.get(), write_options, 1, 7, v17); + iter.reset(db->NewIterator(read_options)); + SeekIterator(iter.get(), 1, 7); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + SeekIterator(iter.get(), 1, 6); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); + + SeekIterator(iter.get(), 2, 0); + ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); + + // 3. Insert an entry for the same prefix as the head of the bucket. + Slice v15("v15"); + PutKey(db.get(), write_options, 1, 5, v15); + iter.reset(db->NewIterator(read_options)); + + SeekIterator(iter.get(), 1, 7); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + SeekIterator(iter.get(), 1, 5); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v15 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + SeekIterator(iter.get(), 1, 5); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v15 == iter->value()); + + ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5)); + ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6)); + ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7)); + + // 4. Insert an entry with a larger prefix + Slice v22("v22"); + PutKey(db.get(), write_options, 2, 2, v22); + iter.reset(db->NewIterator(read_options)); + + SeekIterator(iter.get(), 2, 2); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v22 == iter->value()); + SeekIterator(iter.get(), 2, 0); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v22 == iter->value()); + + SeekIterator(iter.get(), 1, 5); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v15 == iter->value()); + + SeekIterator(iter.get(), 1, 7); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + // 5. Insert an entry with a smaller prefix + Slice v02("v02"); + PutKey(db.get(), write_options, 0, 2, v02); + iter.reset(db->NewIterator(read_options)); + + SeekIterator(iter.get(), 0, 2); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v02 == iter->value()); + SeekIterator(iter.get(), 0, 0); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v02 == iter->value()); + + SeekIterator(iter.get(), 2, 0); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v22 == iter->value()); + + SeekIterator(iter.get(), 1, 5); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v15 == iter->value()); + + SeekIterator(iter.get(), 1, 7); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + // 6. Insert to the beginning and the end of the first prefix + Slice v13("v13"); + Slice v18("v18"); + PutKey(db.get(), write_options, 1, 3, v13); + PutKey(db.get(), write_options, 1, 8, v18); + iter.reset(db->NewIterator(read_options)); + SeekIterator(iter.get(), 1, 7); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + SeekIterator(iter.get(), 1, 3); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v13 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v15 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v18 == iter->value()); + + SeekIterator(iter.get(), 0, 0); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v02 == iter->value()); + + SeekIterator(iter.get(), 2, 0); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v22 == iter->value()); + + ASSERT_EQ(v22.ToString(), Get(db.get(), read_options, 2, 2)); + ASSERT_EQ(v02.ToString(), Get(db.get(), read_options, 0, 2)); + ASSERT_EQ(v13.ToString(), Get(db.get(), read_options, 1, 3)); + ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5)); + ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6)); + ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7)); + ASSERT_EQ(v18.ToString(), Get(db.get(), read_options, 1, 8)); + } + } +} + +// Show results in prefix +TEST_F(PrefixTest, PrefixValid) { + for (int num_buckets = 1; num_buckets <= 2; num_buckets++) { + FirstOption(); + while (NextOptions(num_buckets)) { + std::cout << "*** Mem table: " << options.memtable_factory->Name() + << " number of buckets: " << num_buckets << std::endl; + ASSERT_OK(DestroyDB(kDbName, Options())); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + // Insert keys with common prefix and one key with different + Slice v16("v16"); + Slice v17("v17"); + Slice v18("v18"); + Slice v19("v19"); + PutKey(db.get(), write_options, 12345, 6, v16); + PutKey(db.get(), write_options, 12345, 7, v17); + PutKey(db.get(), write_options, 12345, 8, v18); + PutKey(db.get(), write_options, 12345, 9, v19); + PutKey(db.get(), write_options, 12346, 8, v16); + ASSERT_OK(db->Flush(FlushOptions())); + TestKey test_key(12346, 8); + std::string s; + ASSERT_OK(db->Delete(write_options, TestKeyToSlice(s, test_key))); + ASSERT_OK(db->Flush(FlushOptions())); + read_options.prefix_same_as_start = true; + std::unique_ptr iter(db->NewIterator(read_options)); + SeekIterator(iter.get(), 12345, 6); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v18 == iter->value()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v19 == iter->value()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 12346, 8)); + + // Verify seeking past the prefix won't return a result. + SeekIterator(iter.get(), 12345, 10); + ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); + } + } +} + +TEST_F(PrefixTest, DynamicPrefixIterator) { + while (NextOptions(FLAGS_bucket_count)) { + std::cout << "*** Mem table: " << options.memtable_factory->Name() + << std::endl; + ASSERT_OK(DestroyDB(kDbName, Options())); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + std::vector prefixes; + for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) { + prefixes.push_back(i); + } + + if (FLAGS_random_prefix) { + RandomShuffle(prefixes.begin(), prefixes.end()); + } + + HistogramImpl hist_put_time; + HistogramImpl hist_put_comparison; + // insert x random prefix, each with y continuous element. + for (auto prefix : prefixes) { + for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) { + TestKey test_key(prefix, sorted); + + std::string s; + Slice key = TestKeyToSlice(s, test_key); + std::string value(FLAGS_value_size, 0); + + get_perf_context()->Reset(); + StopWatchNano timer(SystemClock::Default().get(), true); + ASSERT_OK(db->Put(write_options, key, value)); + hist_put_time.Add(timer.ElapsedNanos()); + hist_put_comparison.Add(get_perf_context()->user_key_comparison_count); + } + } + + std::cout << "Put key comparison: \n" + << hist_put_comparison.ToString() << "Put time: \n" + << hist_put_time.ToString(); + + // test seek existing keys + HistogramImpl hist_seek_time; + HistogramImpl hist_seek_comparison; + + std::unique_ptr iter(db->NewIterator(read_options)); + + for (auto prefix : prefixes) { + TestKey test_key(prefix, FLAGS_items_per_prefix / 2); + std::string s; + Slice key = TestKeyToSlice(s, test_key); + std::string value = "v" + std::to_string(0); + + get_perf_context()->Reset(); + StopWatchNano timer(SystemClock::Default().get(), true); + auto key_prefix = options.prefix_extractor->Transform(key); + uint64_t total_keys = 0; + for (iter->Seek(key); + iter->Valid() && iter->key().starts_with(key_prefix); iter->Next()) { + if (FLAGS_trigger_deadlock) { + std::cout << "Behold the deadlock!\n"; + db->Delete(write_options, iter->key()); + } + total_keys++; + } + hist_seek_time.Add(timer.ElapsedNanos()); + hist_seek_comparison.Add(get_perf_context()->user_key_comparison_count); + ASSERT_EQ(total_keys, + FLAGS_items_per_prefix - FLAGS_items_per_prefix / 2); + } + + std::cout << "Seek key comparison: \n" + << hist_seek_comparison.ToString() << "Seek time: \n" + << hist_seek_time.ToString(); + + // test non-existing keys + HistogramImpl hist_no_seek_time; + HistogramImpl hist_no_seek_comparison; + + for (auto prefix = FLAGS_total_prefixes; + prefix < FLAGS_total_prefixes + 10000; prefix++) { + TestKey test_key(prefix, 0); + std::string s; + Slice key = TestKeyToSlice(s, test_key); + + get_perf_context()->Reset(); + StopWatchNano timer(SystemClock::Default().get(), true); + iter->Seek(key); + hist_no_seek_time.Add(timer.ElapsedNanos()); + hist_no_seek_comparison.Add( + get_perf_context()->user_key_comparison_count); + ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); + } + + std::cout << "non-existing Seek key comparison: \n" + << hist_no_seek_comparison.ToString() + << "non-existing Seek time: \n" + << hist_no_seek_time.ToString(); + } +} + +TEST_F(PrefixTest, PrefixSeekModePrev) { + // Only for SkipListFactory + options.memtable_factory.reset(new SkipListFactory); + options.merge_operator = MergeOperators::CreatePutOperator(); + options.write_buffer_size = 1024 * 1024; + Random rnd(1); + for (size_t m = 1; m < 100; m++) { + std::cout << "[" + std::to_string(m) + "]" + "*** Mem table: " + << options.memtable_factory->Name() << std::endl; + ASSERT_OK(DestroyDB(kDbName, Options())); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + std::map entry_maps[3], whole_map; + for (uint64_t i = 0; i < 10; i++) { + int div = i % 3 + 1; + for (uint64_t j = 0; j < 10; j++) { + whole_map[TestKey(i, j)] = entry_maps[rnd.Uniform(div)][TestKey(i, j)] = + 'v' + std::to_string(i) + std::to_string(j); + } + } + + std::map type_map; + for (size_t i = 0; i < 3; i++) { + for (auto& kv : entry_maps[i]) { + if (rnd.OneIn(3)) { + PutKey(db.get(), write_options, kv.first, kv.second); + type_map[kv.first] = "value"; + } else { + MergeKey(db.get(), write_options, kv.first, kv.second); + type_map[kv.first] = "merge"; + } + } + if (i < 2) { + ASSERT_OK(db->Flush(FlushOptions())); + } + } + + for (size_t i = 0; i < 2; i++) { + for (auto& kv : entry_maps[i]) { + if (rnd.OneIn(10)) { + whole_map.erase(kv.first); + DeleteKey(db.get(), write_options, kv.first); + entry_maps[2][kv.first] = "delete"; + } + } + } + + if (FLAGS_enable_print) { + for (size_t i = 0; i < 3; i++) { + for (auto& kv : entry_maps[i]) { + std::cout << "[" << i << "]" << kv.first.prefix << kv.first.sorted + << " " << kv.second + " " + type_map[kv.first] << std::endl; + } + } + } + + std::unique_ptr iter(db->NewIterator(read_options)); + for (uint64_t prefix = 0; prefix < 10; prefix++) { + uint64_t start_suffix = rnd.Uniform(9); + SeekIterator(iter.get(), prefix, start_suffix); + auto it = whole_map.find(TestKey(prefix, start_suffix)); + if (it == whole_map.end()) { + continue; + } + ASSERT_NE(it, whole_map.end()); + ASSERT_TRUE(iter->Valid()); + if (FLAGS_enable_print) { + std::cout << "round " << prefix + << " iter: " << SliceToTestKey(iter->key()).prefix + << SliceToTestKey(iter->key()).sorted + << " | map: " << it->first.prefix << it->first.sorted << " | " + << iter->value().ToString() << " " << it->second << std::endl; + } + ASSERT_EQ(iter->value(), it->second); + uint64_t stored_prefix = prefix; + for (size_t k = 0; k < 9; k++) { + if (rnd.OneIn(2) || it == whole_map.begin()) { + iter->Next(); + ++it; + if (FLAGS_enable_print) { + std::cout << "Next >> "; + } + } else { + iter->Prev(); + it--; + if (FLAGS_enable_print) { + std::cout << "Prev >> "; + } + } + if (!iter->Valid() || + SliceToTestKey(iter->key()).prefix != stored_prefix) { + break; + } + ASSERT_OK(iter->status()); + stored_prefix = SliceToTestKey(iter->key()).prefix; + ASSERT_TRUE(iter->Valid()); + ASSERT_NE(it, whole_map.end()); + ASSERT_EQ(iter->value(), it->second); + if (FLAGS_enable_print) { + std::cout << "iter: " << SliceToTestKey(iter->key()).prefix + << SliceToTestKey(iter->key()).sorted + << " | map: " << it->first.prefix << it->first.sorted + << " | " << iter->value().ToString() << " " << it->second + << std::endl; + } + } + } + } +} + +TEST_F(PrefixTest, PrefixSeekModePrev2) { + // Only for SkipListFactory + // test the case + // iter1 iter2 + // | prefix | suffix | | prefix | suffix | + // | 1 | 1 | | 1 | 2 | + // | 1 | 3 | | 1 | 4 | + // | 2 | 1 | | 3 | 3 | + // | 2 | 2 | | 3 | 4 | + // after seek(15), iter1 will be at 21 and iter2 will be 33. + // Then if call Prev() in prefix mode where SeekForPrev(21) gets called, + // iter2 should turn to invalid state because of bloom filter. + options.memtable_factory.reset(new SkipListFactory); + options.write_buffer_size = 1024 * 1024; + std::string v13("v13"); + ASSERT_OK(DestroyDB(kDbName, Options())); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + PutKey(db.get(), write_options, TestKey(1, 2), "v12"); + PutKey(db.get(), write_options, TestKey(1, 4), "v14"); + PutKey(db.get(), write_options, TestKey(3, 3), "v33"); + PutKey(db.get(), write_options, TestKey(3, 4), "v34"); + ASSERT_OK(db->Flush(FlushOptions())); + ASSERT_OK( + static_cast_with_check(db.get())->TEST_WaitForFlushMemTable()); + PutKey(db.get(), write_options, TestKey(1, 1), "v11"); + PutKey(db.get(), write_options, TestKey(1, 3), "v13"); + PutKey(db.get(), write_options, TestKey(2, 1), "v21"); + PutKey(db.get(), write_options, TestKey(2, 2), "v22"); + ASSERT_OK(db->Flush(FlushOptions())); + ASSERT_OK( + static_cast_with_check(db.get())->TEST_WaitForFlushMemTable()); + std::unique_ptr iter(db->NewIterator(read_options)); + SeekIterator(iter.get(), 1, 5); + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->value(), v13); +} + +TEST_F(PrefixTest, PrefixSeekModePrev3) { + // Only for SkipListFactory + // test SeekToLast() with iterate_upper_bound_ in prefix_seek_mode + options.memtable_factory.reset(new SkipListFactory); + options.write_buffer_size = 1024 * 1024; + std::string v14("v14"); + TestKey upper_bound_key = TestKey(1, 5); + std::string s; + Slice upper_bound = TestKeyToSlice(s, upper_bound_key); + + { + ASSERT_OK(DestroyDB(kDbName, Options())); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + read_options.iterate_upper_bound = &upper_bound; + PutKey(db.get(), write_options, TestKey(1, 2), "v12"); + PutKey(db.get(), write_options, TestKey(1, 4), "v14"); + ASSERT_OK(db->Flush(FlushOptions())); + ASSERT_OK( + static_cast_with_check(db.get())->TEST_WaitForFlushMemTable()); + PutKey(db.get(), write_options, TestKey(1, 1), "v11"); + PutKey(db.get(), write_options, TestKey(1, 3), "v13"); + PutKey(db.get(), write_options, TestKey(2, 1), "v21"); + PutKey(db.get(), write_options, TestKey(2, 2), "v22"); + ASSERT_OK(db->Flush(FlushOptions())); + ASSERT_OK( + static_cast_with_check(db.get())->TEST_WaitForFlushMemTable()); + std::unique_ptr iter(db->NewIterator(read_options)); + iter->SeekToLast(); + ASSERT_EQ(iter->value(), v14); + } + { + ASSERT_OK(DestroyDB(kDbName, Options())); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + read_options.iterate_upper_bound = &upper_bound; + PutKey(db.get(), write_options, TestKey(1, 2), "v12"); + PutKey(db.get(), write_options, TestKey(1, 4), "v14"); + PutKey(db.get(), write_options, TestKey(3, 3), "v33"); + PutKey(db.get(), write_options, TestKey(3, 4), "v34"); + ASSERT_OK(db->Flush(FlushOptions())); + ASSERT_OK( + static_cast_with_check(db.get())->TEST_WaitForFlushMemTable()); + PutKey(db.get(), write_options, TestKey(1, 1), "v11"); + PutKey(db.get(), write_options, TestKey(1, 3), "v13"); + ASSERT_OK(db->Flush(FlushOptions())); + ASSERT_OK( + static_cast_with_check(db.get())->TEST_WaitForFlushMemTable()); + std::unique_ptr iter(db->NewIterator(read_options)); + iter->SeekToLast(); + ASSERT_EQ(iter->value(), v14); + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + ParseCommandLineFlags(&argc, &argv, true); + return RUN_ALL_TESTS(); +} + +#endif // GFLAGS + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as HashSkipList and HashLinkList are not supported in " + "ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/range_del_aggregator.cc b/src/rocksdb/db/range_del_aggregator.cc new file mode 100644 index 000000000..c03efa11f --- /dev/null +++ b/src/rocksdb/db/range_del_aggregator.cc @@ -0,0 +1,524 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/range_del_aggregator.h" + +#include "db/compaction/compaction_iteration_stats.h" +#include "db/dbformat.h" +#include "db/pinned_iterators_manager.h" +#include "db/range_del_aggregator.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/version_edit.h" +#include "rocksdb/comparator.h" +#include "rocksdb/types.h" +#include "table/internal_iterator.h" +#include "table/scoped_arena_iterator.h" +#include "table/table_builder.h" +#include "util/heap.h" +#include "util/kv_map.h" +#include "util/vector_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +TruncatedRangeDelIterator::TruncatedRangeDelIterator( + std::unique_ptr iter, + const InternalKeyComparator* icmp, const InternalKey* smallest, + const InternalKey* largest) + : iter_(std::move(iter)), + icmp_(icmp), + smallest_ikey_(smallest), + largest_ikey_(largest) { + if (smallest != nullptr) { + pinned_bounds_.emplace_back(); + auto& parsed_smallest = pinned_bounds_.back(); + Status pik_status = ParseInternalKey(smallest->Encode(), &parsed_smallest, + false /* log_err_key */); // TODO + pik_status.PermitUncheckedError(); + assert(pik_status.ok()); + smallest_ = &parsed_smallest; + } + if (largest != nullptr) { + pinned_bounds_.emplace_back(); + auto& parsed_largest = pinned_bounds_.back(); + + Status pik_status = ParseInternalKey(largest->Encode(), &parsed_largest, + false /* log_err_key */); // TODO + pik_status.PermitUncheckedError(); + assert(pik_status.ok()); + + if (parsed_largest.type == kTypeRangeDeletion && + parsed_largest.sequence == kMaxSequenceNumber) { + // The file boundary has been artificially extended by a range tombstone. + // We do not need to adjust largest to properly truncate range + // tombstones that extend past the boundary. + } else if (parsed_largest.sequence == 0) { + // The largest key in the sstable has a sequence number of 0. Since we + // guarantee that no internal keys with the same user key and sequence + // number can exist in a DB, we know that the largest key in this sstable + // cannot exist as the smallest key in the next sstable. This further + // implies that no range tombstone in this sstable covers largest; + // otherwise, the file boundary would have been artificially extended. + // + // Therefore, we will never truncate a range tombstone at largest, so we + // can leave it unchanged. + } else { + // The same user key may straddle two sstable boundaries. To ensure that + // the truncated end key can cover the largest key in this sstable, reduce + // its sequence number by 1. + parsed_largest.sequence -= 1; + // This line is not needed for correctness, but it ensures that the + // truncated end key is not covering keys from the next SST file. + parsed_largest.type = kValueTypeForSeek; + } + largest_ = &parsed_largest; + } +} + +bool TruncatedRangeDelIterator::Valid() const { + assert(iter_ != nullptr); + return iter_->Valid() && + (smallest_ == nullptr || + icmp_->Compare(*smallest_, iter_->parsed_end_key()) < 0) && + (largest_ == nullptr || + icmp_->Compare(iter_->parsed_start_key(), *largest_) < 0); +} + +// NOTE: target is a user key, with timestamp if enabled. +void TruncatedRangeDelIterator::Seek(const Slice& target) { + if (largest_ != nullptr && + icmp_->Compare(*largest_, ParsedInternalKey(target, kMaxSequenceNumber, + kTypeRangeDeletion)) <= 0) { + iter_->Invalidate(); + return; + } + if (smallest_ != nullptr && + icmp_->user_comparator()->Compare(target, smallest_->user_key) < 0) { + iter_->Seek(smallest_->user_key); + return; + } + iter_->Seek(target); +} + +// NOTE: target is a user key, with timestamp if enabled. +void TruncatedRangeDelIterator::SeekForPrev(const Slice& target) { + if (smallest_ != nullptr && + icmp_->Compare(ParsedInternalKey(target, 0, kTypeRangeDeletion), + *smallest_) < 0) { + iter_->Invalidate(); + return; + } + if (largest_ != nullptr && + icmp_->user_comparator()->Compare(largest_->user_key, target) < 0) { + iter_->SeekForPrev(largest_->user_key); + return; + } + iter_->SeekForPrev(target); +} + +void TruncatedRangeDelIterator::SeekToFirst() { + if (smallest_ != nullptr) { + iter_->Seek(smallest_->user_key); + return; + } + iter_->SeekToTopFirst(); +} + +void TruncatedRangeDelIterator::SeekToLast() { + if (largest_ != nullptr) { + iter_->SeekForPrev(largest_->user_key); + return; + } + iter_->SeekToTopLast(); +} + +std::map> +TruncatedRangeDelIterator::SplitBySnapshot( + const std::vector& snapshots) { + using FragmentedIterPair = + std::pair>; + + auto split_untruncated_iters = iter_->SplitBySnapshot(snapshots); + std::map> + split_truncated_iters; + std::for_each( + split_untruncated_iters.begin(), split_untruncated_iters.end(), + [&](FragmentedIterPair& iter_pair) { + auto truncated_iter = std::make_unique( + std::move(iter_pair.second), icmp_, smallest_ikey_, largest_ikey_); + split_truncated_iters.emplace(iter_pair.first, + std::move(truncated_iter)); + }); + return split_truncated_iters; +} + +ForwardRangeDelIterator::ForwardRangeDelIterator( + const InternalKeyComparator* icmp) + : icmp_(icmp), + unused_idx_(0), + active_seqnums_(SeqMaxComparator()), + active_iters_(EndKeyMinComparator(icmp)), + inactive_iters_(StartKeyMinComparator(icmp)) {} + +bool ForwardRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) { + // Move active iterators that end before parsed. + while (!active_iters_.empty() && + icmp_->Compare((*active_iters_.top())->end_key(), parsed) <= 0) { + TruncatedRangeDelIterator* iter = PopActiveIter(); + do { + iter->Next(); + } while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0); + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); + } + + // Move inactive iterators that start before parsed. + while (!inactive_iters_.empty() && + icmp_->Compare(inactive_iters_.top()->start_key(), parsed) <= 0) { + TruncatedRangeDelIterator* iter = PopInactiveIter(); + while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0) { + iter->Next(); + } + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); + } + + return active_seqnums_.empty() + ? false + : (*active_seqnums_.begin())->seq() > parsed.sequence; +} + +void ForwardRangeDelIterator::Invalidate() { + unused_idx_ = 0; + active_iters_.clear(); + active_seqnums_.clear(); + inactive_iters_.clear(); +} + +ReverseRangeDelIterator::ReverseRangeDelIterator( + const InternalKeyComparator* icmp) + : icmp_(icmp), + unused_idx_(0), + active_seqnums_(SeqMaxComparator()), + active_iters_(StartKeyMaxComparator(icmp)), + inactive_iters_(EndKeyMaxComparator(icmp)) {} + +bool ReverseRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) { + // Move active iterators that start after parsed. + while (!active_iters_.empty() && + icmp_->Compare(parsed, (*active_iters_.top())->start_key()) < 0) { + TruncatedRangeDelIterator* iter = PopActiveIter(); + do { + iter->Prev(); + } while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0); + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); + } + + // Move inactive iterators that end after parsed. + while (!inactive_iters_.empty() && + icmp_->Compare(parsed, inactive_iters_.top()->end_key()) < 0) { + TruncatedRangeDelIterator* iter = PopInactiveIter(); + while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0) { + iter->Prev(); + } + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); + } + + return active_seqnums_.empty() + ? false + : (*active_seqnums_.begin())->seq() > parsed.sequence; +} + +void ReverseRangeDelIterator::Invalidate() { + unused_idx_ = 0; + active_iters_.clear(); + active_seqnums_.clear(); + inactive_iters_.clear(); +} + +bool RangeDelAggregator::StripeRep::ShouldDelete( + const ParsedInternalKey& parsed, RangeDelPositioningMode mode) { + if (!InStripe(parsed.sequence) || IsEmpty()) { + return false; + } + switch (mode) { + case RangeDelPositioningMode::kForwardTraversal: + InvalidateReverseIter(); + + // Pick up previously unseen iterators. + for (auto it = std::next(iters_.begin(), forward_iter_.UnusedIdx()); + it != iters_.end(); ++it, forward_iter_.IncUnusedIdx()) { + auto& iter = *it; + forward_iter_.AddNewIter(iter.get(), parsed); + } + + return forward_iter_.ShouldDelete(parsed); + case RangeDelPositioningMode::kBackwardTraversal: + InvalidateForwardIter(); + + // Pick up previously unseen iterators. + for (auto it = std::next(iters_.begin(), reverse_iter_.UnusedIdx()); + it != iters_.end(); ++it, reverse_iter_.IncUnusedIdx()) { + auto& iter = *it; + reverse_iter_.AddNewIter(iter.get(), parsed); + } + + return reverse_iter_.ShouldDelete(parsed); + default: + assert(false); + return false; + } +} + +bool RangeDelAggregator::StripeRep::IsRangeOverlapped(const Slice& start, + const Slice& end) { + Invalidate(); + + // Set the internal start/end keys so that: + // - if start_ikey has the same user key and sequence number as the + // current end key, start_ikey will be considered greater; and + // - if end_ikey has the same user key and sequence number as the current + // start key, end_ikey will be considered greater. + ParsedInternalKey start_ikey(start, kMaxSequenceNumber, + static_cast(0)); + ParsedInternalKey end_ikey(end, 0, static_cast(0)); + for (auto& iter : iters_) { + bool checked_candidate_tombstones = false; + for (iter->SeekForPrev(start); + iter->Valid() && icmp_->Compare(iter->start_key(), end_ikey) <= 0; + iter->Next()) { + checked_candidate_tombstones = true; + if (icmp_->Compare(start_ikey, iter->end_key()) < 0 && + icmp_->Compare(iter->start_key(), end_ikey) <= 0) { + return true; + } + } + + if (!checked_candidate_tombstones) { + // Do an additional check for when the end of the range is the begin + // key of a tombstone, which we missed earlier since SeekForPrev'ing + // to the start was invalid. + iter->SeekForPrev(end); + if (iter->Valid() && icmp_->Compare(start_ikey, iter->end_key()) < 0 && + icmp_->Compare(iter->start_key(), end_ikey) <= 0) { + return true; + } + } + } + return false; +} + +void ReadRangeDelAggregator::AddTombstones( + std::unique_ptr input_iter, + const InternalKey* smallest, const InternalKey* largest) { + if (input_iter == nullptr || input_iter->empty()) { + return; + } + rep_.AddTombstones(std::make_unique( + std::move(input_iter), icmp_, smallest, largest)); +} + +bool ReadRangeDelAggregator::ShouldDeleteImpl(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode) { + return rep_.ShouldDelete(parsed, mode); +} + +bool ReadRangeDelAggregator::IsRangeOverlapped(const Slice& start, + const Slice& end) { + InvalidateRangeDelMapPositions(); + return rep_.IsRangeOverlapped(start, end); +} + +void CompactionRangeDelAggregator::AddTombstones( + std::unique_ptr input_iter, + const InternalKey* smallest, const InternalKey* largest) { + if (input_iter == nullptr || input_iter->empty()) { + return; + } + // This bounds output of CompactionRangeDelAggregator::NewIterator. + if (!trim_ts_.empty()) { + assert(icmp_->user_comparator()->timestamp_size() > 0); + input_iter->SetTimestampUpperBound(&trim_ts_); + } + + assert(input_iter->lower_bound() == 0); + assert(input_iter->upper_bound() == kMaxSequenceNumber); + parent_iters_.emplace_back(new TruncatedRangeDelIterator( + std::move(input_iter), icmp_, smallest, largest)); + + Slice* ts_upper_bound = nullptr; + if (!ts_upper_bound_.empty()) { + assert(icmp_->user_comparator()->timestamp_size() > 0); + ts_upper_bound = &ts_upper_bound_; + } + auto split_iters = parent_iters_.back()->SplitBySnapshot(*snapshots_); + for (auto& split_iter : split_iters) { + auto it = reps_.find(split_iter.first); + if (it == reps_.end()) { + bool inserted; + SequenceNumber upper_bound = split_iter.second->upper_bound(); + SequenceNumber lower_bound = split_iter.second->lower_bound(); + std::tie(it, inserted) = reps_.emplace( + split_iter.first, StripeRep(icmp_, upper_bound, lower_bound)); + assert(inserted); + } + assert(it != reps_.end()); + // ts_upper_bound is used to bound ShouldDelete() to only consider + // range tombstones under full_history_ts_low_ and trim_ts_. Keys covered by + // range tombstones that are above full_history_ts_low_ should not be + // dropped prematurely: user may read with a timestamp between the range + // tombstone and the covered key. Note that we cannot set timestamp + // upperbound on the original `input_iter` since `input_iter`s are later + // used in CompactionRangeDelAggregator::NewIterator to output range + // tombstones for persistence. We do not want to only persist range + // tombstones with timestamp lower than ts_upper_bound. + split_iter.second->SetTimestampUpperBound(ts_upper_bound); + it->second.AddTombstones(std::move(split_iter.second)); + } +} + +bool CompactionRangeDelAggregator::ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode) { + auto it = reps_.lower_bound(parsed.sequence); + if (it == reps_.end()) { + return false; + } + return it->second.ShouldDelete(parsed, mode); +} + +namespace { + +// Produce a sorted (by start internal key) stream of range tombstones from +// `children`. lower_bound and upper_bound on user key can be +// optionally specified. Range tombstones that ends before lower_bound or starts +// after upper_bound are excluded. +// If user-defined timestamp is enabled, lower_bound and upper_bound should +// contain timestamp, but comparison is done ignoring timestamps. +class TruncatedRangeDelMergingIter : public InternalIterator { + public: + TruncatedRangeDelMergingIter( + const InternalKeyComparator* icmp, const Slice* lower_bound, + const Slice* upper_bound, bool upper_bound_inclusive, + const std::vector>& children) + : icmp_(icmp), + lower_bound_(lower_bound), + upper_bound_(upper_bound), + upper_bound_inclusive_(upper_bound_inclusive), + heap_(StartKeyMinComparator(icmp)), + ts_sz_(icmp_->user_comparator()->timestamp_size()) { + for (auto& child : children) { + if (child != nullptr) { + assert(child->lower_bound() == 0); + assert(child->upper_bound() == kMaxSequenceNumber); + children_.push_back(child.get()); + } + } + } + + bool Valid() const override { + return !heap_.empty() && BeforeEndKey(heap_.top()); + } + Status status() const override { return Status::OK(); } + + void SeekToFirst() override { + heap_.clear(); + for (auto& child : children_) { + if (lower_bound_ != nullptr) { + child->Seek(*lower_bound_); + } else { + child->SeekToFirst(); + } + if (child->Valid()) { + heap_.push(child); + } + } + } + + void Next() override { + auto* top = heap_.top(); + top->InternalNext(); + if (top->Valid()) { + heap_.replace_top(top); + } else { + heap_.pop(); + } + } + + Slice key() const override { + auto* top = heap_.top(); + if (ts_sz_) { + cur_start_key_.Set(top->start_key().user_key, top->seq(), + kTypeRangeDeletion, top->timestamp()); + } else { + cur_start_key_.Set(top->start_key().user_key, top->seq(), + kTypeRangeDeletion); + } + assert(top->start_key().user_key.size() >= ts_sz_); + return cur_start_key_.Encode(); + } + + Slice value() const override { + auto* top = heap_.top(); + if (!ts_sz_) { + return top->end_key().user_key; + } + assert(top->timestamp().size() == ts_sz_); + cur_end_key_.clear(); + cur_end_key_.append(top->end_key().user_key.data(), + top->end_key().user_key.size() - ts_sz_); + cur_end_key_.append(top->timestamp().data(), ts_sz_); + return cur_end_key_; + } + + // Unused InternalIterator methods + void Prev() override { assert(false); } + void Seek(const Slice& /* target */) override { assert(false); } + void SeekForPrev(const Slice& /* target */) override { assert(false); } + void SeekToLast() override { assert(false); } + + private: + bool BeforeEndKey(const TruncatedRangeDelIterator* iter) const { + if (upper_bound_ == nullptr) { + return true; + } + int cmp = icmp_->user_comparator()->CompareWithoutTimestamp( + iter->start_key().user_key, *upper_bound_); + return upper_bound_inclusive_ ? cmp <= 0 : cmp < 0; + } + + const InternalKeyComparator* icmp_; + const Slice* lower_bound_; + const Slice* upper_bound_; + bool upper_bound_inclusive_; + BinaryHeap heap_; + std::vector children_; + + mutable InternalKey cur_start_key_; + mutable std::string cur_end_key_; + size_t ts_sz_; +}; + +} // anonymous namespace + +std::unique_ptr +CompactionRangeDelAggregator::NewIterator(const Slice* lower_bound, + const Slice* upper_bound, + bool upper_bound_inclusive) { + InvalidateRangeDelMapPositions(); + auto merging_iter = std::make_unique( + icmp_, lower_bound, upper_bound, upper_bound_inclusive, parent_iters_); + + auto fragmented_tombstone_list = + std::make_shared( + std::move(merging_iter), *icmp_, true /* for_compaction */, + *snapshots_); + + return std::make_unique( + fragmented_tombstone_list, *icmp_, kMaxSequenceNumber /* upper_bound */); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/range_del_aggregator.h b/src/rocksdb/db/range_del_aggregator.h new file mode 100644 index 000000000..9bd40967d --- /dev/null +++ b/src/rocksdb/db/range_del_aggregator.h @@ -0,0 +1,476 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "db/compaction/compaction_iteration_stats.h" +#include "db/dbformat.h" +#include "db/pinned_iterators_manager.h" +#include "db/range_del_aggregator.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/version_edit.h" +#include "rocksdb/comparator.h" +#include "rocksdb/types.h" +#include "table/internal_iterator.h" +#include "table/scoped_arena_iterator.h" +#include "table/table_builder.h" +#include "util/heap.h" +#include "util/kv_map.h" + +namespace ROCKSDB_NAMESPACE { + +class TruncatedRangeDelIterator { + public: + TruncatedRangeDelIterator( + std::unique_ptr iter, + const InternalKeyComparator* icmp, const InternalKey* smallest, + const InternalKey* largest); + + bool Valid() const; + + void Next() { iter_->TopNext(); } + void Prev() { iter_->TopPrev(); } + + void InternalNext() { iter_->Next(); } + + // Seeks to the tombstone with the highest visible sequence number that covers + // target (a user key). If no such tombstone exists, the position will be at + // the earliest tombstone that ends after target. + // REQUIRES: target is a user key. + void Seek(const Slice& target); + + // Seeks to the tombstone with the highest visible sequence number that covers + // target (a user key). If no such tombstone exists, the position will be at + // the latest tombstone that starts before target. + void SeekForPrev(const Slice& target); + + void SeekToFirst(); + void SeekToLast(); + + ParsedInternalKey start_key() const { + return (smallest_ == nullptr || + icmp_->Compare(*smallest_, iter_->parsed_start_key()) <= 0) + ? iter_->parsed_start_key() + : *smallest_; + } + + ParsedInternalKey end_key() const { + return (largest_ == nullptr || + icmp_->Compare(iter_->parsed_end_key(), *largest_) <= 0) + ? iter_->parsed_end_key() + : *largest_; + } + + SequenceNumber seq() const { return iter_->seq(); } + Slice timestamp() const { + assert(icmp_->user_comparator()->timestamp_size()); + return iter_->timestamp(); + } + void SetTimestampUpperBound(const Slice* ts_upper_bound) { + iter_->SetTimestampUpperBound(ts_upper_bound); + } + + std::map> + SplitBySnapshot(const std::vector& snapshots); + + SequenceNumber upper_bound() const { return iter_->upper_bound(); } + + SequenceNumber lower_bound() const { return iter_->lower_bound(); } + + private: + std::unique_ptr iter_; + const InternalKeyComparator* icmp_; + const ParsedInternalKey* smallest_ = nullptr; + const ParsedInternalKey* largest_ = nullptr; + std::list pinned_bounds_; + + const InternalKey* smallest_ikey_; + const InternalKey* largest_ikey_; +}; + +struct SeqMaxComparator { + bool operator()(const TruncatedRangeDelIterator* a, + const TruncatedRangeDelIterator* b) const { + return a->seq() > b->seq(); + } +}; + +struct StartKeyMinComparator { + explicit StartKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {} + + bool operator()(const TruncatedRangeDelIterator* a, + const TruncatedRangeDelIterator* b) const { + return icmp->Compare(a->start_key(), b->start_key()) > 0; + } + + const InternalKeyComparator* icmp; +}; + +class ForwardRangeDelIterator { + public: + explicit ForwardRangeDelIterator(const InternalKeyComparator* icmp); + + bool ShouldDelete(const ParsedInternalKey& parsed); + void Invalidate(); + + void AddNewIter(TruncatedRangeDelIterator* iter, + const ParsedInternalKey& parsed) { + iter->Seek(parsed.user_key); + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); + } + + size_t UnusedIdx() const { return unused_idx_; } + void IncUnusedIdx() { unused_idx_++; } + + private: + using ActiveSeqSet = + std::multiset; + + struct EndKeyMinComparator { + explicit EndKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {} + + bool operator()(const ActiveSeqSet::const_iterator& a, + const ActiveSeqSet::const_iterator& b) const { + return icmp->Compare((*a)->end_key(), (*b)->end_key()) > 0; + } + + const InternalKeyComparator* icmp; + }; + + void PushIter(TruncatedRangeDelIterator* iter, + const ParsedInternalKey& parsed) { + if (!iter->Valid()) { + // The iterator has been fully consumed, so we don't need to add it to + // either of the heaps. + return; + } + int cmp = icmp_->Compare(parsed, iter->start_key()); + if (cmp < 0) { + PushInactiveIter(iter); + } else { + PushActiveIter(iter); + } + } + + void PushActiveIter(TruncatedRangeDelIterator* iter) { + auto seq_pos = active_seqnums_.insert(iter); + active_iters_.push(seq_pos); + } + + TruncatedRangeDelIterator* PopActiveIter() { + auto active_top = active_iters_.top(); + auto iter = *active_top; + active_iters_.pop(); + active_seqnums_.erase(active_top); + return iter; + } + + void PushInactiveIter(TruncatedRangeDelIterator* iter) { + inactive_iters_.push(iter); + } + + TruncatedRangeDelIterator* PopInactiveIter() { + auto* iter = inactive_iters_.top(); + inactive_iters_.pop(); + return iter; + } + + const InternalKeyComparator* icmp_; + size_t unused_idx_; + ActiveSeqSet active_seqnums_; + BinaryHeap active_iters_; + BinaryHeap inactive_iters_; +}; + +class ReverseRangeDelIterator { + public: + explicit ReverseRangeDelIterator(const InternalKeyComparator* icmp); + + bool ShouldDelete(const ParsedInternalKey& parsed); + void Invalidate(); + + void AddNewIter(TruncatedRangeDelIterator* iter, + const ParsedInternalKey& parsed) { + iter->SeekForPrev(parsed.user_key); + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); + } + + size_t UnusedIdx() const { return unused_idx_; } + void IncUnusedIdx() { unused_idx_++; } + + private: + using ActiveSeqSet = + std::multiset; + + struct EndKeyMaxComparator { + explicit EndKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {} + + bool operator()(const TruncatedRangeDelIterator* a, + const TruncatedRangeDelIterator* b) const { + return icmp->Compare(a->end_key(), b->end_key()) < 0; + } + + const InternalKeyComparator* icmp; + }; + struct StartKeyMaxComparator { + explicit StartKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {} + + bool operator()(const ActiveSeqSet::const_iterator& a, + const ActiveSeqSet::const_iterator& b) const { + return icmp->Compare((*a)->start_key(), (*b)->start_key()) < 0; + } + + const InternalKeyComparator* icmp; + }; + + void PushIter(TruncatedRangeDelIterator* iter, + const ParsedInternalKey& parsed) { + if (!iter->Valid()) { + // The iterator has been fully consumed, so we don't need to add it to + // either of the heaps. + } else if (icmp_->Compare(iter->end_key(), parsed) <= 0) { + PushInactiveIter(iter); + } else { + PushActiveIter(iter); + } + } + + void PushActiveIter(TruncatedRangeDelIterator* iter) { + auto seq_pos = active_seqnums_.insert(iter); + active_iters_.push(seq_pos); + } + + TruncatedRangeDelIterator* PopActiveIter() { + auto active_top = active_iters_.top(); + auto iter = *active_top; + active_iters_.pop(); + active_seqnums_.erase(active_top); + return iter; + } + + void PushInactiveIter(TruncatedRangeDelIterator* iter) { + inactive_iters_.push(iter); + } + + TruncatedRangeDelIterator* PopInactiveIter() { + auto* iter = inactive_iters_.top(); + inactive_iters_.pop(); + return iter; + } + + const InternalKeyComparator* icmp_; + size_t unused_idx_; + ActiveSeqSet active_seqnums_; + BinaryHeap active_iters_; + BinaryHeap inactive_iters_; +}; + +enum class RangeDelPositioningMode { kForwardTraversal, kBackwardTraversal }; +class RangeDelAggregator { + public: + explicit RangeDelAggregator(const InternalKeyComparator* icmp) + : icmp_(icmp) {} + virtual ~RangeDelAggregator() {} + + virtual void AddTombstones( + std::unique_ptr input_iter, + const InternalKey* smallest = nullptr, + const InternalKey* largest = nullptr) = 0; + + bool ShouldDelete(const Slice& ikey, RangeDelPositioningMode mode) { + ParsedInternalKey parsed; + + Status pik_status = + ParseInternalKey(ikey, &parsed, false /* log_err_key */); // TODO + assert(pik_status.ok()); + if (!pik_status.ok()) { + return false; + } + + return ShouldDelete(parsed, mode); + } + virtual bool ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode) = 0; + + virtual void InvalidateRangeDelMapPositions() = 0; + + virtual bool IsEmpty() const = 0; + + bool AddFile(uint64_t file_number) { + return files_seen_.insert(file_number).second; + } + + protected: + class StripeRep { + public: + StripeRep(const InternalKeyComparator* icmp, SequenceNumber upper_bound, + SequenceNumber lower_bound) + : icmp_(icmp), + forward_iter_(icmp), + reverse_iter_(icmp), + upper_bound_(upper_bound), + lower_bound_(lower_bound) {} + + void AddTombstones(std::unique_ptr input_iter) { + iters_.push_back(std::move(input_iter)); + } + + bool IsEmpty() const { return iters_.empty(); } + + bool ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode); + + void Invalidate() { + if (!IsEmpty()) { + InvalidateForwardIter(); + InvalidateReverseIter(); + } + } + + // If user-defined timestamp is enabled, `start` and `end` are user keys + // with timestamp. + bool IsRangeOverlapped(const Slice& start, const Slice& end); + + private: + bool InStripe(SequenceNumber seq) const { + return lower_bound_ <= seq && seq <= upper_bound_; + } + + void InvalidateForwardIter() { forward_iter_.Invalidate(); } + + void InvalidateReverseIter() { reverse_iter_.Invalidate(); } + + const InternalKeyComparator* icmp_; + std::vector> iters_; + ForwardRangeDelIterator forward_iter_; + ReverseRangeDelIterator reverse_iter_; + SequenceNumber upper_bound_; + SequenceNumber lower_bound_; + }; + + const InternalKeyComparator* icmp_; + + private: + std::set files_seen_; +}; + +class ReadRangeDelAggregator final : public RangeDelAggregator { + public: + ReadRangeDelAggregator(const InternalKeyComparator* icmp, + SequenceNumber upper_bound) + : RangeDelAggregator(icmp), + rep_(icmp, upper_bound, 0 /* lower_bound */) {} + ~ReadRangeDelAggregator() override {} + + using RangeDelAggregator::ShouldDelete; + void AddTombstones( + std::unique_ptr input_iter, + const InternalKey* smallest = nullptr, + const InternalKey* largest = nullptr) override; + + bool ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode) final override { + if (rep_.IsEmpty()) { + return false; + } + return ShouldDeleteImpl(parsed, mode); + } + + bool IsRangeOverlapped(const Slice& start, const Slice& end); + + void InvalidateRangeDelMapPositions() override { rep_.Invalidate(); } + + bool IsEmpty() const override { return rep_.IsEmpty(); } + + private: + StripeRep rep_; + + bool ShouldDeleteImpl(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode); +}; + +class CompactionRangeDelAggregator : public RangeDelAggregator { + public: + CompactionRangeDelAggregator(const InternalKeyComparator* icmp, + const std::vector& snapshots, + const std::string* full_history_ts_low = nullptr, + const std::string* trim_ts = nullptr) + : RangeDelAggregator(icmp), snapshots_(&snapshots) { + if (full_history_ts_low) { + ts_upper_bound_ = *full_history_ts_low; + } + if (trim_ts) { + trim_ts_ = *trim_ts; + // Range tombstone newer than `trim_ts` or `full_history_ts_low` should + // not be considered in ShouldDelete(). + if (ts_upper_bound_.empty()) { + ts_upper_bound_ = trim_ts_; + } else if (!trim_ts_.empty() && icmp->user_comparator()->CompareTimestamp( + trim_ts_, ts_upper_bound_) < 0) { + ts_upper_bound_ = trim_ts_; + } + } + } + ~CompactionRangeDelAggregator() override {} + + void AddTombstones( + std::unique_ptr input_iter, + const InternalKey* smallest = nullptr, + const InternalKey* largest = nullptr) override; + + using RangeDelAggregator::ShouldDelete; + bool ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode) override; + + bool IsRangeOverlapped(const Slice& start, const Slice& end); + + void InvalidateRangeDelMapPositions() override { + for (auto& rep : reps_) { + rep.second.Invalidate(); + } + } + + bool IsEmpty() const override { + for (const auto& rep : reps_) { + if (!rep.second.IsEmpty()) { + return false; + } + } + return true; + } + + // Creates an iterator over all the range tombstones in the aggregator, for + // use in compaction. Nullptr arguments indicate that the iterator range is + // unbounded. + // NOTE: the boundaries are used for optimization purposes to reduce the + // number of tombstones that are passed to the fragmenter; they do not + // guarantee that the resulting iterator only contains range tombstones that + // cover keys in the provided range. If required, these bounds must be + // enforced during iteration. + std::unique_ptr NewIterator( + const Slice* lower_bound = nullptr, const Slice* upper_bound = nullptr, + bool upper_bound_inclusive = false); + + private: + std::vector> parent_iters_; + std::map reps_; + + const std::vector* snapshots_; + // min over full_history_ts_low and trim_ts_ + Slice ts_upper_bound_{}; + Slice trim_ts_{}; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/range_del_aggregator_bench.cc b/src/rocksdb/db/range_del_aggregator_bench.cc new file mode 100644 index 000000000..9dca707e5 --- /dev/null +++ b/src/rocksdb/db/range_del_aggregator_bench.cc @@ -0,0 +1,280 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef GFLAGS +#include +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else + +#include +#include +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/range_del_aggregator.h" +#include "db/range_tombstone_fragmenter.h" +#include "rocksdb/comparator.h" +#include "rocksdb/system_clock.h" +#include "util/coding.h" +#include "util/gflags_compat.h" +#include "util/random.h" +#include "util/stop_watch.h" +#include "util/vector_iterator.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; + +DEFINE_int32(num_range_tombstones, 1000, "number of range tombstones created"); + +DEFINE_int32(num_runs, 1000, "number of test runs"); + +DEFINE_int32(tombstone_start_upper_bound, 1000, + "exclusive upper bound on range tombstone start keys"); + +DEFINE_int32(should_delete_upper_bound, 1000, + "exclusive upper bound on keys passed to ShouldDelete"); + +DEFINE_double(tombstone_width_mean, 100.0, "average range tombstone width"); + +DEFINE_double(tombstone_width_stddev, 0.0, + "standard deviation of range tombstone width"); + +DEFINE_int32(seed, 0, "random number generator seed"); + +DEFINE_int32(should_deletes_per_run, 1, "number of ShouldDelete calls per run"); + +DEFINE_int32(add_tombstones_per_run, 1, + "number of AddTombstones calls per run"); + +DEFINE_bool(use_compaction_range_del_aggregator, false, + "Whether to use CompactionRangeDelAggregator. Default is to use " + "ReadRangeDelAggregator."); + +namespace { + +struct Stats { + uint64_t time_add_tombstones = 0; + uint64_t time_first_should_delete = 0; + uint64_t time_rest_should_delete = 0; + uint64_t time_fragment_tombstones = 0; +}; + +std::ostream& operator<<(std::ostream& os, const Stats& s) { + std::ios fmt_holder(nullptr); + fmt_holder.copyfmt(os); + + os << std::left; + os << std::setw(25) << "Fragment Tombstones: " + << s.time_fragment_tombstones / + (FLAGS_add_tombstones_per_run * FLAGS_num_runs * 1.0e3) + << " us\n"; + os << std::setw(25) << "AddTombstones: " + << s.time_add_tombstones / + (FLAGS_add_tombstones_per_run * FLAGS_num_runs * 1.0e3) + << " us\n"; + os << std::setw(25) << "ShouldDelete (first): " + << s.time_first_should_delete / (FLAGS_num_runs * 1.0e3) << " us\n"; + if (FLAGS_should_deletes_per_run > 1) { + os << std::setw(25) << "ShouldDelete (rest): " + << s.time_rest_should_delete / + ((FLAGS_should_deletes_per_run - 1) * FLAGS_num_runs * 1.0e3) + << " us\n"; + } + + os.copyfmt(fmt_holder); + return os; +} + +auto icmp = ROCKSDB_NAMESPACE::InternalKeyComparator( + ROCKSDB_NAMESPACE::BytewiseComparator()); + +} // anonymous namespace + +namespace ROCKSDB_NAMESPACE { + +namespace { + +// A wrapper around RangeTombstones and the underlying data of its start and end +// keys. +struct PersistentRangeTombstone { + std::string start_key; + std::string end_key; + RangeTombstone tombstone; + + PersistentRangeTombstone(std::string start, std::string end, + SequenceNumber seq) + : start_key(std::move(start)), end_key(std::move(end)) { + tombstone = RangeTombstone(start_key, end_key, seq); + } + + PersistentRangeTombstone() = default; + + PersistentRangeTombstone(const PersistentRangeTombstone& t) { *this = t; } + + PersistentRangeTombstone& operator=(const PersistentRangeTombstone& t) { + start_key = t.start_key; + end_key = t.end_key; + tombstone = RangeTombstone(start_key, end_key, t.tombstone.seq_); + + return *this; + } + + PersistentRangeTombstone(PersistentRangeTombstone&& t) noexcept { *this = t; } + + PersistentRangeTombstone& operator=(PersistentRangeTombstone&& t) { + start_key = std::move(t.start_key); + end_key = std::move(t.end_key); + tombstone = RangeTombstone(start_key, end_key, t.tombstone.seq_); + + return *this; + } +}; + +struct TombstoneStartKeyComparator { + explicit TombstoneStartKeyComparator(const Comparator* c) : cmp(c) {} + + bool operator()(const RangeTombstone& a, const RangeTombstone& b) const { + return cmp->Compare(a.start_key_, b.start_key_) < 0; + } + + const Comparator* cmp; +}; + +std::unique_ptr MakeRangeDelIterator( + const std::vector& range_dels) { + std::vector keys, values; + for (const auto& range_del : range_dels) { + auto key_and_value = range_del.tombstone.Serialize(); + keys.push_back(key_and_value.first.Encode().ToString()); + values.push_back(key_and_value.second.ToString()); + } + return std::unique_ptr( + new VectorIterator(keys, values, &icmp)); +} + +// convert long to a big-endian slice key +static std::string Key(int64_t val) { + std::string little_endian_key; + std::string big_endian_key; + PutFixed64(&little_endian_key, val); + assert(little_endian_key.size() == sizeof(val)); + big_endian_key.resize(sizeof(val)); + for (size_t i = 0; i < sizeof(val); ++i) { + big_endian_key[i] = little_endian_key[sizeof(val) - 1 - i]; + } + return big_endian_key; +} + +} // anonymous namespace + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ParseCommandLineFlags(&argc, &argv, true); + + Stats stats; + ROCKSDB_NAMESPACE::SystemClock* clock = + ROCKSDB_NAMESPACE::SystemClock::Default().get(); + ROCKSDB_NAMESPACE::Random64 rnd(FLAGS_seed); + std::default_random_engine random_gen(FLAGS_seed); + std::normal_distribution normal_dist(FLAGS_tombstone_width_mean, + FLAGS_tombstone_width_stddev); + std::vector > + all_persistent_range_tombstones(FLAGS_add_tombstones_per_run); + for (int i = 0; i < FLAGS_add_tombstones_per_run; i++) { + all_persistent_range_tombstones[i] = + std::vector( + FLAGS_num_range_tombstones); + } + auto mode = ROCKSDB_NAMESPACE::RangeDelPositioningMode::kForwardTraversal; + std::vector snapshots{0}; + for (int i = 0; i < FLAGS_num_runs; i++) { + std::unique_ptr range_del_agg = + nullptr; + if (FLAGS_use_compaction_range_del_aggregator) { + range_del_agg.reset(new ROCKSDB_NAMESPACE::CompactionRangeDelAggregator( + &icmp, snapshots)); + } else { + range_del_agg.reset(new ROCKSDB_NAMESPACE::ReadRangeDelAggregator( + &icmp, ROCKSDB_NAMESPACE::kMaxSequenceNumber /* upper_bound */)); + } + + std::vector< + std::unique_ptr > + fragmented_range_tombstone_lists(FLAGS_add_tombstones_per_run); + + for (auto& persistent_range_tombstones : all_persistent_range_tombstones) { + // TODO(abhimadan): consider whether creating the range tombstones right + // before AddTombstones is artificially warming the cache compared to + // real workloads. + for (int j = 0; j < FLAGS_num_range_tombstones; j++) { + uint64_t start = rnd.Uniform(FLAGS_tombstone_start_upper_bound); + uint64_t end = static_cast( + std::round(start + std::max(1.0, normal_dist(random_gen)))); + persistent_range_tombstones[j] = + ROCKSDB_NAMESPACE::PersistentRangeTombstone( + ROCKSDB_NAMESPACE::Key(start), ROCKSDB_NAMESPACE::Key(end), j); + } + auto iter = + ROCKSDB_NAMESPACE::MakeRangeDelIterator(persistent_range_tombstones); + ROCKSDB_NAMESPACE::StopWatchNano stop_watch_fragment_tombstones( + clock, true /* auto_start */); + fragmented_range_tombstone_lists.emplace_back( + new ROCKSDB_NAMESPACE::FragmentedRangeTombstoneList( + std::move(iter), icmp, FLAGS_use_compaction_range_del_aggregator, + snapshots)); + stats.time_fragment_tombstones += + stop_watch_fragment_tombstones.ElapsedNanos(); + std::unique_ptr + fragmented_range_del_iter( + new ROCKSDB_NAMESPACE::FragmentedRangeTombstoneIterator( + fragmented_range_tombstone_lists.back().get(), icmp, + ROCKSDB_NAMESPACE::kMaxSequenceNumber)); + + ROCKSDB_NAMESPACE::StopWatchNano stop_watch_add_tombstones( + clock, true /* auto_start */); + range_del_agg->AddTombstones(std::move(fragmented_range_del_iter)); + stats.time_add_tombstones += stop_watch_add_tombstones.ElapsedNanos(); + } + + ROCKSDB_NAMESPACE::ParsedInternalKey parsed_key; + parsed_key.sequence = FLAGS_num_range_tombstones / 2; + parsed_key.type = ROCKSDB_NAMESPACE::kTypeValue; + + uint64_t first_key = rnd.Uniform(FLAGS_should_delete_upper_bound - + FLAGS_should_deletes_per_run + 1); + + for (int j = 0; j < FLAGS_should_deletes_per_run; j++) { + std::string key_string = ROCKSDB_NAMESPACE::Key(first_key + j); + parsed_key.user_key = key_string; + + ROCKSDB_NAMESPACE::StopWatchNano stop_watch_should_delete( + clock, true /* auto_start */); + range_del_agg->ShouldDelete(parsed_key, mode); + uint64_t call_time = stop_watch_should_delete.ElapsedNanos(); + + if (j == 0) { + stats.time_first_should_delete += call_time; + } else { + stats.time_rest_should_delete += call_time; + } + } + } + + std::cout << "=========================\n" + << "Results:\n" + << "=========================\n" + << stats; + + return 0; +} + +#endif // GFLAGS diff --git a/src/rocksdb/db/range_del_aggregator_test.cc b/src/rocksdb/db/range_del_aggregator_test.cc new file mode 100644 index 000000000..7fe35276a --- /dev/null +++ b/src/rocksdb/db/range_del_aggregator_test.cc @@ -0,0 +1,715 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/range_del_aggregator.h" + +#include +#include +#include + +#include "db/db_test_util.h" +#include "db/dbformat.h" +#include "db/range_tombstone_fragmenter.h" +#include "test_util/testutil.h" +#include "util/vector_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +class RangeDelAggregatorTest : public testing::Test {}; + +namespace { + +static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator()); + +std::unique_ptr MakeRangeDelIter( + const std::vector& range_dels) { + std::vector keys, values; + for (const auto& range_del : range_dels) { + auto key_and_value = range_del.Serialize(); + keys.push_back(key_and_value.first.Encode().ToString()); + values.push_back(key_and_value.second.ToString()); + } + return std::unique_ptr( + new VectorIterator(keys, values, &bytewise_icmp)); +} + +std::vector> +MakeFragmentedTombstoneLists( + const std::vector>& range_dels_list) { + std::vector> fragment_lists; + for (const auto& range_dels : range_dels_list) { + auto range_del_iter = MakeRangeDelIter(range_dels); + fragment_lists.emplace_back(new FragmentedRangeTombstoneList( + std::move(range_del_iter), bytewise_icmp)); + } + return fragment_lists; +} + +struct TruncatedIterScanTestCase { + ParsedInternalKey start; + ParsedInternalKey end; + SequenceNumber seq; +}; + +struct TruncatedIterSeekTestCase { + Slice target; + ParsedInternalKey start; + ParsedInternalKey end; + SequenceNumber seq; + bool invalid; +}; + +struct ShouldDeleteTestCase { + ParsedInternalKey lookup_key; + bool result; +}; + +struct IsRangeOverlappedTestCase { + Slice start; + Slice end; + bool result; +}; + +ParsedInternalKey UncutEndpoint(const Slice& s) { + return ParsedInternalKey(s, kMaxSequenceNumber, kTypeRangeDeletion); +} + +ParsedInternalKey InternalValue(const Slice& key, SequenceNumber seq, + ValueType type = kTypeValue) { + return ParsedInternalKey(key, seq, type); +} + +void VerifyIterator( + TruncatedRangeDelIterator* iter, const InternalKeyComparator& icmp, + const std::vector& expected_range_dels) { + // Test forward iteration. + iter->SeekToFirst(); + for (size_t i = 0; i < expected_range_dels.size(); i++, iter->Next()) { + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(0, icmp.Compare(iter->start_key(), expected_range_dels[i].start)); + EXPECT_EQ(0, icmp.Compare(iter->end_key(), expected_range_dels[i].end)); + EXPECT_EQ(expected_range_dels[i].seq, iter->seq()); + } + EXPECT_FALSE(iter->Valid()); + + // Test reverse iteration. + iter->SeekToLast(); + std::vector reverse_expected_range_dels( + expected_range_dels.rbegin(), expected_range_dels.rend()); + for (size_t i = 0; i < reverse_expected_range_dels.size(); + i++, iter->Prev()) { + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(0, icmp.Compare(iter->start_key(), + reverse_expected_range_dels[i].start)); + EXPECT_EQ( + 0, icmp.Compare(iter->end_key(), reverse_expected_range_dels[i].end)); + EXPECT_EQ(reverse_expected_range_dels[i].seq, iter->seq()); + } + EXPECT_FALSE(iter->Valid()); +} + +void VerifySeek(TruncatedRangeDelIterator* iter, + const InternalKeyComparator& icmp, + const std::vector& test_cases) { + for (const auto& test_case : test_cases) { + iter->Seek(test_case.target); + if (test_case.invalid) { + ASSERT_FALSE(iter->Valid()); + } else { + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(0, icmp.Compare(iter->start_key(), test_case.start)); + EXPECT_EQ(0, icmp.Compare(iter->end_key(), test_case.end)); + EXPECT_EQ(test_case.seq, iter->seq()); + } + } +} + +void VerifySeekForPrev( + TruncatedRangeDelIterator* iter, const InternalKeyComparator& icmp, + const std::vector& test_cases) { + for (const auto& test_case : test_cases) { + iter->SeekForPrev(test_case.target); + if (test_case.invalid) { + ASSERT_FALSE(iter->Valid()); + } else { + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(0, icmp.Compare(iter->start_key(), test_case.start)); + EXPECT_EQ(0, icmp.Compare(iter->end_key(), test_case.end)); + EXPECT_EQ(test_case.seq, iter->seq()); + } + } +} + +void VerifyShouldDelete(RangeDelAggregator* range_del_agg, + const std::vector& test_cases) { + for (const auto& test_case : test_cases) { + EXPECT_EQ( + test_case.result, + range_del_agg->ShouldDelete( + test_case.lookup_key, RangeDelPositioningMode::kForwardTraversal)); + } + for (auto it = test_cases.rbegin(); it != test_cases.rend(); ++it) { + const auto& test_case = *it; + EXPECT_EQ( + test_case.result, + range_del_agg->ShouldDelete( + test_case.lookup_key, RangeDelPositioningMode::kBackwardTraversal)); + } +} + +void VerifyIsRangeOverlapped( + ReadRangeDelAggregator* range_del_agg, + const std::vector& test_cases) { + for (const auto& test_case : test_cases) { + EXPECT_EQ(test_case.result, + range_del_agg->IsRangeOverlapped(test_case.start, test_case.end)); + } +} + +void CheckIterPosition(const RangeTombstone& tombstone, + const FragmentedRangeTombstoneIterator* iter) { + // Test InternalIterator interface. + EXPECT_EQ(tombstone.start_key_, ExtractUserKey(iter->key())); + EXPECT_EQ(tombstone.end_key_, iter->value()); + EXPECT_EQ(tombstone.seq_, iter->seq()); + + // Test FragmentedRangeTombstoneIterator interface. + EXPECT_EQ(tombstone.start_key_, iter->start_key()); + EXPECT_EQ(tombstone.end_key_, iter->end_key()); + EXPECT_EQ(tombstone.seq_, GetInternalKeySeqno(iter->key())); +} + +void VerifyFragmentedRangeDels( + FragmentedRangeTombstoneIterator* iter, + const std::vector& expected_tombstones) { + iter->SeekToFirst(); + for (size_t i = 0; i < expected_tombstones.size(); i++, iter->Next()) { + ASSERT_TRUE(iter->Valid()); + CheckIterPosition(expected_tombstones[i], iter); + } + EXPECT_FALSE(iter->Valid()); +} + +} // anonymous namespace + +TEST_F(RangeDelAggregatorTest, EmptyTruncatedIter) { + auto range_del_iter = MakeRangeDelIter({}); + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, + kMaxSequenceNumber)); + + TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr, + nullptr); + + iter.SeekToFirst(); + ASSERT_FALSE(iter.Valid()); + + iter.SeekToLast(); + ASSERT_FALSE(iter.Valid()); +} + +TEST_F(RangeDelAggregatorTest, UntruncatedIter) { + auto range_del_iter = + MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}}); + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, + kMaxSequenceNumber)); + + TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr, + nullptr); + + VerifyIterator(&iter, bytewise_icmp, + {{UncutEndpoint("a"), UncutEndpoint("e"), 10}, + {UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {UncutEndpoint("j"), UncutEndpoint("n"), 4}}); + + VerifySeek( + &iter, bytewise_icmp, + {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10}, + {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4}, + {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, + {"", UncutEndpoint("a"), UncutEndpoint("e"), 10}}); + + VerifySeekForPrev( + &iter, bytewise_icmp, + {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10}, + {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4}, + {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}}); +} + +TEST_F(RangeDelAggregatorTest, UntruncatedIterWithSnapshot) { + auto range_del_iter = + MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}}); + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, + 9 /* snapshot */)); + + TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr, + nullptr); + + VerifyIterator(&iter, bytewise_icmp, + {{UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {UncutEndpoint("j"), UncutEndpoint("n"), 4}}); + + VerifySeek( + &iter, bytewise_icmp, + {{"d", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4}, + {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, + {"", UncutEndpoint("e"), UncutEndpoint("g"), 8}}); + + VerifySeekForPrev( + &iter, bytewise_icmp, + {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, + {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4}, + {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}}); +} + +TEST_F(RangeDelAggregatorTest, TruncatedIterPartiallyCutTombstones) { + auto range_del_iter = + MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}}); + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, + kMaxSequenceNumber)); + + InternalKey smallest("d", 7, kTypeValue); + InternalKey largest("m", 9, kTypeValue); + TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, + &smallest, &largest); + + VerifyIterator( + &iter, bytewise_icmp, + {{InternalValue("d", 7), UncutEndpoint("e"), 10}, + {UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {UncutEndpoint("j"), InternalValue("m", 8, kValueTypeForSeek), 4}}); + + VerifySeek( + &iter, bytewise_icmp, + {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10}, + {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"ia", UncutEndpoint("j"), InternalValue("m", 8, kValueTypeForSeek), 4, + false /* invalid */}, + {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, + {"", InternalValue("d", 7), UncutEndpoint("e"), 10}}); + + VerifySeekForPrev( + &iter, bytewise_icmp, + {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10}, + {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"n", UncutEndpoint("j"), InternalValue("m", 8, kValueTypeForSeek), 4, + false /* invalid */}, + {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}}); +} + +TEST_F(RangeDelAggregatorTest, TruncatedIterFullyCutTombstones) { + auto range_del_iter = + MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}}); + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, + kMaxSequenceNumber)); + + InternalKey smallest("f", 7, kTypeValue); + InternalKey largest("i", 9, kTypeValue); + TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, + &smallest, &largest); + + VerifyIterator(&iter, bytewise_icmp, + {{InternalValue("f", 7), UncutEndpoint("g"), 8}}); + + VerifySeek( + &iter, bytewise_icmp, + {{"d", InternalValue("f", 7), UncutEndpoint("g"), 8}, + {"f", InternalValue("f", 7), UncutEndpoint("g"), 8}, + {"j", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}}); + + VerifySeekForPrev( + &iter, bytewise_icmp, + {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, + {"f", InternalValue("f", 7), UncutEndpoint("g"), 8}, + {"j", InternalValue("f", 7), UncutEndpoint("g"), 8}}); +} + +TEST_F(RangeDelAggregatorTest, SingleIterInAggregator) { + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 8}}); + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, + kMaxSequenceNumber)); + + ReadRangeDelAggregator range_del_agg(&bytewise_icmp, kMaxSequenceNumber); + range_del_agg.AddTombstones(std::move(input_iter)); + + VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false}, + {InternalValue("b", 9), true}, + {InternalValue("d", 9), true}, + {InternalValue("e", 7), true}, + {InternalValue("g", 7), false}}); + + VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false}, + {"_", "a", true}, + {"a", "c", true}, + {"d", "f", true}, + {"g", "l", false}}); +} + +TEST_F(RangeDelAggregatorTest, MultipleItersInAggregator) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + ReadRangeDelAggregator range_del_agg(&bytewise_icmp, kMaxSequenceNumber); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } + + VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), true}, + {InternalValue("b", 19), false}, + {InternalValue("b", 9), true}, + {InternalValue("d", 9), true}, + {InternalValue("e", 7), true}, + {InternalValue("g", 7), false}, + {InternalValue("h", 24), true}, + {InternalValue("i", 24), false}, + {InternalValue("ii", 14), true}, + {InternalValue("j", 14), false}}); + + VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false}, + {"_", "a", true}, + {"a", "c", true}, + {"d", "f", true}, + {"g", "l", true}, + {"x", "y", false}}); +} + +TEST_F(RangeDelAggregatorTest, MultipleItersInAggregatorWithUpperBound) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + 19 /* snapshot */)); + range_del_agg.AddTombstones(std::move(input_iter)); + } + + VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false}, + {InternalValue("a", 9), true}, + {InternalValue("b", 9), true}, + {InternalValue("d", 9), true}, + {InternalValue("e", 7), true}, + {InternalValue("g", 7), false}, + {InternalValue("h", 24), false}, + {InternalValue("i", 24), false}, + {InternalValue("ii", 14), true}, + {InternalValue("j", 14), false}}); + + VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false}, + {"_", "a", true}, + {"a", "c", true}, + {"d", "f", true}, + {"g", "l", true}, + {"x", "y", false}}); +} + +TEST_F(RangeDelAggregatorTest, MultipleTruncatedItersInAggregator) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "z", 10}}, {{"a", "z", 10}}, {{"a", "z", 10}}}); + std::vector> iter_bounds = { + {InternalKey("a", 4, kTypeValue), + InternalKey("m", kMaxSequenceNumber, kTypeRangeDeletion)}, + {InternalKey("m", 20, kTypeValue), + InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)}, + {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}}; + + ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19); + for (size_t i = 0; i < fragment_lists.size(); i++) { + const auto& fragment_list = fragment_lists[i]; + const auto& bounds = iter_bounds[i]; + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + 19 /* snapshot */)); + range_del_agg.AddTombstones(std::move(input_iter), &bounds.first, + &bounds.second); + } + + VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 10), false}, + {InternalValue("a", 9), false}, + {InternalValue("a", 4), true}, + {InternalValue("m", 10), false}, + {InternalValue("m", 9), true}, + {InternalValue("x", 10), false}, + {InternalValue("x", 9), false}, + {InternalValue("x", 5), true}, + {InternalValue("z", 9), false}}); + + VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false}, + {"_", "a", true}, + {"a", "n", true}, + {"l", "x", true}, + {"w", "z", true}, + {"zzz", "zz", false}, + {"zz", "zzz", false}}); +} + +TEST_F(RangeDelAggregatorTest, MultipleTruncatedItersInAggregatorSameLevel) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "z", 10}}, {{"a", "z", 10}}, {{"a", "z", 10}}}); + std::vector> iter_bounds = { + {InternalKey("a", 4, kTypeValue), + InternalKey("m", kMaxSequenceNumber, kTypeRangeDeletion)}, + {InternalKey("m", 20, kTypeValue), + InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)}, + {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}}; + + ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19); + + auto add_iter_to_agg = [&](size_t i) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_lists[i].get(), + bytewise_icmp, 19 /* snapshot */)); + range_del_agg.AddTombstones(std::move(input_iter), &iter_bounds[i].first, + &iter_bounds[i].second); + }; + + add_iter_to_agg(0); + VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 10), false}, + {InternalValue("a", 9), false}, + {InternalValue("a", 4), true}}); + + add_iter_to_agg(1); + VerifyShouldDelete(&range_del_agg, {{InternalValue("m", 10), false}, + {InternalValue("m", 9), true}}); + + add_iter_to_agg(2); + VerifyShouldDelete(&range_del_agg, {{InternalValue("x", 10), false}, + {InternalValue("x", 9), false}, + {InternalValue("x", 5), true}, + {InternalValue("z", 9), false}}); + + VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false}, + {"_", "a", true}, + {"a", "n", true}, + {"l", "x", true}, + {"w", "z", true}, + {"zzz", "zz", false}, + {"zz", "zzz", false}}); +} + +TEST_F(RangeDelAggregatorTest, CompactionAggregatorNoSnapshots) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + std::vector snapshots; + CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } + + VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), true}, + {InternalValue("b", 19), false}, + {InternalValue("b", 9), true}, + {InternalValue("d", 9), true}, + {InternalValue("e", 7), true}, + {InternalValue("g", 7), false}, + {InternalValue("h", 24), true}, + {InternalValue("i", 24), false}, + {InternalValue("ii", 14), true}, + {InternalValue("j", 14), false}}); + + auto range_del_compaction_iter = range_del_agg.NewIterator(); + VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 20}, + {"b", "c", 10}, + {"c", "e", 10}, + {"e", "g", 8}, + {"h", "i", 25}, + {"ii", "j", 15}}); +} + +TEST_F(RangeDelAggregatorTest, CompactionAggregatorWithSnapshots) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + std::vector snapshots{9, 19}; + CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } + + VerifyShouldDelete( + &range_del_agg, + { + {InternalValue("a", 19), false}, // [10, 19] + {InternalValue("a", 9), false}, // [0, 9] + {InternalValue("b", 9), false}, // [0, 9] + {InternalValue("d", 9), false}, // [0, 9] + {InternalValue("d", 7), true}, // [0, 9] + {InternalValue("e", 7), true}, // [0, 9] + {InternalValue("g", 7), false}, // [0, 9] + {InternalValue("h", 24), true}, // [20, kMaxSequenceNumber] + {InternalValue("i", 24), false}, // [20, kMaxSequenceNumber] + {InternalValue("ii", 14), true}, // [10, 19] + {InternalValue("j", 14), false} // [10, 19] + }); + + auto range_del_compaction_iter = range_del_agg.NewIterator(); + VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 20}, + {"a", "b", 10}, + {"b", "c", 10}, + {"c", "e", 10}, + {"c", "e", 8}, + {"e", "g", 8}, + {"h", "i", 25}, + {"ii", "j", 15}}); +} + +TEST_F(RangeDelAggregatorTest, CompactionAggregatorEmptyIteratorLeft) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + std::vector snapshots{9, 19}; + CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } + + Slice start("_"); + Slice end("__"); +} + +TEST_F(RangeDelAggregatorTest, CompactionAggregatorEmptyIteratorRight) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + std::vector snapshots{9, 19}; + CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } + + Slice start("p"); + Slice end("q"); + auto range_del_compaction_iter1 = + range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */); + VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {}); + + auto range_del_compaction_iter2 = + range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */); + VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {}); +} + +TEST_F(RangeDelAggregatorTest, CompactionAggregatorBoundedIterator) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + std::vector snapshots{9, 19}; + CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } + + Slice start("bb"); + Slice end("e"); + auto range_del_compaction_iter1 = + range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */); + VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), + {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}}); + + auto range_del_compaction_iter2 = + range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */); + VerifyFragmentedRangeDels( + range_del_compaction_iter2.get(), + {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}, {"e", "g", 8}}); +} + +TEST_F(RangeDelAggregatorTest, + CompactionAggregatorBoundedIteratorExtraFragments) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "d", 10}, {"c", "g", 8}}, + {{"b", "c", 20}, {"d", "f", 30}, {"h", "i", 25}, {"ii", "j", 15}}}); + + std::vector snapshots{9, 19}; + CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } + + Slice start("bb"); + Slice end("e"); + auto range_del_compaction_iter1 = + range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */); + VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {{"a", "b", 10}, + {"b", "c", 20}, + {"b", "c", 10}, + {"c", "d", 10}, + {"c", "d", 8}, + {"d", "f", 30}, + {"d", "f", 8}, + {"f", "g", 8}}); + + auto range_del_compaction_iter2 = + range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */); + VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {{"a", "b", 10}, + {"b", "c", 20}, + {"b", "c", 10}, + {"c", "d", 10}, + {"c", "d", 8}, + {"d", "f", 30}, + {"d", "f", 8}, + {"f", "g", 8}}); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/range_tombstone_fragmenter.cc b/src/rocksdb/db/range_tombstone_fragmenter.cc new file mode 100644 index 000000000..7e7cedeca --- /dev/null +++ b/src/rocksdb/db/range_tombstone_fragmenter.cc @@ -0,0 +1,502 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/range_tombstone_fragmenter.h" + +#include +#include +#include +#include +#include + +#include "util/autovector.h" +#include "util/kv_map.h" +#include "util/vector_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +FragmentedRangeTombstoneList::FragmentedRangeTombstoneList( + std::unique_ptr unfragmented_tombstones, + const InternalKeyComparator& icmp, bool for_compaction, + const std::vector& snapshots) { + if (unfragmented_tombstones == nullptr) { + return; + } + bool is_sorted = true; + InternalKey pinned_last_start_key; + Slice last_start_key; + num_unfragmented_tombstones_ = 0; + total_tombstone_payload_bytes_ = 0; + for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid(); + unfragmented_tombstones->Next(), num_unfragmented_tombstones_++) { + total_tombstone_payload_bytes_ += unfragmented_tombstones->key().size() + + unfragmented_tombstones->value().size(); + if (num_unfragmented_tombstones_ > 0 && + icmp.Compare(last_start_key, unfragmented_tombstones->key()) > 0) { + is_sorted = false; + break; + } + if (unfragmented_tombstones->IsKeyPinned()) { + last_start_key = unfragmented_tombstones->key(); + } else { + pinned_last_start_key.DecodeFrom(unfragmented_tombstones->key()); + last_start_key = pinned_last_start_key.Encode(); + } + } + if (is_sorted) { + FragmentTombstones(std::move(unfragmented_tombstones), icmp, for_compaction, + snapshots); + return; + } + + // Sort the tombstones before fragmenting them. + std::vector keys, values; + keys.reserve(num_unfragmented_tombstones_); + values.reserve(num_unfragmented_tombstones_); + // Reset the counter to zero for the next iteration over keys. + total_tombstone_payload_bytes_ = 0; + for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid(); + unfragmented_tombstones->Next()) { + total_tombstone_payload_bytes_ += unfragmented_tombstones->key().size() + + unfragmented_tombstones->value().size(); + keys.emplace_back(unfragmented_tombstones->key().data(), + unfragmented_tombstones->key().size()); + values.emplace_back(unfragmented_tombstones->value().data(), + unfragmented_tombstones->value().size()); + } + // VectorIterator implicitly sorts by key during construction. + auto iter = std::make_unique(std::move(keys), + std::move(values), &icmp); + FragmentTombstones(std::move(iter), icmp, for_compaction, snapshots); +} + +void FragmentedRangeTombstoneList::FragmentTombstones( + std::unique_ptr unfragmented_tombstones, + const InternalKeyComparator& icmp, bool for_compaction, + const std::vector& snapshots) { + Slice cur_start_key(nullptr, 0); + auto cmp = ParsedInternalKeyComparator(&icmp); + + // Stores the end keys and sequence numbers of range tombstones with a start + // key less than or equal to cur_start_key. Provides an ordering by end key + // for use in flush_current_tombstones. + std::set cur_end_keys(cmp); + + size_t ts_sz = icmp.user_comparator()->timestamp_size(); + // Given the next start key in unfragmented_tombstones, + // flush_current_tombstones writes every tombstone fragment that starts + // and ends with a key before next_start_key, and starts with a key greater + // than or equal to cur_start_key. + auto flush_current_tombstones = [&](const Slice& next_start_key) { + auto it = cur_end_keys.begin(); + bool reached_next_start_key = false; + for (; it != cur_end_keys.end() && !reached_next_start_key; ++it) { + Slice cur_end_key = it->user_key; + if (icmp.user_comparator()->CompareWithoutTimestamp(cur_start_key, + cur_end_key) == 0) { + // Empty tombstone. + continue; + } + if (icmp.user_comparator()->CompareWithoutTimestamp(next_start_key, + cur_end_key) <= 0) { + // All the end keys in [it, cur_end_keys.end()) are after + // next_start_key, so the tombstones they represent can be used in + // fragments that start with keys greater than or equal to + // next_start_key. However, the end keys we already passed will not be + // used in any more tombstone fragments. + // + // Remove the fully fragmented tombstones and stop iteration after a + // final round of flushing to preserve the tombstones we can create more + // fragments from. + reached_next_start_key = true; + cur_end_keys.erase(cur_end_keys.begin(), it); + cur_end_key = next_start_key; + } + + // Flush a range tombstone fragment [cur_start_key, cur_end_key), which + // should not overlap with the last-flushed tombstone fragment. + assert(tombstones_.empty() || + icmp.user_comparator()->CompareWithoutTimestamp( + tombstones_.back().end_key, cur_start_key) <= 0); + + // Sort the sequence numbers of the tombstones being fragmented in + // descending order, and then flush them in that order. + autovector seqnums_to_flush; + autovector timestamps_to_flush; + for (auto flush_it = it; flush_it != cur_end_keys.end(); ++flush_it) { + seqnums_to_flush.push_back(flush_it->sequence); + if (ts_sz) { + timestamps_to_flush.push_back( + ExtractTimestampFromUserKey(flush_it->user_key, ts_sz)); + } + } + // TODO: bind the two sorting together to be more efficient + std::sort(seqnums_to_flush.begin(), seqnums_to_flush.end(), + std::greater()); + if (ts_sz) { + std::sort(timestamps_to_flush.begin(), timestamps_to_flush.end(), + [icmp](const Slice& ts1, const Slice& ts2) { + return icmp.user_comparator()->CompareTimestamp(ts1, ts2) > + 0; + }); + } + + size_t start_idx = tombstone_seqs_.size(); + size_t end_idx = start_idx + seqnums_to_flush.size(); + + // If user-defined timestamp is enabled, we should not drop tombstones + // from any snapshot stripe. Garbage collection of range tombstones + // happens in CompactionOutputs::AddRangeDels(). + if (for_compaction && ts_sz == 0) { + // Drop all tombstone seqnums that are not preserved by a snapshot. + SequenceNumber next_snapshot = kMaxSequenceNumber; + for (auto seq : seqnums_to_flush) { + if (seq <= next_snapshot) { + // This seqnum is visible by a lower snapshot. + tombstone_seqs_.push_back(seq); + auto upper_bound_it = + std::lower_bound(snapshots.begin(), snapshots.end(), seq); + if (upper_bound_it == snapshots.begin()) { + // This seqnum is the topmost one visible by the earliest + // snapshot. None of the seqnums below it will be visible, so we + // can skip them. + break; + } + next_snapshot = *std::prev(upper_bound_it); + } + } + end_idx = tombstone_seqs_.size(); + } else { + // The fragmentation is being done for reads, so preserve all seqnums. + tombstone_seqs_.insert(tombstone_seqs_.end(), seqnums_to_flush.begin(), + seqnums_to_flush.end()); + if (ts_sz) { + tombstone_timestamps_.insert(tombstone_timestamps_.end(), + timestamps_to_flush.begin(), + timestamps_to_flush.end()); + } + } + + assert(start_idx < end_idx); + if (ts_sz) { + std::string start_key_with_max_ts; + AppendUserKeyWithMaxTimestamp(&start_key_with_max_ts, cur_start_key, + ts_sz); + pinned_slices_.emplace_back(std::move(start_key_with_max_ts)); + Slice start_key = pinned_slices_.back(); + + std::string end_key_with_max_ts; + AppendUserKeyWithMaxTimestamp(&end_key_with_max_ts, cur_end_key, ts_sz); + pinned_slices_.emplace_back(std::move(end_key_with_max_ts)); + Slice end_key = pinned_slices_.back(); + + // RangeTombstoneStack expects start_key and end_key to have max + // timestamp. + tombstones_.emplace_back(start_key, end_key, start_idx, end_idx); + } else { + tombstones_.emplace_back(cur_start_key, cur_end_key, start_idx, + end_idx); + } + + cur_start_key = cur_end_key; + } + if (!reached_next_start_key) { + // There is a gap between the last flushed tombstone fragment and + // the next tombstone's start key. Remove all the end keys in + // the working set, since we have fully fragmented their corresponding + // tombstones. + cur_end_keys.clear(); + } + cur_start_key = next_start_key; + }; + + pinned_iters_mgr_.StartPinning(); + + bool no_tombstones = true; + for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid(); + unfragmented_tombstones->Next()) { + const Slice& ikey = unfragmented_tombstones->key(); + Slice tombstone_start_key = ExtractUserKey(ikey); + SequenceNumber tombstone_seq = GetInternalKeySeqno(ikey); + if (!unfragmented_tombstones->IsKeyPinned()) { + pinned_slices_.emplace_back(tombstone_start_key.data(), + tombstone_start_key.size()); + tombstone_start_key = pinned_slices_.back(); + } + no_tombstones = false; + + Slice tombstone_end_key = unfragmented_tombstones->value(); + if (!unfragmented_tombstones->IsValuePinned()) { + pinned_slices_.emplace_back(tombstone_end_key.data(), + tombstone_end_key.size()); + tombstone_end_key = pinned_slices_.back(); + } + if (!cur_end_keys.empty() && + icmp.user_comparator()->CompareWithoutTimestamp( + cur_start_key, tombstone_start_key) != 0) { + // The start key has changed. Flush all tombstones that start before + // this new start key. + flush_current_tombstones(tombstone_start_key); + } + cur_start_key = tombstone_start_key; + + cur_end_keys.emplace(tombstone_end_key, tombstone_seq, kTypeRangeDeletion); + } + if (!cur_end_keys.empty()) { + ParsedInternalKey last_end_key = *std::prev(cur_end_keys.end()); + flush_current_tombstones(last_end_key.user_key); + } + + if (!no_tombstones) { + pinned_iters_mgr_.PinIterator(unfragmented_tombstones.release(), + false /* arena */); + } +} + +bool FragmentedRangeTombstoneList::ContainsRange(SequenceNumber lower, + SequenceNumber upper) { + std::call_once(seq_set_init_once_flag_, [this]() { + for (auto s : tombstone_seqs_) { + seq_set_.insert(s); + } + }); + auto seq_it = seq_set_.lower_bound(lower); + return seq_it != seq_set_.end() && *seq_it <= upper; +} + +FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator( + FragmentedRangeTombstoneList* tombstones, const InternalKeyComparator& icmp, + SequenceNumber _upper_bound, const Slice* ts_upper_bound, + SequenceNumber _lower_bound) + : tombstone_start_cmp_(icmp.user_comparator()), + tombstone_end_cmp_(icmp.user_comparator()), + icmp_(&icmp), + ucmp_(icmp.user_comparator()), + tombstones_(tombstones), + upper_bound_(_upper_bound), + lower_bound_(_lower_bound), + ts_upper_bound_(ts_upper_bound) { + assert(tombstones_ != nullptr); + Invalidate(); +} + +FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator( + const std::shared_ptr& tombstones, + const InternalKeyComparator& icmp, SequenceNumber _upper_bound, + const Slice* ts_upper_bound, SequenceNumber _lower_bound) + : tombstone_start_cmp_(icmp.user_comparator()), + tombstone_end_cmp_(icmp.user_comparator()), + icmp_(&icmp), + ucmp_(icmp.user_comparator()), + tombstones_ref_(tombstones), + tombstones_(tombstones_ref_.get()), + upper_bound_(_upper_bound), + lower_bound_(_lower_bound), + ts_upper_bound_(ts_upper_bound) { + assert(tombstones_ != nullptr); + Invalidate(); +} + +FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator( + const std::shared_ptr& tombstones_cache, + const InternalKeyComparator& icmp, SequenceNumber _upper_bound, + const Slice* ts_upper_bound, SequenceNumber _lower_bound) + : tombstone_start_cmp_(icmp.user_comparator()), + tombstone_end_cmp_(icmp.user_comparator()), + icmp_(&icmp), + ucmp_(icmp.user_comparator()), + tombstones_cache_ref_(tombstones_cache), + tombstones_(tombstones_cache_ref_->tombstones.get()), + upper_bound_(_upper_bound), + lower_bound_(_lower_bound) { + assert(tombstones_ != nullptr); + if (!ts_upper_bound || ts_upper_bound->empty()) { + ts_upper_bound_ = nullptr; + } else { + ts_upper_bound_ = ts_upper_bound; + } + Invalidate(); +} + +void FragmentedRangeTombstoneIterator::SeekToFirst() { + pos_ = tombstones_->begin(); + seq_pos_ = tombstones_->seq_begin(); +} + +void FragmentedRangeTombstoneIterator::SeekToTopFirst() { + if (tombstones_->empty()) { + Invalidate(); + return; + } + pos_ = tombstones_->begin(); + SetMaxVisibleSeqAndTimestamp(); + ScanForwardToVisibleTombstone(); +} + +void FragmentedRangeTombstoneIterator::SeekToLast() { + pos_ = std::prev(tombstones_->end()); + seq_pos_ = std::prev(tombstones_->seq_end()); +} + +void FragmentedRangeTombstoneIterator::SeekToTopLast() { + if (tombstones_->empty()) { + Invalidate(); + return; + } + pos_ = std::prev(tombstones_->end()); + SetMaxVisibleSeqAndTimestamp(); + ScanBackwardToVisibleTombstone(); +} + +// @param `target` is a user key, with timestamp if user-defined timestamp is +// enabled. +void FragmentedRangeTombstoneIterator::Seek(const Slice& target) { + if (tombstones_->empty()) { + Invalidate(); + return; + } + SeekToCoveringTombstone(target); + ScanForwardToVisibleTombstone(); +} + +void FragmentedRangeTombstoneIterator::SeekForPrev(const Slice& target) { + if (tombstones_->empty()) { + Invalidate(); + return; + } + SeekForPrevToCoveringTombstone(target); + ScanBackwardToVisibleTombstone(); +} + +void FragmentedRangeTombstoneIterator::SeekToCoveringTombstone( + const Slice& target) { + pos_ = std::upper_bound(tombstones_->begin(), tombstones_->end(), target, + tombstone_end_cmp_); + if (pos_ == tombstones_->end()) { + // All tombstones end before target. + seq_pos_ = tombstones_->seq_end(); + return; + } + SetMaxVisibleSeqAndTimestamp(); +} + +void FragmentedRangeTombstoneIterator::SeekForPrevToCoveringTombstone( + const Slice& target) { + if (tombstones_->empty()) { + Invalidate(); + return; + } + pos_ = std::upper_bound(tombstones_->begin(), tombstones_->end(), target, + tombstone_start_cmp_); + if (pos_ == tombstones_->begin()) { + // All tombstones start after target. + Invalidate(); + return; + } + --pos_; + SetMaxVisibleSeqAndTimestamp(); +} + +void FragmentedRangeTombstoneIterator::ScanForwardToVisibleTombstone() { + while (pos_ != tombstones_->end() && + (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) || + *seq_pos_ < lower_bound_)) { + ++pos_; + if (pos_ == tombstones_->end()) { + Invalidate(); + return; + } + SetMaxVisibleSeqAndTimestamp(); + } +} + +void FragmentedRangeTombstoneIterator::ScanBackwardToVisibleTombstone() { + while (pos_ != tombstones_->end() && + (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) || + *seq_pos_ < lower_bound_)) { + if (pos_ == tombstones_->begin()) { + Invalidate(); + return; + } + --pos_; + SetMaxVisibleSeqAndTimestamp(); + } +} + +void FragmentedRangeTombstoneIterator::Next() { + ++seq_pos_; + if (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx)) { + ++pos_; + } +} + +void FragmentedRangeTombstoneIterator::TopNext() { + ++pos_; + if (pos_ == tombstones_->end()) { + return; + } + SetMaxVisibleSeqAndTimestamp(); + ScanForwardToVisibleTombstone(); +} + +void FragmentedRangeTombstoneIterator::Prev() { + if (seq_pos_ == tombstones_->seq_begin()) { + Invalidate(); + return; + } + --seq_pos_; + if (pos_ == tombstones_->end() || + seq_pos_ == tombstones_->seq_iter(pos_->seq_start_idx - 1)) { + --pos_; + } +} + +void FragmentedRangeTombstoneIterator::TopPrev() { + if (pos_ == tombstones_->begin()) { + Invalidate(); + return; + } + --pos_; + SetMaxVisibleSeqAndTimestamp(); + ScanBackwardToVisibleTombstone(); +} + +bool FragmentedRangeTombstoneIterator::Valid() const { + return tombstones_ != nullptr && pos_ != tombstones_->end(); +} + +SequenceNumber FragmentedRangeTombstoneIterator::MaxCoveringTombstoneSeqnum( + const Slice& target_user_key) { + SeekToCoveringTombstone(target_user_key); + return ValidPos() && ucmp_->CompareWithoutTimestamp(start_key(), + target_user_key) <= 0 + ? seq() + : 0; +} + +std::map> +FragmentedRangeTombstoneIterator::SplitBySnapshot( + const std::vector& snapshots) { + std::map> + splits; + SequenceNumber lower = 0; + SequenceNumber upper; + for (size_t i = 0; i <= snapshots.size(); i++) { + if (i >= snapshots.size()) { + upper = kMaxSequenceNumber; + } else { + upper = snapshots[i]; + } + if (tombstones_->ContainsRange(lower, upper)) { + splits.emplace(upper, + std::make_unique( + tombstones_, *icmp_, upper, ts_upper_bound_, lower)); + } + lower = upper + 1; + } + return splits; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/range_tombstone_fragmenter.h b/src/rocksdb/db/range_tombstone_fragmenter.h new file mode 100644 index 000000000..df07fa894 --- /dev/null +++ b/src/rocksdb/db/range_tombstone_fragmenter.h @@ -0,0 +1,357 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/pinned_iterators_manager.h" +#include "rocksdb/status.h" +#include "table/internal_iterator.h" + +namespace ROCKSDB_NAMESPACE { +struct FragmentedRangeTombstoneList; + +struct FragmentedRangeTombstoneListCache { + // ensure only the first reader needs to initialize l + std::mutex reader_mutex; + std::unique_ptr tombstones = nullptr; + // readers will first check this bool to avoid + std::atomic initialized = false; +}; + +struct FragmentedRangeTombstoneList { + public: + // A compact representation of a "stack" of range tombstone fragments, which + // start and end at the same user keys but have different sequence numbers. + // The members seq_start_idx and seq_end_idx are intended to be parameters to + // seq_iter(). + // If user-defined timestamp is enabled, `start` and `end` should be user keys + // with timestamp, and the timestamps are set to max timestamp to be returned + // by parsed_start_key()/parsed_end_key(). seq_start_idx and seq_end_idx will + // also be used as parameters to ts_iter(). + struct RangeTombstoneStack { + RangeTombstoneStack(const Slice& start, const Slice& end, size_t start_idx, + size_t end_idx) + : start_key(start), + end_key(end), + seq_start_idx(start_idx), + seq_end_idx(end_idx) {} + Slice start_key; + Slice end_key; + size_t seq_start_idx; + size_t seq_end_idx; + }; + // Assumes unfragmented_tombstones->key() and unfragmented_tombstones->value() + // both contain timestamp if enabled. + FragmentedRangeTombstoneList( + std::unique_ptr unfragmented_tombstones, + const InternalKeyComparator& icmp, bool for_compaction = false, + const std::vector& snapshots = {}); + + std::vector::const_iterator begin() const { + return tombstones_.begin(); + } + + std::vector::const_iterator end() const { + return tombstones_.end(); + } + + std::vector::const_iterator seq_iter(size_t idx) const { + return std::next(tombstone_seqs_.begin(), idx); + } + + std::vector::const_iterator ts_iter(size_t idx) const { + return std::next(tombstone_timestamps_.begin(), idx); + } + + std::vector::const_iterator seq_begin() const { + return tombstone_seqs_.begin(); + } + + std::vector::const_iterator seq_end() const { + return tombstone_seqs_.end(); + } + + bool empty() const { return tombstones_.empty(); } + + // Returns true if the stored tombstones contain with one with a sequence + // number in [lower, upper]. + // This method is not const as it internally lazy initialize a set of + // sequence numbers (`seq_set_`). + bool ContainsRange(SequenceNumber lower, SequenceNumber upper); + + uint64_t num_unfragmented_tombstones() const { + return num_unfragmented_tombstones_; + } + + uint64_t total_tombstone_payload_bytes() const { + return total_tombstone_payload_bytes_; + } + + private: + // Given an ordered range tombstone iterator unfragmented_tombstones, + // "fragment" the tombstones into non-overlapping pieces. Each + // "non-overlapping piece" is a RangeTombstoneStack in tombstones_, which + // contains start_key, end_key, and indices that points to sequence numbers + // (in tombstone_seqs_) and timestamps (in tombstone_timestamps_). If + // for_compaction is true, then `snapshots` should be provided. Range + // tombstone fragments are dropped if they are not visible in any snapshot and + // user-defined timestamp is not enabled. That is, for each snapshot stripe + // [lower, upper], the range tombstone fragment with largest seqno in [lower, + // upper] is preserved, and all the other range tombstones are dropped. + void FragmentTombstones( + std::unique_ptr unfragmented_tombstones, + const InternalKeyComparator& icmp, bool for_compaction, + const std::vector& snapshots); + + std::vector tombstones_; + std::vector tombstone_seqs_; + std::vector tombstone_timestamps_; + std::once_flag seq_set_init_once_flag_; + std::set seq_set_; + std::list pinned_slices_; + PinnedIteratorsManager pinned_iters_mgr_; + uint64_t num_unfragmented_tombstones_; + uint64_t total_tombstone_payload_bytes_; +}; + +// FragmentedRangeTombstoneIterator converts an InternalIterator of a range-del +// meta block into an iterator over non-overlapping tombstone fragments. The +// tombstone fragmentation process should be more efficient than the range +// tombstone collapsing algorithm in RangeDelAggregator because this leverages +// the internal key ordering already provided by the input iterator, if +// applicable (when the iterator is unsorted, a new sorted iterator is created +// before proceeding). If there are few overlaps, creating a +// FragmentedRangeTombstoneIterator should be O(n), while the RangeDelAggregator +// tombstone collapsing is always O(n log n). +class FragmentedRangeTombstoneIterator : public InternalIterator { + public: + FragmentedRangeTombstoneIterator(FragmentedRangeTombstoneList* tombstones, + const InternalKeyComparator& icmp, + SequenceNumber upper_bound, + const Slice* ts_upper_bound = nullptr, + SequenceNumber lower_bound = 0); + FragmentedRangeTombstoneIterator( + const std::shared_ptr& tombstones, + const InternalKeyComparator& icmp, SequenceNumber upper_bound, + const Slice* ts_upper_bound = nullptr, SequenceNumber lower_bound = 0); + FragmentedRangeTombstoneIterator( + const std::shared_ptr& tombstones, + const InternalKeyComparator& icmp, SequenceNumber upper_bound, + const Slice* ts_upper_bound = nullptr, SequenceNumber lower_bound = 0); + + void SeekToFirst() override; + void SeekToLast() override; + + void SeekToTopFirst(); + void SeekToTopLast(); + + // NOTE: Seek and SeekForPrev do not behave in the way InternalIterator + // seeking should behave. This is OK because they are not currently used, but + // eventually FragmentedRangeTombstoneIterator should no longer implement + // InternalIterator. + // + // Seeks to the range tombstone that covers target at a seqnum in the + // snapshot. If no such tombstone exists, seek to the earliest tombstone in + // the snapshot that ends after target. + void Seek(const Slice& target) override; + // Seeks to the range tombstone that covers target at a seqnum in the + // snapshot. If no such tombstone exists, seek to the latest tombstone in the + // snapshot that starts before target. + void SeekForPrev(const Slice& target) override; + + void Next() override; + void Prev() override; + + void TopNext(); + void TopPrev(); + + bool Valid() const override; + // Note that key() and value() do not return correct timestamp. + // Caller should call timestamp() to get the current timestamp. + Slice key() const override { + MaybePinKey(); + return current_start_key_.Encode(); + } + Slice value() const override { return pos_->end_key; } + bool IsKeyPinned() const override { return false; } + bool IsValuePinned() const override { return true; } + Status status() const override { return Status::OK(); } + + bool empty() const { return tombstones_->empty(); } + void Invalidate() { + pos_ = tombstones_->end(); + seq_pos_ = tombstones_->seq_end(); + pinned_pos_ = tombstones_->end(); + pinned_seq_pos_ = tombstones_->seq_end(); + } + + RangeTombstone Tombstone() const { + assert(Valid()); + if (icmp_->user_comparator()->timestamp_size()) { + return RangeTombstone(start_key(), end_key(), seq(), timestamp()); + } + return RangeTombstone(start_key(), end_key(), seq()); + } + // Note that start_key() and end_key() are not guaranteed to have the + // correct timestamp. User can call timestamp() to get the correct + // timestamp(). + Slice start_key() const { return pos_->start_key; } + Slice end_key() const { return pos_->end_key; } + SequenceNumber seq() const { return *seq_pos_; } + Slice timestamp() const { + // seqno and timestamp are stored in the same order. + return *tombstones_->ts_iter(seq_pos_ - tombstones_->seq_begin()); + } + // Current use case is by CompactionRangeDelAggregator to set + // full_history_ts_low_. + void SetTimestampUpperBound(const Slice* ts_upper_bound) { + ts_upper_bound_ = ts_upper_bound; + } + + ParsedInternalKey parsed_start_key() const { + return ParsedInternalKey(pos_->start_key, kMaxSequenceNumber, + kTypeRangeDeletion); + } + ParsedInternalKey parsed_end_key() const { + return ParsedInternalKey(pos_->end_key, kMaxSequenceNumber, + kTypeRangeDeletion); + } + + // Return the max sequence number of a range tombstone that covers + // the given user key. + // If there is no covering tombstone, then 0 is returned. + SequenceNumber MaxCoveringTombstoneSeqnum(const Slice& user_key); + + // Splits the iterator into n+1 iterators (where n is the number of + // snapshots), each providing a view over a "stripe" of sequence numbers. The + // iterators are keyed by the upper bound of their ranges (the provided + // snapshots + kMaxSequenceNumber). + // + // NOTE: the iterators in the returned map are no longer valid if their + // parent iterator is deleted, since they do not modify the refcount of the + // underlying tombstone list. Therefore, this map should be deleted before + // the parent iterator. + std::map> + SplitBySnapshot(const std::vector& snapshots); + + SequenceNumber upper_bound() const { return upper_bound_; } + SequenceNumber lower_bound() const { return lower_bound_; } + + uint64_t num_unfragmented_tombstones() const { + return tombstones_->num_unfragmented_tombstones(); + } + uint64_t total_tombstone_payload_bytes() const { + return tombstones_->total_tombstone_payload_bytes(); + } + + private: + using RangeTombstoneStack = FragmentedRangeTombstoneList::RangeTombstoneStack; + + struct RangeTombstoneStackStartComparator { + explicit RangeTombstoneStackStartComparator(const Comparator* c) : cmp(c) {} + + bool operator()(const RangeTombstoneStack& a, + const RangeTombstoneStack& b) const { + return cmp->CompareWithoutTimestamp(a.start_key, b.start_key) < 0; + } + + bool operator()(const RangeTombstoneStack& a, const Slice& b) const { + return cmp->CompareWithoutTimestamp(a.start_key, b) < 0; + } + + bool operator()(const Slice& a, const RangeTombstoneStack& b) const { + return cmp->CompareWithoutTimestamp(a, b.start_key) < 0; + } + + const Comparator* cmp; + }; + + struct RangeTombstoneStackEndComparator { + explicit RangeTombstoneStackEndComparator(const Comparator* c) : cmp(c) {} + + bool operator()(const RangeTombstoneStack& a, + const RangeTombstoneStack& b) const { + return cmp->CompareWithoutTimestamp(a.end_key, b.end_key) < 0; + } + + bool operator()(const RangeTombstoneStack& a, const Slice& b) const { + return cmp->CompareWithoutTimestamp(a.end_key, b) < 0; + } + + bool operator()(const Slice& a, const RangeTombstoneStack& b) const { + return cmp->CompareWithoutTimestamp(a, b.end_key) < 0; + } + + const Comparator* cmp; + }; + + void MaybePinKey() const { + if (pos_ != tombstones_->end() && seq_pos_ != tombstones_->seq_end() && + (pinned_pos_ != pos_ || pinned_seq_pos_ != seq_pos_)) { + current_start_key_.Set(pos_->start_key, *seq_pos_, kTypeRangeDeletion); + pinned_pos_ = pos_; + pinned_seq_pos_ = seq_pos_; + } + } + + void SeekToCoveringTombstone(const Slice& key); + void SeekForPrevToCoveringTombstone(const Slice& key); + void ScanForwardToVisibleTombstone(); + void ScanBackwardToVisibleTombstone(); + bool ValidPos() const { + return Valid() && seq_pos_ != tombstones_->seq_iter(pos_->seq_end_idx); + } + + const RangeTombstoneStackStartComparator tombstone_start_cmp_; + const RangeTombstoneStackEndComparator tombstone_end_cmp_; + const InternalKeyComparator* icmp_; + const Comparator* ucmp_; + std::shared_ptr tombstones_ref_; + std::shared_ptr tombstones_cache_ref_; + FragmentedRangeTombstoneList* tombstones_; + SequenceNumber upper_bound_; + SequenceNumber lower_bound_; + // Only consider timestamps <= ts_upper_bound_. + const Slice* ts_upper_bound_; + std::vector::const_iterator pos_; + std::vector::const_iterator seq_pos_; + mutable std::vector::const_iterator pinned_pos_; + mutable std::vector::const_iterator pinned_seq_pos_; + mutable InternalKey current_start_key_; + + // Check the current RangeTombstoneStack `pos_` against timestamp + // upper bound `ts_upper_bound_` and sequence number upper bound + // `upper_bound_`. Update the sequence number (and timestamp) pointer + // `seq_pos_` to the first valid position satisfying both bounds. + void SetMaxVisibleSeqAndTimestamp() { + seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx), + tombstones_->seq_iter(pos_->seq_end_idx), + upper_bound_, std::greater()); + if (ts_upper_bound_ && !ts_upper_bound_->empty()) { + auto ts_pos = std::lower_bound( + tombstones_->ts_iter(pos_->seq_start_idx), + tombstones_->ts_iter(pos_->seq_end_idx), *ts_upper_bound_, + [this](const Slice& s1, const Slice& s2) { + return ucmp_->CompareTimestamp(s1, s2) > 0; + }); + auto ts_idx = ts_pos - tombstones_->ts_iter(pos_->seq_start_idx); + auto seq_idx = seq_pos_ - tombstones_->seq_iter(pos_->seq_start_idx); + if (seq_idx < ts_idx) { + // seq and ts are ordered in non-increasing order. Only updates seq_pos_ + // to a larger index for smaller sequence number and timestamp. + seq_pos_ = tombstones_->seq_iter(pos_->seq_start_idx + ts_idx); + } + } + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/range_tombstone_fragmenter_test.cc b/src/rocksdb/db/range_tombstone_fragmenter_test.cc new file mode 100644 index 000000000..46b3c99b5 --- /dev/null +++ b/src/rocksdb/db/range_tombstone_fragmenter_test.cc @@ -0,0 +1,555 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/range_tombstone_fragmenter.h" + +#include "db/db_test_util.h" +#include "db/dbformat.h" +#include "rocksdb/comparator.h" +#include "test_util/testutil.h" +#include "util/vector_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +class RangeTombstoneFragmenterTest : public testing::Test {}; + +namespace { + +static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator()); + +std::unique_ptr MakeRangeDelIter( + const std::vector& range_dels) { + std::vector keys, values; + for (const auto& range_del : range_dels) { + auto key_and_value = range_del.Serialize(); + keys.push_back(key_and_value.first.Encode().ToString()); + values.push_back(key_and_value.second.ToString()); + } + return std::unique_ptr( + new VectorIterator(keys, values, &bytewise_icmp)); +} + +void CheckIterPosition(const RangeTombstone& tombstone, + const FragmentedRangeTombstoneIterator* iter) { + // Test InternalIterator interface. + EXPECT_EQ(tombstone.start_key_, ExtractUserKey(iter->key())); + EXPECT_EQ(tombstone.end_key_, iter->value()); + EXPECT_EQ(tombstone.seq_, iter->seq()); + + // Test FragmentedRangeTombstoneIterator interface. + EXPECT_EQ(tombstone.start_key_, iter->start_key()); + EXPECT_EQ(tombstone.end_key_, iter->end_key()); + EXPECT_EQ(tombstone.seq_, GetInternalKeySeqno(iter->key())); +} + +void VerifyFragmentedRangeDels( + FragmentedRangeTombstoneIterator* iter, + const std::vector& expected_tombstones) { + iter->SeekToFirst(); + for (size_t i = 0; i < expected_tombstones.size(); i++, iter->Next()) { + ASSERT_TRUE(iter->Valid()); + CheckIterPosition(expected_tombstones[i], iter); + } + EXPECT_FALSE(iter->Valid()); +} + +void VerifyVisibleTombstones( + FragmentedRangeTombstoneIterator* iter, + const std::vector& expected_tombstones) { + iter->SeekToTopFirst(); + for (size_t i = 0; i < expected_tombstones.size(); i++, iter->TopNext()) { + ASSERT_TRUE(iter->Valid()); + CheckIterPosition(expected_tombstones[i], iter); + } + EXPECT_FALSE(iter->Valid()); +} + +struct SeekTestCase { + Slice seek_target; + RangeTombstone expected_position; + bool out_of_range; +}; + +void VerifySeek(FragmentedRangeTombstoneIterator* iter, + const std::vector& cases) { + for (const auto& testcase : cases) { + iter->Seek(testcase.seek_target); + if (testcase.out_of_range) { + ASSERT_FALSE(iter->Valid()); + } else { + ASSERT_TRUE(iter->Valid()); + CheckIterPosition(testcase.expected_position, iter); + } + } +} + +void VerifySeekForPrev(FragmentedRangeTombstoneIterator* iter, + const std::vector& cases) { + for (const auto& testcase : cases) { + iter->SeekForPrev(testcase.seek_target); + if (testcase.out_of_range) { + ASSERT_FALSE(iter->Valid()); + } else { + ASSERT_TRUE(iter->Valid()); + CheckIterPosition(testcase.expected_position, iter); + } + } +} + +struct MaxCoveringTombstoneSeqnumTestCase { + Slice user_key; + SequenceNumber result; +}; + +void VerifyMaxCoveringTombstoneSeqnum( + FragmentedRangeTombstoneIterator* iter, + const std::vector& cases) { + for (const auto& testcase : cases) { + EXPECT_EQ(testcase.result, + iter->MaxCoveringTombstoneSeqnum(testcase.user_key)); + } +} + +} // anonymous namespace + +TEST_F(RangeTombstoneFragmenterTest, NonOverlappingTombstones) { + auto range_del_iter = MakeRangeDelIter({{"a", "b", 10}, {"c", "d", 5}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + ASSERT_EQ(0, iter.lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound()); + VerifyFragmentedRangeDels(&iter, {{"a", "b", 10}, {"c", "d", 5}}); + VerifyMaxCoveringTombstoneSeqnum(&iter, + {{"", 0}, {"a", 10}, {"b", 0}, {"c", 5}}); +} + +TEST_F(RangeTombstoneFragmenterTest, OverlappingTombstones) { + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 15}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + ASSERT_EQ(0, iter.lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound()); + VerifyFragmentedRangeDels( + &iter, {{"a", "c", 10}, {"c", "e", 15}, {"c", "e", 10}, {"e", "g", 15}}); + VerifyMaxCoveringTombstoneSeqnum(&iter, + {{"a", 10}, {"c", 15}, {"e", 15}, {"g", 0}}); +} + +TEST_F(RangeTombstoneFragmenterTest, ContiguousTombstones) { + auto range_del_iter = MakeRangeDelIter( + {{"a", "c", 10}, {"c", "e", 20}, {"c", "e", 5}, {"e", "g", 15}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + ASSERT_EQ(0, iter.lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound()); + VerifyFragmentedRangeDels( + &iter, {{"a", "c", 10}, {"c", "e", 20}, {"c", "e", 5}, {"e", "g", 15}}); + VerifyMaxCoveringTombstoneSeqnum(&iter, + {{"a", 10}, {"c", 20}, {"e", 15}, {"g", 0}}); +} + +TEST_F(RangeTombstoneFragmenterTest, RepeatedStartAndEndKey) { + auto range_del_iter = + MakeRangeDelIter({{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + ASSERT_EQ(0, iter.lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound()); + VerifyFragmentedRangeDels(&iter, + {{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}}); + VerifyMaxCoveringTombstoneSeqnum(&iter, {{"a", 10}, {"b", 10}, {"c", 0}}); +} + +TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyDifferentEndKeys) { + auto range_del_iter = + MakeRangeDelIter({{"a", "e", 10}, {"a", "g", 7}, {"a", "c", 3}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + ASSERT_EQ(0, iter.lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound()); + VerifyFragmentedRangeDels(&iter, {{"a", "c", 10}, + {"a", "c", 7}, + {"a", "c", 3}, + {"c", "e", 10}, + {"c", "e", 7}, + {"e", "g", 7}}); + VerifyMaxCoveringTombstoneSeqnum(&iter, + {{"a", 10}, {"c", 10}, {"e", 7}, {"g", 0}}); +} + +TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyMixedEndKeys) { + auto range_del_iter = MakeRangeDelIter({{"a", "c", 30}, + {"a", "g", 20}, + {"a", "e", 10}, + {"a", "g", 7}, + {"a", "c", 3}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + ASSERT_EQ(0, iter.lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound()); + VerifyFragmentedRangeDels(&iter, {{"a", "c", 30}, + {"a", "c", 20}, + {"a", "c", 10}, + {"a", "c", 7}, + {"a", "c", 3}, + {"c", "e", 20}, + {"c", "e", 10}, + {"c", "e", 7}, + {"e", "g", 20}, + {"e", "g", 7}}); + VerifyMaxCoveringTombstoneSeqnum(&iter, + {{"a", 30}, {"c", 20}, {"e", 20}, {"g", 0}}); +} + +TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKey) { + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"c", "g", 8}, + {"c", "i", 6}, + {"j", "n", 4}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp, + 9 /* upper_bound */); + FragmentedRangeTombstoneIterator iter3(&fragment_list, bytewise_icmp, + 7 /* upper_bound */); + FragmentedRangeTombstoneIterator iter4(&fragment_list, bytewise_icmp, + 5 /* upper_bound */); + FragmentedRangeTombstoneIterator iter5(&fragment_list, bytewise_icmp, + 3 /* upper_bound */); + for (auto* iter : {&iter1, &iter2, &iter3, &iter4, &iter5}) { + VerifyFragmentedRangeDels(iter, {{"a", "c", 10}, + {"c", "e", 10}, + {"c", "e", 8}, + {"c", "e", 6}, + {"e", "g", 8}, + {"e", "g", 6}, + {"g", "i", 6}, + {"j", "l", 4}, + {"j", "l", 2}, + {"l", "n", 4}}); + } + + ASSERT_EQ(0, iter1.lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, iter1.upper_bound()); + VerifyVisibleTombstones(&iter1, {{"a", "c", 10}, + {"c", "e", 10}, + {"e", "g", 8}, + {"g", "i", 6}, + {"j", "l", 4}, + {"l", "n", 4}}); + VerifyMaxCoveringTombstoneSeqnum( + &iter1, {{"a", 10}, {"c", 10}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}}); + + ASSERT_EQ(0, iter2.lower_bound()); + ASSERT_EQ(9, iter2.upper_bound()); + VerifyVisibleTombstones(&iter2, {{"c", "e", 8}, + {"e", "g", 8}, + {"g", "i", 6}, + {"j", "l", 4}, + {"l", "n", 4}}); + VerifyMaxCoveringTombstoneSeqnum( + &iter2, {{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}}); + + ASSERT_EQ(0, iter3.lower_bound()); + ASSERT_EQ(7, iter3.upper_bound()); + VerifyVisibleTombstones(&iter3, {{"c", "e", 6}, + {"e", "g", 6}, + {"g", "i", 6}, + {"j", "l", 4}, + {"l", "n", 4}}); + VerifyMaxCoveringTombstoneSeqnum( + &iter3, {{"a", 0}, {"c", 6}, {"e", 6}, {"i", 0}, {"j", 4}, {"m", 4}}); + + ASSERT_EQ(0, iter4.lower_bound()); + ASSERT_EQ(5, iter4.upper_bound()); + VerifyVisibleTombstones(&iter4, {{"j", "l", 4}, {"l", "n", 4}}); + VerifyMaxCoveringTombstoneSeqnum( + &iter4, {{"a", 0}, {"c", 0}, {"e", 0}, {"i", 0}, {"j", 4}, {"m", 4}}); + + ASSERT_EQ(0, iter5.lower_bound()); + ASSERT_EQ(3, iter5.upper_bound()); + VerifyVisibleTombstones(&iter5, {{"j", "l", 2}}); + VerifyMaxCoveringTombstoneSeqnum( + &iter5, {{"a", 0}, {"c", 0}, {"e", 0}, {"i", 0}, {"j", 2}, {"m", 0}}); +} + +TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyUnordered) { + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"j", "n", 4}, + {"c", "i", 6}, + {"c", "g", 8}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + 9 /* upper_bound */); + ASSERT_EQ(0, iter.lower_bound()); + ASSERT_EQ(9, iter.upper_bound()); + VerifyFragmentedRangeDels(&iter, {{"a", "c", 10}, + {"c", "e", 10}, + {"c", "e", 8}, + {"c", "e", 6}, + {"e", "g", 8}, + {"e", "g", 6}, + {"g", "i", 6}, + {"j", "l", 4}, + {"j", "l", 2}, + {"l", "n", 4}}); + VerifyMaxCoveringTombstoneSeqnum( + &iter, {{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}}); +} + +TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyForCompaction) { + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"j", "n", 4}, + {"c", "i", 6}, + {"c", "g", 8}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list( + std::move(range_del_iter), bytewise_icmp, true /* for_compaction */, + {} /* snapshots */); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber /* upper_bound */); + VerifyFragmentedRangeDels(&iter, {{"a", "c", 10}, + {"c", "e", 10}, + {"e", "g", 8}, + {"g", "i", 6}, + {"j", "l", 4}, + {"l", "n", 4}}); +} + +TEST_F(RangeTombstoneFragmenterTest, + OverlapAndRepeatedStartKeyForCompactionWithSnapshot) { + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"j", "n", 4}, + {"c", "i", 6}, + {"c", "g", 8}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list( + std::move(range_del_iter), bytewise_icmp, true /* for_compaction */, + {20, 9} /* upper_bounds */); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber /* upper_bound */); + VerifyFragmentedRangeDels(&iter, {{"a", "c", 10}, + {"c", "e", 10}, + {"c", "e", 8}, + {"e", "g", 8}, + {"g", "i", 6}, + {"j", "l", 4}, + {"l", "n", 4}}); +} + +TEST_F(RangeTombstoneFragmenterTest, IteratorSplitNoSnapshots) { + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"j", "n", 4}, + {"c", "i", 6}, + {"c", "g", 8}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber /* upper_bound */); + + auto split_iters = iter.SplitBySnapshot({} /* snapshots */); + ASSERT_EQ(1, split_iters.size()); + + auto* split_iter = split_iters[kMaxSequenceNumber].get(); + ASSERT_EQ(0, split_iter->lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, split_iter->upper_bound()); + VerifyVisibleTombstones(split_iter, {{"a", "c", 10}, + {"c", "e", 10}, + {"e", "g", 8}, + {"g", "i", 6}, + {"j", "l", 4}, + {"l", "n", 4}}); +} + +TEST_F(RangeTombstoneFragmenterTest, IteratorSplitWithSnapshots) { + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"j", "n", 4}, + {"c", "i", 6}, + {"c", "g", 8}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber /* upper_bound */); + + auto split_iters = iter.SplitBySnapshot({3, 5, 7, 9} /* snapshots */); + ASSERT_EQ(5, split_iters.size()); + + auto* split_iter1 = split_iters[3].get(); + ASSERT_EQ(0, split_iter1->lower_bound()); + ASSERT_EQ(3, split_iter1->upper_bound()); + VerifyVisibleTombstones(split_iter1, {{"j", "l", 2}}); + + auto* split_iter2 = split_iters[5].get(); + ASSERT_EQ(4, split_iter2->lower_bound()); + ASSERT_EQ(5, split_iter2->upper_bound()); + VerifyVisibleTombstones(split_iter2, {{"j", "l", 4}, {"l", "n", 4}}); + + auto* split_iter3 = split_iters[7].get(); + ASSERT_EQ(6, split_iter3->lower_bound()); + ASSERT_EQ(7, split_iter3->upper_bound()); + VerifyVisibleTombstones(split_iter3, + {{"c", "e", 6}, {"e", "g", 6}, {"g", "i", 6}}); + + auto* split_iter4 = split_iters[9].get(); + ASSERT_EQ(8, split_iter4->lower_bound()); + ASSERT_EQ(9, split_iter4->upper_bound()); + VerifyVisibleTombstones(split_iter4, {{"c", "e", 8}, {"e", "g", 8}}); + + auto* split_iter5 = split_iters[kMaxSequenceNumber].get(); + ASSERT_EQ(10, split_iter5->lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, split_iter5->upper_bound()); + VerifyVisibleTombstones(split_iter5, {{"a", "c", 10}, {"c", "e", 10}}); +} + +TEST_F(RangeTombstoneFragmenterTest, SeekStartKey) { + // Same tombstones as OverlapAndRepeatedStartKey. + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"c", "g", 8}, + {"c", "i", 6}, + {"j", "n", 4}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + + FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + VerifySeek( + &iter1, + {{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}}); + VerifySeekForPrev( + &iter1, + {{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}}); + + FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp, + 3 /* upper_bound */); + VerifySeek(&iter2, {{"a", {"j", "l", 2}}, + {"e", {"j", "l", 2}}, + {"l", {}, true /* out of range */}}); + VerifySeekForPrev(&iter2, {{"a", {}, true /* out of range */}, + {"e", {}, true /* out of range */}, + {"l", {"j", "l", 2}}}); +} + +TEST_F(RangeTombstoneFragmenterTest, SeekCovered) { + // Same tombstones as OverlapAndRepeatedStartKey. + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"c", "g", 8}, + {"c", "i", 6}, + {"j", "n", 4}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + + FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + VerifySeek( + &iter1, + {{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}}); + VerifySeekForPrev( + &iter1, + {{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}}); + + FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp, + 3 /* upper_bound */); + VerifySeek(&iter2, {{"b", {"j", "l", 2}}, + {"f", {"j", "l", 2}}, + {"m", {}, true /* out of range */}}); + VerifySeekForPrev(&iter2, {{"b", {}, true /* out of range */}, + {"f", {}, true /* out of range */}, + {"m", {"j", "l", 2}}}); +} + +TEST_F(RangeTombstoneFragmenterTest, SeekEndKey) { + // Same tombstones as OverlapAndRepeatedStartKey. + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"c", "g", 8}, + {"c", "i", 6}, + {"j", "n", 4}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + + FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + VerifySeek(&iter1, {{"c", {"c", "e", 10}}, + {"g", {"g", "i", 6}}, + {"i", {"j", "l", 4}}, + {"n", {}, true /* out of range */}}); + VerifySeekForPrev(&iter1, {{"c", {"c", "e", 10}}, + {"g", {"g", "i", 6}}, + {"i", {"g", "i", 6}}, + {"n", {"l", "n", 4}}}); + + FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp, + 3 /* upper_bound */); + VerifySeek(&iter2, {{"c", {"j", "l", 2}}, + {"g", {"j", "l", 2}}, + {"i", {"j", "l", 2}}, + {"n", {}, true /* out of range */}}); + VerifySeekForPrev(&iter2, {{"c", {}, true /* out of range */}, + {"g", {}, true /* out of range */}, + {"i", {}, true /* out of range */}, + {"n", {"j", "l", 2}}}); +} + +TEST_F(RangeTombstoneFragmenterTest, SeekOutOfBounds) { + // Same tombstones as OverlapAndRepeatedStartKey. + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"c", "g", 8}, + {"c", "i", 6}, + {"j", "n", 4}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + VerifySeek(&iter, {{"", {"a", "c", 10}}, {"z", {}, true /* out of range */}}); + VerifySeekForPrev(&iter, + {{"", {}, true /* out of range */}, {"z", {"l", "n", 4}}}); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/read_callback.h b/src/rocksdb/db/read_callback.h new file mode 100644 index 000000000..c042352db --- /dev/null +++ b/src/rocksdb/db/read_callback.h @@ -0,0 +1,54 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "db/dbformat.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +class ReadCallback { + public: + explicit ReadCallback(SequenceNumber last_visible_seq) + : max_visible_seq_(last_visible_seq) {} + ReadCallback(SequenceNumber last_visible_seq, SequenceNumber min_uncommitted) + : max_visible_seq_(last_visible_seq), min_uncommitted_(min_uncommitted) {} + + virtual ~ReadCallback() {} + + // Will be called to see if the seq number visible; if not it moves on to + // the next seq number. + virtual bool IsVisibleFullCheck(SequenceNumber seq) = 0; + + inline bool IsVisible(SequenceNumber seq) { + assert(min_uncommitted_ > 0); + assert(min_uncommitted_ >= kMinUnCommittedSeq); + if (seq < min_uncommitted_) { // handles seq == 0 as well + assert(seq <= max_visible_seq_); + return true; + } else if (max_visible_seq_ < seq) { + assert(seq != 0); + return false; + } else { + assert(seq != 0); // already handled in the first if-then clause + return IsVisibleFullCheck(seq); + } + } + + inline SequenceNumber max_visible_seq() { return max_visible_seq_; } + + // Refresh to a more recent visible seq + virtual void Refresh(SequenceNumber seq) { max_visible_seq_ = seq; } + + protected: + // The max visible seq, it is usually the snapshot but could be larger if + // transaction has its own writes written to db. + SequenceNumber max_visible_seq_ = kMaxSequenceNumber; + // Any seq less than min_uncommitted_ is committed. + const SequenceNumber min_uncommitted_ = kMinUnCommittedSeq; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/repair.cc b/src/rocksdb/db/repair.cc new file mode 100644 index 000000000..1829a79f2 --- /dev/null +++ b/src/rocksdb/db/repair.cc @@ -0,0 +1,771 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Repairer does best effort recovery to recover as much data as possible after +// a disaster without compromising consistency. It does not guarantee bringing +// the database to a time consistent state. +// +// Repair process is broken into 4 phases: +// (a) Find files +// (b) Convert logs to tables +// (c) Extract metadata +// (d) Write Descriptor +// +// (a) Find files +// +// The repairer goes through all the files in the directory, and classifies them +// based on their file name. Any file that cannot be identified by name will be +// ignored. +// +// (b) Convert logs to table +// +// Every log file that is active is replayed. All sections of the file where the +// checksum does not match is skipped over. We intentionally give preference to +// data consistency. +// +// (c) Extract metadata +// +// We scan every table to compute +// (1) smallest/largest for the table +// (2) largest sequence number in the table +// (3) oldest blob file referred to by the table (if applicable) +// +// If we are unable to scan the file, then we ignore the table. +// +// (d) Write Descriptor +// +// We generate descriptor contents: +// - log number is set to zero +// - next-file-number is set to 1 + largest file number we found +// - last-sequence-number is set to largest sequence# found across +// all tables (see 2c) +// - compaction pointers are cleared +// - every table file is added at level 0 +// +// Possible optimization 1: +// (a) Compute total size and use to pick appropriate max-level M +// (b) Sort tables by largest sequence# in the table +// (c) For each table: if it overlaps earlier table, place in level-0, +// else place in level-M. +// (d) We can provide options for time consistent recovery and unsafe recovery +// (ignore checksum failure when applicable) +// Possible optimization 2: +// Store per-table metadata (smallest, largest, largest-seq#, ...) +// in the table's meta section to speed up ScanTable. + +#ifndef ROCKSDB_LITE + +#include + +#include "db/builder.h" +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "db/write_batch_internal.h" +#include "file/filename.h" +#include "file/writable_file_writer.h" +#include "logging/logging.h" +#include "options/cf_options.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/scoped_arena_iterator.h" +#include "table/unique_id_impl.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +class Repairer { + public: + Repairer(const std::string& dbname, const DBOptions& db_options, + const std::vector& column_families, + const ColumnFamilyOptions& default_cf_opts, + const ColumnFamilyOptions& unknown_cf_opts, bool create_unknown_cfs) + : dbname_(dbname), + db_session_id_(DBImpl::GenerateDbSessionId(db_options.env)), + env_(db_options.env), + file_options_(), + db_options_(SanitizeOptions(dbname_, db_options)), + immutable_db_options_(ImmutableDBOptions(db_options_)), + icmp_(default_cf_opts.comparator), + default_cf_opts_( + SanitizeOptions(immutable_db_options_, default_cf_opts)), + default_iopts_( + ImmutableOptions(immutable_db_options_, default_cf_opts_)), + unknown_cf_opts_( + SanitizeOptions(immutable_db_options_, unknown_cf_opts)), + create_unknown_cfs_(create_unknown_cfs), + raw_table_cache_( + // TableCache can be small since we expect each table to be opened + // once. + NewLRUCache(10, db_options_.table_cache_numshardbits)), + table_cache_(new TableCache(default_iopts_, &file_options_, + raw_table_cache_.get(), + /*block_cache_tracer=*/nullptr, + /*io_tracer=*/nullptr, db_session_id_)), + wb_(db_options_.db_write_buffer_size), + wc_(db_options_.delayed_write_rate), + vset_(dbname_, &immutable_db_options_, file_options_, + raw_table_cache_.get(), &wb_, &wc_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id=*/"", db_session_id_), + next_file_number_(1), + db_lock_(nullptr), + closed_(false) { + for (const auto& cfd : column_families) { + cf_name_to_opts_[cfd.name] = cfd.options; + } + } + + const ColumnFamilyOptions* GetColumnFamilyOptions( + const std::string& cf_name) { + if (cf_name_to_opts_.find(cf_name) == cf_name_to_opts_.end()) { + if (create_unknown_cfs_) { + return &unknown_cf_opts_; + } + return nullptr; + } + return &cf_name_to_opts_[cf_name]; + } + + // Adds a column family to the VersionSet with cf_options_ and updates + // manifest. + Status AddColumnFamily(const std::string& cf_name, uint32_t cf_id) { + const auto* cf_opts = GetColumnFamilyOptions(cf_name); + if (cf_opts == nullptr) { + return Status::Corruption("Encountered unknown column family with name=" + + cf_name + ", id=" + std::to_string(cf_id)); + } + Options opts(db_options_, *cf_opts); + MutableCFOptions mut_cf_opts(opts); + + VersionEdit edit; + edit.SetComparatorName(opts.comparator->Name()); + edit.SetLogNumber(0); + edit.SetColumnFamily(cf_id); + ColumnFamilyData* cfd; + cfd = nullptr; + edit.AddColumnFamily(cf_name); + + mutex_.Lock(); + std::unique_ptr db_dir; + Status status = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(), + &db_dir, nullptr); + if (status.ok()) { + status = vset_.LogAndApply(cfd, mut_cf_opts, &edit, &mutex_, db_dir.get(), + false /* new_descriptor_log */, cf_opts); + } + mutex_.Unlock(); + return status; + } + + Status Close() { + Status s = Status::OK(); + if (!closed_) { + if (db_lock_ != nullptr) { + s = env_->UnlockFile(db_lock_); + db_lock_ = nullptr; + } + closed_ = true; + } + return s; + } + + ~Repairer() { Close().PermitUncheckedError(); } + + Status Run() { + Status status = env_->LockFile(LockFileName(dbname_), &db_lock_); + if (!status.ok()) { + return status; + } + status = FindFiles(); + DBImpl* db_impl = nullptr; + if (status.ok()) { + // Discard older manifests and start a fresh one + for (size_t i = 0; i < manifests_.size(); i++) { + ArchiveFile(dbname_ + "/" + manifests_[i]); + } + // Just create a DBImpl temporarily so we can reuse NewDB() + db_impl = new DBImpl(db_options_, dbname_); + status = db_impl->NewDB(/*new_filenames=*/nullptr); + } + delete db_impl; + + if (status.ok()) { + // Recover using the fresh manifest created by NewDB() + status = + vset_.Recover({{kDefaultColumnFamilyName, default_cf_opts_}}, false); + } + if (status.ok()) { + // Need to scan existing SST files first so the column families are + // created before we process WAL files + ExtractMetaData(); + + // ExtractMetaData() uses table_fds_ to know which SST files' metadata to + // extract -- we need to clear it here since metadata for existing SST + // files has been extracted already + table_fds_.clear(); + ConvertLogFilesToTables(); + ExtractMetaData(); + status = AddTables(); + } + if (status.ok()) { + uint64_t bytes = 0; + for (size_t i = 0; i < tables_.size(); i++) { + bytes += tables_[i].meta.fd.GetFileSize(); + } + ROCKS_LOG_WARN(db_options_.info_log, + "**** Repaired rocksdb %s; " + "recovered %" ROCKSDB_PRIszt " files; %" PRIu64 + " bytes. " + "Some data may have been lost. " + "****", + dbname_.c_str(), tables_.size(), bytes); + } + return status; + } + + private: + struct TableInfo { + FileMetaData meta; + uint32_t column_family_id; + std::string column_family_name; + }; + + std::string const dbname_; + std::string db_session_id_; + Env* const env_; + const FileOptions file_options_; + const DBOptions db_options_; + const ImmutableDBOptions immutable_db_options_; + const InternalKeyComparator icmp_; + const ColumnFamilyOptions default_cf_opts_; + const ImmutableOptions default_iopts_; // table_cache_ holds reference + const ColumnFamilyOptions unknown_cf_opts_; + const bool create_unknown_cfs_; + std::shared_ptr raw_table_cache_; + std::unique_ptr table_cache_; + WriteBufferManager wb_; + WriteController wc_; + VersionSet vset_; + std::unordered_map cf_name_to_opts_; + InstrumentedMutex mutex_; + + std::vector manifests_; + std::vector table_fds_; + std::vector logs_; + std::vector tables_; + uint64_t next_file_number_; + // Lock over the persistent DB state. Non-nullptr iff successfully + // acquired. + FileLock* db_lock_; + bool closed_; + + Status FindFiles() { + std::vector filenames; + bool found_file = false; + std::vector to_search_paths; + + for (size_t path_id = 0; path_id < db_options_.db_paths.size(); path_id++) { + to_search_paths.push_back(db_options_.db_paths[path_id].path); + } + + // search wal_dir if user uses a customize wal_dir + bool same = immutable_db_options_.IsWalDirSameAsDBPath(dbname_); + if (!same) { + to_search_paths.push_back(immutable_db_options_.wal_dir); + } + + for (size_t path_id = 0; path_id < to_search_paths.size(); path_id++) { + ROCKS_LOG_INFO(db_options_.info_log, "Searching path %s\n", + to_search_paths[path_id].c_str()); + Status status = env_->GetChildren(to_search_paths[path_id], &filenames); + if (!status.ok()) { + return status; + } + if (!filenames.empty()) { + found_file = true; + } + + uint64_t number; + FileType type; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type)) { + if (type == kDescriptorFile) { + manifests_.push_back(filenames[i]); + } else { + if (number + 1 > next_file_number_) { + next_file_number_ = number + 1; + } + if (type == kWalFile) { + logs_.push_back(number); + } else if (type == kTableFile) { + table_fds_.emplace_back(number, static_cast(path_id), + 0); + } else { + // Ignore other files + } + } + } + } + } + if (!found_file) { + return Status::Corruption(dbname_, "repair found no files"); + } + return Status::OK(); + } + + void ConvertLogFilesToTables() { + const auto& wal_dir = immutable_db_options_.GetWalDir(); + for (size_t i = 0; i < logs_.size(); i++) { + // we should use LogFileName(wal_dir, logs_[i]) here. user might uses + // wal_dir option. + std::string logname = LogFileName(wal_dir, logs_[i]); + Status status = ConvertLogToTable(wal_dir, logs_[i]); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Log #%" PRIu64 ": ignoring conversion error: %s", + logs_[i], status.ToString().c_str()); + } + ArchiveFile(logname); + } + } + + Status ConvertLogToTable(const std::string& wal_dir, uint64_t log) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + std::shared_ptr info_log; + uint64_t lognum; + void Corruption(size_t bytes, const Status& s) override { + // We print error messages for corruption, but continue repairing. + ROCKS_LOG_ERROR(info_log, "Log #%" PRIu64 ": dropping %d bytes; %s", + lognum, static_cast(bytes), s.ToString().c_str()); + } + }; + + // Open the log file + std::string logname = LogFileName(wal_dir, log); + const auto& fs = env_->GetFileSystem(); + std::unique_ptr lfile_reader; + Status status = SequentialFileReader::Create( + fs, logname, fs->OptimizeForLogRead(file_options_), &lfile_reader, + nullptr /* dbg */, nullptr /* rate limiter */); + if (!status.ok()) { + return status; + } + + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = db_options_.info_log; + reporter.lognum = log; + // We intentionally make log::Reader do checksumming so that + // corruptions cause entire commits to be skipped instead of + // propagating bad information (like overly large sequence + // numbers). + log::Reader reader(db_options_.info_log, std::move(lfile_reader), &reporter, + true /*enable checksum*/, log); + + // Initialize per-column family memtables + for (auto* cfd : *vset_.GetColumnFamilySet()) { + cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), + kMaxSequenceNumber); + } + auto cf_mems = new ColumnFamilyMemTablesImpl(vset_.GetColumnFamilySet()); + + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + int counter = 0; + while (reader.ReadRecord(&record, &scratch)) { + if (record.size() < WriteBatchInternal::kHeader) { + reporter.Corruption(record.size(), + Status::Corruption("log record too small")); + continue; + } + Status record_status = WriteBatchInternal::SetContents(&batch, record); + if (record_status.ok()) { + record_status = + WriteBatchInternal::InsertInto(&batch, cf_mems, nullptr, nullptr); + } + if (record_status.ok()) { + counter += WriteBatchInternal::Count(&batch); + } else { + ROCKS_LOG_WARN(db_options_.info_log, "Log #%" PRIu64 ": ignoring %s", + log, record_status.ToString().c_str()); + } + } + + // Dump a table for each column family with entries in this log file. + for (auto* cfd : *vset_.GetColumnFamilySet()) { + // Do not record a version edit for this conversion to a Table + // since ExtractMetaData() will also generate edits. + MemTable* mem = cfd->mem(); + if (mem->IsEmpty()) { + continue; + } + + FileMetaData meta; + meta.fd = FileDescriptor(next_file_number_++, 0, 0); + ReadOptions ro; + ro.total_order_seek = true; + Arena arena; + ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); + int64_t _current_time = 0; + immutable_db_options_.clock->GetCurrentTime(&_current_time) + .PermitUncheckedError(); // ignore error + const uint64_t current_time = static_cast(_current_time); + meta.file_creation_time = current_time; + SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance(); + + auto write_hint = cfd->CalculateSSTWriteHint(0); + std::vector> + range_del_iters; + auto range_del_iter = mem->NewRangeTombstoneIterator( + ro, kMaxSequenceNumber, false /* immutable_memtable */); + if (range_del_iter != nullptr) { + range_del_iters.emplace_back(range_del_iter); + } + + IOStatus io_s; + CompressionOptions default_compression; + TableBuilderOptions tboptions( + *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(), + cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(), + kNoCompression, default_compression, cfd->GetID(), cfd->GetName(), + -1 /* level */, false /* is_bottommost */, + TableFileCreationReason::kRecovery, 0 /* oldest_key_time */, + 0 /* file_creation_time */, "DB Repairer" /* db_id */, db_session_id_, + 0 /*target_file_size*/, meta.fd.GetNumber()); + + SeqnoToTimeMapping empty_seqno_time_mapping; + status = BuildTable( + dbname_, /* versions */ nullptr, immutable_db_options_, tboptions, + file_options_, table_cache_.get(), iter.get(), + std::move(range_del_iters), &meta, nullptr /* blob_file_additions */, + {}, kMaxSequenceNumber, kMaxSequenceNumber, snapshot_checker, + false /* paranoid_file_checks*/, nullptr /* internal_stats */, &io_s, + nullptr /*IOTracer*/, BlobFileCreationReason::kRecovery, + empty_seqno_time_mapping, nullptr /* event_logger */, 0 /* job_id */, + Env::IO_HIGH, nullptr /* table_properties */, write_hint); + ROCKS_LOG_INFO(db_options_.info_log, + "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s", + log, counter, meta.fd.GetNumber(), + status.ToString().c_str()); + if (status.ok()) { + if (meta.fd.GetFileSize() > 0) { + table_fds_.push_back(meta.fd); + } + } else { + break; + } + } + delete cf_mems; + return status; + } + + void ExtractMetaData() { + for (size_t i = 0; i < table_fds_.size(); i++) { + TableInfo t; + t.meta.fd = table_fds_[i]; + Status status = ScanTable(&t); + if (!status.ok()) { + std::string fname = TableFileName( + db_options_.db_paths, t.meta.fd.GetNumber(), t.meta.fd.GetPathId()); + char file_num_buf[kFormatFileNumberBufSize]; + FormatFileNumber(t.meta.fd.GetNumber(), t.meta.fd.GetPathId(), + file_num_buf, sizeof(file_num_buf)); + ROCKS_LOG_WARN(db_options_.info_log, "Table #%s: ignoring %s", + file_num_buf, status.ToString().c_str()); + ArchiveFile(fname); + } else { + tables_.push_back(t); + } + } + } + + Status ScanTable(TableInfo* t) { + std::string fname = TableFileName( + db_options_.db_paths, t->meta.fd.GetNumber(), t->meta.fd.GetPathId()); + int counter = 0; + uint64_t file_size; + Status status = env_->GetFileSize(fname, &file_size); + t->meta.fd = FileDescriptor(t->meta.fd.GetNumber(), t->meta.fd.GetPathId(), + file_size); + std::shared_ptr props; + if (status.ok()) { + status = table_cache_->GetTableProperties(file_options_, icmp_, t->meta, + &props); + } + if (status.ok()) { + auto s = + GetSstInternalUniqueId(props->db_id, props->db_session_id, + props->orig_file_number, &t->meta.unique_id); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Table #%" PRIu64 + ": unable to get unique id, default to Unknown.", + t->meta.fd.GetNumber()); + } + t->column_family_id = static_cast(props->column_family_id); + if (t->column_family_id == + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) { + ROCKS_LOG_WARN( + db_options_.info_log, + "Table #%" PRIu64 + ": column family unknown (probably due to legacy format); " + "adding to default column family id 0.", + t->meta.fd.GetNumber()); + t->column_family_id = 0; + } + + if (vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id) == + nullptr) { + status = + AddColumnFamily(props->column_family_name, t->column_family_id); + } + t->meta.oldest_ancester_time = props->creation_time; + } + ColumnFamilyData* cfd = nullptr; + if (status.ok()) { + cfd = vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id); + if (cfd->GetName() != props->column_family_name) { + ROCKS_LOG_ERROR( + db_options_.info_log, + "Table #%" PRIu64 + ": inconsistent column family name '%s'; expected '%s' for column " + "family id %" PRIu32 ".", + t->meta.fd.GetNumber(), props->column_family_name.c_str(), + cfd->GetName().c_str(), t->column_family_id); + status = Status::Corruption(dbname_, "inconsistent column family name"); + } + } + if (status.ok()) { + ReadOptions ropts; + ropts.total_order_seek = true; + InternalIterator* iter = table_cache_->NewIterator( + ropts, file_options_, cfd->internal_comparator(), t->meta, + nullptr /* range_del_agg */, + cfd->GetLatestMutableCFOptions()->prefix_extractor, + /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, + TableReaderCaller::kRepair, /*arena=*/nullptr, /*skip_filters=*/false, + /*level=*/-1, /*max_file_size_for_l0_meta_pin=*/0, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr, + /*allow_unprepared_value=*/false); + ParsedInternalKey parsed; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + Slice key = iter->key(); + Status pik_status = + ParseInternalKey(key, &parsed, db_options_.allow_data_in_errors); + if (!pik_status.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Table #%" PRIu64 ": unparsable key - %s", + t->meta.fd.GetNumber(), pik_status.getState()); + continue; + } + + counter++; + + status = t->meta.UpdateBoundaries(key, iter->value(), parsed.sequence, + parsed.type); + if (!status.ok()) { + break; + } + } + if (status.ok() && !iter->status().ok()) { + status = iter->status(); + } + delete iter; + + ROCKS_LOG_INFO(db_options_.info_log, "Table #%" PRIu64 ": %d entries %s", + t->meta.fd.GetNumber(), counter, + status.ToString().c_str()); + } + if (status.ok()) { + // XXX/FIXME: This is just basic, naive handling of range tombstones, + // like call to UpdateBoundariesForRange in builder.cc where we assume + // an SST file is a full sorted run. This probably needs the extra logic + // from compaction_job.cc around call to UpdateBoundariesForRange (to + // handle range tombstones extendingg beyond range of other entries). + ReadOptions ropts; + std::unique_ptr r_iter; + status = table_cache_->GetRangeTombstoneIterator( + ropts, cfd->internal_comparator(), t->meta, &r_iter); + + if (r_iter) { + r_iter->SeekToFirst(); + + while (r_iter->Valid()) { + auto tombstone = r_iter->Tombstone(); + auto kv = tombstone.Serialize(); + t->meta.UpdateBoundariesForRange( + kv.first, tombstone.SerializeEndKey(), tombstone.seq_, + cfd->internal_comparator()); + r_iter->Next(); + } + } + } + return status; + } + + Status AddTables() { + std::unordered_map> cf_id_to_tables; + SequenceNumber max_sequence = 0; + for (size_t i = 0; i < tables_.size(); i++) { + cf_id_to_tables[tables_[i].column_family_id].push_back(&tables_[i]); + if (max_sequence < tables_[i].meta.fd.largest_seqno) { + max_sequence = tables_[i].meta.fd.largest_seqno; + } + } + vset_.SetLastAllocatedSequence(max_sequence); + vset_.SetLastPublishedSequence(max_sequence); + vset_.SetLastSequence(max_sequence); + + for (const auto& cf_id_and_tables : cf_id_to_tables) { + auto* cfd = + vset_.GetColumnFamilySet()->GetColumnFamily(cf_id_and_tables.first); + VersionEdit edit; + edit.SetComparatorName(cfd->user_comparator()->Name()); + edit.SetLogNumber(0); + edit.SetNextFile(next_file_number_); + edit.SetColumnFamily(cfd->GetID()); + + // TODO(opt): separate out into multiple levels + for (const auto* table : cf_id_and_tables.second) { + edit.AddFile( + 0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(), + table->meta.fd.GetFileSize(), table->meta.smallest, + table->meta.largest, table->meta.fd.smallest_seqno, + table->meta.fd.largest_seqno, table->meta.marked_for_compaction, + table->meta.temperature, table->meta.oldest_blob_file_number, + table->meta.oldest_ancester_time, table->meta.file_creation_time, + table->meta.file_checksum, table->meta.file_checksum_func_name, + table->meta.unique_id); + } + assert(next_file_number_ > 0); + vset_.MarkFileNumberUsed(next_file_number_ - 1); + mutex_.Lock(); + std::unique_ptr db_dir; + Status status = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(), + &db_dir, nullptr); + if (status.ok()) { + status = vset_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + &edit, &mutex_, db_dir.get(), + false /* new_descriptor_log */); + } + mutex_.Unlock(); + if (!status.ok()) { + return status; + } + } + return Status::OK(); + } + + void ArchiveFile(const std::string& fname) { + // Move into another directory. E.g., for + // dir/foo + // rename to + // dir/lost/foo + const char* slash = strrchr(fname.c_str(), '/'); + std::string new_dir; + if (slash != nullptr) { + new_dir.assign(fname.data(), slash - fname.data()); + } + new_dir.append("/lost"); + env_->CreateDir(new_dir).PermitUncheckedError(); // Ignore error + std::string new_file = new_dir; + new_file.append("/"); + new_file.append((slash == nullptr) ? fname.c_str() : slash + 1); + Status s = env_->RenameFile(fname, new_file); + ROCKS_LOG_INFO(db_options_.info_log, "Archiving %s: %s\n", fname.c_str(), + s.ToString().c_str()); + } +}; + +Status GetDefaultCFOptions( + const std::vector& column_families, + ColumnFamilyOptions* res) { + assert(res != nullptr); + auto iter = std::find_if(column_families.begin(), column_families.end(), + [](const ColumnFamilyDescriptor& cfd) { + return cfd.name == kDefaultColumnFamilyName; + }); + if (iter == column_families.end()) { + return Status::InvalidArgument( + "column_families", "Must contain entry for default column family"); + } + *res = iter->options; + return Status::OK(); +} +} // anonymous namespace + +Status RepairDB(const std::string& dbname, const DBOptions& db_options, + const std::vector& column_families) { + ColumnFamilyOptions default_cf_opts; + Status status = GetDefaultCFOptions(column_families, &default_cf_opts); + if (!status.ok()) { + return status; + } + + Repairer repairer(dbname, db_options, column_families, default_cf_opts, + ColumnFamilyOptions() /* unknown_cf_opts */, + false /* create_unknown_cfs */); + status = repairer.Run(); + if (status.ok()) { + status = repairer.Close(); + } + return status; +} + +Status RepairDB(const std::string& dbname, const DBOptions& db_options, + const std::vector& column_families, + const ColumnFamilyOptions& unknown_cf_opts) { + ColumnFamilyOptions default_cf_opts; + Status status = GetDefaultCFOptions(column_families, &default_cf_opts); + if (!status.ok()) { + return status; + } + + Repairer repairer(dbname, db_options, column_families, default_cf_opts, + unknown_cf_opts, true /* create_unknown_cfs */); + status = repairer.Run(); + if (status.ok()) { + status = repairer.Close(); + } + return status; +} + +Status RepairDB(const std::string& dbname, const Options& options) { + Options opts(options); + DBOptions db_options(opts); + ColumnFamilyOptions cf_options(opts); + + Repairer repairer(dbname, db_options, {}, cf_options /* default_cf_opts */, + cf_options /* unknown_cf_opts */, + true /* create_unknown_cfs */); + Status status = repairer.Run(); + if (status.ok()) { + status = repairer.Close(); + } + return status; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/repair_test.cc b/src/rocksdb/db/repair_test.cc new file mode 100644 index 000000000..644a9270d --- /dev/null +++ b/src/rocksdb/db/repair_test.cc @@ -0,0 +1,442 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/options.h" +#ifndef ROCKSDB_LITE + +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "file/file_util.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/transaction_log.h" +#include "table/unique_id_impl.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE +class RepairTest : public DBTestBase { + public: + RepairTest() : DBTestBase("repair_test", /*env_do_fsync=*/true) {} + + Status GetFirstSstPath(std::string* first_sst_path) { + assert(first_sst_path != nullptr); + first_sst_path->clear(); + uint64_t manifest_size; + std::vector files; + Status s = db_->GetLiveFiles(files, &manifest_size); + if (s.ok()) { + auto sst_iter = + std::find_if(files.begin(), files.end(), [](const std::string& file) { + uint64_t number; + FileType type; + bool ok = ParseFileName(file, &number, &type); + return ok && type == kTableFile; + }); + *first_sst_path = sst_iter == files.end() ? "" : dbname_ + *sst_iter; + } + return s; + } + + void ReopenWithSstIdVerify() { + std::atomic_int verify_passed{0}; + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::Open::PassedVerifyUniqueId", [&](void* arg) { + // override job status + auto id = static_cast(arg); + assert(*id != kNullUniqueId64x2); + verify_passed++; + }); + SyncPoint::GetInstance()->EnableProcessing(); + auto options = CurrentOptions(); + options.verify_sst_unique_id_in_manifest = true; + Reopen(options); + + ASSERT_GT(verify_passed, 0); + SyncPoint::GetInstance()->DisableProcessing(); + } +}; + +TEST_F(RepairTest, LostManifest) { + // Add a couple SST files, delete the manifest, and verify RepairDB() saves + // the day. + ASSERT_OK(Put("key", "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("key2", "val2")); + ASSERT_OK(Flush()); + // Need to get path before Close() deletes db_, but delete it after Close() to + // ensure Close() didn't change the manifest. + std::string manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + + Close(); + ASSERT_OK(env_->FileExists(manifest_path)); + ASSERT_OK(env_->DeleteFile(manifest_path)); + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + ReopenWithSstIdVerify(); + + ASSERT_EQ(Get("key"), "val"); + ASSERT_EQ(Get("key2"), "val2"); +} + +TEST_F(RepairTest, LostManifestMoreDbFeatures) { + // Add a couple SST files, delete the manifest, and verify RepairDB() saves + // the day. + ASSERT_OK(Put("key", "val")); + ASSERT_OK(Put("key2", "val2")); + ASSERT_OK(Put("key3", "val3")); + ASSERT_OK(Put("key4", "val4")); + ASSERT_OK(Flush()); + // Test an SST file containing only a range tombstone + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key2", + "key3z")); + ASSERT_OK(Flush()); + // Need to get path before Close() deletes db_, but delete it after Close() to + // ensure Close() didn't change the manifest. + std::string manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + + Close(); + ASSERT_OK(env_->FileExists(manifest_path)); + ASSERT_OK(env_->DeleteFile(manifest_path)); + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + + // repair from sst should work with unique_id verification + ReopenWithSstIdVerify(); + + ASSERT_EQ(Get("key"), "val"); + ASSERT_EQ(Get("key2"), "NOT_FOUND"); + ASSERT_EQ(Get("key3"), "NOT_FOUND"); + ASSERT_EQ(Get("key4"), "val4"); +} + +TEST_F(RepairTest, CorruptManifest) { + // Manifest is in an invalid format. Expect a full recovery. + ASSERT_OK(Put("key", "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("key2", "val2")); + ASSERT_OK(Flush()); + // Need to get path before Close() deletes db_, but overwrite it after Close() + // to ensure Close() didn't change the manifest. + std::string manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + + Close(); + ASSERT_OK(env_->FileExists(manifest_path)); + + ASSERT_OK(CreateFile(env_->GetFileSystem(), manifest_path, "blah", + false /* use_fsync */)); + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + + ReopenWithSstIdVerify(); + + ASSERT_EQ(Get("key"), "val"); + ASSERT_EQ(Get("key2"), "val2"); +} + +TEST_F(RepairTest, IncompleteManifest) { + // In this case, the manifest is valid but does not reference all of the SST + // files. Expect a full recovery. + ASSERT_OK(Put("key", "val")); + ASSERT_OK(Flush()); + std::string orig_manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + CopyFile(orig_manifest_path, orig_manifest_path + ".tmp"); + ASSERT_OK(Put("key2", "val2")); + ASSERT_OK(Flush()); + // Need to get path before Close() deletes db_, but overwrite it after Close() + // to ensure Close() didn't change the manifest. + std::string new_manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + + Close(); + ASSERT_OK(env_->FileExists(new_manifest_path)); + // Replace the manifest with one that is only aware of the first SST file. + CopyFile(orig_manifest_path + ".tmp", new_manifest_path); + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + + ReopenWithSstIdVerify(); + + ASSERT_EQ(Get("key"), "val"); + ASSERT_EQ(Get("key2"), "val2"); +} + +TEST_F(RepairTest, PostRepairSstFileNumbering) { + // Verify after a DB is repaired, new files will be assigned higher numbers + // than old files. + ASSERT_OK(Put("key", "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("key2", "val2")); + ASSERT_OK(Flush()); + uint64_t pre_repair_file_num = dbfull()->TEST_Current_Next_FileNo(); + Close(); + + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + + ReopenWithSstIdVerify(); + + uint64_t post_repair_file_num = dbfull()->TEST_Current_Next_FileNo(); + ASSERT_GE(post_repair_file_num, pre_repair_file_num); +} + +TEST_F(RepairTest, LostSst) { + // Delete one of the SST files but preserve the manifest that refers to it, + // then verify the DB is still usable for the intact SST. + ASSERT_OK(Put("key", "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("key2", "val2")); + ASSERT_OK(Flush()); + std::string sst_path; + ASSERT_OK(GetFirstSstPath(&sst_path)); + ASSERT_FALSE(sst_path.empty()); + ASSERT_OK(env_->DeleteFile(sst_path)); + + Close(); + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + ReopenWithSstIdVerify(); + + // Exactly one of the key-value pairs should be in the DB now. + ASSERT_TRUE((Get("key") == "val") != (Get("key2") == "val2")); +} + +TEST_F(RepairTest, CorruptSst) { + // Corrupt one of the SST files but preserve the manifest that refers to it, + // then verify the DB is still usable for the intact SST. + ASSERT_OK(Put("key", "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("key2", "val2")); + ASSERT_OK(Flush()); + std::string sst_path; + ASSERT_OK(GetFirstSstPath(&sst_path)); + ASSERT_FALSE(sst_path.empty()); + + ASSERT_OK(CreateFile(env_->GetFileSystem(), sst_path, "blah", + false /* use_fsync */)); + + Close(); + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + ReopenWithSstIdVerify(); + + // Exactly one of the key-value pairs should be in the DB now. + ASSERT_TRUE((Get("key") == "val") != (Get("key2") == "val2")); +} + +TEST_F(RepairTest, UnflushedSst) { + // This test case invokes repair while some data is unflushed, then verifies + // that data is in the db. + ASSERT_OK(Put("key", "val")); + VectorLogPtr wal_files; + ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); + ASSERT_EQ(wal_files.size(), 1); + { + uint64_t total_ssts_size; + std::unordered_map sst_files; + ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size)); + ASSERT_EQ(total_ssts_size, 0); + } + // Need to get path before Close() deletes db_, but delete it after Close() to + // ensure Close() didn't change the manifest. + std::string manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + + Close(); + ASSERT_OK(env_->FileExists(manifest_path)); + ASSERT_OK(env_->DeleteFile(manifest_path)); + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + ReopenWithSstIdVerify(); + + ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); + ASSERT_EQ(wal_files.size(), 0); + { + uint64_t total_ssts_size; + std::unordered_map sst_files; + ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size)); + ASSERT_GT(total_ssts_size, 0); + } + ASSERT_EQ(Get("key"), "val"); +} + +TEST_F(RepairTest, SeparateWalDir) { + do { + Options options = CurrentOptions(); + DestroyAndReopen(options); + ASSERT_OK(Put("key", "val")); + ASSERT_OK(Put("foo", "bar")); + VectorLogPtr wal_files; + ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); + ASSERT_EQ(wal_files.size(), 1); + { + uint64_t total_ssts_size; + std::unordered_map sst_files; + ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size)); + ASSERT_EQ(total_ssts_size, 0); + } + std::string manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + + Close(); + ASSERT_OK(env_->FileExists(manifest_path)); + ASSERT_OK(env_->DeleteFile(manifest_path)); + ASSERT_OK(RepairDB(dbname_, options)); + + // make sure that all WALs are converted to SSTables. + options.wal_dir = ""; + + ReopenWithSstIdVerify(); + ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); + ASSERT_EQ(wal_files.size(), 0); + { + uint64_t total_ssts_size; + std::unordered_map sst_files; + ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size)); + ASSERT_GT(total_ssts_size, 0); + } + ASSERT_EQ(Get("key"), "val"); + ASSERT_EQ(Get("foo"), "bar"); + + } while (ChangeWalOptions()); +} + +TEST_F(RepairTest, RepairMultipleColumnFamilies) { + // Verify repair logic associates SST files with their original column + // families. + const int kNumCfs = 3; + const int kEntriesPerCf = 2; + DestroyAndReopen(CurrentOptions()); + CreateAndReopenWithCF({"pikachu1", "pikachu2"}, CurrentOptions()); + for (int i = 0; i < kNumCfs; ++i) { + for (int j = 0; j < kEntriesPerCf; ++j) { + ASSERT_OK(Put(i, "key" + std::to_string(j), "val" + std::to_string(j))); + if (j == kEntriesPerCf - 1 && i == kNumCfs - 1) { + // Leave one unflushed so we can verify WAL entries are properly + // associated with column families. + continue; + } + ASSERT_OK(Flush(i)); + } + } + + // Need to get path before Close() deletes db_, but delete it after Close() to + // ensure Close() doesn't re-create the manifest. + std::string manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + Close(); + ASSERT_OK(env_->FileExists(manifest_path)); + ASSERT_OK(env_->DeleteFile(manifest_path)); + + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + + ReopenWithColumnFamilies({"default", "pikachu1", "pikachu2"}, + CurrentOptions()); + for (int i = 0; i < kNumCfs; ++i) { + for (int j = 0; j < kEntriesPerCf; ++j) { + ASSERT_EQ(Get(i, "key" + std::to_string(j)), "val" + std::to_string(j)); + } + } +} + +TEST_F(RepairTest, RepairColumnFamilyOptions) { + // Verify repair logic uses correct ColumnFamilyOptions when repairing a + // database with different options for column families. + const int kNumCfs = 2; + const int kEntriesPerCf = 2; + + Options opts(CurrentOptions()), rev_opts(CurrentOptions()); + opts.comparator = BytewiseComparator(); + rev_opts.comparator = ReverseBytewiseComparator(); + + DestroyAndReopen(opts); + CreateColumnFamilies({"reverse"}, rev_opts); + ReopenWithColumnFamilies({"default", "reverse"}, + std::vector{opts, rev_opts}); + for (int i = 0; i < kNumCfs; ++i) { + for (int j = 0; j < kEntriesPerCf; ++j) { + ASSERT_OK(Put(i, "key" + std::to_string(j), "val" + std::to_string(j))); + if (i == kNumCfs - 1 && j == kEntriesPerCf - 1) { + // Leave one unflushed so we can verify RepairDB's flush logic + continue; + } + ASSERT_OK(Flush(i)); + } + } + Close(); + + // RepairDB() records the comparator in the manifest, and DB::Open would fail + // if a different comparator were used. + ASSERT_OK(RepairDB(dbname_, opts, {{"default", opts}, {"reverse", rev_opts}}, + opts /* unknown_cf_opts */)); + ASSERT_OK(TryReopenWithColumnFamilies({"default", "reverse"}, + std::vector{opts, rev_opts})); + for (int i = 0; i < kNumCfs; ++i) { + for (int j = 0; j < kEntriesPerCf; ++j) { + ASSERT_EQ(Get(i, "key" + std::to_string(j)), "val" + std::to_string(j)); + } + } + + // Examine table properties to verify RepairDB() used the right options when + // converting WAL->SST + TablePropertiesCollection fname_to_props; + ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[1], &fname_to_props)); + ASSERT_EQ(fname_to_props.size(), 2U); + for (const auto& fname_and_props : fname_to_props) { + std::string comparator_name(rev_opts.comparator->Name()); + ASSERT_EQ(comparator_name, fname_and_props.second->comparator_name); + } + Close(); + + // Also check comparator when it's provided via "unknown" CF options + ASSERT_OK(RepairDB(dbname_, opts, {{"default", opts}}, + rev_opts /* unknown_cf_opts */)); + ASSERT_OK(TryReopenWithColumnFamilies({"default", "reverse"}, + std::vector{opts, rev_opts})); + for (int i = 0; i < kNumCfs; ++i) { + for (int j = 0; j < kEntriesPerCf; ++j) { + ASSERT_EQ(Get(i, "key" + std::to_string(j)), "val" + std::to_string(j)); + } + } +} + +TEST_F(RepairTest, DbNameContainsTrailingSlash) { + { + bool tmp; + if (env_->AreFilesSame("", "", &tmp).IsNotSupported()) { + fprintf(stderr, + "skipping RepairTest.DbNameContainsTrailingSlash due to " + "unsupported Env::AreFilesSame\n"); + return; + } + } + + ASSERT_OK(Put("key", "val")); + ASSERT_OK(Flush()); + Close(); + + ASSERT_OK(RepairDB(dbname_ + "/", CurrentOptions())); + ReopenWithSstIdVerify(); + ASSERT_EQ(Get("key"), "val"); +} +#endif // ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as RepairDB is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/seqno_time_test.cc b/src/rocksdb/db/seqno_time_test.cc new file mode 100644 index 000000000..12394a368 --- /dev/null +++ b/src/rocksdb/db/seqno_time_test.cc @@ -0,0 +1,996 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_test_util.h" +#include "db/periodic_task_scheduler.h" +#include "db/seqno_to_time_mapping.h" +#include "port/stack_trace.h" +#include "rocksdb/iostats_context.h" +#include "rocksdb/utilities/debug.h" +#include "test_util/mock_time_env.h" + +#ifndef ROCKSDB_LITE + +namespace ROCKSDB_NAMESPACE { + +class SeqnoTimeTest : public DBTestBase { + public: + SeqnoTimeTest() : DBTestBase("seqno_time_test", /*env_do_fsync=*/false) { + mock_clock_ = std::make_shared(env_->GetSystemClock()); + mock_env_ = std::make_unique(env_, mock_clock_); + } + + protected: + std::unique_ptr mock_env_; + std::shared_ptr mock_clock_; + + void SetUp() override { + mock_clock_->InstallTimedWaitFixCallback(); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::StartPeriodicTaskScheduler:Init", [&](void* arg) { + auto periodic_task_scheduler_ptr = + reinterpret_cast(arg); + periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock_.get()); + }); + } + + // make sure the file is not in cache, otherwise it won't have IO info + void AssertKeyTemperature(int key_id, Temperature expected_temperature) { + get_iostats_context()->Reset(); + IOStatsContext* iostats = get_iostats_context(); + std::string result = Get(Key(key_id)); + ASSERT_FALSE(result.empty()); + ASSERT_GT(iostats->bytes_read, 0); + switch (expected_temperature) { + case Temperature::kUnknown: + ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_read_count, + 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, + 0); + break; + case Temperature::kCold: + ASSERT_GT(iostats->file_io_stats_by_temperature.cold_file_read_count, + 0); + ASSERT_GT(iostats->file_io_stats_by_temperature.cold_file_bytes_read, + 0); + break; + default: + // the test only support kCold now for the bottommost temperature + FAIL(); + } + } +}; + +TEST_F(SeqnoTimeTest, TemperatureBasicUniversal) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + const int kKeyPerSec = 10; + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.preclude_last_level_data_seconds = 10000; + options.env = mock_env_.get(); + options.bottommost_temperature = Temperature::kCold; + options.num_levels = kNumLevels; + DestroyAndReopen(options); + + // pass some time first, otherwise the first a few keys write time are going + // to be zero, and internally zero has special meaning: kUnknownSeqnoTime + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); + + int sst_num = 0; + // Write files that are overlap and enough to trigger compaction + for (; sst_num < kNumTrigger; sst_num++) { + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); + }); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + + // All data is hot, only output to penultimate level + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + // read a random key, which should be hot (kUnknown) + AssertKeyTemperature(20, Temperature::kUnknown); + + // Write more data, but still all hot until the 10th SST, as: + // write a key every 10 seconds, 100 keys per SST, each SST takes 1000 seconds + // The preclude_last_level_data_seconds is 10k + for (; sst_num < kNumTrigger * 2; sst_num++) { + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); + }); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + } + + // Now we have both hot data and cold data + for (; sst_num < kNumTrigger * 3; sst_num++) { + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); + }); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->WaitForCompact(true)); + } + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + uint64_t hot_data_size = GetSstSizeHelper(Temperature::kUnknown); + uint64_t cold_data_size = GetSstSizeHelper(Temperature::kCold); + ASSERT_GT(hot_data_size, 0); + ASSERT_GT(cold_data_size, 0); + // the first a few key should be cold + AssertKeyTemperature(20, Temperature::kCold); + + for (int i = 0; i < 30; i++) { + dbfull()->TEST_WaitForPeridicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast(20 * kKeyPerSec)); + }); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + // the hot/cold data cut off range should be between i * 20 + 200 -> 250 + AssertKeyTemperature(i * 20 + 250, Temperature::kUnknown); + AssertKeyTemperature(i * 20 + 200, Temperature::kCold); + } + + ASSERT_LT(GetSstSizeHelper(Temperature::kUnknown), hot_data_size); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), cold_data_size); + + // Wait again, the most of the data should be cold after that + // but it may not be all cold, because if there's no new data write to SST, + // the compaction will not get the new seqno->time sampling to decide the last + // a few data's time. + for (int i = 0; i < 5; i++) { + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(1000)); }); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + } + + // any random data close to the end should be cold + AssertKeyTemperature(1000, Temperature::kCold); + + // close explicitly, because the env is local variable which will be released + // first. + Close(); +} + +TEST_F(SeqnoTimeTest, TemperatureBasicLevel) { + const int kNumLevels = 7; + const int kNumKeys = 100; + + Options options = CurrentOptions(); + options.preclude_last_level_data_seconds = 10000; + options.env = mock_env_.get(); + options.bottommost_temperature = Temperature::kCold; + options.num_levels = kNumLevels; + options.level_compaction_dynamic_level_bytes = true; + // TODO(zjay): for level compaction, auto-compaction may stuck in deadloop, if + // the penultimate level score > 1, but the hot is not cold enough to compact + // to last level, which will keep triggering compaction. + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + // pass some time first, otherwise the first a few keys write time are going + // to be zero, and internally zero has special meaning: kUnknownSeqnoTime + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(10)); }); + + int sst_num = 0; + // Write files that are overlap + for (; sst_num < 4; sst_num++) { + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(10)); }); + } + ASSERT_OK(Flush()); + } + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + // All data is hot, only output to penultimate level + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + // read a random key, which should be hot (kUnknown) + AssertKeyTemperature(20, Temperature::kUnknown); + + // Adding more data to have mixed hot and cold data + for (; sst_num < 14; sst_num++) { + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(10)); }); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + // Compact the files to the last level which should split the hot/cold data + MoveFilesToLevel(6); + uint64_t hot_data_size = GetSstSizeHelper(Temperature::kUnknown); + uint64_t cold_data_size = GetSstSizeHelper(Temperature::kCold); + ASSERT_GT(hot_data_size, 0); + ASSERT_GT(cold_data_size, 0); + // the first a few key should be cold + AssertKeyTemperature(20, Temperature::kCold); + + // Wait some time, with each wait, the cold data is increasing and hot data is + // decreasing + for (int i = 0; i < 30; i++) { + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(200)); }); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + uint64_t pre_hot = hot_data_size; + uint64_t pre_cold = cold_data_size; + hot_data_size = GetSstSizeHelper(Temperature::kUnknown); + cold_data_size = GetSstSizeHelper(Temperature::kCold); + ASSERT_LT(hot_data_size, pre_hot); + ASSERT_GT(cold_data_size, pre_cold); + + // the hot/cold cut_off key should be around i * 20 + 400 -> 450 + AssertKeyTemperature(i * 20 + 450, Temperature::kUnknown); + AssertKeyTemperature(i * 20 + 400, Temperature::kCold); + } + + // Wait again, the most of the data should be cold after that + // hot data might not be empty, because if we don't write new data, there's + // no seqno->time sampling available to the compaction + for (int i = 0; i < 5; i++) { + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(1000)); }); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + } + + // any random data close to the end should be cold + AssertKeyTemperature(1000, Temperature::kCold); + + Close(); +} + +enum class SeqnoTimeTestType : char { + kTrackInternalTimeSeconds = 0, + kPrecludeLastLevel = 1, + kBothSetTrackSmaller = 2, +}; + +class SeqnoTimeTablePropTest + : public SeqnoTimeTest, + public ::testing::WithParamInterface { + public: + SeqnoTimeTablePropTest() : SeqnoTimeTest() {} + + void SetTrackTimeDurationOptions(uint64_t track_time_duration, + Options& options) const { + // either option set will enable the time tracking feature + switch (GetParam()) { + case SeqnoTimeTestType::kTrackInternalTimeSeconds: + options.preclude_last_level_data_seconds = 0; + options.preserve_internal_time_seconds = track_time_duration; + break; + case SeqnoTimeTestType::kPrecludeLastLevel: + options.preclude_last_level_data_seconds = track_time_duration; + options.preserve_internal_time_seconds = 0; + break; + case SeqnoTimeTestType::kBothSetTrackSmaller: + options.preclude_last_level_data_seconds = track_time_duration; + options.preserve_internal_time_seconds = track_time_duration / 10; + break; + } + } +}; + +INSTANTIATE_TEST_CASE_P( + SeqnoTimeTablePropTest, SeqnoTimeTablePropTest, + ::testing::Values(SeqnoTimeTestType::kTrackInternalTimeSeconds, + SeqnoTimeTestType::kPrecludeLastLevel, + SeqnoTimeTestType::kBothSetTrackSmaller)); + +TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) { + Options options = CurrentOptions(); + SetTrackTimeDurationOptions(10000, options); + + options.env = mock_env_.get(); + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + std::set checked_file_nums; + SequenceNumber start_seq = dbfull()->GetLatestSequenceNumber(); + // Write a key every 10 seconds + for (int i = 0; i < 200; i++) { + ASSERT_OK(Put(Key(i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(10)); }); + } + ASSERT_OK(Flush()); + TablePropertiesCollection tables_props; + ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); + ASSERT_EQ(tables_props.size(), 1); + auto it = tables_props.begin(); + SeqnoToTimeMapping tp_mapping; + ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); + ASSERT_OK(tp_mapping.Sort()); + ASSERT_FALSE(tp_mapping.Empty()); + auto seqs = tp_mapping.TEST_GetInternalMapping(); + // about ~20 seqs->time entries, because the sample rate is 10000/100, and it + // passes 2k time. + ASSERT_GE(seqs.size(), 19); + ASSERT_LE(seqs.size(), 21); + SequenceNumber seq_end = dbfull()->GetLatestSequenceNumber(); + for (auto i = start_seq; i < start_seq + 10; i++) { + ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i + 1) * 10); + } + start_seq += 10; + for (auto i = start_seq; i < seq_end; i++) { + // The result is within the range + ASSERT_GE(tp_mapping.GetOldestApproximateTime(i), (i - 10) * 10); + ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i + 10) * 10); + } + checked_file_nums.insert(it->second->orig_file_number); + start_seq = seq_end; + + // Write a key every 1 seconds + for (int i = 0; i < 200; i++) { + ASSERT_OK(Put(Key(i + 190), "value")); + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(1)); }); + } + seq_end = dbfull()->GetLatestSequenceNumber(); + ASSERT_OK(Flush()); + tables_props.clear(); + ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); + ASSERT_EQ(tables_props.size(), 2); + it = tables_props.begin(); + while (it != tables_props.end()) { + if (!checked_file_nums.count(it->second->orig_file_number)) { + break; + } + it++; + } + ASSERT_TRUE(it != tables_props.end()); + + tp_mapping.Clear(); + ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); + ASSERT_OK(tp_mapping.Sort()); + seqs = tp_mapping.TEST_GetInternalMapping(); + // There only a few time sample + ASSERT_GE(seqs.size(), 1); + ASSERT_LE(seqs.size(), 3); + for (auto i = start_seq; i < seq_end; i++) { + // The result is not very accurate, as there is more data write within small + // range of time + ASSERT_GE(tp_mapping.GetOldestApproximateTime(i), (i - start_seq) + 1000); + ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i - start_seq) + 3000); + } + checked_file_nums.insert(it->second->orig_file_number); + start_seq = seq_end; + + // Write a key every 200 seconds + for (int i = 0; i < 200; i++) { + ASSERT_OK(Put(Key(i + 380), "value")); + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(200)); }); + } + seq_end = dbfull()->GetLatestSequenceNumber(); + ASSERT_OK(Flush()); + tables_props.clear(); + ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); + ASSERT_EQ(tables_props.size(), 3); + it = tables_props.begin(); + while (it != tables_props.end()) { + if (!checked_file_nums.count(it->second->orig_file_number)) { + break; + } + it++; + } + ASSERT_TRUE(it != tables_props.end()); + + tp_mapping.Clear(); + ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); + ASSERT_OK(tp_mapping.Sort()); + seqs = tp_mapping.TEST_GetInternalMapping(); + // The sequence number -> time entries should be maxed + ASSERT_GE(seqs.size(), 99); + ASSERT_LE(seqs.size(), 101); + for (auto i = start_seq; i < seq_end - 99; i++) { + // likely the first 100 entries reports 0 + ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i - start_seq) + 3000); + } + start_seq += 101; + + for (auto i = start_seq; i < seq_end; i++) { + ASSERT_GE(tp_mapping.GetOldestApproximateTime(i), + (i - start_seq) * 200 + 22200); + ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), + (i - start_seq) * 200 + 22600); + } + checked_file_nums.insert(it->second->orig_file_number); + start_seq = seq_end; + + // Write a key every 100 seconds + for (int i = 0; i < 200; i++) { + ASSERT_OK(Put(Key(i + 570), "value")); + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(100)); }); + } + seq_end = dbfull()->GetLatestSequenceNumber(); + ASSERT_OK(Flush()); + tables_props.clear(); + ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); + ASSERT_EQ(tables_props.size(), 4); + it = tables_props.begin(); + while (it != tables_props.end()) { + if (!checked_file_nums.count(it->second->orig_file_number)) { + break; + } + it++; + } + ASSERT_TRUE(it != tables_props.end()); + tp_mapping.Clear(); + ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); + ASSERT_OK(tp_mapping.Sort()); + seqs = tp_mapping.TEST_GetInternalMapping(); + ASSERT_GE(seqs.size(), 99); + ASSERT_LE(seqs.size(), 101); + + checked_file_nums.insert(it->second->orig_file_number); + + // re-enable compaction + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "false"}, + })); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + tables_props.clear(); + ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); + ASSERT_GE(tables_props.size(), 1); + it = tables_props.begin(); + while (it != tables_props.end()) { + if (!checked_file_nums.count(it->second->orig_file_number)) { + break; + } + it++; + } + ASSERT_TRUE(it != tables_props.end()); + tp_mapping.Clear(); + ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); + ASSERT_OK(tp_mapping.Sort()); + seqs = tp_mapping.TEST_GetInternalMapping(); + ASSERT_GE(seqs.size(), 99); + ASSERT_LE(seqs.size(), 101); + for (auto i = start_seq; i < seq_end - 99; i++) { + // likely the first 100 entries reports 0 + ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), + (i - start_seq) * 100 + 50000); + } + start_seq += 101; + + for (auto i = start_seq; i < seq_end; i++) { + ASSERT_GE(tp_mapping.GetOldestApproximateTime(i), + (i - start_seq) * 100 + 52200); + ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), + (i - start_seq) * 100 + 52400); + } + ASSERT_OK(db_->Close()); +} + +TEST_P(SeqnoTimeTablePropTest, MultiCFs) { + Options options = CurrentOptions(); + options.preclude_last_level_data_seconds = 0; + options.preserve_internal_time_seconds = 0; + options.env = mock_env_.get(); + options.stats_dump_period_sec = 0; + options.stats_persist_period_sec = 0; + ReopenWithColumnFamilies({"default"}, options); + + const PeriodicTaskScheduler& scheduler = + dbfull()->TEST_GetPeriodicTaskScheduler(); + ASSERT_FALSE(scheduler.TEST_HasTask(PeriodicTaskType::kRecordSeqnoTime)); + + // Write some data and increase the current time + for (int i = 0; i < 200; i++) { + ASSERT_OK(Put(Key(i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(100)); }); + } + ASSERT_OK(Flush()); + TablePropertiesCollection tables_props; + ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); + ASSERT_EQ(tables_props.size(), 1); + auto it = tables_props.begin(); + ASSERT_TRUE(it->second->seqno_to_time_mapping.empty()); + + ASSERT_TRUE(dbfull()->TEST_GetSeqnoToTimeMapping().Empty()); + + Options options_1 = options; + SetTrackTimeDurationOptions(10000, options_1); + CreateColumnFamilies({"one"}, options_1); + ASSERT_TRUE(scheduler.TEST_HasTask(PeriodicTaskType::kRecordSeqnoTime)); + + // Write some data to the default CF (without preclude_last_level feature) + for (int i = 0; i < 200; i++) { + ASSERT_OK(Put(Key(i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(100)); }); + } + ASSERT_OK(Flush()); + + // Write some data to the CF one + for (int i = 0; i < 20; i++) { + ASSERT_OK(Put(1, Key(i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(10)); }); + } + ASSERT_OK(Flush(1)); + tables_props.clear(); + ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[1], &tables_props)); + ASSERT_EQ(tables_props.size(), 1); + it = tables_props.begin(); + SeqnoToTimeMapping tp_mapping; + ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); + ASSERT_OK(tp_mapping.Sort()); + ASSERT_FALSE(tp_mapping.Empty()); + auto seqs = tp_mapping.TEST_GetInternalMapping(); + ASSERT_GE(seqs.size(), 1); + ASSERT_LE(seqs.size(), 4); + + // Create one more CF with larger preclude_last_level time + Options options_2 = options; + SetTrackTimeDurationOptions(1000000, options_2); // 1m + CreateColumnFamilies({"two"}, options_2); + + // Add more data to CF "two" to fill the in memory mapping + for (int i = 0; i < 2000; i++) { + ASSERT_OK(Put(2, Key(i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(100)); }); + } + seqs = dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping(); + ASSERT_GE(seqs.size(), 1000 - 1); + ASSERT_LE(seqs.size(), 1000 + 1); + + ASSERT_OK(Flush(2)); + tables_props.clear(); + ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[2], &tables_props)); + ASSERT_EQ(tables_props.size(), 1); + it = tables_props.begin(); + tp_mapping.Clear(); + ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); + ASSERT_OK(tp_mapping.Sort()); + seqs = tp_mapping.TEST_GetInternalMapping(); + // the max encoded entries is 100 + ASSERT_GE(seqs.size(), 100 - 1); + ASSERT_LE(seqs.size(), 100 + 1); + + // Write some data to default CF, as all memtable with preclude_last_level + // enabled have flushed, the in-memory seqno->time mapping should be cleared + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(0, Key(i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(100)); }); + } + seqs = dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping(); + ASSERT_OK(Flush(0)); + + // trigger compaction for CF "two" and make sure the compaction output has + // seqno_to_time_mapping + for (int j = 0; j < 3; j++) { + for (int i = 0; i < 200; i++) { + ASSERT_OK(Put(2, Key(i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(100)); }); + } + ASSERT_OK(Flush(2)); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + tables_props.clear(); + ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[2], &tables_props)); + ASSERT_EQ(tables_props.size(), 1); + it = tables_props.begin(); + tp_mapping.Clear(); + ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); + ASSERT_OK(tp_mapping.Sort()); + seqs = tp_mapping.TEST_GetInternalMapping(); + ASSERT_GE(seqs.size(), 99); + ASSERT_LE(seqs.size(), 101); + + for (int j = 0; j < 2; j++) { + for (int i = 0; i < 200; i++) { + ASSERT_OK(Put(0, Key(i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(100)); }); + } + ASSERT_OK(Flush(0)); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + tables_props.clear(); + ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[0], &tables_props)); + ASSERT_EQ(tables_props.size(), 1); + it = tables_props.begin(); + ASSERT_TRUE(it->second->seqno_to_time_mapping.empty()); + + // Write some data to CF "two", but don't flush to accumulate + for (int i = 0; i < 1000; i++) { + ASSERT_OK(Put(2, Key(i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(100)); }); + } + ASSERT_GE( + dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping().size(), + 500); + // After dropping CF "one", the in-memory mapping will be change to only + // follow CF "two" options. + ASSERT_OK(db_->DropColumnFamily(handles_[1])); + ASSERT_LE( + dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping().size(), + 100 + 5); + + // After dropping CF "two", the in-memory mapping is also clear. + ASSERT_OK(db_->DropColumnFamily(handles_[2])); + ASSERT_EQ( + dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping().size(), + 0); + + // And the timer worker is stopped + ASSERT_FALSE(scheduler.TEST_HasTask(PeriodicTaskType::kRecordSeqnoTime)); + Close(); +} + +TEST_P(SeqnoTimeTablePropTest, MultiInstancesBasic) { + const int kInstanceNum = 2; + + Options options = CurrentOptions(); + SetTrackTimeDurationOptions(10000, options); + options.env = mock_env_.get(); + options.stats_dump_period_sec = 0; + options.stats_persist_period_sec = 0; + + auto dbs = std::vector(kInstanceNum); + for (int i = 0; i < kInstanceNum; i++) { + ASSERT_OK( + DB::Open(options, test::PerThreadDBPath(std::to_string(i)), &(dbs[i]))); + } + + // Make sure the second instance has the worker enabled + auto dbi = static_cast_with_check(dbs[1]); + WriteOptions wo; + for (int i = 0; i < 200; i++) { + ASSERT_OK(dbi->Put(wo, Key(i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(100)); }); + } + SeqnoToTimeMapping seqno_to_time_mapping = dbi->TEST_GetSeqnoToTimeMapping(); + ASSERT_GT(seqno_to_time_mapping.Size(), 10); + + for (int i = 0; i < kInstanceNum; i++) { + ASSERT_OK(dbs[i]->Close()); + delete dbs[i]; + } +} + +TEST_P(SeqnoTimeTablePropTest, SeqnoToTimeMappingUniversal) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + + Options options = CurrentOptions(); + SetTrackTimeDurationOptions(10000, options); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = kNumLevels; + options.env = mock_env_.get(); + + DestroyAndReopen(options); + + std::atomic_uint64_t num_seqno_zeroing{0}; + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput:ZeroingSeq", + [&](void* /*arg*/) { num_seqno_zeroing++; }); + SyncPoint::GetInstance()->EnableProcessing(); + + int sst_num = 0; + for (; sst_num < kNumTrigger - 1; sst_num++) { + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(10)); }); + } + ASSERT_OK(Flush()); + } + TablePropertiesCollection tables_props; + ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); + ASSERT_EQ(tables_props.size(), 3); + for (const auto& props : tables_props) { + ASSERT_FALSE(props.second->seqno_to_time_mapping.empty()); + SeqnoToTimeMapping tp_mapping; + ASSERT_OK(tp_mapping.Add(props.second->seqno_to_time_mapping)); + ASSERT_OK(tp_mapping.Sort()); + ASSERT_FALSE(tp_mapping.Empty()); + auto seqs = tp_mapping.TEST_GetInternalMapping(); + ASSERT_GE(seqs.size(), 10 - 1); + ASSERT_LE(seqs.size(), 10 + 1); + } + + // Trigger a compaction + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(10)); }); + } + sst_num++; + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + tables_props.clear(); + ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); + ASSERT_EQ(tables_props.size(), 1); + + auto it = tables_props.begin(); + SeqnoToTimeMapping tp_mapping; + ASSERT_FALSE(it->second->seqno_to_time_mapping.empty()); + ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); + + // compact to the last level + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + // make sure the data is all compacted to penultimate level if the feature is + // on, otherwise, compacted to the last level. + if (options.preclude_last_level_data_seconds > 0) { + ASSERT_GT(NumTableFilesAtLevel(5), 0); + ASSERT_EQ(NumTableFilesAtLevel(6), 0); + } else { + ASSERT_EQ(NumTableFilesAtLevel(5), 0); + ASSERT_GT(NumTableFilesAtLevel(6), 0); + } + + // regardless the file is on the last level or not, it should keep the time + // information and sequence number are not set + tables_props.clear(); + tp_mapping.Clear(); + ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); + + ASSERT_EQ(tables_props.size(), 1); + ASSERT_EQ(num_seqno_zeroing, 0); + + it = tables_props.begin(); + ASSERT_FALSE(it->second->seqno_to_time_mapping.empty()); + ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); + + // make half of the data expired + mock_clock_->MockSleepForSeconds(static_cast(8000)); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + tables_props.clear(); + tp_mapping.Clear(); + ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); + + if (options.preclude_last_level_data_seconds > 0) { + ASSERT_EQ(tables_props.size(), 2); + } else { + ASSERT_EQ(tables_props.size(), 1); + } + ASSERT_GT(num_seqno_zeroing, 0); + std::vector key_versions; + ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(), + std::numeric_limits::max(), + &key_versions)); + // make sure there're more than 300 keys and first 100 keys are having seqno + // zeroed out, the last 100 key seqno not zeroed out + ASSERT_GT(key_versions.size(), 300); + for (int i = 0; i < 100; i++) { + ASSERT_EQ(key_versions[i].sequence, 0); + } + auto rit = key_versions.rbegin(); + for (int i = 0; i < 100; i++) { + ASSERT_GT(rit->sequence, 0); + rit++; + } + + // make all data expired and compact again to push it to the last level + // regardless if the tiering feature is enabled or not + mock_clock_->MockSleepForSeconds(static_cast(20000)); + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + ASSERT_GT(num_seqno_zeroing, 0); + ASSERT_GT(NumTableFilesAtLevel(6), 0); + + Close(); +} + +TEST_F(SeqnoTimeTest, MappingAppend) { + SeqnoToTimeMapping test(/*max_time_duration=*/100, /*max_capacity=*/10); + + // ignore seqno == 0, as it may mean the seqno is zeroed out + ASSERT_FALSE(test.Append(0, 9)); + + ASSERT_TRUE(test.Append(3, 10)); + auto size = test.Size(); + // normal add + ASSERT_TRUE(test.Append(10, 11)); + size++; + ASSERT_EQ(size, test.Size()); + + // Append unsorted + ASSERT_FALSE(test.Append(8, 12)); + ASSERT_EQ(size, test.Size()); + + // Append with the same seqno, newer time will be accepted + ASSERT_TRUE(test.Append(10, 12)); + ASSERT_EQ(size, test.Size()); + // older time will be ignored + ASSERT_FALSE(test.Append(10, 9)); + ASSERT_EQ(size, test.Size()); + + // new seqno with old time will be ignored + ASSERT_FALSE(test.Append(12, 8)); + ASSERT_EQ(size, test.Size()); +} + +TEST_F(SeqnoTimeTest, GetOldestApproximateTime) { + SeqnoToTimeMapping test(/*max_time_duration=*/100, /*max_capacity=*/10); + + ASSERT_EQ(test.GetOldestApproximateTime(10), kUnknownSeqnoTime); + + test.Append(3, 10); + + ASSERT_EQ(test.GetOldestApproximateTime(2), kUnknownSeqnoTime); + ASSERT_EQ(test.GetOldestApproximateTime(3), 10); + ASSERT_EQ(test.GetOldestApproximateTime(10), 10); + + test.Append(10, 100); + + test.Append(100, 1000); + ASSERT_EQ(test.GetOldestApproximateTime(10), 100); + ASSERT_EQ(test.GetOldestApproximateTime(40), 100); + ASSERT_EQ(test.GetOldestApproximateTime(111), 1000); +} + +TEST_F(SeqnoTimeTest, Sort) { + SeqnoToTimeMapping test; + + // single entry + test.Add(10, 11); + ASSERT_OK(test.Sort()); + ASSERT_EQ(test.Size(), 1); + + // duplicate, should be removed by sort + test.Add(10, 11); + // same seqno, but older time, should be removed + test.Add(10, 9); + + // unuseful ones, should be removed by sort + test.Add(11, 9); + test.Add(9, 8); + + // Good ones + test.Add(1, 10); + test.Add(100, 100); + + ASSERT_OK(test.Sort()); + + auto seqs = test.TEST_GetInternalMapping(); + + std::deque expected; + expected.emplace_back(1, 10); + expected.emplace_back(10, 11); + expected.emplace_back(100, 100); + + ASSERT_EQ(expected, seqs); +} + +TEST_F(SeqnoTimeTest, EncodeDecodeBasic) { + SeqnoToTimeMapping test(0, 1000); + + std::string output; + test.Encode(output, 0, 1000, 100); + ASSERT_TRUE(output.empty()); + + for (int i = 1; i <= 1000; i++) { + ASSERT_TRUE(test.Append(i, i * 10)); + } + test.Encode(output, 0, 1000, 100); + + ASSERT_FALSE(output.empty()); + + SeqnoToTimeMapping decoded; + ASSERT_OK(decoded.Add(output)); + ASSERT_OK(decoded.Sort()); + ASSERT_EQ(decoded.Size(), SeqnoToTimeMapping::kMaxSeqnoTimePairsPerSST); + ASSERT_EQ(test.Size(), 1000); + + for (SequenceNumber seq = 0; seq <= 1000; seq++) { + // test has the more accurate time mapping, encode only pick + // kMaxSeqnoTimePairsPerSST number of entries, which is less accurate + uint64_t target_time = test.GetOldestApproximateTime(seq); + ASSERT_GE(decoded.GetOldestApproximateTime(seq), + target_time < 200 ? 0 : target_time - 200); + ASSERT_LE(decoded.GetOldestApproximateTime(seq), target_time); + } +} + +TEST_F(SeqnoTimeTest, EncodeDecodePerferNewTime) { + SeqnoToTimeMapping test(0, 10); + + test.Append(1, 10); + test.Append(5, 17); + test.Append(6, 25); + test.Append(8, 30); + + std::string output; + test.Encode(output, 1, 10, 0, 3); + + SeqnoToTimeMapping decoded; + ASSERT_OK(decoded.Add(output)); + ASSERT_OK(decoded.Sort()); + + ASSERT_EQ(decoded.Size(), 3); + + auto seqs = decoded.TEST_GetInternalMapping(); + std::deque expected; + expected.emplace_back(1, 10); + expected.emplace_back(6, 25); + expected.emplace_back(8, 30); + ASSERT_EQ(expected, seqs); + + // Add a few large time number + test.Append(10, 100); + test.Append(13, 200); + test.Append(16, 300); + + output.clear(); + test.Encode(output, 1, 20, 0, 4); + decoded.Clear(); + ASSERT_OK(decoded.Add(output)); + ASSERT_OK(decoded.Sort()); + ASSERT_EQ(decoded.Size(), 4); + + expected.clear(); + expected.emplace_back(1, 10); + // entry #6, #8 are skipped as they are too close to #1. + // entry #100 is also within skip range, but if it's skipped, there not enough + // number to fill 4 entries, so select it. + expected.emplace_back(10, 100); + expected.emplace_back(13, 200); + expected.emplace_back(16, 300); + seqs = decoded.TEST_GetInternalMapping(); + ASSERT_EQ(expected, seqs); +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/seqno_to_time_mapping.cc b/src/rocksdb/db/seqno_to_time_mapping.cc new file mode 100644 index 000000000..c69209929 --- /dev/null +++ b/src/rocksdb/db/seqno_to_time_mapping.cc @@ -0,0 +1,341 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/seqno_to_time_mapping.h" + +#include "db/version_edit.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +uint64_t SeqnoToTimeMapping::GetOldestApproximateTime( + const SequenceNumber seqno) const { + assert(is_sorted_); + auto it = std::upper_bound(seqno_time_mapping_.begin(), + seqno_time_mapping_.end(), seqno); + if (it == seqno_time_mapping_.begin()) { + return 0; + } + it--; + return it->time; +} + +void SeqnoToTimeMapping::Add(SequenceNumber seqno, uint64_t time) { + if (seqno == 0) { + return; + } + is_sorted_ = false; + seqno_time_mapping_.emplace_back(seqno, time); +} + +void SeqnoToTimeMapping::TruncateOldEntries(const uint64_t now) { + assert(is_sorted_); + + if (max_time_duration_ == 0) { + return; + } + + const uint64_t cut_off_time = + now > max_time_duration_ ? now - max_time_duration_ : 0; + assert(cut_off_time <= now); // no overflow + + auto it = std::upper_bound( + seqno_time_mapping_.begin(), seqno_time_mapping_.end(), cut_off_time, + [](uint64_t target, const SeqnoTimePair& other) -> bool { + return target < other.time; + }); + if (it == seqno_time_mapping_.begin()) { + return; + } + it--; + seqno_time_mapping_.erase(seqno_time_mapping_.begin(), it); +} + +SequenceNumber SeqnoToTimeMapping::GetOldestSequenceNum(uint64_t time) { + assert(is_sorted_); + + auto it = std::upper_bound( + seqno_time_mapping_.begin(), seqno_time_mapping_.end(), time, + [](uint64_t target, const SeqnoTimePair& other) -> bool { + return target < other.time; + }); + if (it == seqno_time_mapping_.begin()) { + return 0; + } + it--; + return it->seqno; +} + +// The encoded format is: +// [num_of_entries][[seqno][time],[seqno][time],...] +// ^ ^ +// var_int delta_encoded (var_int) +void SeqnoToTimeMapping::Encode(std::string& dest, const SequenceNumber start, + const SequenceNumber end, const uint64_t now, + const uint64_t output_size) const { + assert(is_sorted_); + if (start > end) { + // It could happen when the SST file is empty, the initial value of min + // sequence number is kMaxSequenceNumber and max is 0. + // The empty output file will be removed in the final step of compaction. + return; + } + + auto start_it = std::upper_bound(seqno_time_mapping_.begin(), + seqno_time_mapping_.end(), start); + if (start_it != seqno_time_mapping_.begin()) { + start_it--; + } + + auto end_it = std::upper_bound(seqno_time_mapping_.begin(), + seqno_time_mapping_.end(), end); + if (end_it == seqno_time_mapping_.begin()) { + return; + } + if (start_it >= end_it) { + return; + } + + // truncate old entries that are not needed + if (max_time_duration_ > 0) { + const uint64_t cut_off_time = + now > max_time_duration_ ? now - max_time_duration_ : 0; + while (start_it < end_it && start_it->time < cut_off_time) { + start_it++; + } + } + // to include the first element + if (start_it != seqno_time_mapping_.begin()) { + start_it--; + } + + // If there are more data than needed, pick the entries for encoding. + // It's not the most optimized algorithm for selecting the best representative + // entries over the time. + // It starts from the beginning and makes sure the distance is larger than + // `(end - start) / size` before selecting the number. For example, for the + // following list, pick 3 entries (it will pick seqno #1, #6, #8): + // 1 -> 10 + // 5 -> 17 + // 6 -> 25 + // 8 -> 30 + // first, it always picks the first one, then there are 2 num_entries_to_fill + // and the time difference between current one vs. the last one is + // (30 - 10) = 20. 20/2 = 10. So it will skip until 10+10 = 20. => it skips + // #5 and pick #6. + // But the most optimized solution is picking #1 #5 #8, as it will be more + // evenly distributed for time. Anyway the following algorithm is simple and + // may over-select new data, which is good. We do want more accurate time + // information for recent data. + std::deque output_copy; + if (std::distance(start_it, end_it) > static_cast(output_size)) { + int64_t num_entries_to_fill = static_cast(output_size); + auto last_it = end_it; + last_it--; + uint64_t end_time = last_it->time; + uint64_t skip_until_time = 0; + for (auto it = start_it; it < end_it; it++) { + // skip if it's not reach the skip_until_time yet + if (std::distance(it, end_it) > num_entries_to_fill && + it->time < skip_until_time) { + continue; + } + output_copy.push_back(*it); + num_entries_to_fill--; + if (std::distance(it, end_it) > num_entries_to_fill && + num_entries_to_fill > 0) { + // If there are more entries than we need, re-calculate the + // skip_until_time, which means skip until that time + skip_until_time = + it->time + ((end_time - it->time) / num_entries_to_fill); + } + } + + // Make sure all entries are filled + assert(num_entries_to_fill == 0); + start_it = output_copy.begin(); + end_it = output_copy.end(); + } + + // Delta encode the data + uint64_t size = std::distance(start_it, end_it); + PutVarint64(&dest, size); + SeqnoTimePair base; + for (auto it = start_it; it < end_it; it++) { + assert(base < *it); + SeqnoTimePair val = *it - base; + base = *it; + val.Encode(dest); + } +} + +Status SeqnoToTimeMapping::Add(const std::string& seqno_time_mapping_str) { + Slice input(seqno_time_mapping_str); + if (input.empty()) { + return Status::OK(); + } + uint64_t size; + if (!GetVarint64(&input, &size)) { + return Status::Corruption("Invalid sequence number time size"); + } + is_sorted_ = false; + SeqnoTimePair base; + for (uint64_t i = 0; i < size; i++) { + SeqnoTimePair val; + Status s = val.Decode(input); + if (!s.ok()) { + return s; + } + val.Add(base); + seqno_time_mapping_.emplace_back(val); + base = val; + } + return Status::OK(); +} + +void SeqnoToTimeMapping::SeqnoTimePair::Encode(std::string& dest) const { + PutVarint64Varint64(&dest, seqno, time); +} + +Status SeqnoToTimeMapping::SeqnoTimePair::Decode(Slice& input) { + if (!GetVarint64(&input, &seqno)) { + return Status::Corruption("Invalid sequence number"); + } + if (!GetVarint64(&input, &time)) { + return Status::Corruption("Invalid time"); + } + return Status::OK(); +} + +bool SeqnoToTimeMapping::Append(SequenceNumber seqno, uint64_t time) { + assert(is_sorted_); + + // skip seq number 0, which may have special meaning, like zeroed out data + if (seqno == 0) { + return false; + } + if (!Empty()) { + if (seqno < Last().seqno || time < Last().time) { + return false; + } + if (seqno == Last().seqno) { + Last().time = time; + return true; + } + if (time == Last().time) { + // new sequence has the same time as old one, no need to add new mapping + return false; + } + } + + seqno_time_mapping_.emplace_back(seqno, time); + + if (seqno_time_mapping_.size() > max_capacity_) { + seqno_time_mapping_.pop_front(); + } + return true; +} + +bool SeqnoToTimeMapping::Resize(uint64_t min_time_duration, + uint64_t max_time_duration) { + uint64_t new_max_capacity = + CalculateMaxCapacity(min_time_duration, max_time_duration); + if (new_max_capacity == max_capacity_) { + return false; + } else if (new_max_capacity < seqno_time_mapping_.size()) { + uint64_t delta = seqno_time_mapping_.size() - new_max_capacity; + seqno_time_mapping_.erase(seqno_time_mapping_.begin(), + seqno_time_mapping_.begin() + delta); + } + max_capacity_ = new_max_capacity; + return true; +} + +Status SeqnoToTimeMapping::Sort() { + if (is_sorted_) { + return Status::OK(); + } + if (seqno_time_mapping_.empty()) { + is_sorted_ = true; + return Status::OK(); + } + + std::deque copy = std::move(seqno_time_mapping_); + + std::sort(copy.begin(), copy.end()); + + seqno_time_mapping_.clear(); + + // remove seqno = 0, which may have special meaning, like zeroed out data + while (copy.front().seqno == 0) { + copy.pop_front(); + } + + SeqnoTimePair prev = copy.front(); + for (const auto& it : copy) { + // If sequence number is the same, pick the one with larger time, which is + // more accurate than the older time. + if (it.seqno == prev.seqno) { + assert(it.time >= prev.time); + prev.time = it.time; + } else { + assert(it.seqno > prev.seqno); + // If a larger sequence number has an older time which is not useful, skip + if (it.time > prev.time) { + seqno_time_mapping_.push_back(prev); + prev = it; + } + } + } + seqno_time_mapping_.emplace_back(prev); + + is_sorted_ = true; + return Status::OK(); +} + +std::string SeqnoToTimeMapping::ToHumanString() const { + std::string ret; + for (const auto& seq_time : seqno_time_mapping_) { + AppendNumberTo(&ret, seq_time.seqno); + ret.append("->"); + AppendNumberTo(&ret, seq_time.time); + ret.append(","); + } + return ret; +} + +SeqnoToTimeMapping SeqnoToTimeMapping::Copy( + SequenceNumber smallest_seqno) const { + SeqnoToTimeMapping ret; + auto it = std::upper_bound(seqno_time_mapping_.begin(), + seqno_time_mapping_.end(), smallest_seqno); + if (it != seqno_time_mapping_.begin()) { + it--; + } + std::copy(it, seqno_time_mapping_.end(), + std::back_inserter(ret.seqno_time_mapping_)); + return ret; +} + +uint64_t SeqnoToTimeMapping::CalculateMaxCapacity(uint64_t min_time_duration, + uint64_t max_time_duration) { + if (min_time_duration == 0) { + return 0; + } + return std::min( + kMaxSeqnoToTimeEntries, + max_time_duration * kMaxSeqnoTimePairsPerCF / min_time_duration); +} + +SeqnoToTimeMapping::SeqnoTimePair SeqnoToTimeMapping::SeqnoTimePair::operator-( + const SeqnoTimePair& other) const { + SeqnoTimePair res; + res.seqno = seqno - other.seqno; + res.time = time - other.time; + return res; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/seqno_to_time_mapping.h b/src/rocksdb/db/seqno_to_time_mapping.h new file mode 100644 index 000000000..4ffc9c199 --- /dev/null +++ b/src/rocksdb/db/seqno_to_time_mapping.h @@ -0,0 +1,189 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +constexpr uint64_t kUnknownSeqnoTime = 0; + +// SeqnoToTimeMapping stores the sequence number to time mapping, so given a +// sequence number it can estimate the oldest possible time for that sequence +// number. For example: +// 10 -> 100 +// 50 -> 300 +// then if a key has seqno 19, the OldestApproximateTime would be 100, for 51 it +// would be 300. +// As it's a sorted list, the new entry is inserted from the back. The old data +// will be popped from the front if they're no longer used. +// +// Note: the data struct is not thread safe, both read and write need to be +// synchronized by caller. +class SeqnoToTimeMapping { + public: + // Maximum number of entries can be encoded into SST. The data is delta encode + // so the maximum data usage for each SST is < 0.3K + static constexpr uint64_t kMaxSeqnoTimePairsPerSST = 100; + + // Maximum number of entries per CF. If there's only CF with this feature on, + // the max duration divided by this number, so for example, if + // preclude_last_level_data_seconds = 100000 (~1day), then it will sample the + // seqno -> time every 1000 seconds (~17minutes). Then the maximum entry it + // needs is 100. + // When there are multiple CFs having this feature on, the sampling cadence is + // determined by the smallest setting, the capacity is determined the largest + // setting, also it's caped by kMaxSeqnoTimePairsPerCF * 10. + static constexpr uint64_t kMaxSeqnoTimePairsPerCF = 100; + + // A simple struct for sequence number to time pair + struct SeqnoTimePair { + SequenceNumber seqno = 0; + uint64_t time = 0; + + SeqnoTimePair() = default; + SeqnoTimePair(SequenceNumber _seqno, uint64_t _time) + : seqno(_seqno), time(_time) {} + + // Encode to dest string + void Encode(std::string& dest) const; + + // Decode the value from input Slice and remove it from the input + Status Decode(Slice& input); + + // subtraction of 2 SeqnoTimePair + SeqnoTimePair operator-(const SeqnoTimePair& other) const; + + // Add 2 values together + void Add(const SeqnoTimePair& obj) { + seqno += obj.seqno; + time += obj.time; + } + + // Compare SeqnoTimePair with a sequence number, used for binary search a + // sequence number in a list of SeqnoTimePair + bool operator<(const SequenceNumber& other) const { return seqno < other; } + + // Compare 2 SeqnoTimePair + bool operator<(const SeqnoTimePair& other) const { + return std::tie(seqno, time) < std::tie(other.seqno, other.time); + } + + // Check if 2 SeqnoTimePair is the same + bool operator==(const SeqnoTimePair& other) const { + return std::tie(seqno, time) == std::tie(other.seqno, other.time); + } + }; + + // constractor of SeqnoToTimeMapping + // max_time_duration is the maximum time it should track. For example, if + // preclude_last_level_data_seconds is 1 day, then if an entry is older than 1 + // day, then it can be removed. + // max_capacity is the maximum number of entry it can hold. For single CF, + // it's caped at 100 (kMaxSeqnoTimePairsPerCF), otherwise + // kMaxSeqnoTimePairsPerCF * 10. + // If it's set to 0, means it won't truncate any old data. + explicit SeqnoToTimeMapping(uint64_t max_time_duration = 0, + uint64_t max_capacity = 0) + : max_time_duration_(max_time_duration), max_capacity_(max_capacity) {} + + // Append a new entry to the list. The new entry should be newer than the + // existing ones. It maintains the internal sorted status. + bool Append(SequenceNumber seqno, uint64_t time); + + // Given a sequence number, estimate it's oldest time + uint64_t GetOldestApproximateTime(SequenceNumber seqno) const; + + // Truncate the old entries based on the current time and max_time_duration_ + void TruncateOldEntries(uint64_t now); + + // Given a time, return it's oldest possible sequence number + SequenceNumber GetOldestSequenceNum(uint64_t time); + + // Encode to a binary string + void Encode(std::string& des, SequenceNumber start, SequenceNumber end, + uint64_t now, + uint64_t output_size = kMaxSeqnoTimePairsPerSST) const; + + // Add a new random entry, unlike Append(), it can be any data, but also makes + // the list un-sorted. + void Add(SequenceNumber seqno, uint64_t time); + + // Decode and add the entries to the current obj. The list will be unsorted + Status Add(const std::string& seqno_time_mapping_str); + + // Return the number of entries + size_t Size() const { return seqno_time_mapping_.size(); } + + // Reduce the size of internal list + bool Resize(uint64_t min_time_duration, uint64_t max_time_duration); + + // Override the max_time_duration_ + void SetMaxTimeDuration(uint64_t max_time_duration) { + max_time_duration_ = max_time_duration; + } + + uint64_t GetCapacity() const { return max_capacity_; } + + // Sort the list, which also remove the redundant entries, useless entries, + // which makes sure the seqno is sorted, but also the time + Status Sort(); + + // copy the current obj from the given smallest_seqno. + SeqnoToTimeMapping Copy(SequenceNumber smallest_seqno) const; + + // If the internal list is empty + bool Empty() const { return seqno_time_mapping_.empty(); } + + // clear all entries + void Clear() { seqno_time_mapping_.clear(); } + + // return the string for user message + // Note: Not efficient, okay for print + std::string ToHumanString() const; + +#ifndef NDEBUG + const std::deque& TEST_GetInternalMapping() const { + return seqno_time_mapping_; + } +#endif + + private: + static constexpr uint64_t kMaxSeqnoToTimeEntries = + kMaxSeqnoTimePairsPerCF * 10; + + uint64_t max_time_duration_; + uint64_t max_capacity_; + + std::deque seqno_time_mapping_; + + bool is_sorted_ = true; + + static uint64_t CalculateMaxCapacity(uint64_t min_time_duration, + uint64_t max_time_duration); + + SeqnoTimePair& Last() { + assert(!Empty()); + return seqno_time_mapping_.back(); + } +}; + +// for searching the sequence number from SeqnoToTimeMapping +inline bool operator<(const SequenceNumber& seqno, + const SeqnoToTimeMapping::SeqnoTimePair& other) { + return seqno < other.seqno; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/snapshot_checker.h b/src/rocksdb/db/snapshot_checker.h new file mode 100644 index 000000000..0bfb1aa07 --- /dev/null +++ b/src/rocksdb/db/snapshot_checker.h @@ -0,0 +1,60 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +enum class SnapshotCheckerResult : int { + kInSnapshot = 0, + kNotInSnapshot = 1, + // In case snapshot is released and the checker has no clue whether + // the given sequence is visible to the snapshot. + kSnapshotReleased = 2, +}; + +// Callback class that control GC of duplicate keys in flush/compaction. +class SnapshotChecker { + public: + virtual ~SnapshotChecker() {} + virtual SnapshotCheckerResult CheckInSnapshot( + SequenceNumber sequence, SequenceNumber snapshot_sequence) const = 0; +}; + +class DisableGCSnapshotChecker : public SnapshotChecker { + public: + virtual ~DisableGCSnapshotChecker() {} + virtual SnapshotCheckerResult CheckInSnapshot( + SequenceNumber /*sequence*/, + SequenceNumber /*snapshot_sequence*/) const override { + // By returning kNotInSnapshot, we prevent all the values from being GCed + return SnapshotCheckerResult::kNotInSnapshot; + } + static DisableGCSnapshotChecker* Instance(); + + protected: + explicit DisableGCSnapshotChecker() {} +}; + +class WritePreparedTxnDB; + +// Callback class created by WritePreparedTxnDB to check if a key +// is visible by a snapshot. +class WritePreparedSnapshotChecker : public SnapshotChecker { + public: + explicit WritePreparedSnapshotChecker(WritePreparedTxnDB* txn_db); + virtual ~WritePreparedSnapshotChecker() {} + + virtual SnapshotCheckerResult CheckInSnapshot( + SequenceNumber sequence, SequenceNumber snapshot_sequence) const override; + + private: +#ifndef ROCKSDB_LITE + const WritePreparedTxnDB* const txn_db_; +#endif // !ROCKSDB_LITE +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/snapshot_impl.cc b/src/rocksdb/db/snapshot_impl.cc new file mode 100644 index 000000000..98b475463 --- /dev/null +++ b/src/rocksdb/db/snapshot_impl.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/db.h" +#include "rocksdb/snapshot.h" + +namespace ROCKSDB_NAMESPACE { + +ManagedSnapshot::ManagedSnapshot(DB* db) + : db_(db), snapshot_(db->GetSnapshot()) {} + +ManagedSnapshot::ManagedSnapshot(DB* db, const Snapshot* _snapshot) + : db_(db), snapshot_(_snapshot) {} + +ManagedSnapshot::~ManagedSnapshot() { + if (snapshot_) { + db_->ReleaseSnapshot(snapshot_); + } +} + +const Snapshot* ManagedSnapshot::snapshot() { return snapshot_; } + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/snapshot_impl.h b/src/rocksdb/db/snapshot_impl.h new file mode 100644 index 000000000..23e5e98cd --- /dev/null +++ b/src/rocksdb/db/snapshot_impl.h @@ -0,0 +1,239 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include "db/dbformat.h" +#include "rocksdb/db.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class SnapshotList; + +// Snapshots are kept in a doubly-linked list in the DB. +// Each SnapshotImpl corresponds to a particular sequence number. +class SnapshotImpl : public Snapshot { + public: + SequenceNumber number_; // const after creation + // It indicates the smallest uncommitted data at the time the snapshot was + // taken. This is currently used by WritePrepared transactions to limit the + // scope of queries to IsInSnapshot. + SequenceNumber min_uncommitted_ = kMinUnCommittedSeq; + + SequenceNumber GetSequenceNumber() const override { return number_; } + + int64_t GetUnixTime() const override { return unix_time_; } + + uint64_t GetTimestamp() const override { return timestamp_; } + + private: + friend class SnapshotList; + + // SnapshotImpl is kept in a doubly-linked circular list + SnapshotImpl* prev_; + SnapshotImpl* next_; + + SnapshotList* list_; // just for sanity checks + + int64_t unix_time_; + + uint64_t timestamp_; + + // Will this snapshot be used by a Transaction to do write-conflict checking? + bool is_write_conflict_boundary_; +}; + +class SnapshotList { + public: + SnapshotList() { + list_.prev_ = &list_; + list_.next_ = &list_; + list_.number_ = 0xFFFFFFFFL; // placeholder marker, for debugging + // Set all the variables to make UBSAN happy. + list_.list_ = nullptr; + list_.unix_time_ = 0; + list_.timestamp_ = 0; + list_.is_write_conflict_boundary_ = false; + count_ = 0; + } + + // No copy-construct. + SnapshotList(const SnapshotList&) = delete; + + bool empty() const { + assert(list_.next_ != &list_ || 0 == count_); + return list_.next_ == &list_; + } + SnapshotImpl* oldest() const { + assert(!empty()); + return list_.next_; + } + SnapshotImpl* newest() const { + assert(!empty()); + return list_.prev_; + } + + SnapshotImpl* New(SnapshotImpl* s, SequenceNumber seq, uint64_t unix_time, + bool is_write_conflict_boundary, + uint64_t ts = std::numeric_limits::max()) { + s->number_ = seq; + s->unix_time_ = unix_time; + s->timestamp_ = ts; + s->is_write_conflict_boundary_ = is_write_conflict_boundary; + s->list_ = this; + s->next_ = &list_; + s->prev_ = list_.prev_; + s->prev_->next_ = s; + s->next_->prev_ = s; + count_++; + return s; + } + + // Do not responsible to free the object. + void Delete(const SnapshotImpl* s) { + assert(s->list_ == this); + s->prev_->next_ = s->next_; + s->next_->prev_ = s->prev_; + count_--; + } + + // retrieve all snapshot numbers up until max_seq. They are sorted in + // ascending order (with no duplicates). + std::vector GetAll( + SequenceNumber* oldest_write_conflict_snapshot = nullptr, + const SequenceNumber& max_seq = kMaxSequenceNumber) const { + std::vector ret; + GetAll(&ret, oldest_write_conflict_snapshot, max_seq); + return ret; + } + + void GetAll(std::vector* snap_vector, + SequenceNumber* oldest_write_conflict_snapshot = nullptr, + const SequenceNumber& max_seq = kMaxSequenceNumber) const { + std::vector& ret = *snap_vector; + // So far we have no use case that would pass a non-empty vector + assert(ret.size() == 0); + + if (oldest_write_conflict_snapshot != nullptr) { + *oldest_write_conflict_snapshot = kMaxSequenceNumber; + } + + if (empty()) { + return; + } + const SnapshotImpl* s = &list_; + while (s->next_ != &list_) { + if (s->next_->number_ > max_seq) { + break; + } + // Avoid duplicates + if (ret.empty() || ret.back() != s->next_->number_) { + ret.push_back(s->next_->number_); + } + + if (oldest_write_conflict_snapshot != nullptr && + *oldest_write_conflict_snapshot == kMaxSequenceNumber && + s->next_->is_write_conflict_boundary_) { + // If this is the first write-conflict boundary snapshot in the list, + // it is the oldest + *oldest_write_conflict_snapshot = s->next_->number_; + } + + s = s->next_; + } + return; + } + + // get the sequence number of the most recent snapshot + SequenceNumber GetNewest() { + if (empty()) { + return 0; + } + return newest()->number_; + } + + int64_t GetOldestSnapshotTime() const { + if (empty()) { + return 0; + } else { + return oldest()->unix_time_; + } + } + + int64_t GetOldestSnapshotSequence() const { + if (empty()) { + return 0; + } else { + return oldest()->GetSequenceNumber(); + } + } + + uint64_t count() const { return count_; } + + private: + // Dummy head of doubly-linked list of snapshots + SnapshotImpl list_; + uint64_t count_; +}; + +// All operations on TimestampedSnapshotList must be protected by db mutex. +class TimestampedSnapshotList { + public: + explicit TimestampedSnapshotList() = default; + + std::shared_ptr GetSnapshot(uint64_t ts) const { + if (ts == std::numeric_limits::max() && !snapshots_.empty()) { + auto it = snapshots_.rbegin(); + assert(it != snapshots_.rend()); + return it->second; + } + auto it = snapshots_.find(ts); + if (it == snapshots_.end()) { + return std::shared_ptr(); + } + return it->second; + } + + void GetSnapshots( + uint64_t ts_lb, uint64_t ts_ub, + std::vector>& snapshots) const { + assert(ts_lb < ts_ub); + auto it_low = snapshots_.lower_bound(ts_lb); + auto it_high = snapshots_.lower_bound(ts_ub); + for (auto it = it_low; it != it_high; ++it) { + snapshots.emplace_back(it->second); + } + } + + void AddSnapshot(const std::shared_ptr& snapshot) { + assert(snapshot); + snapshots_.try_emplace(snapshot->GetTimestamp(), snapshot); + } + + // snapshots_to_release: the container to where the timestamped snapshots will + // be moved so that it retains the last reference to the snapshots and the + // snapshots won't be actually released which requires db mutex. The + // snapshots will be released by caller of ReleaseSnapshotsOlderThan(). + void ReleaseSnapshotsOlderThan( + uint64_t ts, + autovector>& snapshots_to_release) { + auto ub = snapshots_.lower_bound(ts); + for (auto it = snapshots_.begin(); it != ub; ++it) { + snapshots_to_release.emplace_back(it->second); + } + snapshots_.erase(snapshots_.begin(), ub); + } + + private: + std::map> snapshots_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/table_cache.cc b/src/rocksdb/db/table_cache.cc new file mode 100644 index 000000000..c44c4bb84 --- /dev/null +++ b/src/rocksdb/db/table_cache.cc @@ -0,0 +1,753 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/table_cache.h" + +#include "db/dbformat.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/snapshot_impl.h" +#include "db/version_edit.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "file/random_access_file_reader.h" +#include "monitoring/perf_context_imp.h" +#include "rocksdb/advanced_options.h" +#include "rocksdb/statistics.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/get_context.h" +#include "table/internal_iterator.h" +#include "table/iterator_wrapper.h" +#include "table/multiget_context.h" +#include "table/table_builder.h" +#include "table/table_reader.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/coding.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +template +static void DeleteEntry(const Slice& /*key*/, void* value) { + T* typed_value = reinterpret_cast(value); + delete typed_value; +} +} // anonymous namespace +} // namespace ROCKSDB_NAMESPACE + +// Generate the regular and coroutine versions of some methods by +// including table_cache_sync_and_async.h twice +// Macros in the header will expand differently based on whether +// WITH_COROUTINES or WITHOUT_COROUTINES is defined +// clang-format off +#define WITHOUT_COROUTINES +#include "db/table_cache_sync_and_async.h" +#undef WITHOUT_COROUTINES +#define WITH_COROUTINES +#include "db/table_cache_sync_and_async.h" +#undef WITH_COROUTINES +// clang-format on + +namespace ROCKSDB_NAMESPACE { + +namespace { + +static void UnrefEntry(void* arg1, void* arg2) { + Cache* cache = reinterpret_cast(arg1); + Cache::Handle* h = reinterpret_cast(arg2); + cache->Release(h); +} + +static Slice GetSliceForFileNumber(const uint64_t* file_number) { + return Slice(reinterpret_cast(file_number), + sizeof(*file_number)); +} + +#ifndef ROCKSDB_LITE + +void AppendVarint64(IterKey* key, uint64_t v) { + char buf[10]; + auto ptr = EncodeVarint64(buf, v); + key->TrimAppend(key->Size(), buf, ptr - buf); +} + +#endif // ROCKSDB_LITE + +} // anonymous namespace + +const int kLoadConcurency = 128; + +TableCache::TableCache(const ImmutableOptions& ioptions, + const FileOptions* file_options, Cache* const cache, + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, + const std::string& db_session_id) + : ioptions_(ioptions), + file_options_(*file_options), + cache_(cache), + immortal_tables_(false), + block_cache_tracer_(block_cache_tracer), + loader_mutex_(kLoadConcurency, kGetSliceNPHash64UnseededFnPtr), + io_tracer_(io_tracer), + db_session_id_(db_session_id) { + if (ioptions_.row_cache) { + // If the same cache is shared by multiple instances, we need to + // disambiguate its entries. + PutVarint64(&row_cache_id_, ioptions_.row_cache->NewId()); + } +} + +TableCache::~TableCache() {} + +TableReader* TableCache::GetTableReaderFromHandle(Cache::Handle* handle) { + return reinterpret_cast(cache_->Value(handle)); +} + +void TableCache::ReleaseHandle(Cache::Handle* handle) { + cache_->Release(handle); +} + +Status TableCache::GetTableReader( + const ReadOptions& ro, const FileOptions& file_options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, bool sequential_mode, bool record_read_stats, + HistogramImpl* file_read_hist, std::unique_ptr* table_reader, + const std::shared_ptr& prefix_extractor, + bool skip_filters, int level, bool prefetch_index_and_filter_in_cache, + size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) { + std::string fname = TableFileName( + ioptions_.cf_paths, file_meta.fd.GetNumber(), file_meta.fd.GetPathId()); + std::unique_ptr file; + FileOptions fopts = file_options; + fopts.temperature = file_temperature; + Status s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options); + TEST_SYNC_POINT_CALLBACK("TableCache::GetTableReader:BeforeOpenFile", + const_cast(&s)); + if (s.ok()) { + s = ioptions_.fs->NewRandomAccessFile(fname, fopts, &file, nullptr); + } + if (s.ok()) { + RecordTick(ioptions_.stats, NO_FILE_OPENS); + } else if (s.IsPathNotFound()) { + fname = Rocks2LevelTableFileName(fname); + s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options); + if (s.ok()) { + s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file, + nullptr); + } + if (s.ok()) { + RecordTick(ioptions_.stats, NO_FILE_OPENS); + } + } + + if (s.ok()) { + if (!sequential_mode && ioptions_.advise_random_on_open) { + file->Hint(FSRandomAccessFile::kRandom); + } + StopWatch sw(ioptions_.clock, ioptions_.stats, TABLE_OPEN_IO_MICROS); + std::unique_ptr file_reader( + new RandomAccessFileReader( + std::move(file), fname, ioptions_.clock, io_tracer_, + record_read_stats ? ioptions_.stats : nullptr, SST_READ_MICROS, + file_read_hist, ioptions_.rate_limiter.get(), ioptions_.listeners, + file_temperature, level == ioptions_.num_levels - 1)); + UniqueId64x2 expected_unique_id; + if (ioptions_.verify_sst_unique_id_in_manifest) { + expected_unique_id = file_meta.unique_id; + } else { + expected_unique_id = kNullUniqueId64x2; // null ID == no verification + } + s = ioptions_.table_factory->NewTableReader( + ro, + TableReaderOptions(ioptions_, prefix_extractor, file_options, + internal_comparator, skip_filters, immortal_tables_, + false /* force_direct_prefetch */, level, + block_cache_tracer_, max_file_size_for_l0_meta_pin, + db_session_id_, file_meta.fd.GetNumber(), + expected_unique_id, file_meta.fd.largest_seqno), + std::move(file_reader), file_meta.fd.GetFileSize(), table_reader, + prefetch_index_and_filter_in_cache); + TEST_SYNC_POINT("TableCache::GetTableReader:0"); + } + return s; +} + +void TableCache::EraseHandle(const FileDescriptor& fd, Cache::Handle* handle) { + ReleaseHandle(handle); + uint64_t number = fd.GetNumber(); + Slice key = GetSliceForFileNumber(&number); + cache_->Erase(key); +} + +Status TableCache::FindTable( + const ReadOptions& ro, const FileOptions& file_options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, Cache::Handle** handle, + const std::shared_ptr& prefix_extractor, + const bool no_io, bool record_read_stats, HistogramImpl* file_read_hist, + bool skip_filters, int level, bool prefetch_index_and_filter_in_cache, + size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) { + PERF_TIMER_GUARD_WITH_CLOCK(find_table_nanos, ioptions_.clock); + uint64_t number = file_meta.fd.GetNumber(); + Slice key = GetSliceForFileNumber(&number); + *handle = cache_->Lookup(key); + TEST_SYNC_POINT_CALLBACK("TableCache::FindTable:0", + const_cast(&no_io)); + + if (*handle == nullptr) { + if (no_io) { + return Status::Incomplete("Table not found in table_cache, no_io is set"); + } + MutexLock load_lock(loader_mutex_.get(key)); + // We check the cache again under loading mutex + *handle = cache_->Lookup(key); + if (*handle != nullptr) { + return Status::OK(); + } + + std::unique_ptr table_reader; + Status s = + GetTableReader(ro, file_options, internal_comparator, file_meta, + false /* sequential mode */, record_read_stats, + file_read_hist, &table_reader, prefix_extractor, + skip_filters, level, prefetch_index_and_filter_in_cache, + max_file_size_for_l0_meta_pin, file_temperature); + if (!s.ok()) { + assert(table_reader == nullptr); + RecordTick(ioptions_.stats, NO_FILE_ERRORS); + // We do not cache error results so that if the error is transient, + // or somebody repairs the file, we recover automatically. + } else { + s = cache_->Insert(key, table_reader.get(), 1, &DeleteEntry, + handle); + if (s.ok()) { + // Release ownership of table reader. + table_reader.release(); + } + } + return s; + } + return Status::OK(); +} + +InternalIterator* TableCache::NewIterator( + const ReadOptions& options, const FileOptions& file_options, + const InternalKeyComparator& icomparator, const FileMetaData& file_meta, + RangeDelAggregator* range_del_agg, + const std::shared_ptr& prefix_extractor, + TableReader** table_reader_ptr, HistogramImpl* file_read_hist, + TableReaderCaller caller, Arena* arena, bool skip_filters, int level, + size_t max_file_size_for_l0_meta_pin, + const InternalKey* smallest_compaction_key, + const InternalKey* largest_compaction_key, bool allow_unprepared_value, + TruncatedRangeDelIterator** range_del_iter) { + PERF_TIMER_GUARD(new_table_iterator_nanos); + + Status s; + TableReader* table_reader = nullptr; + Cache::Handle* handle = nullptr; + if (table_reader_ptr != nullptr) { + *table_reader_ptr = nullptr; + } + bool for_compaction = caller == TableReaderCaller::kCompaction; + auto& fd = file_meta.fd; + table_reader = fd.table_reader; + if (table_reader == nullptr) { + s = FindTable( + options, file_options, icomparator, file_meta, &handle, + prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */, + !for_compaction /* record_read_stats */, file_read_hist, skip_filters, + level, true /* prefetch_index_and_filter_in_cache */, + max_file_size_for_l0_meta_pin, file_meta.temperature); + if (s.ok()) { + table_reader = GetTableReaderFromHandle(handle); + } + } + InternalIterator* result = nullptr; + if (s.ok()) { + if (options.table_filter && + !options.table_filter(*table_reader->GetTableProperties())) { + result = NewEmptyInternalIterator(arena); + } else { + result = table_reader->NewIterator( + options, prefix_extractor.get(), arena, skip_filters, caller, + file_options.compaction_readahead_size, allow_unprepared_value); + } + if (handle != nullptr) { + result->RegisterCleanup(&UnrefEntry, cache_, handle); + handle = nullptr; // prevent from releasing below + } + + if (for_compaction) { + table_reader->SetupForCompaction(); + } + if (table_reader_ptr != nullptr) { + *table_reader_ptr = table_reader; + } + } + if (s.ok() && !options.ignore_range_deletions) { + if (range_del_iter != nullptr) { + auto new_range_del_iter = + table_reader->NewRangeTombstoneIterator(options); + if (new_range_del_iter == nullptr || new_range_del_iter->empty()) { + delete new_range_del_iter; + *range_del_iter = nullptr; + } else { + *range_del_iter = new TruncatedRangeDelIterator( + std::unique_ptr( + new_range_del_iter), + &icomparator, &file_meta.smallest, &file_meta.largest); + } + } + if (range_del_agg != nullptr) { + if (range_del_agg->AddFile(fd.GetNumber())) { + std::unique_ptr new_range_del_iter( + static_cast( + table_reader->NewRangeTombstoneIterator(options))); + if (new_range_del_iter != nullptr) { + s = new_range_del_iter->status(); + } + if (s.ok()) { + const InternalKey* smallest = &file_meta.smallest; + const InternalKey* largest = &file_meta.largest; + if (smallest_compaction_key != nullptr) { + smallest = smallest_compaction_key; + } + if (largest_compaction_key != nullptr) { + largest = largest_compaction_key; + } + range_del_agg->AddTombstones(std::move(new_range_del_iter), smallest, + largest); + } + } + } + } + + if (handle != nullptr) { + ReleaseHandle(handle); + } + if (!s.ok()) { + assert(result == nullptr); + result = NewErrorInternalIterator(s, arena); + } + return result; +} + +Status TableCache::GetRangeTombstoneIterator( + const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + std::unique_ptr* out_iter) { + assert(out_iter); + const FileDescriptor& fd = file_meta.fd; + Status s; + TableReader* t = fd.table_reader; + Cache::Handle* handle = nullptr; + if (t == nullptr) { + s = FindTable(options, file_options_, internal_comparator, file_meta, + &handle); + if (s.ok()) { + t = GetTableReaderFromHandle(handle); + } + } + if (s.ok()) { + // Note: NewRangeTombstoneIterator could return nullptr + out_iter->reset(t->NewRangeTombstoneIterator(options)); + } + if (handle) { + if (*out_iter) { + (*out_iter)->RegisterCleanup(&UnrefEntry, cache_, handle); + } else { + ReleaseHandle(handle); + } + } + return s; +} + +#ifndef ROCKSDB_LITE +void TableCache::CreateRowCacheKeyPrefix(const ReadOptions& options, + const FileDescriptor& fd, + const Slice& internal_key, + GetContext* get_context, + IterKey& row_cache_key) { + uint64_t fd_number = fd.GetNumber(); + // We use the user key as cache key instead of the internal key, + // otherwise the whole cache would be invalidated every time the + // sequence key increases. However, to support caching snapshot + // reads, we append the sequence number (incremented by 1 to + // distinguish from 0) only in this case. + // If the snapshot is larger than the largest seqno in the file, + // all data should be exposed to the snapshot, so we treat it + // the same as there is no snapshot. The exception is that if + // a seq-checking callback is registered, some internal keys + // may still be filtered out. + uint64_t seq_no = 0; + // Maybe we can include the whole file ifsnapshot == fd.largest_seqno. + if (options.snapshot != nullptr && + (get_context->has_callback() || + static_cast_with_check(options.snapshot) + ->GetSequenceNumber() <= fd.largest_seqno)) { + // We should consider to use options.snapshot->GetSequenceNumber() + // instead of GetInternalKeySeqno(k), which will make the code + // easier to understand. + seq_no = 1 + GetInternalKeySeqno(internal_key); + } + + // Compute row cache key. + row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(), + row_cache_id_.size()); + AppendVarint64(&row_cache_key, fd_number); + AppendVarint64(&row_cache_key, seq_no); +} + +bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key, + size_t prefix_size, GetContext* get_context) { + bool found = false; + + row_cache_key.TrimAppend(prefix_size, user_key.data(), user_key.size()); + if (auto row_handle = + ioptions_.row_cache->Lookup(row_cache_key.GetUserKey())) { + // Cleanable routine to release the cache entry + Cleanable value_pinner; + auto release_cache_entry_func = [](void* cache_to_clean, + void* cache_handle) { + ((Cache*)cache_to_clean)->Release((Cache::Handle*)cache_handle); + }; + auto found_row_cache_entry = + static_cast(ioptions_.row_cache->Value(row_handle)); + // If it comes here value is located on the cache. + // found_row_cache_entry points to the value on cache, + // and value_pinner has cleanup procedure for the cached entry. + // After replayGetContextLog() returns, get_context.pinnable_slice_ + // will point to cache entry buffer (or a copy based on that) and + // cleanup routine under value_pinner will be delegated to + // get_context.pinnable_slice_. Cache entry is released when + // get_context.pinnable_slice_ is reset. + value_pinner.RegisterCleanup(release_cache_entry_func, + ioptions_.row_cache.get(), row_handle); + replayGetContextLog(*found_row_cache_entry, user_key, get_context, + &value_pinner); + RecordTick(ioptions_.stats, ROW_CACHE_HIT); + found = true; + } else { + RecordTick(ioptions_.stats, ROW_CACHE_MISS); + } + return found; +} +#endif // ROCKSDB_LITE + +Status TableCache::Get( + const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, const Slice& k, GetContext* get_context, + const std::shared_ptr& prefix_extractor, + HistogramImpl* file_read_hist, bool skip_filters, int level, + size_t max_file_size_for_l0_meta_pin) { + auto& fd = file_meta.fd; + std::string* row_cache_entry = nullptr; + bool done = false; +#ifndef ROCKSDB_LITE + IterKey row_cache_key; + std::string row_cache_entry_buffer; + + // Check row cache if enabled. Since row cache does not currently store + // sequence numbers, we cannot use it if we need to fetch the sequence. + if (ioptions_.row_cache && !get_context->NeedToReadSequence()) { + auto user_key = ExtractUserKey(k); + CreateRowCacheKeyPrefix(options, fd, k, get_context, row_cache_key); + done = GetFromRowCache(user_key, row_cache_key, row_cache_key.Size(), + get_context); + if (!done) { + row_cache_entry = &row_cache_entry_buffer; + } + } +#endif // ROCKSDB_LITE + Status s; + TableReader* t = fd.table_reader; + Cache::Handle* handle = nullptr; + if (!done) { + assert(s.ok()); + if (t == nullptr) { + s = FindTable(options, file_options_, internal_comparator, file_meta, + &handle, prefix_extractor, + options.read_tier == kBlockCacheTier /* no_io */, + true /* record_read_stats */, file_read_hist, skip_filters, + level, true /* prefetch_index_and_filter_in_cache */, + max_file_size_for_l0_meta_pin, file_meta.temperature); + if (s.ok()) { + t = GetTableReaderFromHandle(handle); + } + } + SequenceNumber* max_covering_tombstone_seq = + get_context->max_covering_tombstone_seq(); + if (s.ok() && max_covering_tombstone_seq != nullptr && + !options.ignore_range_deletions) { + std::unique_ptr range_del_iter( + t->NewRangeTombstoneIterator(options)); + if (range_del_iter != nullptr) { + SequenceNumber seq = + range_del_iter->MaxCoveringTombstoneSeqnum(ExtractUserKey(k)); + if (seq > *max_covering_tombstone_seq) { + *max_covering_tombstone_seq = seq; + if (get_context->NeedTimestamp()) { + get_context->SetTimestampFromRangeTombstone( + range_del_iter->timestamp()); + } + } + } + } + if (s.ok()) { + get_context->SetReplayLog(row_cache_entry); // nullptr if no cache. + s = t->Get(options, k, get_context, prefix_extractor.get(), skip_filters); + get_context->SetReplayLog(nullptr); + } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) { + // Couldn't find Table in cache but treat as kFound if no_io set + get_context->MarkKeyMayExist(); + s = Status::OK(); + done = true; + } + } + +#ifndef ROCKSDB_LITE + // Put the replay log in row cache only if something was found. + if (!done && s.ok() && row_cache_entry && !row_cache_entry->empty()) { + size_t charge = row_cache_entry->capacity() + sizeof(std::string); + void* row_ptr = new std::string(std::move(*row_cache_entry)); + // If row cache is full, it's OK to continue. + ioptions_.row_cache + ->Insert(row_cache_key.GetUserKey(), row_ptr, charge, + &DeleteEntry) + .PermitUncheckedError(); + } +#endif // ROCKSDB_LITE + + if (handle != nullptr) { + ReleaseHandle(handle); + } + return s; +} + +void TableCache::UpdateRangeTombstoneSeqnums( + const ReadOptions& options, TableReader* t, + MultiGetContext::Range& table_range) { + std::unique_ptr range_del_iter( + t->NewRangeTombstoneIterator(options)); + if (range_del_iter != nullptr) { + for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) { + SequenceNumber* max_covering_tombstone_seq = + iter->get_context->max_covering_tombstone_seq(); + SequenceNumber seq = + range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey_with_ts); + if (seq > *max_covering_tombstone_seq) { + *max_covering_tombstone_seq = seq; + if (iter->get_context->NeedTimestamp()) { + iter->get_context->SetTimestampFromRangeTombstone( + range_del_iter->timestamp()); + } + } + } + } +} + +Status TableCache::MultiGetFilter( + const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + const std::shared_ptr& prefix_extractor, + HistogramImpl* file_read_hist, int level, + MultiGetContext::Range* mget_range, Cache::Handle** table_handle) { + auto& fd = file_meta.fd; +#ifndef ROCKSDB_LITE + IterKey row_cache_key; + std::string row_cache_entry_buffer; + + // Check if we need to use the row cache. If yes, then we cannot do the + // filtering here, since the filtering needs to happen after the row cache + // lookup. + KeyContext& first_key = *mget_range->begin(); + if (ioptions_.row_cache && !first_key.get_context->NeedToReadSequence()) { + return Status::NotSupported(); + } +#endif // ROCKSDB_LITE + Status s; + TableReader* t = fd.table_reader; + Cache::Handle* handle = nullptr; + MultiGetContext::Range tombstone_range(*mget_range, mget_range->begin(), + mget_range->end()); + if (t == nullptr) { + s = FindTable( + options, file_options_, internal_comparator, file_meta, &handle, + prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */, + true /* record_read_stats */, file_read_hist, /*skip_filters=*/false, + level, true /* prefetch_index_and_filter_in_cache */, + /*max_file_size_for_l0_meta_pin=*/0, file_meta.temperature); + if (s.ok()) { + t = GetTableReaderFromHandle(handle); + } + *table_handle = handle; + } + if (s.ok()) { + s = t->MultiGetFilter(options, prefix_extractor.get(), mget_range); + } + if (s.ok() && !options.ignore_range_deletions) { + // Update the range tombstone sequence numbers for the keys here + // as TableCache::MultiGet may or may not be called, and even if it + // is, it may be called with fewer keys in the rangedue to filtering. + UpdateRangeTombstoneSeqnums(options, t, tombstone_range); + } + if (mget_range->empty() && handle) { + ReleaseHandle(handle); + *table_handle = nullptr; + } + + return s; +} + +Status TableCache::GetTableProperties( + const FileOptions& file_options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + std::shared_ptr* properties, + const std::shared_ptr& prefix_extractor, bool no_io) { + auto table_reader = file_meta.fd.table_reader; + // table already been pre-loaded? + if (table_reader) { + *properties = table_reader->GetTableProperties(); + + return Status::OK(); + } + + Cache::Handle* table_handle = nullptr; + Status s = FindTable(ReadOptions(), file_options, internal_comparator, + file_meta, &table_handle, prefix_extractor, no_io); + if (!s.ok()) { + return s; + } + assert(table_handle); + auto table = GetTableReaderFromHandle(table_handle); + *properties = table->GetTableProperties(); + ReleaseHandle(table_handle); + return s; +} + +Status TableCache::ApproximateKeyAnchors( + const ReadOptions& ro, const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, std::vector& anchors) { + Status s; + TableReader* t = file_meta.fd.table_reader; + Cache::Handle* handle = nullptr; + if (t == nullptr) { + s = FindTable(ro, file_options_, internal_comparator, file_meta, &handle); + if (s.ok()) { + t = GetTableReaderFromHandle(handle); + } + } + if (s.ok() && t != nullptr) { + s = t->ApproximateKeyAnchors(ro, anchors); + } + if (handle != nullptr) { + ReleaseHandle(handle); + } + return s; +} + +size_t TableCache::GetMemoryUsageByTableReader( + const FileOptions& file_options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + const std::shared_ptr& prefix_extractor) { + auto table_reader = file_meta.fd.table_reader; + // table already been pre-loaded? + if (table_reader) { + return table_reader->ApproximateMemoryUsage(); + } + + Cache::Handle* table_handle = nullptr; + Status s = FindTable(ReadOptions(), file_options, internal_comparator, + file_meta, &table_handle, prefix_extractor, true); + if (!s.ok()) { + return 0; + } + assert(table_handle); + auto table = GetTableReaderFromHandle(table_handle); + auto ret = table->ApproximateMemoryUsage(); + ReleaseHandle(table_handle); + return ret; +} + +bool TableCache::HasEntry(Cache* cache, uint64_t file_number) { + Cache::Handle* handle = cache->Lookup(GetSliceForFileNumber(&file_number)); + if (handle) { + cache->Release(handle); + return true; + } else { + return false; + } +} + +void TableCache::Evict(Cache* cache, uint64_t file_number) { + cache->Erase(GetSliceForFileNumber(&file_number)); +} + +uint64_t TableCache::ApproximateOffsetOf( + const Slice& key, const FileMetaData& file_meta, TableReaderCaller caller, + const InternalKeyComparator& internal_comparator, + const std::shared_ptr& prefix_extractor) { + uint64_t result = 0; + TableReader* table_reader = file_meta.fd.table_reader; + Cache::Handle* table_handle = nullptr; + if (table_reader == nullptr) { + const bool for_compaction = (caller == TableReaderCaller::kCompaction); + Status s = + FindTable(ReadOptions(), file_options_, internal_comparator, file_meta, + &table_handle, prefix_extractor, false /* no_io */, + !for_compaction /* record_read_stats */); + if (s.ok()) { + table_reader = GetTableReaderFromHandle(table_handle); + } + } + + if (table_reader != nullptr) { + result = table_reader->ApproximateOffsetOf(key, caller); + } + if (table_handle != nullptr) { + ReleaseHandle(table_handle); + } + + return result; +} + +uint64_t TableCache::ApproximateSize( + const Slice& start, const Slice& end, const FileMetaData& file_meta, + TableReaderCaller caller, const InternalKeyComparator& internal_comparator, + const std::shared_ptr& prefix_extractor) { + uint64_t result = 0; + TableReader* table_reader = file_meta.fd.table_reader; + Cache::Handle* table_handle = nullptr; + if (table_reader == nullptr) { + const bool for_compaction = (caller == TableReaderCaller::kCompaction); + Status s = + FindTable(ReadOptions(), file_options_, internal_comparator, file_meta, + &table_handle, prefix_extractor, false /* no_io */, + !for_compaction /* record_read_stats */); + if (s.ok()) { + table_reader = GetTableReaderFromHandle(table_handle); + } + } + + if (table_reader != nullptr) { + result = table_reader->ApproximateSize(start, end, caller); + } + if (table_handle != nullptr) { + ReleaseHandle(table_handle); + } + + return result; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/table_cache.h b/src/rocksdb/db/table_cache.h new file mode 100644 index 000000000..2e50f2c77 --- /dev/null +++ b/src/rocksdb/db/table_cache.h @@ -0,0 +1,275 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Thread-safe (provides internal synchronization) + +#pragma once +#include +#include +#include + +#include "db/dbformat.h" +#include "db/range_del_aggregator.h" +#include "options/cf_options.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "table/table_reader.h" +#include "trace_replay/block_cache_tracer.h" +#include "util/coro_utils.h" + +namespace ROCKSDB_NAMESPACE { + +class Env; +class Arena; +struct FileDescriptor; +class GetContext; +class HistogramImpl; + +// Manages caching for TableReader objects for a column family. The actual +// cache is allocated separately and passed to the constructor. TableCache +// wraps around the underlying SST file readers by providing Get(), +// MultiGet() and NewIterator() methods that hide the instantiation, +// caching and access to the TableReader. The main purpose of this is +// performance - by caching the TableReader, it avoids unnecessary file opens +// and object allocation and instantiation. One exception is compaction, where +// a new TableReader may be instantiated - see NewIterator() comments +// +// Another service provided by TableCache is managing the row cache - if the +// DB is configured with a row cache, and the lookup key is present in the row +// cache, lookup is very fast. The row cache is obtained from +// ioptions.row_cache +class TableCache { + public: + TableCache(const ImmutableOptions& ioptions, + const FileOptions* storage_options, Cache* cache, + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, + const std::string& db_session_id); + ~TableCache(); + + // Return an iterator for the specified file number (the corresponding + // file length must be exactly "file_size" bytes). If "table_reader_ptr" + // is non-nullptr, also sets "*table_reader_ptr" to point to the Table object + // underlying the returned iterator, or nullptr if no Table object underlies + // the returned iterator. The returned "*table_reader_ptr" object is owned + // by the cache and should not be deleted, and is valid for as long as the + // returned iterator is live. + // If !options.ignore_range_deletions, and range_del_iter is non-nullptr, + // then range_del_iter is set to a TruncatedRangeDelIterator for range + // tombstones in the SST file corresponding to the specified file number. The + // upper/lower bounds for the TruncatedRangeDelIterator are set to the SST + // file's boundary. + // @param options Must outlive the returned iterator. + // @param range_del_agg If non-nullptr, adds range deletions to the + // aggregator. If an error occurs, returns it in a NewErrorInternalIterator + // @param for_compaction If true, a new TableReader may be allocated (but + // not cached), depending on the CF options + // @param skip_filters Disables loading/accessing the filter block + // @param level The level this table is at, -1 for "not set / don't know" + InternalIterator* NewIterator( + const ReadOptions& options, const FileOptions& toptions, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, RangeDelAggregator* range_del_agg, + const std::shared_ptr& prefix_extractor, + TableReader** table_reader_ptr, HistogramImpl* file_read_hist, + TableReaderCaller caller, Arena* arena, bool skip_filters, int level, + size_t max_file_size_for_l0_meta_pin, + const InternalKey* smallest_compaction_key, + const InternalKey* largest_compaction_key, bool allow_unprepared_value, + TruncatedRangeDelIterator** range_del_iter = nullptr); + + // If a seek to internal key "k" in specified file finds an entry, + // call get_context->SaveValue() repeatedly until + // it returns false. As a side effect, it will insert the TableReader + // into the cache and potentially evict another entry + // @param get_context Context for get operation. The result of the lookup + // can be retrieved by calling get_context->State() + // @param file_read_hist If non-nullptr, the file reader statistics are + // recorded + // @param skip_filters Disables loading/accessing the filter block + // @param level The level this table is at, -1 for "not set / don't know" + Status Get( + const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, const Slice& k, GetContext* get_context, + const std::shared_ptr& prefix_extractor = nullptr, + HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, + int level = -1, size_t max_file_size_for_l0_meta_pin = 0); + + // Return the range delete tombstone iterator of the file specified by + // `file_meta`. + Status GetRangeTombstoneIterator( + const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + std::unique_ptr* out_iter); + + // Call table reader's MultiGetFilter to use the bloom filter to filter out + // keys. Returns Status::NotSupported() if row cache needs to be checked. + // If the table cache is looked up to get the table reader, the cache handle + // is returned in table_handle. This handle should be passed back to + // MultiGet() so it can be released. + Status MultiGetFilter( + const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + const std::shared_ptr& prefix_extractor, + HistogramImpl* file_read_hist, int level, + MultiGetContext::Range* mget_range, Cache::Handle** table_handle); + + // If a seek to internal key "k" in specified file finds an entry, + // call get_context->SaveValue() repeatedly until + // it returns false. As a side effect, it will insert the TableReader + // into the cache and potentially evict another entry + // @param mget_range Pointer to the structure describing a batch of keys to + // be looked up in this table file. The result is stored + // in the embedded GetContext + // @param skip_filters Disables loading/accessing the filter block + // @param level The level this table is at, -1 for "not set / don't know" + DECLARE_SYNC_AND_ASYNC( + Status, MultiGet, const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, const MultiGetContext::Range* mget_range, + const std::shared_ptr& prefix_extractor = nullptr, + HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, + bool skip_range_deletions = false, int level = -1, + Cache::Handle* table_handle = nullptr); + + // Evict any entry for the specified file number + static void Evict(Cache* cache, uint64_t file_number); + + // Query whether specified file number is currently in cache + static bool HasEntry(Cache* cache, uint64_t file_number); + + // Clean table handle and erase it from the table cache + // Used in DB close, or the file is not live anymore. + void EraseHandle(const FileDescriptor& fd, Cache::Handle* handle); + + // Find table reader + // @param skip_filters Disables loading/accessing the filter block + // @param level == -1 means not specified + Status FindTable( + const ReadOptions& ro, const FileOptions& toptions, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, Cache::Handle**, + const std::shared_ptr& prefix_extractor = nullptr, + const bool no_io = false, bool record_read_stats = true, + HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, + int level = -1, bool prefetch_index_and_filter_in_cache = true, + size_t max_file_size_for_l0_meta_pin = 0, + Temperature file_temperature = Temperature::kUnknown); + + // Get TableReader from a cache handle. + TableReader* GetTableReaderFromHandle(Cache::Handle* handle); + + // Get the table properties of a given table. + // @no_io: indicates if we should load table to the cache if it is not present + // in table cache yet. + // @returns: `properties` will be reset on success. Please note that we will + // return Status::Incomplete() if table is not present in cache and + // we set `no_io` to be true. + Status GetTableProperties( + const FileOptions& toptions, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + std::shared_ptr* properties, + const std::shared_ptr& prefix_extractor = nullptr, + bool no_io = false); + + Status ApproximateKeyAnchors(const ReadOptions& ro, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + std::vector& anchors); + + // Return total memory usage of the table reader of the file. + // 0 if table reader of the file is not loaded. + size_t GetMemoryUsageByTableReader( + const FileOptions& toptions, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + const std::shared_ptr& prefix_extractor = nullptr); + + // Returns approximated offset of a key in a file represented by fd. + uint64_t ApproximateOffsetOf( + const Slice& key, const FileMetaData& file_meta, TableReaderCaller caller, + const InternalKeyComparator& internal_comparator, + const std::shared_ptr& prefix_extractor = nullptr); + + // Returns approximated data size between start and end keys in a file + // represented by fd (the start key must not be greater than the end key). + uint64_t ApproximateSize( + const Slice& start, const Slice& end, const FileMetaData& file_meta, + TableReaderCaller caller, + const InternalKeyComparator& internal_comparator, + const std::shared_ptr& prefix_extractor = nullptr); + + // Release the handle from a cache + void ReleaseHandle(Cache::Handle* handle); + + Cache* get_cache() const { return cache_; } + + // Capacity of the backing Cache that indicates infinite TableCache capacity. + // For example when max_open_files is -1 we set the backing Cache to this. + static const int kInfiniteCapacity = 0x400000; + + // The tables opened with this TableCache will be immortal, i.e., their + // lifetime is as long as that of the DB. + void SetTablesAreImmortal() { + if (cache_->GetCapacity() >= kInfiniteCapacity) { + immortal_tables_ = true; + } + } + + private: + // Build a table reader + Status GetTableReader( + const ReadOptions& ro, const FileOptions& file_options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, bool sequential_mode, + bool record_read_stats, HistogramImpl* file_read_hist, + std::unique_ptr* table_reader, + const std::shared_ptr& prefix_extractor = nullptr, + bool skip_filters = false, int level = -1, + bool prefetch_index_and_filter_in_cache = true, + size_t max_file_size_for_l0_meta_pin = 0, + Temperature file_temperature = Temperature::kUnknown); + + // Update the max_covering_tombstone_seq in the GetContext for each key based + // on the range deletions in the table + void UpdateRangeTombstoneSeqnums(const ReadOptions& options, TableReader* t, + MultiGetContext::Range& table_range); + + // Create a key prefix for looking up the row cache. The prefix is of the + // format row_cache_id + fd_number + seq_no. Later, the user key can be + // appended to form the full key + void CreateRowCacheKeyPrefix(const ReadOptions& options, + const FileDescriptor& fd, + const Slice& internal_key, + GetContext* get_context, IterKey& row_cache_key); + + // Helper function to lookup the row cache for a key. It appends the + // user key to row_cache_key at offset prefix_size + bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key, + size_t prefix_size, GetContext* get_context); + + const ImmutableOptions& ioptions_; + const FileOptions& file_options_; + Cache* const cache_; + std::string row_cache_id_; + bool immortal_tables_; + BlockCacheTracer* const block_cache_tracer_; + Striped loader_mutex_; + std::shared_ptr io_tracer_; + std::string db_session_id_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/table_cache_sync_and_async.h b/src/rocksdb/db/table_cache_sync_and_async.h new file mode 100644 index 000000000..e72abdd45 --- /dev/null +++ b/src/rocksdb/db/table_cache_sync_and_async.h @@ -0,0 +1,135 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/coro_utils.h" + +#if defined(WITHOUT_COROUTINES) || \ + (defined(USE_COROUTINES) && defined(WITH_COROUTINES)) +namespace ROCKSDB_NAMESPACE { + +#if defined(WITHOUT_COROUTINES) +#endif + +// Batched version of TableCache::MultiGet. +DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet) +(const ReadOptions& options, const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, const MultiGetContext::Range* mget_range, + const std::shared_ptr& prefix_extractor, + HistogramImpl* file_read_hist, bool skip_filters, bool skip_range_deletions, + int level, Cache::Handle* table_handle) { + auto& fd = file_meta.fd; + Status s; + TableReader* t = fd.table_reader; + Cache::Handle* handle = table_handle; + MultiGetRange table_range(*mget_range, mget_range->begin(), + mget_range->end()); + if (handle != nullptr && t == nullptr) { + t = GetTableReaderFromHandle(handle); + } +#ifndef ROCKSDB_LITE + autovector row_cache_entries; + IterKey row_cache_key; + size_t row_cache_key_prefix_size = 0; + KeyContext& first_key = *table_range.begin(); + bool lookup_row_cache = + ioptions_.row_cache && !first_key.get_context->NeedToReadSequence(); + + // Check row cache if enabled. Since row cache does not currently store + // sequence numbers, we cannot use it if we need to fetch the sequence. + if (lookup_row_cache) { + GetContext* first_context = first_key.get_context; + CreateRowCacheKeyPrefix(options, fd, first_key.ikey, first_context, + row_cache_key); + row_cache_key_prefix_size = row_cache_key.Size(); + + for (auto miter = table_range.begin(); miter != table_range.end(); + ++miter) { + const Slice& user_key = miter->ukey_with_ts; + + GetContext* get_context = miter->get_context; + + if (GetFromRowCache(user_key, row_cache_key, row_cache_key_prefix_size, + get_context)) { + table_range.SkipKey(miter); + } else { + row_cache_entries.emplace_back(); + get_context->SetReplayLog(&(row_cache_entries.back())); + } + } + } +#endif // ROCKSDB_LITE + + // Check that table_range is not empty. Its possible all keys may have been + // found in the row cache and thus the range may now be empty + if (s.ok() && !table_range.empty()) { + if (t == nullptr) { + assert(handle == nullptr); + s = FindTable(options, file_options_, internal_comparator, file_meta, + &handle, prefix_extractor, + options.read_tier == kBlockCacheTier /* no_io */, + true /* record_read_stats */, file_read_hist, skip_filters, + level, true /* prefetch_index_and_filter_in_cache */, + 0 /*max_file_size_for_l0_meta_pin*/, file_meta.temperature); + TEST_SYNC_POINT_CALLBACK("TableCache::MultiGet:FindTable", &s); + if (s.ok()) { + t = GetTableReaderFromHandle(handle); + assert(t); + } + } + if (s.ok() && !options.ignore_range_deletions && !skip_range_deletions) { + UpdateRangeTombstoneSeqnums(options, t, table_range); + } + if (s.ok()) { + CO_AWAIT(t->MultiGet) + (options, &table_range, prefix_extractor.get(), skip_filters); + } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) { + for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) { + Status* status = iter->s; + if (status->IsIncomplete()) { + // Couldn't find Table in cache but treat as kFound if no_io set + iter->get_context->MarkKeyMayExist(); + s = Status::OK(); + } + } + } + } + +#ifndef ROCKSDB_LITE + if (lookup_row_cache) { + size_t row_idx = 0; + + for (auto miter = table_range.begin(); miter != table_range.end(); + ++miter) { + std::string& row_cache_entry = row_cache_entries[row_idx++]; + const Slice& user_key = miter->ukey_with_ts; + ; + GetContext* get_context = miter->get_context; + + get_context->SetReplayLog(nullptr); + // Compute row cache key. + row_cache_key.TrimAppend(row_cache_key_prefix_size, user_key.data(), + user_key.size()); + // Put the replay log in row cache only if something was found. + if (s.ok() && !row_cache_entry.empty()) { + size_t charge = row_cache_entry.capacity() + sizeof(std::string); + void* row_ptr = new std::string(std::move(row_cache_entry)); + // If row cache is full, it's OK. + ioptions_.row_cache + ->Insert(row_cache_key.GetUserKey(), row_ptr, charge, + &DeleteEntry) + .PermitUncheckedError(); + } + } + } +#endif // ROCKSDB_LITE + + if (handle != nullptr) { + ReleaseHandle(handle); + } + CO_RETURN s; +} +} // namespace ROCKSDB_NAMESPACE +#endif diff --git a/src/rocksdb/db/table_properties_collector.cc b/src/rocksdb/db/table_properties_collector.cc new file mode 100644 index 000000000..edb9a1b63 --- /dev/null +++ b/src/rocksdb/db/table_properties_collector.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/table_properties_collector.h" + +#include "db/dbformat.h" +#include "util/coding.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +uint64_t GetUint64Property(const UserCollectedProperties& props, + const std::string& property_name, + bool* property_present) { + auto pos = props.find(property_name); + if (pos == props.end()) { + *property_present = false; + return 0; + } + Slice raw = pos->second; + uint64_t val = 0; + *property_present = true; + return GetVarint64(&raw, &val) ? val : 0; +} + +} // anonymous namespace + +Status UserKeyTablePropertiesCollector::InternalAdd(const Slice& key, + const Slice& value, + uint64_t file_size) { + ParsedInternalKey ikey; + Status s = ParseInternalKey(key, &ikey, false /* log_err_key */); // TODO + if (!s.ok()) { + return s; + } + + return collector_->AddUserKey(ikey.user_key, value, GetEntryType(ikey.type), + ikey.sequence, file_size); +} + +void UserKeyTablePropertiesCollector::BlockAdd( + uint64_t block_uncomp_bytes, uint64_t block_compressed_bytes_fast, + uint64_t block_compressed_bytes_slow) { + return collector_->BlockAdd(block_uncomp_bytes, block_compressed_bytes_fast, + block_compressed_bytes_slow); +} + +Status UserKeyTablePropertiesCollector::Finish( + UserCollectedProperties* properties) { + return collector_->Finish(properties); +} + +UserCollectedProperties UserKeyTablePropertiesCollector::GetReadableProperties() + const { + return collector_->GetReadableProperties(); +} + +uint64_t GetDeletedKeys(const UserCollectedProperties& props) { + bool property_present_ignored; + return GetUint64Property(props, TablePropertiesNames::kDeletedKeys, + &property_present_ignored); +} + +uint64_t GetMergeOperands(const UserCollectedProperties& props, + bool* property_present) { + return GetUint64Property(props, TablePropertiesNames::kMergeOperands, + property_present); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/table_properties_collector.h b/src/rocksdb/db/table_properties_collector.h new file mode 100644 index 000000000..9035ba793 --- /dev/null +++ b/src/rocksdb/db/table_properties_collector.h @@ -0,0 +1,175 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file defines a collection of statistics collectors. +#pragma once + +#include +#include +#include + +#include "db/dbformat.h" +#include "rocksdb/comparator.h" +#include "rocksdb/table_properties.h" + +namespace ROCKSDB_NAMESPACE { + +// Base class for internal table properties collector. +class IntTblPropCollector { + public: + virtual ~IntTblPropCollector() {} + virtual Status Finish(UserCollectedProperties* properties) = 0; + + virtual const char* Name() const = 0; + + // @params key the user key that is inserted into the table. + // @params value the value that is inserted into the table. + virtual Status InternalAdd(const Slice& key, const Slice& value, + uint64_t file_size) = 0; + + virtual void BlockAdd(uint64_t block_uncomp_bytes, + uint64_t block_compressed_bytes_fast, + uint64_t block_compressed_bytes_slow) = 0; + + virtual UserCollectedProperties GetReadableProperties() const = 0; + + virtual bool NeedCompact() const { return false; } +}; + +// Factory for internal table properties collector. +class IntTblPropCollectorFactory { + public: + virtual ~IntTblPropCollectorFactory() {} + // has to be thread-safe + virtual IntTblPropCollector* CreateIntTblPropCollector( + uint32_t column_family_id, int level_at_creation) = 0; + + // The name of the properties collector can be used for debugging purpose. + virtual const char* Name() const = 0; +}; + +using IntTblPropCollectorFactories = + std::vector>; + +// When rocksdb creates a new table, it will encode all "user keys" into +// "internal keys", which contains meta information of a given entry. +// +// This class extracts user key from the encoded internal key when Add() is +// invoked. +class UserKeyTablePropertiesCollector : public IntTblPropCollector { + public: + // transfer of ownership + explicit UserKeyTablePropertiesCollector(TablePropertiesCollector* collector) + : collector_(collector) {} + + virtual ~UserKeyTablePropertiesCollector() {} + + virtual Status InternalAdd(const Slice& key, const Slice& value, + uint64_t file_size) override; + + virtual void BlockAdd(uint64_t block_uncomp_bytes, + uint64_t block_compressed_bytes_fast, + uint64_t block_compressed_bytes_slow) override; + + virtual Status Finish(UserCollectedProperties* properties) override; + + virtual const char* Name() const override { return collector_->Name(); } + + UserCollectedProperties GetReadableProperties() const override; + + virtual bool NeedCompact() const override { + return collector_->NeedCompact(); + } + + protected: + std::unique_ptr collector_; +}; + +class UserKeyTablePropertiesCollectorFactory + : public IntTblPropCollectorFactory { + public: + explicit UserKeyTablePropertiesCollectorFactory( + std::shared_ptr user_collector_factory) + : user_collector_factory_(user_collector_factory) {} + virtual IntTblPropCollector* CreateIntTblPropCollector( + uint32_t column_family_id, int level_at_creation) override { + TablePropertiesCollectorFactory::Context context; + context.column_family_id = column_family_id; + context.level_at_creation = level_at_creation; + return new UserKeyTablePropertiesCollector( + user_collector_factory_->CreateTablePropertiesCollector(context)); + } + + virtual const char* Name() const override { + return user_collector_factory_->Name(); + } + + private: + std::shared_ptr user_collector_factory_; +}; + +// When rocksdb creates a newtable, it will encode all "user keys" into +// "internal keys". This class collects min/max timestamp from the encoded +// internal key when Add() is invoked. +// +// @param cmp the user comparator to compare the timestamps in internal key. +class TimestampTablePropertiesCollector : public IntTblPropCollector { + public: + explicit TimestampTablePropertiesCollector(const Comparator* cmp) + : cmp_(cmp), + timestamp_min_(kDisableUserTimestamp), + timestamp_max_(kDisableUserTimestamp) {} + + Status InternalAdd(const Slice& key, const Slice& /* value */, + uint64_t /* file_size */) override { + auto user_key = ExtractUserKey(key); + assert(cmp_ && cmp_->timestamp_size() > 0); + if (user_key.size() < cmp_->timestamp_size()) { + return Status::Corruption( + "User key size mismatch when comparing to timestamp size."); + } + auto timestamp_in_key = + ExtractTimestampFromUserKey(user_key, cmp_->timestamp_size()); + if (timestamp_max_ == kDisableUserTimestamp || + cmp_->CompareTimestamp(timestamp_in_key, timestamp_max_) > 0) { + timestamp_max_.assign(timestamp_in_key.data(), timestamp_in_key.size()); + } + if (timestamp_min_ == kDisableUserTimestamp || + cmp_->CompareTimestamp(timestamp_min_, timestamp_in_key) > 0) { + timestamp_min_.assign(timestamp_in_key.data(), timestamp_in_key.size()); + } + return Status::OK(); + } + + void BlockAdd(uint64_t /* block_uncomp_bytes */, + uint64_t /* block_compressed_bytes_fast */, + uint64_t /* block_compressed_bytes_slow */) override { + return; + } + + Status Finish(UserCollectedProperties* properties) override { + assert(timestamp_min_.size() == timestamp_max_.size() && + timestamp_max_.size() == cmp_->timestamp_size()); + properties->insert({"rocksdb.timestamp_min", timestamp_min_}); + properties->insert({"rocksdb.timestamp_max", timestamp_max_}); + return Status::OK(); + } + + const char* Name() const override { + return "TimestampTablePropertiesCollector"; + } + + UserCollectedProperties GetReadableProperties() const override { + return {{"rocksdb.timestamp_min", Slice(timestamp_min_).ToString(true)}, + {"rocksdb.timestamp_max", Slice(timestamp_max_).ToString(true)}}; + } + + protected: + const Comparator* const cmp_; + std::string timestamp_min_; + std::string timestamp_max_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/table_properties_collector_test.cc b/src/rocksdb/db/table_properties_collector_test.cc new file mode 100644 index 000000000..5f0f205da --- /dev/null +++ b/src/rocksdb/db/table_properties_collector_test.cc @@ -0,0 +1,513 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/table_properties_collector.h" + +#include +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "file/sequence_file_reader.h" +#include "file/writable_file_writer.h" +#include "options/cf_options.h" +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/table.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/meta_blocks.h" +#include "table/plain/plain_table_factory.h" +#include "table/table_builder.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +class TablePropertiesTest : public testing::Test, + public testing::WithParamInterface { + public: + void SetUp() override { backward_mode_ = GetParam(); } + + bool backward_mode_; +}; + +// Utilities test functions +namespace { +static const uint32_t kTestColumnFamilyId = 66; +static const std::string kTestColumnFamilyName = "test_column_fam"; +static const int kTestLevel = 1; + +void MakeBuilder( + const Options& options, const ImmutableOptions& ioptions, + const MutableCFOptions& moptions, + const InternalKeyComparator& internal_comparator, + const IntTblPropCollectorFactories* int_tbl_prop_collector_factories, + std::unique_ptr* writable, + std::unique_ptr* builder) { + std::unique_ptr wf(new test::StringSink); + writable->reset( + new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions())); + TableBuilderOptions tboptions( + ioptions, moptions, internal_comparator, int_tbl_prop_collector_factories, + options.compression, options.compression_opts, kTestColumnFamilyId, + kTestColumnFamilyName, kTestLevel); + builder->reset(NewTableBuilder(tboptions, writable->get())); +} +} // namespace + +// Collects keys that starts with "A" in a table. +class RegularKeysStartWithA : public TablePropertiesCollector { + public: + const char* Name() const override { return "RegularKeysStartWithA"; } + + Status Finish(UserCollectedProperties* properties) override { + std::string encoded; + std::string encoded_num_puts; + std::string encoded_num_deletes; + std::string encoded_num_single_deletes; + std::string encoded_num_size_changes; + PutVarint32(&encoded, count_); + PutVarint32(&encoded_num_puts, num_puts_); + PutVarint32(&encoded_num_deletes, num_deletes_); + PutVarint32(&encoded_num_single_deletes, num_single_deletes_); + PutVarint32(&encoded_num_size_changes, num_size_changes_); + *properties = UserCollectedProperties{ + {"TablePropertiesTest", message_}, + {"Count", encoded}, + {"NumPuts", encoded_num_puts}, + {"NumDeletes", encoded_num_deletes}, + {"NumSingleDeletes", encoded_num_single_deletes}, + {"NumSizeChanges", encoded_num_size_changes}, + }; + return Status::OK(); + } + + Status AddUserKey(const Slice& user_key, const Slice& /*value*/, + EntryType type, SequenceNumber /*seq*/, + uint64_t file_size) override { + // simply asssume all user keys are not empty. + if (user_key.data()[0] == 'A') { + ++count_; + } + if (type == kEntryPut) { + num_puts_++; + } else if (type == kEntryDelete) { + num_deletes_++; + } else if (type == kEntrySingleDelete) { + num_single_deletes_++; + } + if (file_size < file_size_) { + message_ = "File size should not decrease."; + } else if (file_size != file_size_) { + num_size_changes_++; + } + + return Status::OK(); + } + + UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties{}; + } + + private: + std::string message_ = "Rocksdb"; + uint32_t count_ = 0; + uint32_t num_puts_ = 0; + uint32_t num_deletes_ = 0; + uint32_t num_single_deletes_ = 0; + uint32_t num_size_changes_ = 0; + uint64_t file_size_ = 0; +}; + +// Collects keys that starts with "A" in a table. Backward compatible mode +// It is also used to test internal key table property collector +class RegularKeysStartWithABackwardCompatible + : public TablePropertiesCollector { + public: + const char* Name() const override { return "RegularKeysStartWithA"; } + + Status Finish(UserCollectedProperties* properties) override { + std::string encoded; + PutVarint32(&encoded, count_); + *properties = UserCollectedProperties{{"TablePropertiesTest", "Rocksdb"}, + {"Count", encoded}}; + return Status::OK(); + } + + Status Add(const Slice& user_key, const Slice& /*value*/) override { + // simply asssume all user keys are not empty. + if (user_key.data()[0] == 'A') { + ++count_; + } + return Status::OK(); + } + + UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties{}; + } + + private: + uint32_t count_ = 0; +}; + +class RegularKeysStartWithAInternal : public IntTblPropCollector { + public: + const char* Name() const override { return "RegularKeysStartWithA"; } + + Status Finish(UserCollectedProperties* properties) override { + std::string encoded; + PutVarint32(&encoded, count_); + *properties = UserCollectedProperties{{"TablePropertiesTest", "Rocksdb"}, + {"Count", encoded}}; + return Status::OK(); + } + + Status InternalAdd(const Slice& user_key, const Slice& /*value*/, + uint64_t /*file_size*/) override { + // simply asssume all user keys are not empty. + if (user_key.data()[0] == 'A') { + ++count_; + } + return Status::OK(); + } + + void BlockAdd(uint64_t /* block_uncomp_bytes */, + uint64_t /* block_compressed_bytes_fast */, + uint64_t /* block_compressed_bytes_slow */) override { + // Nothing to do. + return; + } + + UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties{}; + } + + private: + uint32_t count_ = 0; +}; + +class RegularKeysStartWithAFactory : public IntTblPropCollectorFactory, + public TablePropertiesCollectorFactory { + public: + explicit RegularKeysStartWithAFactory(bool backward_mode) + : backward_mode_(backward_mode) {} + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context context) override { + EXPECT_EQ(kTestColumnFamilyId, context.column_family_id); + EXPECT_EQ(kTestLevel, context.level_at_creation); + if (!backward_mode_) { + return new RegularKeysStartWithA(); + } else { + return new RegularKeysStartWithABackwardCompatible(); + } + } + IntTblPropCollector* CreateIntTblPropCollector( + uint32_t /*column_family_id*/, int /* level_at_creation */) override { + return new RegularKeysStartWithAInternal(); + } + const char* Name() const override { return "RegularKeysStartWithA"; } + + bool backward_mode_; +}; + +class FlushBlockEveryThreePolicy : public FlushBlockPolicy { + public: + bool Update(const Slice& /*key*/, const Slice& /*value*/) override { + return (++count_ % 3U == 0); + } + + private: + uint64_t count_ = 0; +}; + +class FlushBlockEveryThreePolicyFactory : public FlushBlockPolicyFactory { + public: + explicit FlushBlockEveryThreePolicyFactory() {} + + const char* Name() const override { + return "FlushBlockEveryThreePolicyFactory"; + } + + FlushBlockPolicy* NewFlushBlockPolicy( + const BlockBasedTableOptions& /*table_options*/, + const BlockBuilder& /*data_block_builder*/) const override { + return new FlushBlockEveryThreePolicy; + } +}; + +extern const uint64_t kBlockBasedTableMagicNumber; +extern const uint64_t kPlainTableMagicNumber; +namespace { +void TestCustomizedTablePropertiesCollector( + bool backward_mode, uint64_t magic_number, bool test_int_tbl_prop_collector, + const Options& options, const InternalKeyComparator& internal_comparator) { + // make sure the entries will be inserted with order. + std::map, std::string> kvs = { + {{"About ", kTypeValue}, "val5"}, // starts with 'A' + {{"Abstract", kTypeValue}, "val2"}, // starts with 'A' + {{"Around ", kTypeValue}, "val7"}, // starts with 'A' + {{"Beyond ", kTypeValue}, "val3"}, + {{"Builder ", kTypeValue}, "val1"}, + {{"Love ", kTypeDeletion}, ""}, + {{"Cancel ", kTypeValue}, "val4"}, + {{"Find ", kTypeValue}, "val6"}, + {{"Rocks ", kTypeDeletion}, ""}, + {{"Foo ", kTypeSingleDeletion}, ""}, + }; + + // -- Step 1: build table + std::unique_ptr builder; + std::unique_ptr writer; + const ImmutableOptions ioptions(options); + const MutableCFOptions moptions(options); + IntTblPropCollectorFactories int_tbl_prop_collector_factories; + if (test_int_tbl_prop_collector) { + int_tbl_prop_collector_factories.emplace_back( + new RegularKeysStartWithAFactory(backward_mode)); + } else { + GetIntTblPropCollectorFactory(ioptions, &int_tbl_prop_collector_factories); + } + MakeBuilder(options, ioptions, moptions, internal_comparator, + &int_tbl_prop_collector_factories, &writer, &builder); + + SequenceNumber seqNum = 0U; + for (const auto& kv : kvs) { + InternalKey ikey(kv.first.first, seqNum++, kv.first.second); + builder->Add(ikey.Encode(), kv.second); + } + ASSERT_OK(builder->Finish()); + ASSERT_OK(writer->Flush()); + + // -- Step 2: Read properties + test::StringSink* fwf = + static_cast(writer->writable_file()); + std::unique_ptr source( + new test::StringSource(fwf->contents())); + std::unique_ptr fake_file_reader( + new RandomAccessFileReader(std::move(source), "test")); + + std::unique_ptr props; + Status s = ReadTableProperties(fake_file_reader.get(), fwf->contents().size(), + magic_number, ioptions, &props); + ASSERT_OK(s); + + auto user_collected = props->user_collected_properties; + + ASSERT_NE(user_collected.find("TablePropertiesTest"), user_collected.end()); + ASSERT_EQ("Rocksdb", user_collected.at("TablePropertiesTest")); + + uint32_t starts_with_A = 0; + ASSERT_NE(user_collected.find("Count"), user_collected.end()); + Slice key(user_collected.at("Count")); + ASSERT_TRUE(GetVarint32(&key, &starts_with_A)); + ASSERT_EQ(3u, starts_with_A); + + if (!backward_mode && !test_int_tbl_prop_collector) { + uint32_t num_puts; + ASSERT_NE(user_collected.find("NumPuts"), user_collected.end()); + Slice key_puts(user_collected.at("NumPuts")); + ASSERT_TRUE(GetVarint32(&key_puts, &num_puts)); + ASSERT_EQ(7u, num_puts); + + uint32_t num_deletes; + ASSERT_NE(user_collected.find("NumDeletes"), user_collected.end()); + Slice key_deletes(user_collected.at("NumDeletes")); + ASSERT_TRUE(GetVarint32(&key_deletes, &num_deletes)); + ASSERT_EQ(2u, num_deletes); + + uint32_t num_single_deletes; + ASSERT_NE(user_collected.find("NumSingleDeletes"), user_collected.end()); + Slice key_single_deletes(user_collected.at("NumSingleDeletes")); + ASSERT_TRUE(GetVarint32(&key_single_deletes, &num_single_deletes)); + ASSERT_EQ(1u, num_single_deletes); + + uint32_t num_size_changes; + ASSERT_NE(user_collected.find("NumSizeChanges"), user_collected.end()); + Slice key_size_changes(user_collected.at("NumSizeChanges")); + ASSERT_TRUE(GetVarint32(&key_size_changes, &num_size_changes)); + ASSERT_GE(num_size_changes, 2u); + } +} +} // namespace + +TEST_P(TablePropertiesTest, CustomizedTablePropertiesCollector) { + // Test properties collectors with internal keys or regular keys + // for block based table + for (bool encode_as_internal : {true, false}) { + Options options; + BlockBasedTableOptions table_options; + table_options.flush_block_policy_factory = + std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + test::PlainInternalKeyComparator ikc(options.comparator); + std::shared_ptr collector_factory( + new RegularKeysStartWithAFactory(backward_mode_)); + options.table_properties_collector_factories.resize(1); + options.table_properties_collector_factories[0] = collector_factory; + + TestCustomizedTablePropertiesCollector(backward_mode_, + kBlockBasedTableMagicNumber, + encode_as_internal, options, ikc); + +#ifndef ROCKSDB_LITE // PlainTable is not supported in Lite + // test plain table + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 8; + plain_table_options.bloom_bits_per_key = 8; + plain_table_options.hash_table_ratio = 0; + + options.table_factory = + std::make_shared(plain_table_options); + TestCustomizedTablePropertiesCollector(backward_mode_, + kPlainTableMagicNumber, + encode_as_internal, options, ikc); +#endif // !ROCKSDB_LITE + } +} + +namespace { +void TestInternalKeyPropertiesCollector( + bool backward_mode, uint64_t magic_number, bool sanitized, + std::shared_ptr table_factory) { + InternalKey keys[] = { + InternalKey("A ", 0, ValueType::kTypeValue), + InternalKey("B ", 1, ValueType::kTypeValue), + InternalKey("C ", 2, ValueType::kTypeValue), + InternalKey("W ", 3, ValueType::kTypeDeletion), + InternalKey("X ", 4, ValueType::kTypeDeletion), + InternalKey("Y ", 5, ValueType::kTypeDeletion), + InternalKey("Z ", 6, ValueType::kTypeDeletion), + InternalKey("a ", 7, ValueType::kTypeSingleDeletion), + InternalKey("b ", 8, ValueType::kTypeMerge), + InternalKey("c ", 9, ValueType::kTypeMerge), + }; + + std::unique_ptr builder; + std::unique_ptr writable; + Options options; + test::PlainInternalKeyComparator pikc(options.comparator); + + IntTblPropCollectorFactories int_tbl_prop_collector_factories; + options.table_factory = table_factory; + if (sanitized) { + options.table_properties_collector_factories.emplace_back( + new RegularKeysStartWithAFactory(backward_mode)); + // with sanitization, even regular properties collector will be able to + // handle internal keys. + auto comparator = options.comparator; + // HACK: Set options.info_log to avoid writing log in + // SanitizeOptions(). + options.info_log = std::make_shared(); + options = SanitizeOptions("db", // just a place holder + options); + ImmutableOptions ioptions(options); + GetIntTblPropCollectorFactory(ioptions, &int_tbl_prop_collector_factories); + options.comparator = comparator; + } + const ImmutableOptions ioptions(options); + MutableCFOptions moptions(options); + + for (int iter = 0; iter < 2; ++iter) { + MakeBuilder(options, ioptions, moptions, pikc, + &int_tbl_prop_collector_factories, &writable, &builder); + for (const auto& k : keys) { + builder->Add(k.Encode(), "val"); + } + + ASSERT_OK(builder->Finish()); + ASSERT_OK(writable->Flush()); + + test::StringSink* fwf = + static_cast(writable->writable_file()); + std::unique_ptr source( + new test::StringSource(fwf->contents())); + std::unique_ptr reader( + new RandomAccessFileReader(std::move(source), "test")); + + std::unique_ptr props; + Status s = ReadTableProperties(reader.get(), fwf->contents().size(), + magic_number, ioptions, &props); + ASSERT_OK(s); + + auto user_collected = props->user_collected_properties; + uint64_t deleted = GetDeletedKeys(user_collected); + ASSERT_EQ(5u, deleted); // deletes + single-deletes + + bool property_present; + uint64_t merges = GetMergeOperands(user_collected, &property_present); + ASSERT_TRUE(property_present); + ASSERT_EQ(2u, merges); + + if (sanitized) { + uint32_t starts_with_A = 0; + ASSERT_NE(user_collected.find("Count"), user_collected.end()); + Slice key(user_collected.at("Count")); + ASSERT_TRUE(GetVarint32(&key, &starts_with_A)); + ASSERT_EQ(1u, starts_with_A); + + if (!backward_mode) { + uint32_t num_puts; + ASSERT_NE(user_collected.find("NumPuts"), user_collected.end()); + Slice key_puts(user_collected.at("NumPuts")); + ASSERT_TRUE(GetVarint32(&key_puts, &num_puts)); + ASSERT_EQ(3u, num_puts); + + uint32_t num_deletes; + ASSERT_NE(user_collected.find("NumDeletes"), user_collected.end()); + Slice key_deletes(user_collected.at("NumDeletes")); + ASSERT_TRUE(GetVarint32(&key_deletes, &num_deletes)); + ASSERT_EQ(4u, num_deletes); + + uint32_t num_single_deletes; + ASSERT_NE(user_collected.find("NumSingleDeletes"), + user_collected.end()); + Slice key_single_deletes(user_collected.at("NumSingleDeletes")); + ASSERT_TRUE(GetVarint32(&key_single_deletes, &num_single_deletes)); + ASSERT_EQ(1u, num_single_deletes); + } + } + } +} +} // namespace + +TEST_P(TablePropertiesTest, InternalKeyPropertiesCollector) { + TestInternalKeyPropertiesCollector( + backward_mode_, kBlockBasedTableMagicNumber, true /* sanitize */, + std::make_shared()); + if (backward_mode_) { + TestInternalKeyPropertiesCollector( + backward_mode_, kBlockBasedTableMagicNumber, false /* not sanitize */, + std::make_shared()); + } + +#ifndef ROCKSDB_LITE // PlainTable is not supported in Lite + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 8; + plain_table_options.bloom_bits_per_key = 8; + plain_table_options.hash_table_ratio = 0; + + TestInternalKeyPropertiesCollector( + backward_mode_, kPlainTableMagicNumber, false /* not sanitize */, + std::make_shared(plain_table_options)); +#endif // !ROCKSDB_LITE +} + +INSTANTIATE_TEST_CASE_P(InternalKeyPropertiesCollector, TablePropertiesTest, + ::testing::Bool()); + +INSTANTIATE_TEST_CASE_P(CustomizedTablePropertiesCollector, TablePropertiesTest, + ::testing::Bool()); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/transaction_log_impl.cc b/src/rocksdb/db/transaction_log_impl.cc new file mode 100644 index 000000000..3878b428a --- /dev/null +++ b/src/rocksdb/db/transaction_log_impl.cc @@ -0,0 +1,298 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "db/transaction_log_impl.h" + +#include + +#include "db/write_batch_internal.h" +#include "file/sequence_file_reader.h" +#include "util/defer.h" + +namespace ROCKSDB_NAMESPACE { + +TransactionLogIteratorImpl::TransactionLogIteratorImpl( + const std::string& dir, const ImmutableDBOptions* options, + const TransactionLogIterator::ReadOptions& read_options, + const EnvOptions& soptions, const SequenceNumber seq, + std::unique_ptr files, VersionSet const* const versions, + const bool seq_per_batch, const std::shared_ptr& io_tracer) + : dir_(dir), + options_(options), + read_options_(read_options), + soptions_(soptions), + starting_sequence_number_(seq), + files_(std::move(files)), + versions_(versions), + seq_per_batch_(seq_per_batch), + io_tracer_(io_tracer), + started_(false), + is_valid_(false), + current_file_index_(0), + current_batch_seq_(0), + current_last_seq_(0) { + assert(files_ != nullptr); + assert(versions_ != nullptr); + assert(!seq_per_batch_); + current_status_.PermitUncheckedError(); // Clear on start + reporter_.env = options_->env; + reporter_.info_log = options_->info_log.get(); + SeekToStartSequence(); // Seek till starting sequence +} + +Status TransactionLogIteratorImpl::OpenLogFile( + const LogFile* log_file, + std::unique_ptr* file_reader) { + FileSystemPtr fs(options_->fs, io_tracer_); + std::unique_ptr file; + std::string fname; + Status s; + EnvOptions optimized_env_options = fs->OptimizeForLogRead(soptions_); + if (log_file->Type() == kArchivedLogFile) { + fname = ArchivedLogFileName(dir_, log_file->LogNumber()); + s = fs->NewSequentialFile(fname, optimized_env_options, &file, nullptr); + } else { + fname = LogFileName(dir_, log_file->LogNumber()); + s = fs->NewSequentialFile(fname, optimized_env_options, &file, nullptr); + if (!s.ok()) { + // If cannot open file in DB directory. + // Try the archive dir, as it could have moved in the meanwhile. + fname = ArchivedLogFileName(dir_, log_file->LogNumber()); + s = fs->NewSequentialFile(fname, optimized_env_options, &file, nullptr); + } + } + if (s.ok()) { + file_reader->reset(new SequentialFileReader(std::move(file), fname, + io_tracer_, options_->listeners, + options_->rate_limiter.get())); + } + return s; +} + +BatchResult TransactionLogIteratorImpl::GetBatch() { + assert(is_valid_); // cannot call in a non valid state. + BatchResult result; + result.sequence = current_batch_seq_; + result.writeBatchPtr = std::move(current_batch_); + return result; +} + +Status TransactionLogIteratorImpl::status() { return current_status_; } + +bool TransactionLogIteratorImpl::Valid() { return started_ && is_valid_; } + +bool TransactionLogIteratorImpl::RestrictedRead(Slice* record) { + // Don't read if no more complete entries to read from logs + if (current_last_seq_ >= versions_->LastSequence()) { + return false; + } + return current_log_reader_->ReadRecord(record, &scratch_); +} + +void TransactionLogIteratorImpl::SeekToStartSequence(uint64_t start_file_index, + bool strict) { + Slice record; + started_ = false; + is_valid_ = false; + // Check invariant of TransactionLogIterator when SeekToStartSequence() + // succeeds. + const Defer defer([this]() { + if (is_valid_) { + assert(current_status_.ok()); + if (starting_sequence_number_ > current_batch_seq_) { + assert(current_batch_seq_ < current_last_seq_); + assert(current_last_seq_ >= starting_sequence_number_); + } + } + }); + if (files_->size() <= start_file_index) { + return; + } else if (!current_status_.ok()) { + return; + } + Status s = + OpenLogReader(files_->at(static_cast(start_file_index)).get()); + if (!s.ok()) { + current_status_ = s; + reporter_.Info(current_status_.ToString().c_str()); + return; + } + while (RestrictedRead(&record)) { + if (record.size() < WriteBatchInternal::kHeader) { + reporter_.Corruption(record.size(), + Status::Corruption("very small log record")); + continue; + } + UpdateCurrentWriteBatch(record); + if (current_last_seq_ >= starting_sequence_number_) { + if (strict && current_batch_seq_ != starting_sequence_number_) { + current_status_ = Status::Corruption( + "Gap in sequence number. Could not " + "seek to required sequence number"); + reporter_.Info(current_status_.ToString().c_str()); + return; + } else if (strict) { + reporter_.Info( + "Could seek required sequence number. Iterator will " + "continue."); + } + is_valid_ = true; + started_ = true; // set started_ as we could seek till starting sequence + return; + } else { + is_valid_ = false; + } + } + + // Could not find start sequence in first file. Normally this must be the + // only file. Otherwise log the error and let the iterator return next entry + // If strict is set, we want to seek exactly till the start sequence and it + // should have been present in the file we scanned above + if (strict) { + current_status_ = Status::Corruption( + "Gap in sequence number. Could not " + "seek to required sequence number"); + reporter_.Info(current_status_.ToString().c_str()); + } else if (files_->size() != 1) { + current_status_ = Status::Corruption( + "Start sequence was not found, " + "skipping to the next available"); + reporter_.Info(current_status_.ToString().c_str()); + // Let NextImpl find the next available entry. started_ remains false + // because we don't want to check for gaps while moving to start sequence + NextImpl(true); + } +} + +void TransactionLogIteratorImpl::Next() { + if (!current_status_.ok()) { + return; + } + return NextImpl(false); +} + +void TransactionLogIteratorImpl::NextImpl(bool internal) { + Slice record; + is_valid_ = false; + if (!internal && !started_) { + // Runs every time until we can seek to the start sequence + SeekToStartSequence(); + } + while (true) { + assert(current_log_reader_); + if (current_log_reader_->IsEOF()) { + current_log_reader_->UnmarkEOF(); + } + while (RestrictedRead(&record)) { + if (record.size() < WriteBatchInternal::kHeader) { + reporter_.Corruption(record.size(), + Status::Corruption("very small log record")); + continue; + } else { + // started_ should be true if called by application + assert(internal || started_); + // started_ should be false if called internally + assert(!internal || !started_); + UpdateCurrentWriteBatch(record); + if (internal && !started_) { + started_ = true; + } + return; + } + } + + // Open the next file + if (current_file_index_ < files_->size() - 1) { + ++current_file_index_; + Status s = OpenLogReader(files_->at(current_file_index_).get()); + if (!s.ok()) { + is_valid_ = false; + current_status_ = s; + return; + } + } else { + is_valid_ = false; + if (current_last_seq_ == versions_->LastSequence()) { + current_status_ = Status::OK(); + } else { + const char* msg = "Create a new iterator to fetch the new tail."; + current_status_ = Status::TryAgain(msg); + } + return; + } + } +} + +bool TransactionLogIteratorImpl::IsBatchExpected( + const WriteBatch* batch, const SequenceNumber expected_seq) { + assert(batch); + SequenceNumber batchSeq = WriteBatchInternal::Sequence(batch); + if (batchSeq != expected_seq) { + char buf[200]; + snprintf(buf, sizeof(buf), + "Discontinuity in log records. Got seq=%" PRIu64 + ", Expected seq=%" PRIu64 ", Last flushed seq=%" PRIu64 + ".Log iterator will reseek the correct batch.", + batchSeq, expected_seq, versions_->LastSequence()); + reporter_.Info(buf); + return false; + } + return true; +} + +void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) { + std::unique_ptr batch(new WriteBatch()); + Status s = WriteBatchInternal::SetContents(batch.get(), record); + s.PermitUncheckedError(); // TODO: What should we do with this error? + + SequenceNumber expected_seq = current_last_seq_ + 1; + // If the iterator has started, then confirm that we get continuous batches + if (started_ && !IsBatchExpected(batch.get(), expected_seq)) { + // Seek to the batch having expected sequence number + if (expected_seq < files_->at(current_file_index_)->StartSequence()) { + // Expected batch must lie in the previous log file + // Avoid underflow. + if (current_file_index_ != 0) { + current_file_index_--; + } + } + starting_sequence_number_ = expected_seq; + // currentStatus_ will be set to Ok if reseek succeeds + // Note: this is still ok in seq_pre_batch_ && two_write_queuesp_ mode + // that allows gaps in the WAL since it will still skip over the gap. + current_status_ = Status::NotFound("Gap in sequence numbers"); + // In seq_per_batch_ mode, gaps in the seq are possible so the strict mode + // should be disabled + return SeekToStartSequence(current_file_index_, !seq_per_batch_); + } + + current_batch_seq_ = WriteBatchInternal::Sequence(batch.get()); + assert(!seq_per_batch_); + current_last_seq_ = + current_batch_seq_ + WriteBatchInternal::Count(batch.get()) - 1; + // currentBatchSeq_ can only change here + assert(current_last_seq_ <= versions_->LastSequence()); + + current_batch_ = std::move(batch); + is_valid_ = true; + current_status_ = Status::OK(); +} + +Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* log_file) { + std::unique_ptr file; + Status s = OpenLogFile(log_file, &file); + if (!s.ok()) { + return s; + } + assert(file); + current_log_reader_.reset( + new log::Reader(options_->info_log, std::move(file), &reporter_, + read_options_.verify_checksums_, log_file->LogNumber())); + return Status::OK(); +} +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/transaction_log_impl.h b/src/rocksdb/db/transaction_log_impl.h new file mode 100644 index 000000000..e8c6efc02 --- /dev/null +++ b/src/rocksdb/db/transaction_log_impl.h @@ -0,0 +1,130 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#ifndef ROCKSDB_LITE +#include + +#include "db/log_reader.h" +#include "db/version_set.h" +#include "file/filename.h" +#include "logging/logging.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/transaction_log.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +class LogFileImpl : public LogFile { + public: + LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq, + uint64_t sizeBytes) + : logNumber_(logNum), + type_(logType), + startSequence_(startSeq), + sizeFileBytes_(sizeBytes) {} + + std::string PathName() const override { + if (type_ == kArchivedLogFile) { + return ArchivedLogFileName("", logNumber_); + } + return LogFileName("", logNumber_); + } + + uint64_t LogNumber() const override { return logNumber_; } + + WalFileType Type() const override { return type_; } + + SequenceNumber StartSequence() const override { return startSequence_; } + + uint64_t SizeFileBytes() const override { return sizeFileBytes_; } + + bool operator<(const LogFile& that) const { + return LogNumber() < that.LogNumber(); + } + + private: + uint64_t logNumber_; + WalFileType type_; + SequenceNumber startSequence_; + uint64_t sizeFileBytes_; +}; + +class TransactionLogIteratorImpl : public TransactionLogIterator { + public: + TransactionLogIteratorImpl( + const std::string& dir, const ImmutableDBOptions* options, + const TransactionLogIterator::ReadOptions& read_options, + const EnvOptions& soptions, const SequenceNumber seqNum, + std::unique_ptr files, VersionSet const* const versions, + const bool seq_per_batch, const std::shared_ptr& io_tracer); + + virtual bool Valid() override; + + virtual void Next() override; + + virtual Status status() override; + + virtual BatchResult GetBatch() override; + + private: + const std::string& dir_; + const ImmutableDBOptions* options_; + const TransactionLogIterator::ReadOptions read_options_; + const EnvOptions& soptions_; + SequenceNumber starting_sequence_number_; + std::unique_ptr files_; + // Used only to get latest seq. num + // TODO(icanadi) can this be just a callback? + VersionSet const* const versions_; + const bool seq_per_batch_; + std::shared_ptr io_tracer_; + + // State variables + bool started_; + bool is_valid_; // not valid when it starts of. + Status current_status_; + size_t current_file_index_; + std::unique_ptr current_batch_; + std::unique_ptr current_log_reader_; + std::string scratch_; + Status OpenLogFile(const LogFile* log_file, + std::unique_ptr* file); + + struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + virtual void Corruption(size_t bytes, const Status& s) override { + ROCKS_LOG_ERROR(info_log, "dropping %" ROCKSDB_PRIszt " bytes; %s", bytes, + s.ToString().c_str()); + } + virtual void Info(const char* s) { ROCKS_LOG_INFO(info_log, "%s", s); } + } reporter_; + + SequenceNumber + current_batch_seq_; // sequence number at start of current batch + SequenceNumber current_last_seq_; // last sequence in the current batch + // Reads from transaction log only if the writebatch record has been written + bool RestrictedRead(Slice* record); + // Seeks to starting_sequence_number_ reading from start_file_index in files_. + // If strict is set, then must get a batch starting with + // starting_sequence_number_. + void SeekToStartSequence(uint64_t start_file_index = 0, bool strict = false); + // Implementation of Next. SeekToStartSequence calls it internally with + // internal=true to let it find next entry even if it has to jump gaps because + // the iterator may start off from the first available entry but promises to + // be continuous after that + void NextImpl(bool internal = false); + // Check if batch is expected, else return false + bool IsBatchExpected(const WriteBatch* batch, SequenceNumber expected_seq); + // Update current batch if a continuous batch is found. + void UpdateCurrentWriteBatch(const Slice& record); + Status OpenLogReader(const LogFile* file); +}; +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/trim_history_scheduler.cc b/src/rocksdb/db/trim_history_scheduler.cc new file mode 100644 index 000000000..d7ca0899f --- /dev/null +++ b/src/rocksdb/db/trim_history_scheduler.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/trim_history_scheduler.h" + +#include + +#include "db/column_family.h" + +namespace ROCKSDB_NAMESPACE { + +void TrimHistoryScheduler::ScheduleWork(ColumnFamilyData* cfd) { + std::lock_guard lock(checking_mutex_); + cfd->Ref(); + cfds_.push_back(cfd); + is_empty_.store(false, std::memory_order_relaxed); +} + +ColumnFamilyData* TrimHistoryScheduler::TakeNextColumnFamily() { + std::lock_guard lock(checking_mutex_); + while (true) { + if (cfds_.empty()) { + return nullptr; + } + ColumnFamilyData* cfd = cfds_.back(); + cfds_.pop_back(); + if (cfds_.empty()) { + is_empty_.store(true, std::memory_order_relaxed); + } + + if (!cfd->IsDropped()) { + // success + return cfd; + } + cfd->UnrefAndTryDelete(); + } +} + +bool TrimHistoryScheduler::Empty() { + bool is_empty = is_empty_.load(std::memory_order_relaxed); + return is_empty; +} + +void TrimHistoryScheduler::Clear() { + ColumnFamilyData* cfd; + while ((cfd = TakeNextColumnFamily()) != nullptr) { + cfd->UnrefAndTryDelete(); + } + assert(Empty()); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/trim_history_scheduler.h b/src/rocksdb/db/trim_history_scheduler.h new file mode 100644 index 000000000..252802a7a --- /dev/null +++ b/src/rocksdb/db/trim_history_scheduler.h @@ -0,0 +1,46 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include +#include + +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class ColumnFamilyData; + +// Similar to FlushScheduler, TrimHistoryScheduler is a FIFO queue that keeps +// track of column families whose flushed immutable memtables may need to be +// removed (aka trimmed). The actual trimming may be slightly delayed. Due to +// the use of the mutex and atomic variable, ScheduleWork, +// TakeNextColumnFamily, and, Empty can be called concurrently. +class TrimHistoryScheduler { + public: + TrimHistoryScheduler() : is_empty_(true) {} + + // When a column family needs history trimming, add cfd to the FIFO queue + void ScheduleWork(ColumnFamilyData* cfd); + + // Remove the column family from the queue, the caller is responsible for + // calling `MemtableList::TrimHistory` + ColumnFamilyData* TakeNextColumnFamily(); + + bool Empty(); + + void Clear(); + + // Not on critical path, use mutex to ensure thread safety + private: + std::atomic is_empty_; + autovector cfds_; + std::mutex checking_mutex_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/version_builder.cc b/src/rocksdb/db/version_builder.cc new file mode 100644 index 000000000..2c65dcf71 --- /dev/null +++ b/src/rocksdb/db/version_builder.cc @@ -0,0 +1,1372 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_builder.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cache/cache_reservation_manager.h" +#include "db/blob/blob_file_meta.h" +#include "db/dbformat.h" +#include "db/internal_stats.h" +#include "db/table_cache.h" +#include "db/version_set.h" +#include "port/port.h" +#include "table/table_reader.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class VersionBuilder::Rep { + class NewestFirstBySeqNo { + public: + bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const { + assert(lhs); + assert(rhs); + + if (lhs->fd.largest_seqno != rhs->fd.largest_seqno) { + return lhs->fd.largest_seqno > rhs->fd.largest_seqno; + } + + if (lhs->fd.smallest_seqno != rhs->fd.smallest_seqno) { + return lhs->fd.smallest_seqno > rhs->fd.smallest_seqno; + } + + // Break ties by file number + return lhs->fd.GetNumber() > rhs->fd.GetNumber(); + } + }; + + class BySmallestKey { + public: + explicit BySmallestKey(const InternalKeyComparator* cmp) : cmp_(cmp) {} + + bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const { + assert(lhs); + assert(rhs); + assert(cmp_); + + const int r = cmp_->Compare(lhs->smallest, rhs->smallest); + if (r != 0) { + return (r < 0); + } + + // Break ties by file number + return (lhs->fd.GetNumber() < rhs->fd.GetNumber()); + } + + private: + const InternalKeyComparator* cmp_; + }; + + struct LevelState { + std::unordered_set deleted_files; + // Map from file number to file meta data. + std::unordered_map added_files; + }; + + // A class that represents the accumulated changes (like additional garbage or + // newly linked/unlinked SST files) for a given blob file after applying a + // series of VersionEdits. + class BlobFileMetaDataDelta { + public: + bool IsEmpty() const { + return !additional_garbage_count_ && !additional_garbage_bytes_ && + newly_linked_ssts_.empty() && newly_unlinked_ssts_.empty(); + } + + uint64_t GetAdditionalGarbageCount() const { + return additional_garbage_count_; + } + + uint64_t GetAdditionalGarbageBytes() const { + return additional_garbage_bytes_; + } + + const std::unordered_set& GetNewlyLinkedSsts() const { + return newly_linked_ssts_; + } + + const std::unordered_set& GetNewlyUnlinkedSsts() const { + return newly_unlinked_ssts_; + } + + void AddGarbage(uint64_t count, uint64_t bytes) { + additional_garbage_count_ += count; + additional_garbage_bytes_ += bytes; + } + + void LinkSst(uint64_t sst_file_number) { + assert(newly_linked_ssts_.find(sst_file_number) == + newly_linked_ssts_.end()); + + // Reconcile with newly unlinked SSTs on the fly. (Note: an SST can be + // linked to and unlinked from the same blob file in the case of a trivial + // move.) + auto it = newly_unlinked_ssts_.find(sst_file_number); + + if (it != newly_unlinked_ssts_.end()) { + newly_unlinked_ssts_.erase(it); + } else { + newly_linked_ssts_.emplace(sst_file_number); + } + } + + void UnlinkSst(uint64_t sst_file_number) { + assert(newly_unlinked_ssts_.find(sst_file_number) == + newly_unlinked_ssts_.end()); + + // Reconcile with newly linked SSTs on the fly. (Note: an SST can be + // linked to and unlinked from the same blob file in the case of a trivial + // move.) + auto it = newly_linked_ssts_.find(sst_file_number); + + if (it != newly_linked_ssts_.end()) { + newly_linked_ssts_.erase(it); + } else { + newly_unlinked_ssts_.emplace(sst_file_number); + } + } + + private: + uint64_t additional_garbage_count_ = 0; + uint64_t additional_garbage_bytes_ = 0; + std::unordered_set newly_linked_ssts_; + std::unordered_set newly_unlinked_ssts_; + }; + + // A class that represents the state of a blob file after applying a series of + // VersionEdits. In addition to the resulting state, it also contains the + // delta (see BlobFileMetaDataDelta above). The resulting state can be used to + // identify obsolete blob files, while the delta makes it possible to + // efficiently detect trivial moves. + class MutableBlobFileMetaData { + public: + // To be used for brand new blob files + explicit MutableBlobFileMetaData( + std::shared_ptr&& shared_meta) + : shared_meta_(std::move(shared_meta)) {} + + // To be used for pre-existing blob files + explicit MutableBlobFileMetaData( + const std::shared_ptr& meta) + : shared_meta_(meta->GetSharedMeta()), + linked_ssts_(meta->GetLinkedSsts()), + garbage_blob_count_(meta->GetGarbageBlobCount()), + garbage_blob_bytes_(meta->GetGarbageBlobBytes()) {} + + const std::shared_ptr& GetSharedMeta() const { + return shared_meta_; + } + + uint64_t GetBlobFileNumber() const { + assert(shared_meta_); + return shared_meta_->GetBlobFileNumber(); + } + + bool HasDelta() const { return !delta_.IsEmpty(); } + + const std::unordered_set& GetLinkedSsts() const { + return linked_ssts_; + } + + uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; } + + uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; } + + bool AddGarbage(uint64_t count, uint64_t bytes) { + assert(shared_meta_); + + if (garbage_blob_count_ + count > shared_meta_->GetTotalBlobCount() || + garbage_blob_bytes_ + bytes > shared_meta_->GetTotalBlobBytes()) { + return false; + } + + delta_.AddGarbage(count, bytes); + + garbage_blob_count_ += count; + garbage_blob_bytes_ += bytes; + + return true; + } + + void LinkSst(uint64_t sst_file_number) { + delta_.LinkSst(sst_file_number); + + assert(linked_ssts_.find(sst_file_number) == linked_ssts_.end()); + linked_ssts_.emplace(sst_file_number); + } + + void UnlinkSst(uint64_t sst_file_number) { + delta_.UnlinkSst(sst_file_number); + + assert(linked_ssts_.find(sst_file_number) != linked_ssts_.end()); + linked_ssts_.erase(sst_file_number); + } + + private: + std::shared_ptr shared_meta_; + // Accumulated changes + BlobFileMetaDataDelta delta_; + // Resulting state after applying the changes + BlobFileMetaData::LinkedSsts linked_ssts_; + uint64_t garbage_blob_count_ = 0; + uint64_t garbage_blob_bytes_ = 0; + }; + + const FileOptions& file_options_; + const ImmutableCFOptions* const ioptions_; + TableCache* table_cache_; + VersionStorageInfo* base_vstorage_; + VersionSet* version_set_; + int num_levels_; + LevelState* levels_; + // Store sizes of levels larger than num_levels_. We do this instead of + // storing them in levels_ to avoid regression in case there are no files + // on invalid levels. The version is not consistent if in the end the files + // on invalid levels don't cancel out. + std::unordered_map invalid_level_sizes_; + // Whether there are invalid new files or invalid deletion on levels larger + // than num_levels_. + bool has_invalid_levels_; + // Current levels of table files affected by additions/deletions. + std::unordered_map table_file_levels_; + // Current compact cursors that should be changed after the last compaction + std::unordered_map updated_compact_cursors_; + NewestFirstBySeqNo level_zero_cmp_; + BySmallestKey level_nonzero_cmp_; + + // Mutable metadata objects for all blob files affected by the series of + // version edits. + std::map mutable_blob_file_metas_; + + std::shared_ptr file_metadata_cache_res_mgr_; + + public: + Rep(const FileOptions& file_options, const ImmutableCFOptions* ioptions, + TableCache* table_cache, VersionStorageInfo* base_vstorage, + VersionSet* version_set, + std::shared_ptr file_metadata_cache_res_mgr) + : file_options_(file_options), + ioptions_(ioptions), + table_cache_(table_cache), + base_vstorage_(base_vstorage), + version_set_(version_set), + num_levels_(base_vstorage->num_levels()), + has_invalid_levels_(false), + level_nonzero_cmp_(base_vstorage_->InternalComparator()), + file_metadata_cache_res_mgr_(file_metadata_cache_res_mgr) { + assert(ioptions_); + + levels_ = new LevelState[num_levels_]; + } + + ~Rep() { + for (int level = 0; level < num_levels_; level++) { + const auto& added = levels_[level].added_files; + for (auto& pair : added) { + UnrefFile(pair.second); + } + } + + delete[] levels_; + } + + void UnrefFile(FileMetaData* f) { + f->refs--; + if (f->refs <= 0) { + if (f->table_reader_handle) { + assert(table_cache_ != nullptr); + table_cache_->ReleaseHandle(f->table_reader_handle); + f->table_reader_handle = nullptr; + } + + if (file_metadata_cache_res_mgr_) { + Status s = file_metadata_cache_res_mgr_->UpdateCacheReservation( + f->ApproximateMemoryUsage(), false /* increase */); + s.PermitUncheckedError(); + } + delete f; + } + } + + // Mapping used for checking the consistency of links between SST files and + // blob files. It is built using the forward links (table file -> blob file), + // and is subsequently compared with the inverse mapping stored in the + // BlobFileMetaData objects. + using ExpectedLinkedSsts = + std::unordered_map; + + static void UpdateExpectedLinkedSsts( + uint64_t table_file_number, uint64_t blob_file_number, + ExpectedLinkedSsts* expected_linked_ssts) { + assert(expected_linked_ssts); + + if (blob_file_number == kInvalidBlobFileNumber) { + return; + } + + (*expected_linked_ssts)[blob_file_number].emplace(table_file_number); + } + + template + Status CheckConsistencyDetailsForLevel( + const VersionStorageInfo* vstorage, int level, Checker checker, + const std::string& sync_point, + ExpectedLinkedSsts* expected_linked_ssts) const { +#ifdef NDEBUG + (void)sync_point; +#endif + + assert(vstorage); + assert(level >= 0 && level < num_levels_); + assert(expected_linked_ssts); + + const auto& level_files = vstorage->LevelFiles(level); + + if (level_files.empty()) { + return Status::OK(); + } + + assert(level_files[0]); + UpdateExpectedLinkedSsts(level_files[0]->fd.GetNumber(), + level_files[0]->oldest_blob_file_number, + expected_linked_ssts); + + for (size_t i = 1; i < level_files.size(); ++i) { + assert(level_files[i]); + UpdateExpectedLinkedSsts(level_files[i]->fd.GetNumber(), + level_files[i]->oldest_blob_file_number, + expected_linked_ssts); + + auto lhs = level_files[i - 1]; + auto rhs = level_files[i]; + +#ifndef NDEBUG + auto pair = std::make_pair(&lhs, &rhs); + TEST_SYNC_POINT_CALLBACK(sync_point, &pair); +#endif + + const Status s = checker(lhs, rhs); + if (!s.ok()) { + return s; + } + } + + return Status::OK(); + } + + // Make sure table files are sorted correctly and that the links between + // table files and blob files are consistent. + Status CheckConsistencyDetails(const VersionStorageInfo* vstorage) const { + assert(vstorage); + + ExpectedLinkedSsts expected_linked_ssts; + + if (num_levels_ > 0) { + // Check L0 + { + auto l0_checker = [this](const FileMetaData* lhs, + const FileMetaData* rhs) { + assert(lhs); + assert(rhs); + + if (!level_zero_cmp_(lhs, rhs)) { + std::ostringstream oss; + oss << "L0 files are not sorted properly: files #" + << lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber(); + + return Status::Corruption("VersionBuilder", oss.str()); + } + + if (rhs->fd.smallest_seqno == rhs->fd.largest_seqno) { + // This is an external file that we ingested + const SequenceNumber external_file_seqno = rhs->fd.smallest_seqno; + + if (!(external_file_seqno < lhs->fd.largest_seqno || + external_file_seqno == 0)) { + std::ostringstream oss; + oss << "L0 file #" << lhs->fd.GetNumber() << " with seqno " + << lhs->fd.smallest_seqno << ' ' << lhs->fd.largest_seqno + << " vs. file #" << rhs->fd.GetNumber() + << " with global_seqno " << external_file_seqno; + + return Status::Corruption("VersionBuilder", oss.str()); + } + } else if (lhs->fd.smallest_seqno <= rhs->fd.smallest_seqno) { + std::ostringstream oss; + oss << "L0 file #" << lhs->fd.GetNumber() << " with seqno " + << lhs->fd.smallest_seqno << ' ' << lhs->fd.largest_seqno + << " vs. file #" << rhs->fd.GetNumber() << " with seqno " + << rhs->fd.smallest_seqno << ' ' << rhs->fd.largest_seqno; + + return Status::Corruption("VersionBuilder", oss.str()); + } + + return Status::OK(); + }; + + const Status s = CheckConsistencyDetailsForLevel( + vstorage, /* level */ 0, l0_checker, + "VersionBuilder::CheckConsistency0", &expected_linked_ssts); + if (!s.ok()) { + return s; + } + } + + // Check L1 and up + const InternalKeyComparator* const icmp = vstorage->InternalComparator(); + assert(icmp); + + for (int level = 1; level < num_levels_; ++level) { + auto checker = [this, level, icmp](const FileMetaData* lhs, + const FileMetaData* rhs) { + assert(lhs); + assert(rhs); + + if (!level_nonzero_cmp_(lhs, rhs)) { + std::ostringstream oss; + oss << 'L' << level << " files are not sorted properly: files #" + << lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber(); + + return Status::Corruption("VersionBuilder", oss.str()); + } + + // Make sure there is no overlap in level + if (icmp->Compare(lhs->largest, rhs->smallest) >= 0) { + std::ostringstream oss; + oss << 'L' << level << " has overlapping ranges: file #" + << lhs->fd.GetNumber() + << " largest key: " << lhs->largest.DebugString(true) + << " vs. file #" << rhs->fd.GetNumber() + << " smallest key: " << rhs->smallest.DebugString(true); + + return Status::Corruption("VersionBuilder", oss.str()); + } + + return Status::OK(); + }; + + const Status s = CheckConsistencyDetailsForLevel( + vstorage, level, checker, "VersionBuilder::CheckConsistency1", + &expected_linked_ssts); + if (!s.ok()) { + return s; + } + } + } + + // Make sure that all blob files in the version have non-garbage data and + // the links between them and the table files are consistent. + const auto& blob_files = vstorage->GetBlobFiles(); + for (const auto& blob_file_meta : blob_files) { + assert(blob_file_meta); + + const uint64_t blob_file_number = blob_file_meta->GetBlobFileNumber(); + + if (blob_file_meta->GetGarbageBlobCount() >= + blob_file_meta->GetTotalBlobCount()) { + std::ostringstream oss; + oss << "Blob file #" << blob_file_number + << " consists entirely of garbage"; + + return Status::Corruption("VersionBuilder", oss.str()); + } + + if (blob_file_meta->GetLinkedSsts() != + expected_linked_ssts[blob_file_number]) { + std::ostringstream oss; + oss << "Links are inconsistent between table files and blob file #" + << blob_file_number; + + return Status::Corruption("VersionBuilder", oss.str()); + } + } + + Status ret_s; + TEST_SYNC_POINT_CALLBACK("VersionBuilder::CheckConsistencyBeforeReturn", + &ret_s); + return ret_s; + } + + Status CheckConsistency(const VersionStorageInfo* vstorage) const { + assert(vstorage); + + // Always run consistency checks in debug build +#ifdef NDEBUG + if (!vstorage->force_consistency_checks()) { + return Status::OK(); + } +#endif + Status s = CheckConsistencyDetails(vstorage); + if (s.IsCorruption() && s.getState()) { + // Make it clear the error is due to force_consistency_checks = 1 or + // debug build +#ifdef NDEBUG + auto prefix = "force_consistency_checks"; +#else + auto prefix = "force_consistency_checks(DEBUG)"; +#endif + s = Status::Corruption(prefix, s.getState()); + } else { + // was only expecting corruption with message, or OK + assert(s.ok()); + } + return s; + } + + bool CheckConsistencyForNumLevels() const { + // Make sure there are no files on or beyond num_levels(). + if (has_invalid_levels_) { + return false; + } + + for (const auto& pair : invalid_level_sizes_) { + const size_t level_size = pair.second; + if (level_size != 0) { + return false; + } + } + + return true; + } + + bool IsBlobFileInVersion(uint64_t blob_file_number) const { + auto mutable_it = mutable_blob_file_metas_.find(blob_file_number); + if (mutable_it != mutable_blob_file_metas_.end()) { + return true; + } + + assert(base_vstorage_); + const auto meta = base_vstorage_->GetBlobFileMetaData(blob_file_number); + + return !!meta; + } + + MutableBlobFileMetaData* GetOrCreateMutableBlobFileMetaData( + uint64_t blob_file_number) { + auto mutable_it = mutable_blob_file_metas_.find(blob_file_number); + if (mutable_it != mutable_blob_file_metas_.end()) { + return &mutable_it->second; + } + + assert(base_vstorage_); + const auto meta = base_vstorage_->GetBlobFileMetaData(blob_file_number); + + if (meta) { + mutable_it = mutable_blob_file_metas_ + .emplace(blob_file_number, MutableBlobFileMetaData(meta)) + .first; + return &mutable_it->second; + } + + return nullptr; + } + + Status ApplyBlobFileAddition(const BlobFileAddition& blob_file_addition) { + const uint64_t blob_file_number = blob_file_addition.GetBlobFileNumber(); + + if (IsBlobFileInVersion(blob_file_number)) { + std::ostringstream oss; + oss << "Blob file #" << blob_file_number << " already added"; + + return Status::Corruption("VersionBuilder", oss.str()); + } + + // Note: we use C++11 for now but in C++14, this could be done in a more + // elegant way using generalized lambda capture. + VersionSet* const vs = version_set_; + const ImmutableCFOptions* const ioptions = ioptions_; + + auto deleter = [vs, ioptions](SharedBlobFileMetaData* shared_meta) { + if (vs) { + assert(ioptions); + assert(!ioptions->cf_paths.empty()); + assert(shared_meta); + + vs->AddObsoleteBlobFile(shared_meta->GetBlobFileNumber(), + ioptions->cf_paths.front().path); + } + + delete shared_meta; + }; + + auto shared_meta = SharedBlobFileMetaData::Create( + blob_file_number, blob_file_addition.GetTotalBlobCount(), + blob_file_addition.GetTotalBlobBytes(), + blob_file_addition.GetChecksumMethod(), + blob_file_addition.GetChecksumValue(), deleter); + + mutable_blob_file_metas_.emplace( + blob_file_number, MutableBlobFileMetaData(std::move(shared_meta))); + + return Status::OK(); + } + + Status ApplyBlobFileGarbage(const BlobFileGarbage& blob_file_garbage) { + const uint64_t blob_file_number = blob_file_garbage.GetBlobFileNumber(); + + MutableBlobFileMetaData* const mutable_meta = + GetOrCreateMutableBlobFileMetaData(blob_file_number); + + if (!mutable_meta) { + std::ostringstream oss; + oss << "Blob file #" << blob_file_number << " not found"; + + return Status::Corruption("VersionBuilder", oss.str()); + } + + if (!mutable_meta->AddGarbage(blob_file_garbage.GetGarbageBlobCount(), + blob_file_garbage.GetGarbageBlobBytes())) { + std::ostringstream oss; + oss << "Garbage overflow for blob file #" << blob_file_number; + return Status::Corruption("VersionBuilder", oss.str()); + } + + return Status::OK(); + } + + int GetCurrentLevelForTableFile(uint64_t file_number) const { + auto it = table_file_levels_.find(file_number); + if (it != table_file_levels_.end()) { + return it->second; + } + + assert(base_vstorage_); + return base_vstorage_->GetFileLocation(file_number).GetLevel(); + } + + uint64_t GetOldestBlobFileNumberForTableFile(int level, + uint64_t file_number) const { + assert(level < num_levels_); + + const auto& added_files = levels_[level].added_files; + + auto it = added_files.find(file_number); + if (it != added_files.end()) { + const FileMetaData* const meta = it->second; + assert(meta); + + return meta->oldest_blob_file_number; + } + + assert(base_vstorage_); + const FileMetaData* const meta = + base_vstorage_->GetFileMetaDataByNumber(file_number); + assert(meta); + + return meta->oldest_blob_file_number; + } + + Status ApplyFileDeletion(int level, uint64_t file_number) { + assert(level != VersionStorageInfo::FileLocation::Invalid().GetLevel()); + + const int current_level = GetCurrentLevelForTableFile(file_number); + + if (level != current_level) { + if (level >= num_levels_) { + has_invalid_levels_ = true; + } + + std::ostringstream oss; + oss << "Cannot delete table file #" << file_number << " from level " + << level << " since it is "; + if (current_level == + VersionStorageInfo::FileLocation::Invalid().GetLevel()) { + oss << "not in the LSM tree"; + } else { + oss << "on level " << current_level; + } + + return Status::Corruption("VersionBuilder", oss.str()); + } + + if (level >= num_levels_) { + assert(invalid_level_sizes_[level] > 0); + --invalid_level_sizes_[level]; + + table_file_levels_[file_number] = + VersionStorageInfo::FileLocation::Invalid().GetLevel(); + + return Status::OK(); + } + + const uint64_t blob_file_number = + GetOldestBlobFileNumberForTableFile(level, file_number); + + if (blob_file_number != kInvalidBlobFileNumber) { + MutableBlobFileMetaData* const mutable_meta = + GetOrCreateMutableBlobFileMetaData(blob_file_number); + if (mutable_meta) { + mutable_meta->UnlinkSst(file_number); + } + } + + auto& level_state = levels_[level]; + + auto& add_files = level_state.added_files; + auto add_it = add_files.find(file_number); + if (add_it != add_files.end()) { + UnrefFile(add_it->second); + add_files.erase(add_it); + } + + auto& del_files = level_state.deleted_files; + assert(del_files.find(file_number) == del_files.end()); + del_files.emplace(file_number); + + table_file_levels_[file_number] = + VersionStorageInfo::FileLocation::Invalid().GetLevel(); + + return Status::OK(); + } + + Status ApplyFileAddition(int level, const FileMetaData& meta) { + assert(level != VersionStorageInfo::FileLocation::Invalid().GetLevel()); + + const uint64_t file_number = meta.fd.GetNumber(); + + const int current_level = GetCurrentLevelForTableFile(file_number); + + if (current_level != + VersionStorageInfo::FileLocation::Invalid().GetLevel()) { + if (level >= num_levels_) { + has_invalid_levels_ = true; + } + + std::ostringstream oss; + oss << "Cannot add table file #" << file_number << " to level " << level + << " since it is already in the LSM tree on level " << current_level; + return Status::Corruption("VersionBuilder", oss.str()); + } + + if (level >= num_levels_) { + ++invalid_level_sizes_[level]; + table_file_levels_[file_number] = level; + + return Status::OK(); + } + + auto& level_state = levels_[level]; + + auto& del_files = level_state.deleted_files; + auto del_it = del_files.find(file_number); + if (del_it != del_files.end()) { + del_files.erase(del_it); + } + + FileMetaData* const f = new FileMetaData(meta); + f->refs = 1; + + if (file_metadata_cache_res_mgr_) { + Status s = file_metadata_cache_res_mgr_->UpdateCacheReservation( + f->ApproximateMemoryUsage(), true /* increase */); + if (!s.ok()) { + delete f; + s = Status::MemoryLimit( + "Can't allocate " + + kCacheEntryRoleToCamelString[static_cast( + CacheEntryRole::kFileMetadata)] + + " due to exceeding the memory limit " + "based on " + "cache capacity"); + return s; + } + } + + auto& add_files = level_state.added_files; + assert(add_files.find(file_number) == add_files.end()); + add_files.emplace(file_number, f); + + const uint64_t blob_file_number = f->oldest_blob_file_number; + + if (blob_file_number != kInvalidBlobFileNumber) { + MutableBlobFileMetaData* const mutable_meta = + GetOrCreateMutableBlobFileMetaData(blob_file_number); + if (mutable_meta) { + mutable_meta->LinkSst(file_number); + } + } + + table_file_levels_[file_number] = level; + + return Status::OK(); + } + + Status ApplyCompactCursors(int level, + const InternalKey& smallest_uncompacted_key) { + if (level < 0) { + std::ostringstream oss; + oss << "Cannot add compact cursor (" << level << "," + << smallest_uncompacted_key.Encode().ToString() + << " due to invalid level (level = " << level << ")"; + return Status::Corruption("VersionBuilder", oss.str()); + } + if (level < num_levels_) { + // Omit levels (>= num_levels_) when re-open with shrinking num_levels_ + updated_compact_cursors_[level] = smallest_uncompacted_key; + } + return Status::OK(); + } + + // Apply all of the edits in *edit to the current state. + Status Apply(const VersionEdit* edit) { + { + const Status s = CheckConsistency(base_vstorage_); + if (!s.ok()) { + return s; + } + } + + // Note: we process the blob file related changes first because the + // table file addition/deletion logic depends on the blob files + // already being there. + + // Add new blob files + for (const auto& blob_file_addition : edit->GetBlobFileAdditions()) { + const Status s = ApplyBlobFileAddition(blob_file_addition); + if (!s.ok()) { + return s; + } + } + + // Increase the amount of garbage for blob files affected by GC + for (const auto& blob_file_garbage : edit->GetBlobFileGarbages()) { + const Status s = ApplyBlobFileGarbage(blob_file_garbage); + if (!s.ok()) { + return s; + } + } + + // Delete table files + for (const auto& deleted_file : edit->GetDeletedFiles()) { + const int level = deleted_file.first; + const uint64_t file_number = deleted_file.second; + + const Status s = ApplyFileDeletion(level, file_number); + if (!s.ok()) { + return s; + } + } + + // Add new table files + for (const auto& new_file : edit->GetNewFiles()) { + const int level = new_file.first; + const FileMetaData& meta = new_file.second; + + const Status s = ApplyFileAddition(level, meta); + if (!s.ok()) { + return s; + } + } + + // Populate compact cursors for round-robin compaction, leave + // the cursor to be empty to indicate it is invalid + for (const auto& cursor : edit->GetCompactCursors()) { + const int level = cursor.first; + const InternalKey smallest_uncompacted_key = cursor.second; + const Status s = ApplyCompactCursors(level, smallest_uncompacted_key); + if (!s.ok()) { + return s; + } + } + return Status::OK(); + } + + // Helper function template for merging the blob file metadata from the base + // version with the mutable metadata representing the state after applying the + // edits. The function objects process_base and process_mutable are + // respectively called to handle a base version object when there is no + // matching mutable object, and a mutable object when there is no matching + // base version object. process_both is called to perform the merge when a + // given blob file appears both in the base version and the mutable list. The + // helper stops processing objects if a function object returns false. Blob + // files with a file number below first_blob_file are not processed. + template + void MergeBlobFileMetas(uint64_t first_blob_file, ProcessBase process_base, + ProcessMutable process_mutable, + ProcessBoth process_both) const { + assert(base_vstorage_); + + auto base_it = base_vstorage_->GetBlobFileMetaDataLB(first_blob_file); + const auto base_it_end = base_vstorage_->GetBlobFiles().end(); + + auto mutable_it = mutable_blob_file_metas_.lower_bound(first_blob_file); + const auto mutable_it_end = mutable_blob_file_metas_.end(); + + while (base_it != base_it_end && mutable_it != mutable_it_end) { + const auto& base_meta = *base_it; + assert(base_meta); + + const uint64_t base_blob_file_number = base_meta->GetBlobFileNumber(); + const uint64_t mutable_blob_file_number = mutable_it->first; + + if (base_blob_file_number < mutable_blob_file_number) { + if (!process_base(base_meta)) { + return; + } + + ++base_it; + } else if (mutable_blob_file_number < base_blob_file_number) { + const auto& mutable_meta = mutable_it->second; + + if (!process_mutable(mutable_meta)) { + return; + } + + ++mutable_it; + } else { + assert(base_blob_file_number == mutable_blob_file_number); + + const auto& mutable_meta = mutable_it->second; + + if (!process_both(base_meta, mutable_meta)) { + return; + } + + ++base_it; + ++mutable_it; + } + } + + while (base_it != base_it_end) { + const auto& base_meta = *base_it; + + if (!process_base(base_meta)) { + return; + } + + ++base_it; + } + + while (mutable_it != mutable_it_end) { + const auto& mutable_meta = mutable_it->second; + + if (!process_mutable(mutable_meta)) { + return; + } + + ++mutable_it; + } + } + + // Helper function template for finding the first blob file that has linked + // SSTs. + template + static bool CheckLinkedSsts(const Meta& meta, + uint64_t* min_oldest_blob_file_num) { + assert(min_oldest_blob_file_num); + + if (!meta.GetLinkedSsts().empty()) { + assert(*min_oldest_blob_file_num == kInvalidBlobFileNumber); + + *min_oldest_blob_file_num = meta.GetBlobFileNumber(); + + return false; + } + + return true; + } + + // Find the oldest blob file that has linked SSTs. + uint64_t GetMinOldestBlobFileNumber() const { + uint64_t min_oldest_blob_file_num = kInvalidBlobFileNumber; + + auto process_base = + [&min_oldest_blob_file_num]( + const std::shared_ptr& base_meta) { + assert(base_meta); + + return CheckLinkedSsts(*base_meta, &min_oldest_blob_file_num); + }; + + auto process_mutable = [&min_oldest_blob_file_num]( + const MutableBlobFileMetaData& mutable_meta) { + return CheckLinkedSsts(mutable_meta, &min_oldest_blob_file_num); + }; + + auto process_both = [&min_oldest_blob_file_num]( + const std::shared_ptr& base_meta, + const MutableBlobFileMetaData& mutable_meta) { +#ifndef NDEBUG + assert(base_meta); + assert(base_meta->GetSharedMeta() == mutable_meta.GetSharedMeta()); +#else + (void)base_meta; +#endif + + // Look at mutable_meta since it supersedes *base_meta + return CheckLinkedSsts(mutable_meta, &min_oldest_blob_file_num); + }; + + MergeBlobFileMetas(kInvalidBlobFileNumber, process_base, process_mutable, + process_both); + + return min_oldest_blob_file_num; + } + + static std::shared_ptr CreateBlobFileMetaData( + const MutableBlobFileMetaData& mutable_meta) { + return BlobFileMetaData::Create( + mutable_meta.GetSharedMeta(), mutable_meta.GetLinkedSsts(), + mutable_meta.GetGarbageBlobCount(), mutable_meta.GetGarbageBlobBytes()); + } + + // Add the blob file specified by meta to *vstorage if it is determined to + // contain valid data (blobs). + template + static void AddBlobFileIfNeeded(VersionStorageInfo* vstorage, Meta&& meta) { + assert(vstorage); + assert(meta); + + if (meta->GetLinkedSsts().empty() && + meta->GetGarbageBlobCount() >= meta->GetTotalBlobCount()) { + return; + } + + vstorage->AddBlobFile(std::forward(meta)); + } + + // Merge the blob file metadata from the base version with the changes (edits) + // applied, and save the result into *vstorage. + void SaveBlobFilesTo(VersionStorageInfo* vstorage) const { + assert(vstorage); + + assert(base_vstorage_); + vstorage->ReserveBlob(base_vstorage_->GetBlobFiles().size() + + mutable_blob_file_metas_.size()); + + const uint64_t oldest_blob_file_with_linked_ssts = + GetMinOldestBlobFileNumber(); + + auto process_base = + [vstorage](const std::shared_ptr& base_meta) { + assert(base_meta); + + AddBlobFileIfNeeded(vstorage, base_meta); + + return true; + }; + + auto process_mutable = + [vstorage](const MutableBlobFileMetaData& mutable_meta) { + AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta)); + + return true; + }; + + auto process_both = [vstorage]( + const std::shared_ptr& base_meta, + const MutableBlobFileMetaData& mutable_meta) { + assert(base_meta); + assert(base_meta->GetSharedMeta() == mutable_meta.GetSharedMeta()); + + if (!mutable_meta.HasDelta()) { + assert(base_meta->GetGarbageBlobCount() == + mutable_meta.GetGarbageBlobCount()); + assert(base_meta->GetGarbageBlobBytes() == + mutable_meta.GetGarbageBlobBytes()); + assert(base_meta->GetLinkedSsts() == mutable_meta.GetLinkedSsts()); + + AddBlobFileIfNeeded(vstorage, base_meta); + + return true; + } + + AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta)); + + return true; + }; + + MergeBlobFileMetas(oldest_blob_file_with_linked_ssts, process_base, + process_mutable, process_both); + } + + void MaybeAddFile(VersionStorageInfo* vstorage, int level, + FileMetaData* f) const { + const uint64_t file_number = f->fd.GetNumber(); + + const auto& level_state = levels_[level]; + + const auto& del_files = level_state.deleted_files; + const auto del_it = del_files.find(file_number); + + if (del_it != del_files.end()) { + // f is to-be-deleted table file + vstorage->RemoveCurrentStats(f); + } else { + const auto& add_files = level_state.added_files; + const auto add_it = add_files.find(file_number); + + // Note: if the file appears both in the base version and in the added + // list, the added FileMetaData supersedes the one in the base version. + if (add_it != add_files.end() && add_it->second != f) { + vstorage->RemoveCurrentStats(f); + } else { + vstorage->AddFile(level, f); + } + } + } + + template + void SaveSSTFilesTo(VersionStorageInfo* vstorage, int level, Cmp cmp) const { + // Merge the set of added files with the set of pre-existing files. + // Drop any deleted files. Store the result in *vstorage. + const auto& base_files = base_vstorage_->LevelFiles(level); + const auto& unordered_added_files = levels_[level].added_files; + vstorage->Reserve(level, base_files.size() + unordered_added_files.size()); + + // Sort added files for the level. + std::vector added_files; + added_files.reserve(unordered_added_files.size()); + for (const auto& pair : unordered_added_files) { + added_files.push_back(pair.second); + } + std::sort(added_files.begin(), added_files.end(), cmp); + + auto base_iter = base_files.begin(); + auto base_end = base_files.end(); + auto added_iter = added_files.begin(); + auto added_end = added_files.end(); + while (added_iter != added_end || base_iter != base_end) { + if (base_iter == base_end || + (added_iter != added_end && cmp(*added_iter, *base_iter))) { + MaybeAddFile(vstorage, level, *added_iter++); + } else { + MaybeAddFile(vstorage, level, *base_iter++); + } + } + } + + void SaveSSTFilesTo(VersionStorageInfo* vstorage) const { + assert(vstorage); + + if (!num_levels_) { + return; + } + + SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_); + + for (int level = 1; level < num_levels_; ++level) { + SaveSSTFilesTo(vstorage, level, level_nonzero_cmp_); + } + } + + void SaveCompactCursorsTo(VersionStorageInfo* vstorage) const { + for (auto iter = updated_compact_cursors_.begin(); + iter != updated_compact_cursors_.end(); iter++) { + vstorage->AddCursorForOneLevel(iter->first, iter->second); + } + } + + // Save the current state in *vstorage. + Status SaveTo(VersionStorageInfo* vstorage) const { + Status s; + +#ifndef NDEBUG + // The same check is done within Apply() so we skip it in release mode. + s = CheckConsistency(base_vstorage_); + if (!s.ok()) { + return s; + } +#endif // NDEBUG + + s = CheckConsistency(vstorage); + if (!s.ok()) { + return s; + } + + SaveSSTFilesTo(vstorage); + + SaveBlobFilesTo(vstorage); + + SaveCompactCursorsTo(vstorage); + + s = CheckConsistency(vstorage); + return s; + } + + Status LoadTableHandlers( + InternalStats* internal_stats, int max_threads, + bool prefetch_index_and_filter_in_cache, bool is_initial_load, + const std::shared_ptr& prefix_extractor, + size_t max_file_size_for_l0_meta_pin) { + assert(table_cache_ != nullptr); + + size_t table_cache_capacity = table_cache_->get_cache()->GetCapacity(); + bool always_load = (table_cache_capacity == TableCache::kInfiniteCapacity); + size_t max_load = std::numeric_limits::max(); + + if (!always_load) { + // If it is initial loading and not set to always loading all the + // files, we only load up to kInitialLoadLimit files, to limit the + // time reopening the DB. + const size_t kInitialLoadLimit = 16; + size_t load_limit; + // If the table cache is not 1/4 full, we pin the table handle to + // file metadata to avoid the cache read costs when reading the file. + // The downside of pinning those files is that LRU won't be followed + // for those files. This doesn't matter much because if number of files + // of the DB excceeds table cache capacity, eventually no table reader + // will be pinned and LRU will be followed. + if (is_initial_load) { + load_limit = std::min(kInitialLoadLimit, table_cache_capacity / 4); + } else { + load_limit = table_cache_capacity / 4; + } + + size_t table_cache_usage = table_cache_->get_cache()->GetUsage(); + if (table_cache_usage >= load_limit) { + // TODO (yanqin) find a suitable status code. + return Status::OK(); + } else { + max_load = load_limit - table_cache_usage; + } + } + + // + std::vector> files_meta; + std::vector statuses; + for (int level = 0; level < num_levels_; level++) { + for (auto& file_meta_pair : levels_[level].added_files) { + auto* file_meta = file_meta_pair.second; + // If the file has been opened before, just skip it. + if (!file_meta->table_reader_handle) { + files_meta.emplace_back(file_meta, level); + statuses.emplace_back(Status::OK()); + } + if (files_meta.size() >= max_load) { + break; + } + } + if (files_meta.size() >= max_load) { + break; + } + } + + std::atomic next_file_meta_idx(0); + std::function load_handlers_func([&]() { + while (true) { + size_t file_idx = next_file_meta_idx.fetch_add(1); + if (file_idx >= files_meta.size()) { + break; + } + + auto* file_meta = files_meta[file_idx].first; + int level = files_meta[file_idx].second; + statuses[file_idx] = table_cache_->FindTable( + ReadOptions(), file_options_, + *(base_vstorage_->InternalComparator()), *file_meta, + &file_meta->table_reader_handle, prefix_extractor, false /*no_io */, + true /* record_read_stats */, + internal_stats->GetFileReadHist(level), false, level, + prefetch_index_and_filter_in_cache, max_file_size_for_l0_meta_pin, + file_meta->temperature); + if (file_meta->table_reader_handle != nullptr) { + // Load table_reader + file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle( + file_meta->table_reader_handle); + } + } + }); + + std::vector threads; + for (int i = 1; i < max_threads; i++) { + threads.emplace_back(load_handlers_func); + } + load_handlers_func(); + for (auto& t : threads) { + t.join(); + } + Status ret; + for (const auto& s : statuses) { + if (!s.ok()) { + if (ret.ok()) { + ret = s; + } + } + } + return ret; + } +}; + +VersionBuilder::VersionBuilder( + const FileOptions& file_options, const ImmutableCFOptions* ioptions, + TableCache* table_cache, VersionStorageInfo* base_vstorage, + VersionSet* version_set, + std::shared_ptr file_metadata_cache_res_mgr) + : rep_(new Rep(file_options, ioptions, table_cache, base_vstorage, + version_set, file_metadata_cache_res_mgr)) {} + +VersionBuilder::~VersionBuilder() = default; + +bool VersionBuilder::CheckConsistencyForNumLevels() { + return rep_->CheckConsistencyForNumLevels(); +} + +Status VersionBuilder::Apply(const VersionEdit* edit) { + return rep_->Apply(edit); +} + +Status VersionBuilder::SaveTo(VersionStorageInfo* vstorage) const { + return rep_->SaveTo(vstorage); +} + +Status VersionBuilder::LoadTableHandlers( + InternalStats* internal_stats, int max_threads, + bool prefetch_index_and_filter_in_cache, bool is_initial_load, + const std::shared_ptr& prefix_extractor, + size_t max_file_size_for_l0_meta_pin) { + return rep_->LoadTableHandlers( + internal_stats, max_threads, prefetch_index_and_filter_in_cache, + is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin); +} + +uint64_t VersionBuilder::GetMinOldestBlobFileNumber() const { + return rep_->GetMinOldestBlobFileNumber(); +} + +BaseReferencedVersionBuilder::BaseReferencedVersionBuilder( + ColumnFamilyData* cfd) + : version_builder_(new VersionBuilder( + cfd->current()->version_set()->file_options(), cfd->ioptions(), + cfd->table_cache(), cfd->current()->storage_info(), + cfd->current()->version_set(), + cfd->GetFileMetadataCacheReservationManager())), + version_(cfd->current()) { + version_->Ref(); +} + +BaseReferencedVersionBuilder::BaseReferencedVersionBuilder( + ColumnFamilyData* cfd, Version* v) + : version_builder_(new VersionBuilder( + cfd->current()->version_set()->file_options(), cfd->ioptions(), + cfd->table_cache(), v->storage_info(), v->version_set(), + cfd->GetFileMetadataCacheReservationManager())), + version_(v) { + assert(version_ != cfd->current()); +} + +BaseReferencedVersionBuilder::~BaseReferencedVersionBuilder() { + version_->Unref(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/version_builder.h b/src/rocksdb/db/version_builder.h new file mode 100644 index 000000000..1c022832a --- /dev/null +++ b/src/rocksdb/db/version_builder.h @@ -0,0 +1,72 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +#pragma once + +#include + +#include "rocksdb/file_system.h" +#include "rocksdb/slice_transform.h" + +namespace ROCKSDB_NAMESPACE { + +struct ImmutableCFOptions; +class TableCache; +class VersionStorageInfo; +class VersionEdit; +struct FileMetaData; +class InternalStats; +class Version; +class VersionSet; +class ColumnFamilyData; +class CacheReservationManager; + +// A helper class so we can efficiently apply a whole sequence +// of edits to a particular state without creating intermediate +// Versions that contain full copies of the intermediate state. +class VersionBuilder { + public: + VersionBuilder(const FileOptions& file_options, + const ImmutableCFOptions* ioptions, TableCache* table_cache, + VersionStorageInfo* base_vstorage, VersionSet* version_set, + std::shared_ptr + file_metadata_cache_res_mgr = nullptr); + ~VersionBuilder(); + + bool CheckConsistencyForNumLevels(); + Status Apply(const VersionEdit* edit); + Status SaveTo(VersionStorageInfo* vstorage) const; + Status LoadTableHandlers( + InternalStats* internal_stats, int max_threads, + bool prefetch_index_and_filter_in_cache, bool is_initial_load, + const std::shared_ptr& prefix_extractor, + size_t max_file_size_for_l0_meta_pin); + uint64_t GetMinOldestBlobFileNumber() const; + + private: + class Rep; + std::unique_ptr rep_; +}; + +// A wrapper of version builder which references the current version in +// constructor and unref it in the destructor. +// Both of the constructor and destructor need to be called inside DB Mutex. +class BaseReferencedVersionBuilder { + public: + explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd); + BaseReferencedVersionBuilder(ColumnFamilyData* cfd, Version* v); + ~BaseReferencedVersionBuilder(); + VersionBuilder* version_builder() const { return version_builder_.get(); } + + private: + std::unique_ptr version_builder_; + Version* version_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/version_builder_test.cc b/src/rocksdb/db/version_builder_test.cc new file mode 100644 index 000000000..ee5c3f2e3 --- /dev/null +++ b/src/rocksdb/db/version_builder_test.cc @@ -0,0 +1,1695 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include +#include +#include +#include + +#include "db/version_edit.h" +#include "db/version_set.h" +#include "rocksdb/advanced_options.h" +#include "table/unique_id_impl.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class VersionBuilderTest : public testing::Test { + public: + const Comparator* ucmp_; + InternalKeyComparator icmp_; + Options options_; + ImmutableOptions ioptions_; + MutableCFOptions mutable_cf_options_; + VersionStorageInfo vstorage_; + uint32_t file_num_; + CompactionOptionsFIFO fifo_options_; + std::vector size_being_compacted_; + + VersionBuilderTest() + : ucmp_(BytewiseComparator()), + icmp_(ucmp_), + ioptions_(options_), + mutable_cf_options_(options_), + vstorage_(&icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, + nullptr, false), + file_num_(1) { + mutable_cf_options_.RefreshDerivedOptions(ioptions_); + size_being_compacted_.resize(options_.num_levels); + } + + ~VersionBuilderTest() override { + for (int i = 0; i < vstorage_.num_levels(); i++) { + for (auto* f : vstorage_.LevelFiles(i)) { + if (--f->refs == 0) { + delete f; + } + } + } + } + + InternalKey GetInternalKey(const char* ukey, + SequenceNumber smallest_seq = 100) { + return InternalKey(ukey, smallest_seq, kTypeValue); + } + + void Add(int level, uint64_t file_number, const char* smallest, + const char* largest, uint64_t file_size = 0, uint32_t path_id = 0, + SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100, + uint64_t num_entries = 0, uint64_t num_deletions = 0, + bool sampled = false, SequenceNumber smallest_seqno = 0, + SequenceNumber largest_seqno = 0, + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) { + assert(level < vstorage_.num_levels()); + FileMetaData* f = new FileMetaData( + file_number, path_id, file_size, GetInternalKey(smallest, smallest_seq), + GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno, + /* marked_for_compact */ false, Temperature::kUnknown, + oldest_blob_file_number, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); + f->compensated_file_size = file_size; + f->num_entries = num_entries; + f->num_deletions = num_deletions; + vstorage_.AddFile(level, f); + if (sampled) { + f->init_stats_from_file = true; + vstorage_.UpdateAccumulatedStats(f); + } + } + + void AddBlob(uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, std::string checksum_method, + std::string checksum_value, + BlobFileMetaData::LinkedSsts linked_ssts, + uint64_t garbage_blob_count, uint64_t garbage_blob_bytes) { + auto shared_meta = SharedBlobFileMetaData::Create( + blob_file_number, total_blob_count, total_blob_bytes, + std::move(checksum_method), std::move(checksum_value)); + auto meta = + BlobFileMetaData::Create(std::move(shared_meta), std::move(linked_ssts), + garbage_blob_count, garbage_blob_bytes); + + vstorage_.AddBlobFile(std::move(meta)); + } + + void AddDummyFile(uint64_t table_file_number, uint64_t blob_file_number) { + constexpr int level = 0; + constexpr char smallest[] = "bar"; + constexpr char largest[] = "foo"; + constexpr uint64_t file_size = 100; + constexpr uint32_t path_id = 0; + constexpr SequenceNumber smallest_seq = 0; + constexpr SequenceNumber largest_seq = 0; + constexpr uint64_t num_entries = 0; + constexpr uint64_t num_deletions = 0; + constexpr bool sampled = false; + + Add(level, table_file_number, smallest, largest, file_size, path_id, + smallest_seq, largest_seq, num_entries, num_deletions, sampled, + smallest_seq, largest_seq, blob_file_number); + } + + void AddDummyFileToEdit(VersionEdit* edit, uint64_t table_file_number, + uint64_t blob_file_number) { + assert(edit); + + constexpr int level = 0; + constexpr uint32_t path_id = 0; + constexpr uint64_t file_size = 100; + constexpr char smallest[] = "bar"; + constexpr char largest[] = "foo"; + constexpr SequenceNumber smallest_seqno = 100; + constexpr SequenceNumber largest_seqno = 300; + constexpr bool marked_for_compaction = false; + + edit->AddFile( + level, table_file_number, path_id, file_size, GetInternalKey(smallest), + GetInternalKey(largest), smallest_seqno, largest_seqno, + marked_for_compaction, Temperature::kUnknown, blob_file_number, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + } + + void UpdateVersionStorageInfo(VersionStorageInfo* vstorage) { + assert(vstorage); + + vstorage->PrepareForVersionAppend(ioptions_, mutable_cf_options_); + vstorage->SetFinalized(); + } + + void UpdateVersionStorageInfo() { UpdateVersionStorageInfo(&vstorage_); } +}; + +void UnrefFilesInVersion(VersionStorageInfo* new_vstorage) { + for (int i = 0; i < new_vstorage->num_levels(); i++) { + for (auto* f : new_vstorage->LevelFiles(i)) { + if (--f->refs == 0) { + delete f; + } + } + } +} + +TEST_F(VersionBuilderTest, ApplyAndSaveTo) { + Add(0, 1U, "150", "200", 100U); + + Add(1, 66U, "150", "200", 100U); + Add(1, 88U, "201", "300", 100U); + + Add(2, 6U, "150", "179", 100U); + Add(2, 7U, "180", "220", 100U); + Add(2, 8U, "221", "300", 100U); + + Add(3, 26U, "150", "170", 100U); + Add(3, 27U, "171", "179", 100U); + Add(3, 28U, "191", "220", 100U); + Add(3, 29U, "221", "300", 100U); + + UpdateVersionStorageInfo(); + + VersionEdit version_edit; + version_edit.AddFile( + 2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, + false, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + version_edit.DeleteFile(3, 27U); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder version_builder(env_options, &ioptions_, table_cache, + &vstorage_, version_set); + + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, nullptr, false); + ASSERT_OK(version_builder.Apply(&version_edit)); + ASSERT_OK(version_builder.SaveTo(&new_vstorage)); + + UpdateVersionStorageInfo(&new_vstorage); + + ASSERT_EQ(400U, new_vstorage.NumLevelBytes(2)); + ASSERT_EQ(300U, new_vstorage.NumLevelBytes(3)); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) { + ioptions_.level_compaction_dynamic_level_bytes = true; + + Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U); + Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U); + + Add(4, 6U, "150", "179", 100U); + Add(4, 7U, "180", "220", 100U); + Add(4, 8U, "221", "300", 100U); + + Add(5, 26U, "150", "170", 100U); + Add(5, 27U, "171", "179", 100U); + + UpdateVersionStorageInfo(); + + VersionEdit version_edit; + version_edit.AddFile( + 3, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, + false, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + version_edit.DeleteFile(0, 1U); + version_edit.DeleteFile(0, 88U); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder version_builder(env_options, &ioptions_, table_cache, + &vstorage_, version_set); + + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, nullptr, false); + ASSERT_OK(version_builder.Apply(&version_edit)); + ASSERT_OK(version_builder.SaveTo(&new_vstorage)); + + UpdateVersionStorageInfo(&new_vstorage); + + ASSERT_EQ(0U, new_vstorage.NumLevelBytes(0)); + ASSERT_EQ(100U, new_vstorage.NumLevelBytes(3)); + ASSERT_EQ(300U, new_vstorage.NumLevelBytes(4)); + ASSERT_EQ(200U, new_vstorage.NumLevelBytes(5)); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) { + ioptions_.level_compaction_dynamic_level_bytes = true; + + Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U); + Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U); + + Add(4, 6U, "150", "179", 100U); + Add(4, 7U, "180", "220", 100U); + Add(4, 8U, "221", "300", 100U); + + Add(5, 26U, "150", "170", 100U); + Add(5, 27U, "171", "179", 100U); + + UpdateVersionStorageInfo(); + + VersionEdit version_edit; + version_edit.AddFile( + 4, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, + false, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + version_edit.DeleteFile(0, 1U); + version_edit.DeleteFile(0, 88U); + version_edit.DeleteFile(4, 6U); + version_edit.DeleteFile(4, 7U); + version_edit.DeleteFile(4, 8U); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder version_builder(env_options, &ioptions_, table_cache, + &vstorage_, version_set); + + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, nullptr, false); + ASSERT_OK(version_builder.Apply(&version_edit)); + ASSERT_OK(version_builder.SaveTo(&new_vstorage)); + + UpdateVersionStorageInfo(&new_vstorage); + + ASSERT_EQ(0U, new_vstorage.NumLevelBytes(0)); + ASSERT_EQ(100U, new_vstorage.NumLevelBytes(4)); + ASSERT_EQ(200U, new_vstorage.NumLevelBytes(5)); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) { + UpdateVersionStorageInfo(); + + VersionEdit version_edit; + version_edit.AddFile( + 2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, + false, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + version_edit.AddFile( + 2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200, + false, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + version_edit.AddFile( + 2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200, + false, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + version_edit.AddFile( + 2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200, + false, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + version_edit.AddFile( + 2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200, + false, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder version_builder(env_options, &ioptions_, table_cache, + &vstorage_, version_set); + + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, nullptr, false); + ASSERT_OK(version_builder.Apply(&version_edit)); + ASSERT_OK(version_builder.SaveTo(&new_vstorage)); + + UpdateVersionStorageInfo(&new_vstorage); + + ASSERT_EQ(500U, new_vstorage.NumLevelBytes(2)); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) { + UpdateVersionStorageInfo(); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder version_builder(env_options, &ioptions_, table_cache, + &vstorage_, version_set); + + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, nullptr, false); + + VersionEdit version_edit; + version_edit.AddFile( + 2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, + false, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + version_edit.AddFile( + 2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200, + false, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + version_edit.AddFile( + 2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200, + false, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + version_edit.AddFile( + 2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200, + false, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + version_edit.AddFile( + 2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200, + false, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + ASSERT_OK(version_builder.Apply(&version_edit)); + + VersionEdit version_edit2; + version_edit.AddFile( + 2, 808, 0, 100U, GetInternalKey("901"), GetInternalKey("950"), 200, 200, + false, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + version_edit2.DeleteFile(2, 616); + version_edit2.DeleteFile(2, 636); + version_edit.AddFile( + 2, 806, 0, 100U, GetInternalKey("801"), GetInternalKey("850"), 200, 200, + false, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + + ASSERT_OK(version_builder.Apply(&version_edit2)); + ASSERT_OK(version_builder.SaveTo(&new_vstorage)); + + UpdateVersionStorageInfo(&new_vstorage); + + ASSERT_EQ(300U, new_vstorage.NumLevelBytes(2)); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, ApplyFileDeletionIncorrectLevel) { + constexpr int level = 1; + constexpr uint64_t file_number = 2345; + constexpr char smallest[] = "bar"; + constexpr char largest[] = "foo"; + constexpr uint64_t file_size = 100; + + Add(level, file_number, smallest, largest, file_size); + + UpdateVersionStorageInfo(); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + constexpr int incorrect_level = 3; + + edit.DeleteFile(incorrect_level, file_number); + + const Status s = builder.Apply(&edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), + "Cannot delete table file #2345 from level 3 since " + "it is on level 1")); +} + +TEST_F(VersionBuilderTest, ApplyFileDeletionNotInLSMTree) { + UpdateVersionStorageInfo(); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + constexpr int level = 3; + constexpr uint64_t file_number = 1234; + + edit.DeleteFile(level, file_number); + + const Status s = builder.Apply(&edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), + "Cannot delete table file #1234 from level 3 since " + "it is not in the LSM tree")); +} + +TEST_F(VersionBuilderTest, ApplyFileDeletionAndAddition) { + constexpr int level = 1; + constexpr uint64_t file_number = 2345; + constexpr char smallest[] = "bar"; + constexpr char largest[] = "foo"; + constexpr uint64_t file_size = 10000; + constexpr uint32_t path_id = 0; + constexpr SequenceNumber smallest_seq = 100; + constexpr SequenceNumber largest_seq = 500; + constexpr uint64_t num_entries = 0; + constexpr uint64_t num_deletions = 0; + constexpr bool sampled = false; + constexpr SequenceNumber smallest_seqno = 1; + constexpr SequenceNumber largest_seqno = 1000; + + Add(level, file_number, smallest, largest, file_size, path_id, smallest_seq, + largest_seq, num_entries, num_deletions, sampled, smallest_seqno, + largest_seqno); + + UpdateVersionStorageInfo(); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit deletion; + + deletion.DeleteFile(level, file_number); + + ASSERT_OK(builder.Apply(&deletion)); + + VersionEdit addition; + + constexpr bool marked_for_compaction = false; + + addition.AddFile(level, file_number, path_id, file_size, + GetInternalKey(smallest, smallest_seq), + GetInternalKey(largest, largest_seq), smallest_seqno, + largest_seqno, marked_for_compaction, Temperature::kUnknown, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); + + ASSERT_OK(builder.Apply(&addition)); + + constexpr bool force_consistency_checks = false; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + ASSERT_OK(builder.SaveTo(&new_vstorage)); + + UpdateVersionStorageInfo(&new_vstorage); + + ASSERT_EQ(new_vstorage.GetFileLocation(file_number).GetLevel(), level); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyInBase) { + constexpr int level = 1; + constexpr uint64_t file_number = 2345; + constexpr char smallest[] = "bar"; + constexpr char largest[] = "foo"; + constexpr uint64_t file_size = 10000; + + Add(level, file_number, smallest, largest, file_size); + + UpdateVersionStorageInfo(); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + constexpr int new_level = 2; + constexpr uint32_t path_id = 0; + constexpr SequenceNumber smallest_seqno = 100; + constexpr SequenceNumber largest_seqno = 1000; + constexpr bool marked_for_compaction = false; + + edit.AddFile( + new_level, file_number, path_id, file_size, GetInternalKey(smallest), + GetInternalKey(largest), smallest_seqno, largest_seqno, + marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + + const Status s = builder.Apply(&edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), + "Cannot add table file #2345 to level 2 since it is " + "already in the LSM tree on level 1")); +} + +TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) { + UpdateVersionStorageInfo(); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + constexpr int level = 3; + constexpr uint64_t file_number = 2345; + constexpr uint32_t path_id = 0; + constexpr uint64_t file_size = 10000; + constexpr char smallest[] = "bar"; + constexpr char largest[] = "foo"; + constexpr SequenceNumber smallest_seqno = 100; + constexpr SequenceNumber largest_seqno = 1000; + constexpr bool marked_for_compaction = false; + + edit.AddFile(level, file_number, path_id, file_size, GetInternalKey(smallest), + GetInternalKey(largest), smallest_seqno, largest_seqno, + marked_for_compaction, Temperature::kUnknown, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); + + ASSERT_OK(builder.Apply(&edit)); + + VersionEdit other_edit; + + constexpr int new_level = 2; + + other_edit.AddFile( + new_level, file_number, path_id, file_size, GetInternalKey(smallest), + GetInternalKey(largest), smallest_seqno, largest_seqno, + marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + + const Status s = builder.Apply(&other_edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), + "Cannot add table file #2345 to level 2 since it is " + "already in the LSM tree on level 3")); +} + +TEST_F(VersionBuilderTest, ApplyFileAdditionAndDeletion) { + UpdateVersionStorageInfo(); + + constexpr int level = 1; + constexpr uint64_t file_number = 2345; + constexpr uint32_t path_id = 0; + constexpr uint64_t file_size = 10000; + constexpr char smallest[] = "bar"; + constexpr char largest[] = "foo"; + constexpr SequenceNumber smallest_seqno = 100; + constexpr SequenceNumber largest_seqno = 1000; + constexpr bool marked_for_compaction = false; + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit addition; + + addition.AddFile( + level, file_number, path_id, file_size, GetInternalKey(smallest), + GetInternalKey(largest), smallest_seqno, largest_seqno, + marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + + ASSERT_OK(builder.Apply(&addition)); + + VersionEdit deletion; + + deletion.DeleteFile(level, file_number); + + ASSERT_OK(builder.Apply(&deletion)); + + constexpr bool force_consistency_checks = false; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + ASSERT_OK(builder.SaveTo(&new_vstorage)); + + UpdateVersionStorageInfo(&new_vstorage); + + ASSERT_FALSE(new_vstorage.GetFileLocation(file_number).IsValid()); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, ApplyBlobFileAddition) { + UpdateVersionStorageInfo(); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + constexpr uint64_t blob_file_number = 1234; + constexpr uint64_t total_blob_count = 5678; + constexpr uint64_t total_blob_bytes = 999999; + constexpr char checksum_method[] = "SHA1"; + constexpr char checksum_value[] = + "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52" + "\x5c\xbd"; + + edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes, + checksum_method, checksum_value); + + // Add dummy table file to ensure the blob file is referenced. + constexpr uint64_t table_file_number = 1; + AddDummyFileToEdit(&edit, table_file_number, blob_file_number); + + ASSERT_OK(builder.Apply(&edit)); + + constexpr bool force_consistency_checks = false; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + ASSERT_OK(builder.SaveTo(&new_vstorage)); + + UpdateVersionStorageInfo(&new_vstorage); + + const auto& new_blob_files = new_vstorage.GetBlobFiles(); + ASSERT_EQ(new_blob_files.size(), 1); + + const auto new_meta = new_vstorage.GetBlobFileMetaData(blob_file_number); + + ASSERT_NE(new_meta, nullptr); + ASSERT_EQ(new_meta->GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(new_meta->GetTotalBlobCount(), total_blob_count); + ASSERT_EQ(new_meta->GetTotalBlobBytes(), total_blob_bytes); + ASSERT_EQ(new_meta->GetChecksumMethod(), checksum_method); + ASSERT_EQ(new_meta->GetChecksumValue(), checksum_value); + ASSERT_EQ(new_meta->GetLinkedSsts(), + BlobFileMetaData::LinkedSsts{table_file_number}); + ASSERT_EQ(new_meta->GetGarbageBlobCount(), 0); + ASSERT_EQ(new_meta->GetGarbageBlobBytes(), 0); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, ApplyBlobFileAdditionAlreadyInBase) { + // Attempt to add a blob file that is already present in the base version. + + constexpr uint64_t blob_file_number = 1234; + constexpr uint64_t total_blob_count = 5678; + constexpr uint64_t total_blob_bytes = 999999; + constexpr char checksum_method[] = "SHA1"; + constexpr char checksum_value[] = + "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52" + "\x5c\xbd"; + constexpr uint64_t garbage_blob_count = 123; + constexpr uint64_t garbage_blob_bytes = 456789; + + AddBlob(blob_file_number, total_blob_count, total_blob_bytes, checksum_method, + checksum_value, BlobFileMetaData::LinkedSsts(), garbage_blob_count, + garbage_blob_bytes); + + UpdateVersionStorageInfo(); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes, + checksum_method, checksum_value); + + const Status s = builder.Apply(&edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "Blob file #1234 already added")); +} + +TEST_F(VersionBuilderTest, ApplyBlobFileAdditionAlreadyApplied) { + // Attempt to add the same blob file twice using version edits. + + UpdateVersionStorageInfo(); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + constexpr uint64_t blob_file_number = 1234; + constexpr uint64_t total_blob_count = 5678; + constexpr uint64_t total_blob_bytes = 999999; + constexpr char checksum_method[] = "SHA1"; + constexpr char checksum_value[] = + "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52" + "\x5c\xbd"; + + edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes, + checksum_method, checksum_value); + + ASSERT_OK(builder.Apply(&edit)); + + const Status s = builder.Apply(&edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "Blob file #1234 already added")); +} + +TEST_F(VersionBuilderTest, ApplyBlobFileGarbageFileInBase) { + // Increase the amount of garbage for a blob file present in the base version. + + constexpr uint64_t table_file_number = 1; + constexpr uint64_t blob_file_number = 1234; + constexpr uint64_t total_blob_count = 5678; + constexpr uint64_t total_blob_bytes = 999999; + constexpr char checksum_method[] = "SHA1"; + constexpr char checksum_value[] = + "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52" + "\x5c\xbd"; + constexpr uint64_t garbage_blob_count = 123; + constexpr uint64_t garbage_blob_bytes = 456789; + + AddBlob(blob_file_number, total_blob_count, total_blob_bytes, checksum_method, + checksum_value, BlobFileMetaData::LinkedSsts{table_file_number}, + garbage_blob_count, garbage_blob_bytes); + + const auto meta = vstorage_.GetBlobFileMetaData(blob_file_number); + ASSERT_NE(meta, nullptr); + + // Add dummy table file to ensure the blob file is referenced. + AddDummyFile(table_file_number, blob_file_number); + + UpdateVersionStorageInfo(); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + constexpr uint64_t new_garbage_blob_count = 456; + constexpr uint64_t new_garbage_blob_bytes = 111111; + + edit.AddBlobFileGarbage(blob_file_number, new_garbage_blob_count, + new_garbage_blob_bytes); + + ASSERT_OK(builder.Apply(&edit)); + + constexpr bool force_consistency_checks = false; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + ASSERT_OK(builder.SaveTo(&new_vstorage)); + + UpdateVersionStorageInfo(&new_vstorage); + + const auto& new_blob_files = new_vstorage.GetBlobFiles(); + ASSERT_EQ(new_blob_files.size(), 1); + + const auto new_meta = new_vstorage.GetBlobFileMetaData(blob_file_number); + + ASSERT_NE(new_meta, nullptr); + ASSERT_EQ(new_meta->GetSharedMeta(), meta->GetSharedMeta()); + ASSERT_EQ(new_meta->GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(new_meta->GetTotalBlobCount(), total_blob_count); + ASSERT_EQ(new_meta->GetTotalBlobBytes(), total_blob_bytes); + ASSERT_EQ(new_meta->GetChecksumMethod(), checksum_method); + ASSERT_EQ(new_meta->GetChecksumValue(), checksum_value); + ASSERT_EQ(new_meta->GetLinkedSsts(), + BlobFileMetaData::LinkedSsts{table_file_number}); + ASSERT_EQ(new_meta->GetGarbageBlobCount(), + garbage_blob_count + new_garbage_blob_count); + ASSERT_EQ(new_meta->GetGarbageBlobBytes(), + garbage_blob_bytes + new_garbage_blob_bytes); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, ApplyBlobFileGarbageFileAdditionApplied) { + // Increase the amount of garbage for a blob file added using a version edit. + + UpdateVersionStorageInfo(); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit addition; + + constexpr uint64_t blob_file_number = 1234; + constexpr uint64_t total_blob_count = 5678; + constexpr uint64_t total_blob_bytes = 999999; + constexpr char checksum_method[] = "SHA1"; + constexpr char checksum_value[] = + "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52" + "\x5c\xbd"; + + addition.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes, + checksum_method, checksum_value); + + // Add dummy table file to ensure the blob file is referenced. + constexpr uint64_t table_file_number = 1; + AddDummyFileToEdit(&addition, table_file_number, blob_file_number); + + ASSERT_OK(builder.Apply(&addition)); + + constexpr uint64_t garbage_blob_count = 123; + constexpr uint64_t garbage_blob_bytes = 456789; + + VersionEdit garbage; + + garbage.AddBlobFileGarbage(blob_file_number, garbage_blob_count, + garbage_blob_bytes); + + ASSERT_OK(builder.Apply(&garbage)); + + constexpr bool force_consistency_checks = false; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + ASSERT_OK(builder.SaveTo(&new_vstorage)); + + UpdateVersionStorageInfo(&new_vstorage); + + const auto& new_blob_files = new_vstorage.GetBlobFiles(); + ASSERT_EQ(new_blob_files.size(), 1); + + const auto new_meta = new_vstorage.GetBlobFileMetaData(blob_file_number); + + ASSERT_NE(new_meta, nullptr); + ASSERT_EQ(new_meta->GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(new_meta->GetTotalBlobCount(), total_blob_count); + ASSERT_EQ(new_meta->GetTotalBlobBytes(), total_blob_bytes); + ASSERT_EQ(new_meta->GetChecksumMethod(), checksum_method); + ASSERT_EQ(new_meta->GetChecksumValue(), checksum_value); + ASSERT_EQ(new_meta->GetLinkedSsts(), + BlobFileMetaData::LinkedSsts{table_file_number}); + ASSERT_EQ(new_meta->GetGarbageBlobCount(), garbage_blob_count); + ASSERT_EQ(new_meta->GetGarbageBlobBytes(), garbage_blob_bytes); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, ApplyBlobFileGarbageFileNotFound) { + // Attempt to increase the amount of garbage for a blob file that is + // neither in the base version, nor was it added using a version edit. + + UpdateVersionStorageInfo(); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + constexpr uint64_t blob_file_number = 1234; + constexpr uint64_t garbage_blob_count = 5678; + constexpr uint64_t garbage_blob_bytes = 999999; + + edit.AddBlobFileGarbage(blob_file_number, garbage_blob_count, + garbage_blob_bytes); + + const Status s = builder.Apply(&edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "Blob file #1234 not found")); +} + +TEST_F(VersionBuilderTest, BlobFileGarbageOverflow) { + // Test that VersionEdits that would result in the count/total size of garbage + // exceeding the count/total size of all blobs are rejected. + + UpdateVersionStorageInfo(); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit addition; + + constexpr uint64_t blob_file_number = 1234; + constexpr uint64_t total_blob_count = 5678; + constexpr uint64_t total_blob_bytes = 999999; + constexpr char checksum_method[] = "SHA1"; + constexpr char checksum_value[] = + "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52" + "\x5c\xbd"; + + addition.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes, + checksum_method, checksum_value); + + // Add dummy table file to ensure the blob file is referenced. + constexpr uint64_t table_file_number = 1; + AddDummyFileToEdit(&addition, table_file_number, blob_file_number); + + ASSERT_OK(builder.Apply(&addition)); + + { + // Garbage blob count overflow + constexpr uint64_t garbage_blob_count = 5679; + constexpr uint64_t garbage_blob_bytes = 999999; + + VersionEdit garbage; + + garbage.AddBlobFileGarbage(blob_file_number, garbage_blob_count, + garbage_blob_bytes); + + const Status s = builder.Apply(&garbage); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE( + std::strstr(s.getState(), "Garbage overflow for blob file #1234")); + } + + { + // Garbage blob bytes overflow + constexpr uint64_t garbage_blob_count = 5678; + constexpr uint64_t garbage_blob_bytes = 1000000; + + VersionEdit garbage; + + garbage.AddBlobFileGarbage(blob_file_number, garbage_blob_count, + garbage_blob_bytes); + + const Status s = builder.Apply(&garbage); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE( + std::strstr(s.getState(), "Garbage overflow for blob file #1234")); + } +} + +TEST_F(VersionBuilderTest, SaveBlobFilesTo) { + // Add three blob files to base version. + for (uint64_t i = 1; i <= 3; ++i) { + const uint64_t table_file_number = 2 * i; + const uint64_t blob_file_number = 2 * i + 1; + const uint64_t total_blob_count = i * 1000; + const uint64_t total_blob_bytes = i * 1000000; + const uint64_t garbage_blob_count = i * 100; + const uint64_t garbage_blob_bytes = i * 20000; + + AddBlob(blob_file_number, total_blob_count, total_blob_bytes, + /* checksum_method */ std::string(), + /* checksum_value */ std::string(), + BlobFileMetaData::LinkedSsts{table_file_number}, garbage_blob_count, + garbage_blob_bytes); + } + + // Add dummy table files to ensure the blob files are referenced. + // Note: files are added to L0, so they have to be added in reverse order + // (newest first). + for (uint64_t i = 3; i >= 1; --i) { + const uint64_t table_file_number = 2 * i; + const uint64_t blob_file_number = 2 * i + 1; + + AddDummyFile(table_file_number, blob_file_number); + } + + UpdateVersionStorageInfo(); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + // Add some garbage to the second and third blob files. The second blob file + // remains valid since it does not consist entirely of garbage yet. The third + // blob file is all garbage after the edit and will not be part of the new + // version. The corresponding dummy table file is also removed for + // consistency. + edit.AddBlobFileGarbage(/* blob_file_number */ 5, + /* garbage_blob_count */ 200, + /* garbage_blob_bytes */ 100000); + edit.AddBlobFileGarbage(/* blob_file_number */ 7, + /* garbage_blob_count */ 2700, + /* garbage_blob_bytes */ 2940000); + edit.DeleteFile(/* level */ 0, /* file_number */ 6); + + // Add a fourth blob file. + edit.AddBlobFile(/* blob_file_number */ 9, /* total_blob_count */ 4000, + /* total_blob_bytes */ 4000000, + /* checksum_method */ std::string(), + /* checksum_value */ std::string()); + + ASSERT_OK(builder.Apply(&edit)); + + constexpr bool force_consistency_checks = false; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + ASSERT_OK(builder.SaveTo(&new_vstorage)); + + UpdateVersionStorageInfo(&new_vstorage); + + const auto& new_blob_files = new_vstorage.GetBlobFiles(); + ASSERT_EQ(new_blob_files.size(), 3); + + const auto meta3 = new_vstorage.GetBlobFileMetaData(/* blob_file_number */ 3); + + ASSERT_NE(meta3, nullptr); + ASSERT_EQ(meta3->GetBlobFileNumber(), 3); + ASSERT_EQ(meta3->GetTotalBlobCount(), 1000); + ASSERT_EQ(meta3->GetTotalBlobBytes(), 1000000); + ASSERT_EQ(meta3->GetGarbageBlobCount(), 100); + ASSERT_EQ(meta3->GetGarbageBlobBytes(), 20000); + + const auto meta5 = new_vstorage.GetBlobFileMetaData(/* blob_file_number */ 5); + + ASSERT_NE(meta5, nullptr); + ASSERT_EQ(meta5->GetBlobFileNumber(), 5); + ASSERT_EQ(meta5->GetTotalBlobCount(), 2000); + ASSERT_EQ(meta5->GetTotalBlobBytes(), 2000000); + ASSERT_EQ(meta5->GetGarbageBlobCount(), 400); + ASSERT_EQ(meta5->GetGarbageBlobBytes(), 140000); + + const auto meta9 = new_vstorage.GetBlobFileMetaData(/* blob_file_number */ 9); + + ASSERT_NE(meta9, nullptr); + ASSERT_EQ(meta9->GetBlobFileNumber(), 9); + ASSERT_EQ(meta9->GetTotalBlobCount(), 4000); + ASSERT_EQ(meta9->GetTotalBlobBytes(), 4000000); + ASSERT_EQ(meta9->GetGarbageBlobCount(), 0); + ASSERT_EQ(meta9->GetGarbageBlobBytes(), 0); + + // Delete the first table file, which makes the first blob file obsolete + // since it's at the head and unreferenced. + VersionBuilder second_builder(env_options, &ioptions_, table_cache, + &new_vstorage, version_set); + + VersionEdit second_edit; + second_edit.DeleteFile(/* level */ 0, /* file_number */ 2); + + ASSERT_OK(second_builder.Apply(&second_edit)); + + VersionStorageInfo newer_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &new_vstorage, + force_consistency_checks); + + ASSERT_OK(second_builder.SaveTo(&newer_vstorage)); + + UpdateVersionStorageInfo(&newer_vstorage); + + const auto& newer_blob_files = newer_vstorage.GetBlobFiles(); + ASSERT_EQ(newer_blob_files.size(), 2); + + const auto newer_meta3 = + newer_vstorage.GetBlobFileMetaData(/* blob_file_number */ 3); + + ASSERT_EQ(newer_meta3, nullptr); + + UnrefFilesInVersion(&newer_vstorage); + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, SaveBlobFilesToConcurrentJobs) { + // When multiple background jobs (flushes/compactions) are executing in + // parallel, it is possible for the VersionEdit adding blob file K to be + // applied *after* the VersionEdit adding blob file N (for N > K). This test + // case makes sure this is handled correctly. + + // Add blob file #4 (referenced by table file #3) to base version. + constexpr uint64_t base_table_file_number = 3; + constexpr uint64_t base_blob_file_number = 4; + constexpr uint64_t base_total_blob_count = 100; + constexpr uint64_t base_total_blob_bytes = 1 << 20; + + constexpr char checksum_method[] = "SHA1"; + constexpr char checksum_value[] = "\xfa\xce\xb0\x0c"; + constexpr uint64_t garbage_blob_count = 0; + constexpr uint64_t garbage_blob_bytes = 0; + + AddDummyFile(base_table_file_number, base_blob_file_number); + AddBlob(base_blob_file_number, base_total_blob_count, base_total_blob_bytes, + checksum_method, checksum_value, + BlobFileMetaData::LinkedSsts{base_table_file_number}, + garbage_blob_count, garbage_blob_bytes); + + UpdateVersionStorageInfo(); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + // Add blob file #2 (referenced by table file #1). + constexpr int level = 0; + constexpr uint64_t table_file_number = 1; + constexpr uint32_t path_id = 0; + constexpr uint64_t file_size = 1 << 12; + constexpr char smallest[] = "key1"; + constexpr char largest[] = "key987"; + constexpr SequenceNumber smallest_seqno = 0; + constexpr SequenceNumber largest_seqno = 0; + constexpr bool marked_for_compaction = false; + + constexpr uint64_t blob_file_number = 2; + static_assert(blob_file_number < base_blob_file_number, + "Added blob file should have a smaller file number"); + + constexpr uint64_t total_blob_count = 234; + constexpr uint64_t total_blob_bytes = 1 << 22; + + edit.AddFile(level, table_file_number, path_id, file_size, + GetInternalKey(smallest), GetInternalKey(largest), + smallest_seqno, largest_seqno, marked_for_compaction, + Temperature::kUnknown, blob_file_number, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + checksum_value, checksum_method, kNullUniqueId64x2); + edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes, + checksum_method, checksum_value); + + ASSERT_OK(builder.Apply(&edit)); + + constexpr bool force_consistency_checks = true; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + ASSERT_OK(builder.SaveTo(&new_vstorage)); + + UpdateVersionStorageInfo(&new_vstorage); + + const auto& new_blob_files = new_vstorage.GetBlobFiles(); + ASSERT_EQ(new_blob_files.size(), 2); + + const auto base_meta = + new_vstorage.GetBlobFileMetaData(base_blob_file_number); + + ASSERT_NE(base_meta, nullptr); + ASSERT_EQ(base_meta->GetBlobFileNumber(), base_blob_file_number); + ASSERT_EQ(base_meta->GetTotalBlobCount(), base_total_blob_count); + ASSERT_EQ(base_meta->GetTotalBlobBytes(), base_total_blob_bytes); + ASSERT_EQ(base_meta->GetGarbageBlobCount(), garbage_blob_count); + ASSERT_EQ(base_meta->GetGarbageBlobBytes(), garbage_blob_bytes); + ASSERT_EQ(base_meta->GetChecksumMethod(), checksum_method); + ASSERT_EQ(base_meta->GetChecksumValue(), checksum_value); + + const auto added_meta = new_vstorage.GetBlobFileMetaData(blob_file_number); + + ASSERT_NE(added_meta, nullptr); + ASSERT_EQ(added_meta->GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(added_meta->GetTotalBlobCount(), total_blob_count); + ASSERT_EQ(added_meta->GetTotalBlobBytes(), total_blob_bytes); + ASSERT_EQ(added_meta->GetGarbageBlobCount(), garbage_blob_count); + ASSERT_EQ(added_meta->GetGarbageBlobBytes(), garbage_blob_bytes); + ASSERT_EQ(added_meta->GetChecksumMethod(), checksum_method); + ASSERT_EQ(added_meta->GetChecksumValue(), checksum_value); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) { + // Initialize base version. The first table file points to a valid blob file + // in this version; the second one does not refer to any blob files. + + Add(/* level */ 1, /* file_number */ 1, /* smallest */ "150", + /* largest */ "200", /* file_size */ 100, + /* path_id */ 0, /* smallest_seq */ 100, /* largest_seq */ 100, + /* num_entries */ 0, /* num_deletions */ 0, + /* sampled */ false, /* smallest_seqno */ 100, /* largest_seqno */ 100, + /* oldest_blob_file_number */ 16); + Add(/* level */ 1, /* file_number */ 23, /* smallest */ "201", + /* largest */ "300", /* file_size */ 100, + /* path_id */ 0, /* smallest_seq */ 200, /* largest_seq */ 200, + /* num_entries */ 0, /* num_deletions */ 0, + /* sampled */ false, /* smallest_seqno */ 200, /* largest_seqno */ 200, + kInvalidBlobFileNumber); + + AddBlob(/* blob_file_number */ 16, /* total_blob_count */ 1000, + /* total_blob_bytes */ 1000000, + /* checksum_method */ std::string(), + /* checksum_value */ std::string(), BlobFileMetaData::LinkedSsts{1}, + /* garbage_blob_count */ 500, /* garbage_blob_bytes */ 300000); + + UpdateVersionStorageInfo(); + + // Add a new table file that points to the existing blob file, and add a + // new table file--blob file pair. + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + edit.AddFile(/* level */ 1, /* file_number */ 606, /* path_id */ 0, + /* file_size */ 100, /* smallest */ GetInternalKey("701"), + /* largest */ GetInternalKey("750"), /* smallest_seqno */ 200, + /* largest_seqno */ 200, /* marked_for_compaction */ false, + Temperature::kUnknown, + /* oldest_blob_file_number */ 16, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); + + edit.AddFile(/* level */ 1, /* file_number */ 700, /* path_id */ 0, + /* file_size */ 100, /* smallest */ GetInternalKey("801"), + /* largest */ GetInternalKey("850"), /* smallest_seqno */ 200, + /* largest_seqno */ 200, /* marked_for_compaction */ false, + Temperature::kUnknown, + /* oldest_blob_file_number */ 1000, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); + edit.AddBlobFile(/* blob_file_number */ 1000, /* total_blob_count */ 2000, + /* total_blob_bytes */ 200000, + /* checksum_method */ std::string(), + /* checksum_value */ std::string()); + + ASSERT_OK(builder.Apply(&edit)); + + // Save to a new version in order to trigger consistency checks. + constexpr bool force_consistency_checks = true; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + ASSERT_OK(builder.SaveTo(&new_vstorage)); + + UpdateVersionStorageInfo(&new_vstorage); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, CheckConsistencyForBlobFilesInconsistentLinks) { + // Initialize base version. Links between the table file and the blob file + // are inconsistent. + + Add(/* level */ 1, /* file_number */ 1, /* smallest */ "150", + /* largest */ "200", /* file_size */ 100, + /* path_id */ 0, /* smallest_seq */ 100, /* largest_seq */ 100, + /* num_entries */ 0, /* num_deletions */ 0, + /* sampled */ false, /* smallest_seqno */ 100, /* largest_seqno */ 100, + /* oldest_blob_file_number */ 256); + + AddBlob(/* blob_file_number */ 16, /* total_blob_count */ 1000, + /* total_blob_bytes */ 1000000, + /* checksum_method */ std::string(), + /* checksum_value */ std::string(), BlobFileMetaData::LinkedSsts{1}, + /* garbage_blob_count */ 500, /* garbage_blob_bytes */ 300000); + + UpdateVersionStorageInfo(); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + // Save to a new version in order to trigger consistency checks. + constexpr bool force_consistency_checks = true; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + const Status s = builder.SaveTo(&new_vstorage); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr( + s.getState(), + "Links are inconsistent between table files and blob file #16")); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, CheckConsistencyForBlobFilesAllGarbage) { + // Initialize base version. The table file points to a blob file that is + // all garbage. + + Add(/* level */ 1, /* file_number */ 1, /* smallest */ "150", + /* largest */ "200", /* file_size */ 100, + /* path_id */ 0, /* smallest_seq */ 100, /* largest_seq */ 100, + /* num_entries */ 0, /* num_deletions */ 0, + /* sampled */ false, /* smallest_seqno */ 100, /* largest_seqno */ 100, + /* oldest_blob_file_number */ 16); + + AddBlob(/* blob_file_number */ 16, /* total_blob_count */ 1000, + /* total_blob_bytes */ 1000000, + /* checksum_method */ std::string(), + /* checksum_value */ std::string(), BlobFileMetaData::LinkedSsts{1}, + /* garbage_blob_count */ 1000, /* garbage_blob_bytes */ 1000000); + + UpdateVersionStorageInfo(); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + // Save to a new version in order to trigger consistency checks. + constexpr bool force_consistency_checks = true; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + const Status s = builder.SaveTo(&new_vstorage); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE( + std::strstr(s.getState(), "Blob file #16 consists entirely of garbage")); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, CheckConsistencyForBlobFilesAllGarbageLinkedSsts) { + // Initialize base version, with a table file pointing to a blob file + // that has no garbage at this point. + + Add(/* level */ 1, /* file_number */ 1, /* smallest */ "150", + /* largest */ "200", /* file_size */ 100, + /* path_id */ 0, /* smallest_seq */ 100, /* largest_seq */ 100, + /* num_entries */ 0, /* num_deletions */ 0, + /* sampled */ false, /* smallest_seqno */ 100, /* largest_seqno */ 100, + /* oldest_blob_file_number */ 16); + + AddBlob(/* blob_file_number */ 16, /* total_blob_count */ 1000, + /* total_blob_bytes */ 1000000, + /* checksum_method */ std::string(), + /* checksum_value */ std::string(), BlobFileMetaData::LinkedSsts{1}, + /* garbage_blob_count */ 0, /* garbage_blob_bytes */ 0); + + UpdateVersionStorageInfo(); + + // Mark the entire blob file garbage but do not remove the linked SST. + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + edit.AddBlobFileGarbage(/* blob_file_number */ 16, + /* garbage_blob_count */ 1000, + /* garbage_blob_bytes */ 1000000); + + ASSERT_OK(builder.Apply(&edit)); + + // Save to a new version in order to trigger consistency checks. + constexpr bool force_consistency_checks = true; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + const Status s = builder.SaveTo(&new_vstorage); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE( + std::strstr(s.getState(), "Blob file #16 consists entirely of garbage")); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { + // Initialize base version. Table files 1..10 are linked to blob files 1..5, + // while table files 11..20 are not linked to any blob files. + + for (uint64_t i = 1; i <= 10; ++i) { + std::ostringstream oss; + oss << std::setw(2) << std::setfill('0') << i; + + const std::string key = oss.str(); + + Add(/* level */ 1, /* file_number */ i, /* smallest */ key.c_str(), + /* largest */ key.c_str(), /* file_size */ 100, + /* path_id */ 0, /* smallest_seq */ i * 100, /* largest_seq */ i * 100, + /* num_entries */ 0, /* num_deletions */ 0, + /* sampled */ false, /* smallest_seqno */ i * 100, + /* largest_seqno */ i * 100, + /* oldest_blob_file_number */ ((i - 1) % 5) + 1); + } + + for (uint64_t i = 1; i <= 5; ++i) { + AddBlob(/* blob_file_number */ i, /* total_blob_count */ 2000, + /* total_blob_bytes */ 2000000, + /* checksum_method */ std::string(), + /* checksum_value */ std::string(), + BlobFileMetaData::LinkedSsts{i, i + 5}, + /* garbage_blob_count */ 1000, /* garbage_blob_bytes */ 1000000); + } + + for (uint64_t i = 11; i <= 20; ++i) { + std::ostringstream oss; + oss << std::setw(2) << std::setfill('0') << i; + + const std::string key = oss.str(); + + Add(/* level */ 1, /* file_number */ i, /* smallest */ key.c_str(), + /* largest */ key.c_str(), /* file_size */ 100, + /* path_id */ 0, /* smallest_seq */ i * 100, /* largest_seq */ i * 100, + /* num_entries */ 0, /* num_deletions */ 0, + /* sampled */ false, /* smallest_seqno */ i * 100, + /* largest_seqno */ i * 100, kInvalidBlobFileNumber); + } + + UpdateVersionStorageInfo(); + + { + const auto& blob_files = vstorage_.GetBlobFiles(); + ASSERT_EQ(blob_files.size(), 5); + + const std::vector expected_linked_ssts{ + {1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10}}; + + for (size_t i = 0; i < 5; ++i) { + const auto meta = + vstorage_.GetBlobFileMetaData(/* blob_file_number */ i + 1); + ASSERT_NE(meta, nullptr); + ASSERT_EQ(meta->GetLinkedSsts(), expected_linked_ssts[i]); + } + } + + VersionEdit edit; + + // Add an SST that references a blob file. + edit.AddFile( + /* level */ 1, /* file_number */ 21, /* path_id */ 0, + /* file_size */ 100, /* smallest */ GetInternalKey("21", 2100), + /* largest */ GetInternalKey("21", 2100), /* smallest_seqno */ 2100, + /* largest_seqno */ 2100, /* marked_for_compaction */ false, + Temperature::kUnknown, + /* oldest_blob_file_number */ 1, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); + + // Add an SST that does not reference any blob files. + edit.AddFile( + /* level */ 1, /* file_number */ 22, /* path_id */ 0, + /* file_size */ 100, /* smallest */ GetInternalKey("22", 2200), + /* largest */ GetInternalKey("22", 2200), /* smallest_seqno */ 2200, + /* largest_seqno */ 2200, /* marked_for_compaction */ false, + Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); + + // Delete a file that references a blob file. + edit.DeleteFile(/* level */ 1, /* file_number */ 6); + + // Delete a file that does not reference any blob files. + edit.DeleteFile(/* level */ 1, /* file_number */ 16); + + // Trivially move a file that references a blob file. Note that we save + // the original BlobFileMetaData object so we can check that no new object + // gets created. + auto meta3 = vstorage_.GetBlobFileMetaData(/* blob_file_number */ 3); + + edit.DeleteFile(/* level */ 1, /* file_number */ 3); + edit.AddFile(/* level */ 2, /* file_number */ 3, /* path_id */ 0, + /* file_size */ 100, /* smallest */ GetInternalKey("03", 300), + /* largest */ GetInternalKey("03", 300), + /* smallest_seqno */ 300, + /* largest_seqno */ 300, /* marked_for_compaction */ false, + Temperature::kUnknown, + /* oldest_blob_file_number */ 3, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); + + // Trivially move a file that does not reference any blob files. + edit.DeleteFile(/* level */ 1, /* file_number */ 13); + edit.AddFile(/* level */ 2, /* file_number */ 13, /* path_id */ 0, + /* file_size */ 100, /* smallest */ GetInternalKey("13", 1300), + /* largest */ GetInternalKey("13", 1300), + /* smallest_seqno */ 1300, + /* largest_seqno */ 1300, /* marked_for_compaction */ false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2); + + // Add one more SST file that references a blob file, then promptly + // delete it in a second version edit before the new version gets saved. + // This file should not show up as linked to the blob file in the new version. + edit.AddFile(/* level */ 1, /* file_number */ 23, /* path_id */ 0, + /* file_size */ 100, /* smallest */ GetInternalKey("23", 2300), + /* largest */ GetInternalKey("23", 2300), + /* smallest_seqno */ 2300, + /* largest_seqno */ 2300, /* marked_for_compaction */ false, + Temperature::kUnknown, + /* oldest_blob_file_number */ 5, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); + + VersionEdit edit2; + + edit2.DeleteFile(/* level */ 1, /* file_number */ 23); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + ASSERT_OK(builder.Apply(&edit)); + ASSERT_OK(builder.Apply(&edit2)); + + constexpr bool force_consistency_checks = true; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + ASSERT_OK(builder.SaveTo(&new_vstorage)); + + UpdateVersionStorageInfo(&new_vstorage); + + { + const auto& blob_files = new_vstorage.GetBlobFiles(); + ASSERT_EQ(blob_files.size(), 5); + + const std::vector expected_linked_ssts{ + {1, 21}, {2, 7}, {3, 8}, {4, 9}, {5, 10}}; + + for (size_t i = 0; i < 5; ++i) { + const auto meta = + new_vstorage.GetBlobFileMetaData(/* blob_file_number */ i + 1); + ASSERT_NE(meta, nullptr); + ASSERT_EQ(meta->GetLinkedSsts(), expected_linked_ssts[i]); + } + + // Make sure that no new BlobFileMetaData got created for the blob file + // affected by the trivial move. + ASSERT_EQ(new_vstorage.GetBlobFileMetaData(/* blob_file_number */ 3), + meta3); + } + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, CheckConsistencyForFileDeletedTwice) { + Add(0, 1U, "150", "200", 100U); + + UpdateVersionStorageInfo(); + + VersionEdit version_edit; + version_edit.DeleteFile(0, 1U); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder version_builder(env_options, &ioptions_, table_cache, + &vstorage_, version_set); + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, nullptr, + true /* force_consistency_checks */); + ASSERT_OK(version_builder.Apply(&version_edit)); + ASSERT_OK(version_builder.SaveTo(&new_vstorage)); + + UpdateVersionStorageInfo(&new_vstorage); + + VersionBuilder version_builder2(env_options, &ioptions_, table_cache, + &new_vstorage, version_set); + VersionStorageInfo new_vstorage2(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, nullptr, + true /* force_consistency_checks */); + ASSERT_NOK(version_builder2.Apply(&version_edit)); + + UnrefFilesInVersion(&new_vstorage); + UnrefFilesInVersion(&new_vstorage2); +} + +TEST_F(VersionBuilderTest, EstimatedActiveKeys) { + const uint32_t kTotalSamples = 20; + const uint32_t kNumLevels = 5; + const uint32_t kFilesPerLevel = 8; + const uint32_t kNumFiles = kNumLevels * kFilesPerLevel; + const uint32_t kEntriesPerFile = 1000; + const uint32_t kDeletionsPerFile = 100; + for (uint32_t i = 0; i < kNumFiles; ++i) { + Add(static_cast(i / kFilesPerLevel), i + 1, + std::to_string((i + 100) * 1000).c_str(), + std::to_string((i + 100) * 1000 + 999).c_str(), 100U, 0, 100, 100, + kEntriesPerFile, kDeletionsPerFile, (i < kTotalSamples)); + } + // minus 2X for the number of deletion entries because: + // 1x for deletion entry does not count as a data entry. + // 1x for each deletion entry will actually remove one data entry. + ASSERT_EQ(vstorage_.GetEstimatedActiveKeys(), + (kEntriesPerFile - 2 * kDeletionsPerFile) * kNumFiles); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/version_edit.cc b/src/rocksdb/db/version_edit.cc new file mode 100644 index 000000000..e4e02fe25 --- /dev/null +++ b/src/rocksdb/db/version_edit.cc @@ -0,0 +1,1043 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_edit.h" + +#include "db/blob/blob_index.h" +#include "db/version_set.h" +#include "logging/event_logger.h" +#include "rocksdb/slice.h" +#include "table/unique_id_impl.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +namespace {} // anonymous namespace + +uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) { + assert(number <= kFileNumberMask); + return number | (path_id * (kFileNumberMask + 1)); +} + +Status FileMetaData::UpdateBoundaries(const Slice& key, const Slice& value, + SequenceNumber seqno, + ValueType value_type) { + if (value_type == kTypeBlobIndex) { + BlobIndex blob_index; + const Status s = blob_index.DecodeFrom(value); + if (!s.ok()) { + return s; + } + + if (!blob_index.IsInlined() && !blob_index.HasTTL()) { + if (blob_index.file_number() == kInvalidBlobFileNumber) { + return Status::Corruption("Invalid blob file number"); + } + + if (oldest_blob_file_number == kInvalidBlobFileNumber || + oldest_blob_file_number > blob_index.file_number()) { + oldest_blob_file_number = blob_index.file_number(); + } + } + } + + if (smallest.size() == 0) { + smallest.DecodeFrom(key); + } + largest.DecodeFrom(key); + fd.smallest_seqno = std::min(fd.smallest_seqno, seqno); + fd.largest_seqno = std::max(fd.largest_seqno, seqno); + + return Status::OK(); +} + +void VersionEdit::Clear() { + max_level_ = 0; + db_id_.clear(); + comparator_.clear(); + log_number_ = 0; + prev_log_number_ = 0; + next_file_number_ = 0; + max_column_family_ = 0; + min_log_number_to_keep_ = 0; + last_sequence_ = 0; + has_db_id_ = false; + has_comparator_ = false; + has_log_number_ = false; + has_prev_log_number_ = false; + has_next_file_number_ = false; + has_max_column_family_ = false; + has_min_log_number_to_keep_ = false; + has_last_sequence_ = false; + compact_cursors_.clear(); + deleted_files_.clear(); + new_files_.clear(); + blob_file_additions_.clear(); + blob_file_garbages_.clear(); + wal_additions_.clear(); + wal_deletion_.Reset(); + column_family_ = 0; + is_column_family_add_ = false; + is_column_family_drop_ = false; + column_family_name_.clear(); + is_in_atomic_group_ = false; + remaining_entries_ = 0; + full_history_ts_low_.clear(); +} + +bool VersionEdit::EncodeTo(std::string* dst) const { + if (has_db_id_) { + PutVarint32(dst, kDbId); + PutLengthPrefixedSlice(dst, db_id_); + } + if (has_comparator_) { + PutVarint32(dst, kComparator); + PutLengthPrefixedSlice(dst, comparator_); + } + if (has_log_number_) { + PutVarint32Varint64(dst, kLogNumber, log_number_); + } + if (has_prev_log_number_) { + PutVarint32Varint64(dst, kPrevLogNumber, prev_log_number_); + } + if (has_next_file_number_) { + PutVarint32Varint64(dst, kNextFileNumber, next_file_number_); + } + if (has_max_column_family_) { + PutVarint32Varint32(dst, kMaxColumnFamily, max_column_family_); + } + if (has_min_log_number_to_keep_) { + PutVarint32Varint64(dst, kMinLogNumberToKeep, min_log_number_to_keep_); + } + if (has_last_sequence_) { + PutVarint32Varint64(dst, kLastSequence, last_sequence_); + } + for (size_t i = 0; i < compact_cursors_.size(); i++) { + if (compact_cursors_[i].second.Valid()) { + PutVarint32(dst, kCompactCursor); + PutVarint32(dst, compact_cursors_[i].first); // level + PutLengthPrefixedSlice(dst, compact_cursors_[i].second.Encode()); + } + } + for (const auto& deleted : deleted_files_) { + PutVarint32Varint32Varint64(dst, kDeletedFile, deleted.first /* level */, + deleted.second /* file number */); + } + + bool min_log_num_written = false; + for (size_t i = 0; i < new_files_.size(); i++) { + const FileMetaData& f = new_files_[i].second; + if (!f.smallest.Valid() || !f.largest.Valid()) { + return false; + } + PutVarint32(dst, kNewFile4); + PutVarint32Varint64(dst, new_files_[i].first /* level */, f.fd.GetNumber()); + PutVarint64(dst, f.fd.GetFileSize()); + PutLengthPrefixedSlice(dst, f.smallest.Encode()); + PutLengthPrefixedSlice(dst, f.largest.Encode()); + PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno); + // Customized fields' format: + // +-----------------------------+ + // | 1st field's tag (varint32) | + // +-----------------------------+ + // | 1st field's size (varint32) | + // +-----------------------------+ + // | bytes for 1st field | + // | (based on size decoded) | + // +-----------------------------+ + // | | + // | ...... | + // | | + // +-----------------------------+ + // | last field's size (varint32)| + // +-----------------------------+ + // | bytes for last field | + // | (based on size decoded) | + // +-----------------------------+ + // | terminating tag (varint32) | + // +-----------------------------+ + // + // Customized encoding for fields: + // tag kPathId: 1 byte as path_id + // tag kNeedCompaction: + // now only can take one char value 1 indicating need-compaction + // + PutVarint32(dst, NewFileCustomTag::kOldestAncesterTime); + std::string varint_oldest_ancester_time; + PutVarint64(&varint_oldest_ancester_time, f.oldest_ancester_time); + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime", + &varint_oldest_ancester_time); + PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time)); + + PutVarint32(dst, NewFileCustomTag::kFileCreationTime); + std::string varint_file_creation_time; + PutVarint64(&varint_file_creation_time, f.file_creation_time); + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintFileCreationTime", + &varint_file_creation_time); + PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time)); + + PutVarint32(dst, NewFileCustomTag::kFileChecksum); + PutLengthPrefixedSlice(dst, Slice(f.file_checksum)); + + PutVarint32(dst, NewFileCustomTag::kFileChecksumFuncName); + PutLengthPrefixedSlice(dst, Slice(f.file_checksum_func_name)); + + if (f.fd.GetPathId() != 0) { + PutVarint32(dst, NewFileCustomTag::kPathId); + char p = static_cast(f.fd.GetPathId()); + PutLengthPrefixedSlice(dst, Slice(&p, 1)); + } + if (f.temperature != Temperature::kUnknown) { + PutVarint32(dst, NewFileCustomTag::kTemperature); + char p = static_cast(f.temperature); + PutLengthPrefixedSlice(dst, Slice(&p, 1)); + } + if (f.marked_for_compaction) { + PutVarint32(dst, NewFileCustomTag::kNeedCompaction); + char p = static_cast(1); + PutLengthPrefixedSlice(dst, Slice(&p, 1)); + } + if (has_min_log_number_to_keep_ && !min_log_num_written) { + PutVarint32(dst, NewFileCustomTag::kMinLogNumberToKeepHack); + std::string varint_log_number; + PutFixed64(&varint_log_number, min_log_number_to_keep_); + PutLengthPrefixedSlice(dst, Slice(varint_log_number)); + min_log_num_written = true; + } + if (f.oldest_blob_file_number != kInvalidBlobFileNumber) { + PutVarint32(dst, NewFileCustomTag::kOldestBlobFileNumber); + std::string oldest_blob_file_number; + PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number); + PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number)); + } + UniqueId64x2 unique_id = f.unique_id; + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:UniqueId", &unique_id); + if (unique_id != kNullUniqueId64x2) { + PutVarint32(dst, NewFileCustomTag::kUniqueId); + std::string unique_id_str = EncodeUniqueIdBytes(&unique_id); + PutLengthPrefixedSlice(dst, Slice(unique_id_str)); + } + + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields", + dst); + + PutVarint32(dst, NewFileCustomTag::kTerminate); + } + + for (const auto& blob_file_addition : blob_file_additions_) { + PutVarint32(dst, kBlobFileAddition); + blob_file_addition.EncodeTo(dst); + } + + for (const auto& blob_file_garbage : blob_file_garbages_) { + PutVarint32(dst, kBlobFileGarbage); + blob_file_garbage.EncodeTo(dst); + } + + for (const auto& wal_addition : wal_additions_) { + PutVarint32(dst, kWalAddition2); + std::string encoded; + wal_addition.EncodeTo(&encoded); + PutLengthPrefixedSlice(dst, encoded); + } + + if (!wal_deletion_.IsEmpty()) { + PutVarint32(dst, kWalDeletion2); + std::string encoded; + wal_deletion_.EncodeTo(&encoded); + PutLengthPrefixedSlice(dst, encoded); + } + + // 0 is default and does not need to be explicitly written + if (column_family_ != 0) { + PutVarint32Varint32(dst, kColumnFamily, column_family_); + } + + if (is_column_family_add_) { + PutVarint32(dst, kColumnFamilyAdd); + PutLengthPrefixedSlice(dst, Slice(column_family_name_)); + } + + if (is_column_family_drop_) { + PutVarint32(dst, kColumnFamilyDrop); + } + + if (is_in_atomic_group_) { + PutVarint32(dst, kInAtomicGroup); + PutVarint32(dst, remaining_entries_); + } + + if (HasFullHistoryTsLow()) { + PutVarint32(dst, kFullHistoryTsLow); + PutLengthPrefixedSlice(dst, full_history_ts_low_); + } + return true; +} + +static bool GetInternalKey(Slice* input, InternalKey* dst) { + Slice str; + if (GetLengthPrefixedSlice(input, &str)) { + dst->DecodeFrom(str); + return dst->Valid(); + } else { + return false; + } +} + +bool VersionEdit::GetLevel(Slice* input, int* level, const char** /*msg*/) { + uint32_t v = 0; + if (GetVarint32(input, &v)) { + *level = v; + if (max_level_ < *level) { + max_level_ = *level; + } + return true; + } else { + return false; + } +} + +static bool is_pseudo_new_file_record_pr3488( + const int level, + const uint64_t number, + const uint64_t file_size, + InternalKey& smallest, + InternalKey& largest, + const bool has_min_log_number_to_keep_) { + + if (level == 0 && number == 0 && file_size == 0 && + has_min_log_number_to_keep_) { + InternalKey dummy_key(Slice("dummy_key"), 0ull, ValueType::kTypeValue); + return (*smallest.rep() == *dummy_key.rep() && + *largest.rep() == *dummy_key.rep()); + } else { + return false; + } +} + +const char* VersionEdit::DecodeNewFile4From(Slice* input) { + const char* msg = nullptr; + int level = 0; + FileMetaData f; + uint64_t number = 0; + uint32_t path_id = 0; + uint64_t file_size = 0; + SequenceNumber smallest_seqno = 0; + SequenceNumber largest_seqno = kMaxSequenceNumber; + if (GetLevel(input, &level, &msg) && GetVarint64(input, &number) && + GetVarint64(input, &file_size) && GetInternalKey(input, &f.smallest) && + GetInternalKey(input, &f.largest) && + GetVarint64(input, &smallest_seqno) && + GetVarint64(input, &largest_seqno)) { + // See comments in VersionEdit::EncodeTo() for format of customized fields + while (true) { + uint32_t custom_tag = 0; + Slice field; + if (!GetVarint32(input, &custom_tag)) { + return "new-file4 custom field"; + } + if (custom_tag == kTerminate) { + break; + } + if (!GetLengthPrefixedSlice(input, &field)) { + return "new-file4 custom field length prefixed slice error"; + } + switch (custom_tag) { + case kPathId: + if (field.size() != 1) { + return "path_id field wrong size"; + } + path_id = field[0]; + if (path_id > 3) { + return "path_id wrong vaue"; + } + break; + case kOldestAncesterTime: + if (!GetVarint64(&field, &f.oldest_ancester_time)) { + return "invalid oldest ancester time"; + } + break; + case kFileCreationTime: + if (!GetVarint64(&field, &f.file_creation_time)) { + return "invalid file creation time"; + } + break; + case kFileChecksum: + f.file_checksum = field.ToString(); + break; + case kFileChecksumFuncName: + f.file_checksum_func_name = field.ToString(); + break; + case kNeedCompaction: + if (field.size() != 1) { + return "need_compaction field wrong size"; + } + f.marked_for_compaction = (field[0] == 1); + break; + case kMinLogNumberToKeepHack: + // This is a hack to encode kMinLogNumberToKeep in a + // forward-compatible fashion. + if (!GetFixed64(&field, &min_log_number_to_keep_)) { + return "deleted log number malformatted"; + } + has_min_log_number_to_keep_ = true; + break; + case kOldestBlobFileNumber: + if (!GetVarint64(&field, &f.oldest_blob_file_number)) { + return "invalid oldest blob file number"; + } + break; + case kTemperature: + if (field.size() != 1) { + return "temperature field wrong size"; + } else { + Temperature casted_field = static_cast(field[0]); + if (casted_field <= Temperature::kCold) { + f.temperature = casted_field; + } + } + break; + case kUniqueId: + if (!DecodeUniqueIdBytes(field.ToString(), &f.unique_id).ok()) { + f.unique_id = kNullUniqueId64x2; + return "invalid unique id"; + } + break; + default: + if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) { + // Should not proceed if cannot understand it + return "new-file4 custom field not supported"; + } + break; + } + } + } else { + return "new-file4 entry"; + } + if (is_pseudo_new_file_record_pr3488(level, number, file_size, + f.smallest, f.largest, + has_min_log_number_to_keep_)) { + // Since this has nothing to do with NewFile, return immediately. + return nullptr; + } + f.fd = + FileDescriptor(number, path_id, file_size, smallest_seqno, largest_seqno); + new_files_.push_back(std::make_pair(level, f)); + return nullptr; +} + +Status VersionEdit::DecodeFrom(const Slice& src) { + Clear(); +#ifndef NDEBUG + bool ignore_ignorable_tags = false; + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:IgnoreIgnorableTags", + &ignore_ignorable_tags); +#endif + Slice input = src; + const char* msg = nullptr; + uint32_t tag = 0; + + // Temporary storage for parsing + int level = 0; + FileMetaData f; + Slice str; + InternalKey key; + while (msg == nullptr && GetVarint32(&input, &tag)) { +#ifndef NDEBUG + if (ignore_ignorable_tags && tag > kTagSafeIgnoreMask) { + tag = kTagSafeIgnoreMask; + } +#endif + switch (tag) { + case kDbId: + if (GetLengthPrefixedSlice(&input, &str)) { + db_id_ = str.ToString(); + has_db_id_ = true; + } else { + msg = "db id"; + } + break; + case kComparator: + if (GetLengthPrefixedSlice(&input, &str)) { + comparator_ = str.ToString(); + has_comparator_ = true; + } else { + msg = "comparator name"; + } + break; + + case kLogNumber: + if (GetVarint64(&input, &log_number_)) { + has_log_number_ = true; + } else { + msg = "log number"; + } + break; + + case kPrevLogNumber: + if (GetVarint64(&input, &prev_log_number_)) { + has_prev_log_number_ = true; + } else { + msg = "previous log number"; + } + break; + + case kNextFileNumber: + if (GetVarint64(&input, &next_file_number_)) { + has_next_file_number_ = true; + } else { + msg = "next file number"; + } + break; + + case kMaxColumnFamily: + if (GetVarint32(&input, &max_column_family_)) { + has_max_column_family_ = true; + } else { + msg = "max column family"; + } + break; + + case kMinLogNumberToKeep: + if (GetVarint64(&input, &min_log_number_to_keep_)) { + has_min_log_number_to_keep_ = true; + } else { + msg = "min log number to kee"; + } + break; + + case kLastSequence: + if (GetVarint64(&input, &last_sequence_)) { + has_last_sequence_ = true; + } else { + msg = "last sequence number"; + } + break; + + case kCompactCursor: + if (GetLevel(&input, &level, &msg) && GetInternalKey(&input, &key)) { + // Here we re-use the output format of compact pointer in LevelDB + // to persist compact_cursors_ + compact_cursors_.push_back(std::make_pair(level, key)); + } else { + if (!msg) { + msg = "compaction cursor"; + } + } + break; + + case kDeletedFile: { + uint64_t number = 0; + if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number)) { + deleted_files_.insert(std::make_pair(level, number)); + } else { + if (!msg) { + msg = "deleted file"; + } + } + break; + } + + case kNewFile: { + uint64_t number = 0; + uint64_t file_size = 0; + if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) && + GetVarint64(&input, &file_size) && + GetInternalKey(&input, &f.smallest) && + GetInternalKey(&input, &f.largest)) { + f.fd = FileDescriptor(number, 0, file_size); + new_files_.push_back(std::make_pair(level, f)); + } else { + if (!msg) { + msg = "new-file entry"; + } + } + break; + } + case kNewFile2: { + uint64_t number = 0; + uint64_t file_size = 0; + SequenceNumber smallest_seqno = 0; + SequenceNumber largest_seqno = kMaxSequenceNumber; + if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) && + GetVarint64(&input, &file_size) && + GetInternalKey(&input, &f.smallest) && + GetInternalKey(&input, &f.largest) && + GetVarint64(&input, &smallest_seqno) && + GetVarint64(&input, &largest_seqno)) { + f.fd = FileDescriptor(number, 0, file_size, smallest_seqno, + largest_seqno); + new_files_.push_back(std::make_pair(level, f)); + } else { + if (!msg) { + msg = "new-file2 entry"; + } + } + break; + } + + case kNewFile3: { + uint64_t number = 0; + uint32_t path_id = 0; + uint64_t file_size = 0; + SequenceNumber smallest_seqno = 0; + SequenceNumber largest_seqno = kMaxSequenceNumber; + if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) && + GetVarint32(&input, &path_id) && GetVarint64(&input, &file_size) && + GetInternalKey(&input, &f.smallest) && + GetInternalKey(&input, &f.largest) && + GetVarint64(&input, &smallest_seqno) && + GetVarint64(&input, &largest_seqno)) { + f.fd = FileDescriptor(number, path_id, file_size, smallest_seqno, + largest_seqno); + new_files_.push_back(std::make_pair(level, f)); + } else { + if (!msg) { + msg = "new-file3 entry"; + } + } + break; + } + + case kNewFile4: { + msg = DecodeNewFile4From(&input); + break; + } + + case kBlobFileAddition: + case kBlobFileAddition_DEPRECATED: { + BlobFileAddition blob_file_addition; + const Status s = blob_file_addition.DecodeFrom(&input); + if (!s.ok()) { + return s; + } + + AddBlobFile(std::move(blob_file_addition)); + break; + } + + case kBlobFileGarbage: + case kBlobFileGarbage_DEPRECATED: { + BlobFileGarbage blob_file_garbage; + const Status s = blob_file_garbage.DecodeFrom(&input); + if (!s.ok()) { + return s; + } + + AddBlobFileGarbage(std::move(blob_file_garbage)); + break; + } + + case kWalAddition: { + WalAddition wal_addition; + const Status s = wal_addition.DecodeFrom(&input); + if (!s.ok()) { + return s; + } + + wal_additions_.emplace_back(std::move(wal_addition)); + break; + } + + case kWalAddition2: { + Slice encoded; + if (!GetLengthPrefixedSlice(&input, &encoded)) { + msg = "WalAddition not prefixed by length"; + break; + } + + WalAddition wal_addition; + const Status s = wal_addition.DecodeFrom(&encoded); + if (!s.ok()) { + return s; + } + + wal_additions_.emplace_back(std::move(wal_addition)); + break; + } + + case kWalDeletion: { + WalDeletion wal_deletion; + const Status s = wal_deletion.DecodeFrom(&input); + if (!s.ok()) { + return s; + } + + wal_deletion_ = std::move(wal_deletion); + break; + } + + case kWalDeletion2: { + Slice encoded; + if (!GetLengthPrefixedSlice(&input, &encoded)) { + msg = "WalDeletion not prefixed by length"; + break; + } + + WalDeletion wal_deletion; + const Status s = wal_deletion.DecodeFrom(&encoded); + if (!s.ok()) { + return s; + } + + wal_deletion_ = std::move(wal_deletion); + break; + } + + case kColumnFamily: + if (!GetVarint32(&input, &column_family_)) { + if (!msg) { + msg = "set column family id"; + } + } + break; + + case kColumnFamilyAdd: + if (GetLengthPrefixedSlice(&input, &str)) { + is_column_family_add_ = true; + column_family_name_ = str.ToString(); + } else { + if (!msg) { + msg = "column family add"; + } + } + break; + + case kColumnFamilyDrop: + is_column_family_drop_ = true; + break; + + case kInAtomicGroup: + is_in_atomic_group_ = true; + if (!GetVarint32(&input, &remaining_entries_)) { + if (!msg) { + msg = "remaining entries"; + } + } + break; + + case kFullHistoryTsLow: + if (!GetLengthPrefixedSlice(&input, &str)) { + msg = "full_history_ts_low"; + } else if (str.empty()) { + msg = "full_history_ts_low: empty"; + } else { + full_history_ts_low_.assign(str.data(), str.size()); + } + break; + + default: + if (tag & kTagSafeIgnoreMask) { + // Tag from future which can be safely ignored. + // The next field must be the length of the entry. + uint32_t field_len; + if (!GetVarint32(&input, &field_len) || + static_cast(field_len) > input.size()) { + if (!msg) { + msg = "safely ignoreable tag length error"; + } + } else { + input.remove_prefix(static_cast(field_len)); + } + } else { + msg = "unknown tag"; + } + break; + } + } + + if (msg == nullptr && !input.empty()) { + msg = "invalid tag"; + } + + Status result; + if (msg != nullptr) { + result = Status::Corruption("VersionEdit", msg); + } + return result; +} + +std::string VersionEdit::DebugString(bool hex_key) const { + std::string r; + r.append("VersionEdit {"); + if (has_db_id_) { + r.append("\n DB ID: "); + r.append(db_id_); + } + if (has_comparator_) { + r.append("\n Comparator: "); + r.append(comparator_); + } + if (has_log_number_) { + r.append("\n LogNumber: "); + AppendNumberTo(&r, log_number_); + } + if (has_prev_log_number_) { + r.append("\n PrevLogNumber: "); + AppendNumberTo(&r, prev_log_number_); + } + if (has_next_file_number_) { + r.append("\n NextFileNumber: "); + AppendNumberTo(&r, next_file_number_); + } + if (has_max_column_family_) { + r.append("\n MaxColumnFamily: "); + AppendNumberTo(&r, max_column_family_); + } + if (has_min_log_number_to_keep_) { + r.append("\n MinLogNumberToKeep: "); + AppendNumberTo(&r, min_log_number_to_keep_); + } + if (has_last_sequence_) { + r.append("\n LastSeq: "); + AppendNumberTo(&r, last_sequence_); + } + for (const auto& level_and_compact_cursor : compact_cursors_) { + r.append("\n CompactCursor: "); + AppendNumberTo(&r, level_and_compact_cursor.first); + r.append(" "); + r.append(level_and_compact_cursor.second.DebugString(hex_key)); + } + for (const auto& deleted_file : deleted_files_) { + r.append("\n DeleteFile: "); + AppendNumberTo(&r, deleted_file.first); + r.append(" "); + AppendNumberTo(&r, deleted_file.second); + } + for (size_t i = 0; i < new_files_.size(); i++) { + const FileMetaData& f = new_files_[i].second; + r.append("\n AddFile: "); + AppendNumberTo(&r, new_files_[i].first); + r.append(" "); + AppendNumberTo(&r, f.fd.GetNumber()); + r.append(" "); + AppendNumberTo(&r, f.fd.GetFileSize()); + r.append(" "); + r.append(f.smallest.DebugString(hex_key)); + r.append(" .. "); + r.append(f.largest.DebugString(hex_key)); + if (f.oldest_blob_file_number != kInvalidBlobFileNumber) { + r.append(" blob_file:"); + AppendNumberTo(&r, f.oldest_blob_file_number); + } + r.append(" oldest_ancester_time:"); + AppendNumberTo(&r, f.oldest_ancester_time); + r.append(" file_creation_time:"); + AppendNumberTo(&r, f.file_creation_time); + r.append(" file_checksum:"); + r.append(Slice(f.file_checksum).ToString(true)); + r.append(" file_checksum_func_name: "); + r.append(f.file_checksum_func_name); + if (f.temperature != Temperature::kUnknown) { + r.append(" temperature: "); + // Maybe change to human readable format whenthe feature becomes + // permanent + r.append(std::to_string(static_cast(f.temperature))); + } + if (f.unique_id != kNullUniqueId64x2) { + r.append(" unique_id(internal): "); + UniqueId64x2 id = f.unique_id; + r.append(InternalUniqueIdToHumanString(&id)); + r.append(" public_unique_id: "); + InternalUniqueIdToExternal(&id); + r.append(UniqueIdToHumanString(EncodeUniqueIdBytes(&id))); + } + } + + for (const auto& blob_file_addition : blob_file_additions_) { + r.append("\n BlobFileAddition: "); + r.append(blob_file_addition.DebugString()); + } + + for (const auto& blob_file_garbage : blob_file_garbages_) { + r.append("\n BlobFileGarbage: "); + r.append(blob_file_garbage.DebugString()); + } + + for (const auto& wal_addition : wal_additions_) { + r.append("\n WalAddition: "); + r.append(wal_addition.DebugString()); + } + + if (!wal_deletion_.IsEmpty()) { + r.append("\n WalDeletion: "); + r.append(wal_deletion_.DebugString()); + } + + r.append("\n ColumnFamily: "); + AppendNumberTo(&r, column_family_); + if (is_column_family_add_) { + r.append("\n ColumnFamilyAdd: "); + r.append(column_family_name_); + } + if (is_column_family_drop_) { + r.append("\n ColumnFamilyDrop"); + } + if (is_in_atomic_group_) { + r.append("\n AtomicGroup: "); + AppendNumberTo(&r, remaining_entries_); + r.append(" entries remains"); + } + if (HasFullHistoryTsLow()) { + r.append("\n FullHistoryTsLow: "); + r.append(Slice(full_history_ts_low_).ToString(hex_key)); + } + r.append("\n}\n"); + return r; +} + +std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const { + JSONWriter jw; + jw << "EditNumber" << edit_num; + + if (has_db_id_) { + jw << "DB ID" << db_id_; + } + if (has_comparator_) { + jw << "Comparator" << comparator_; + } + if (has_log_number_) { + jw << "LogNumber" << log_number_; + } + if (has_prev_log_number_) { + jw << "PrevLogNumber" << prev_log_number_; + } + if (has_next_file_number_) { + jw << "NextFileNumber" << next_file_number_; + } + if (has_max_column_family_) { + jw << "MaxColumnFamily" << max_column_family_; + } + if (has_min_log_number_to_keep_) { + jw << "MinLogNumberToKeep" << min_log_number_to_keep_; + } + if (has_last_sequence_) { + jw << "LastSeq" << last_sequence_; + } + + if (!deleted_files_.empty()) { + jw << "DeletedFiles"; + jw.StartArray(); + + for (const auto& deleted_file : deleted_files_) { + jw.StartArrayedObject(); + jw << "Level" << deleted_file.first; + jw << "FileNumber" << deleted_file.second; + jw.EndArrayedObject(); + } + + jw.EndArray(); + } + + if (!new_files_.empty()) { + jw << "AddedFiles"; + jw.StartArray(); + + for (size_t i = 0; i < new_files_.size(); i++) { + jw.StartArrayedObject(); + jw << "Level" << new_files_[i].first; + const FileMetaData& f = new_files_[i].second; + jw << "FileNumber" << f.fd.GetNumber(); + jw << "FileSize" << f.fd.GetFileSize(); + jw << "SmallestIKey" << f.smallest.DebugString(hex_key); + jw << "LargestIKey" << f.largest.DebugString(hex_key); + jw << "OldestAncesterTime" << f.oldest_ancester_time; + jw << "FileCreationTime" << f.file_creation_time; + jw << "FileChecksum" << Slice(f.file_checksum).ToString(true); + jw << "FileChecksumFuncName" << f.file_checksum_func_name; + if (f.temperature != Temperature::kUnknown) { + jw << "temperature" << std::to_string(static_cast(f.temperature)); + } + if (f.oldest_blob_file_number != kInvalidBlobFileNumber) { + jw << "OldestBlobFile" << f.oldest_blob_file_number; + } + if (f.temperature != Temperature::kUnknown) { + // Maybe change to human readable format whenthe feature becomes + // permanent + jw << "Temperature" << static_cast(f.temperature); + } + jw.EndArrayedObject(); + } + + jw.EndArray(); + } + + if (!blob_file_additions_.empty()) { + jw << "BlobFileAdditions"; + + jw.StartArray(); + + for (const auto& blob_file_addition : blob_file_additions_) { + jw.StartArrayedObject(); + jw << blob_file_addition; + jw.EndArrayedObject(); + } + + jw.EndArray(); + } + + if (!blob_file_garbages_.empty()) { + jw << "BlobFileGarbages"; + + jw.StartArray(); + + for (const auto& blob_file_garbage : blob_file_garbages_) { + jw.StartArrayedObject(); + jw << blob_file_garbage; + jw.EndArrayedObject(); + } + + jw.EndArray(); + } + + if (!wal_additions_.empty()) { + jw << "WalAdditions"; + + jw.StartArray(); + + for (const auto& wal_addition : wal_additions_) { + jw.StartArrayedObject(); + jw << wal_addition; + jw.EndArrayedObject(); + } + + jw.EndArray(); + } + + if (!wal_deletion_.IsEmpty()) { + jw << "WalDeletion"; + jw.StartObject(); + jw << wal_deletion_; + jw.EndObject(); + } + + jw << "ColumnFamily" << column_family_; + + if (is_column_family_add_) { + jw << "ColumnFamilyAdd" << column_family_name_; + } + if (is_column_family_drop_) { + jw << "ColumnFamilyDrop" << column_family_name_; + } + if (is_in_atomic_group_) { + jw << "AtomicGroup" << remaining_entries_; + } + + if (HasFullHistoryTsLow()) { + jw << "FullHistoryTsLow" << Slice(full_history_ts_low_).ToString(hex_key); + } + + jw.EndObject(); + + return jw.Get(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/version_edit.h b/src/rocksdb/db/version_edit.h new file mode 100644 index 000000000..c9800a3c0 --- /dev/null +++ b/src/rocksdb/db/version_edit.h @@ -0,0 +1,669 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include +#include + +#include "db/blob/blob_file_addition.h" +#include "db/blob/blob_file_garbage.h" +#include "db/dbformat.h" +#include "db/wal_edit.h" +#include "memory/arena.h" +#include "port/malloc.h" +#include "rocksdb/advanced_options.h" +#include "rocksdb/cache.h" +#include "table/table_reader.h" +#include "table/unique_id_impl.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +// Tag numbers for serialized VersionEdit. These numbers are written to +// disk and should not be changed. The number should be forward compatible so +// users can down-grade RocksDB safely. A future Tag is ignored by doing '&' +// between Tag and kTagSafeIgnoreMask field. +enum Tag : uint32_t { + kComparator = 1, + kLogNumber = 2, + kNextFileNumber = 3, + kLastSequence = 4, + kCompactCursor = 5, + kDeletedFile = 6, + kNewFile = 7, + // 8 was used for large value refs + kPrevLogNumber = 9, + kMinLogNumberToKeep = 10, + + // these are new formats divergent from open source leveldb + kNewFile2 = 100, + kNewFile3 = 102, + kNewFile4 = 103, // 4th (the latest) format version of adding files + kColumnFamily = 200, // specify column family for version edit + kColumnFamilyAdd = 201, + kColumnFamilyDrop = 202, + kMaxColumnFamily = 203, + + kInAtomicGroup = 300, + + kBlobFileAddition = 400, + kBlobFileGarbage, + + // Mask for an unidentified tag from the future which can be safely ignored. + kTagSafeIgnoreMask = 1 << 13, + + // Forward compatible (aka ignorable) records + kDbId, + kBlobFileAddition_DEPRECATED, + kBlobFileGarbage_DEPRECATED, + kWalAddition, + kWalDeletion, + kFullHistoryTsLow, + kWalAddition2, + kWalDeletion2, +}; + +enum NewFileCustomTag : uint32_t { + kTerminate = 1, // The end of customized fields + kNeedCompaction = 2, + // Since Manifest is not entirely forward-compatible, we currently encode + // kMinLogNumberToKeep as part of NewFile as a hack. This should be removed + // when manifest becomes forward-compatible. + kMinLogNumberToKeepHack = 3, + kOldestBlobFileNumber = 4, + kOldestAncesterTime = 5, + kFileCreationTime = 6, + kFileChecksum = 7, + kFileChecksumFuncName = 8, + kTemperature = 9, + kMinTimestamp = 10, + kMaxTimestamp = 11, + kUniqueId = 12, + + // If this bit for the custom tag is set, opening DB should fail if + // we don't know this field. + kCustomTagNonSafeIgnoreMask = 1 << 6, + + // Forward incompatible (aka unignorable) fields + kPathId, +}; + +class VersionSet; + +constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF; +constexpr uint64_t kUnknownOldestAncesterTime = 0; +constexpr uint64_t kUnknownFileCreationTime = 0; + +extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id); + +// A copyable structure contains information needed to read data from an SST +// file. It can contain a pointer to a table reader opened for the file, or +// file number and size, which can be used to create a new table reader for it. +// The behavior is undefined when a copied of the structure is used when the +// file is not in any live version any more. +struct FileDescriptor { + // Table reader in table_reader_handle + TableReader* table_reader; + uint64_t packed_number_and_path_id; + uint64_t file_size; // File size in bytes + SequenceNumber smallest_seqno; // The smallest seqno in this file + SequenceNumber largest_seqno; // The largest seqno in this file + + FileDescriptor() : FileDescriptor(0, 0, 0) {} + + FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size) + : FileDescriptor(number, path_id, _file_size, kMaxSequenceNumber, 0) {} + + FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size, + SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno) + : table_reader(nullptr), + packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)), + file_size(_file_size), + smallest_seqno(_smallest_seqno), + largest_seqno(_largest_seqno) {} + + FileDescriptor(const FileDescriptor& fd) { *this = fd; } + + FileDescriptor& operator=(const FileDescriptor& fd) { + table_reader = fd.table_reader; + packed_number_and_path_id = fd.packed_number_and_path_id; + file_size = fd.file_size; + smallest_seqno = fd.smallest_seqno; + largest_seqno = fd.largest_seqno; + return *this; + } + + uint64_t GetNumber() const { + return packed_number_and_path_id & kFileNumberMask; + } + uint32_t GetPathId() const { + return static_cast(packed_number_and_path_id / + (kFileNumberMask + 1)); + } + uint64_t GetFileSize() const { return file_size; } +}; + +struct FileSampledStats { + FileSampledStats() : num_reads_sampled(0) {} + FileSampledStats(const FileSampledStats& other) { *this = other; } + FileSampledStats& operator=(const FileSampledStats& other) { + num_reads_sampled = other.num_reads_sampled.load(); + return *this; + } + + // number of user reads to this file. + mutable std::atomic num_reads_sampled; +}; + +struct FileMetaData { + FileDescriptor fd; + InternalKey smallest; // Smallest internal key served by table + InternalKey largest; // Largest internal key served by table + + // Needs to be disposed when refs becomes 0. + Cache::Handle* table_reader_handle = nullptr; + + FileSampledStats stats; + + // Stats for compensating deletion entries during compaction + + // File size compensated by deletion entry. + // This is updated in Version::UpdateAccumulatedStats() first time when the + // file is created or loaded. After it is updated (!= 0), it is immutable. + uint64_t compensated_file_size = 0; + // These values can mutate, but they can only be read or written from + // single-threaded LogAndApply thread + uint64_t num_entries = 0; // the number of entries. + uint64_t num_deletions = 0; // the number of deletion entries. + uint64_t raw_key_size = 0; // total uncompressed key size. + uint64_t raw_value_size = 0; // total uncompressed value size. + + int refs = 0; // Reference count + + bool being_compacted = false; // Is this file undergoing compaction? + bool init_stats_from_file = false; // true if the data-entry stats of this + // file has initialized from file. + + bool marked_for_compaction = false; // True if client asked us nicely to + // compact this file. + Temperature temperature = Temperature::kUnknown; + + // Used only in BlobDB. The file number of the oldest blob file this SST file + // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1. + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber; + + // The file could be the compaction output from other SST files, which could + // in turn be outputs for compact older SST files. We track the memtable + // flush timestamp for the oldest SST file that eventually contribute data + // to this file. 0 means the information is not available. + uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; + + // Unix time when the SST file is created. + uint64_t file_creation_time = kUnknownFileCreationTime; + + // File checksum + std::string file_checksum = kUnknownFileChecksum; + + // File checksum function name + std::string file_checksum_func_name = kUnknownFileChecksumFuncName; + + // SST unique id + UniqueId64x2 unique_id{}; + + FileMetaData() = default; + + FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size, + const InternalKey& smallest_key, const InternalKey& largest_key, + const SequenceNumber& smallest_seq, + const SequenceNumber& largest_seq, bool marked_for_compact, + Temperature _temperature, uint64_t oldest_blob_file, + uint64_t _oldest_ancester_time, uint64_t _file_creation_time, + const std::string& _file_checksum, + const std::string& _file_checksum_func_name, + UniqueId64x2 _unique_id) + : fd(file, file_path_id, file_size, smallest_seq, largest_seq), + smallest(smallest_key), + largest(largest_key), + marked_for_compaction(marked_for_compact), + temperature(_temperature), + oldest_blob_file_number(oldest_blob_file), + oldest_ancester_time(_oldest_ancester_time), + file_creation_time(_file_creation_time), + file_checksum(_file_checksum), + file_checksum_func_name(_file_checksum_func_name), + unique_id(std::move(_unique_id)) { + TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this); + } + + // REQUIRED: Keys must be given to the function in sorted order (it expects + // the last key to be the largest). + Status UpdateBoundaries(const Slice& key, const Slice& value, + SequenceNumber seqno, ValueType value_type); + + // Unlike UpdateBoundaries, ranges do not need to be presented in any + // particular order. + void UpdateBoundariesForRange(const InternalKey& start, + const InternalKey& end, SequenceNumber seqno, + const InternalKeyComparator& icmp) { + if (smallest.size() == 0 || icmp.Compare(start, smallest) < 0) { + smallest = start; + } + if (largest.size() == 0 || icmp.Compare(largest, end) < 0) { + largest = end; + } + fd.smallest_seqno = std::min(fd.smallest_seqno, seqno); + fd.largest_seqno = std::max(fd.largest_seqno, seqno); + } + + // Try to get oldest ancester time from the class itself or table properties + // if table reader is already pinned. + // 0 means the information is not available. + uint64_t TryGetOldestAncesterTime() { + if (oldest_ancester_time != kUnknownOldestAncesterTime) { + return oldest_ancester_time; + } else if (fd.table_reader != nullptr && + fd.table_reader->GetTableProperties() != nullptr) { + return fd.table_reader->GetTableProperties()->creation_time; + } + return kUnknownOldestAncesterTime; + } + + uint64_t TryGetFileCreationTime() { + if (file_creation_time != kUnknownFileCreationTime) { + return file_creation_time; + } else if (fd.table_reader != nullptr && + fd.table_reader->GetTableProperties() != nullptr) { + return fd.table_reader->GetTableProperties()->file_creation_time; + } + return kUnknownFileCreationTime; + } + + // WARNING: manual update to this function is needed + // whenever a new string property is added to FileMetaData + // to reduce approximation error. + // + // TODO: eliminate the need of manually updating this function + // for new string properties + size_t ApproximateMemoryUsage() const { + size_t usage = 0; +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + usage += smallest.size() + largest.size() + file_checksum.size() + + file_checksum_func_name.size(); + return usage; + } +}; + +// A compressed copy of file meta data that just contain minimum data needed +// to serve read operations, while still keeping the pointer to full metadata +// of the file in case it is needed. +struct FdWithKeyRange { + FileDescriptor fd; + FileMetaData* file_metadata; // Point to all metadata + Slice smallest_key; // slice that contain smallest key + Slice largest_key; // slice that contain largest key + + FdWithKeyRange() + : fd(), file_metadata(nullptr), smallest_key(), largest_key() {} + + FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key, + FileMetaData* _file_metadata) + : fd(_fd), + file_metadata(_file_metadata), + smallest_key(_smallest_key), + largest_key(_largest_key) {} +}; + +// Data structure to store an array of FdWithKeyRange in one level +// Actual data is guaranteed to be stored closely +struct LevelFilesBrief { + size_t num_files; + FdWithKeyRange* files; + LevelFilesBrief() { + num_files = 0; + files = nullptr; + } +}; + +// The state of a DB at any given time is referred to as a Version. +// Any modification to the Version is considered a Version Edit. A Version is +// constructed by joining a sequence of Version Edits. Version Edits are written +// to the MANIFEST file. +class VersionEdit { + public: + void Clear(); + + void SetDBId(const std::string& db_id) { + has_db_id_ = true; + db_id_ = db_id; + } + bool HasDbId() const { return has_db_id_; } + const std::string& GetDbId() const { return db_id_; } + + void SetComparatorName(const Slice& name) { + has_comparator_ = true; + comparator_ = name.ToString(); + } + bool HasComparatorName() const { return has_comparator_; } + const std::string& GetComparatorName() const { return comparator_; } + + void SetLogNumber(uint64_t num) { + has_log_number_ = true; + log_number_ = num; + } + bool HasLogNumber() const { return has_log_number_; } + uint64_t GetLogNumber() const { return log_number_; } + + void SetPrevLogNumber(uint64_t num) { + has_prev_log_number_ = true; + prev_log_number_ = num; + } + bool HasPrevLogNumber() const { return has_prev_log_number_; } + uint64_t GetPrevLogNumber() const { return prev_log_number_; } + + void SetNextFile(uint64_t num) { + has_next_file_number_ = true; + next_file_number_ = num; + } + bool HasNextFile() const { return has_next_file_number_; } + uint64_t GetNextFile() const { return next_file_number_; } + + void SetMaxColumnFamily(uint32_t max_column_family) { + has_max_column_family_ = true; + max_column_family_ = max_column_family; + } + bool HasMaxColumnFamily() const { return has_max_column_family_; } + uint32_t GetMaxColumnFamily() const { return max_column_family_; } + + void SetMinLogNumberToKeep(uint64_t num) { + has_min_log_number_to_keep_ = true; + min_log_number_to_keep_ = num; + } + bool HasMinLogNumberToKeep() const { return has_min_log_number_to_keep_; } + uint64_t GetMinLogNumberToKeep() const { return min_log_number_to_keep_; } + + void SetLastSequence(SequenceNumber seq) { + has_last_sequence_ = true; + last_sequence_ = seq; + } + bool HasLastSequence() const { return has_last_sequence_; } + SequenceNumber GetLastSequence() const { return last_sequence_; } + + // Delete the specified table file from the specified level. + void DeleteFile(int level, uint64_t file) { + deleted_files_.emplace(level, file); + } + + // Retrieve the table files deleted as well as their associated levels. + using DeletedFiles = std::set>; + const DeletedFiles& GetDeletedFiles() const { return deleted_files_; } + + // Add the specified table file at the specified level. + // REQUIRES: "smallest" and "largest" are smallest and largest keys in file + // REQUIRES: "oldest_blob_file_number" is the number of the oldest blob file + // referred to by this file if any, kInvalidBlobFileNumber otherwise. + void AddFile(int level, uint64_t file, uint32_t file_path_id, + uint64_t file_size, const InternalKey& smallest, + const InternalKey& largest, const SequenceNumber& smallest_seqno, + const SequenceNumber& largest_seqno, bool marked_for_compaction, + Temperature temperature, uint64_t oldest_blob_file_number, + uint64_t oldest_ancester_time, uint64_t file_creation_time, + const std::string& file_checksum, + const std::string& file_checksum_func_name, + const UniqueId64x2& unique_id) { + assert(smallest_seqno <= largest_seqno); + new_files_.emplace_back( + level, + FileMetaData(file, file_path_id, file_size, smallest, largest, + smallest_seqno, largest_seqno, marked_for_compaction, + temperature, oldest_blob_file_number, oldest_ancester_time, + file_creation_time, file_checksum, file_checksum_func_name, + unique_id)); + if (!HasLastSequence() || largest_seqno > GetLastSequence()) { + SetLastSequence(largest_seqno); + } + } + + void AddFile(int level, const FileMetaData& f) { + assert(f.fd.smallest_seqno <= f.fd.largest_seqno); + new_files_.emplace_back(level, f); + if (!HasLastSequence() || f.fd.largest_seqno > GetLastSequence()) { + SetLastSequence(f.fd.largest_seqno); + } + } + + // Retrieve the table files added as well as their associated levels. + using NewFiles = std::vector>; + const NewFiles& GetNewFiles() const { return new_files_; } + + // Retrieve all the compact cursors + using CompactCursors = std::vector>; + const CompactCursors& GetCompactCursors() const { return compact_cursors_; } + void AddCompactCursor(int level, const InternalKey& cursor) { + compact_cursors_.push_back(std::make_pair(level, cursor)); + } + void SetCompactCursors( + const std::vector& compact_cursors_by_level) { + compact_cursors_.clear(); + compact_cursors_.reserve(compact_cursors_by_level.size()); + for (int i = 0; i < (int)compact_cursors_by_level.size(); i++) { + if (compact_cursors_by_level[i].Valid()) { + compact_cursors_.push_back( + std::make_pair(i, compact_cursors_by_level[i])); + } + } + } + + // Add a new blob file. + void AddBlobFile(uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, std::string checksum_method, + std::string checksum_value) { + blob_file_additions_.emplace_back( + blob_file_number, total_blob_count, total_blob_bytes, + std::move(checksum_method), std::move(checksum_value)); + } + + void AddBlobFile(BlobFileAddition blob_file_addition) { + blob_file_additions_.emplace_back(std::move(blob_file_addition)); + } + + // Retrieve all the blob files added. + using BlobFileAdditions = std::vector; + const BlobFileAdditions& GetBlobFileAdditions() const { + return blob_file_additions_; + } + + void SetBlobFileAdditions(BlobFileAdditions blob_file_additions) { + assert(blob_file_additions_.empty()); + blob_file_additions_ = std::move(blob_file_additions); + } + + // Add garbage for an existing blob file. Note: intentionally broken English + // follows. + void AddBlobFileGarbage(uint64_t blob_file_number, + uint64_t garbage_blob_count, + uint64_t garbage_blob_bytes) { + blob_file_garbages_.emplace_back(blob_file_number, garbage_blob_count, + garbage_blob_bytes); + } + + void AddBlobFileGarbage(BlobFileGarbage blob_file_garbage) { + blob_file_garbages_.emplace_back(std::move(blob_file_garbage)); + } + + // Retrieve all the blob file garbage added. + using BlobFileGarbages = std::vector; + const BlobFileGarbages& GetBlobFileGarbages() const { + return blob_file_garbages_; + } + + void SetBlobFileGarbages(BlobFileGarbages blob_file_garbages) { + assert(blob_file_garbages_.empty()); + blob_file_garbages_ = std::move(blob_file_garbages); + } + + // Add a WAL (either just created or closed). + // AddWal and DeleteWalsBefore cannot be called on the same VersionEdit. + void AddWal(WalNumber number, WalMetadata metadata = WalMetadata()) { + assert(NumEntries() == wal_additions_.size()); + wal_additions_.emplace_back(number, std::move(metadata)); + } + + // Retrieve all the added WALs. + const WalAdditions& GetWalAdditions() const { return wal_additions_; } + + bool IsWalAddition() const { return !wal_additions_.empty(); } + + // Delete a WAL (either directly deleted or archived). + // AddWal and DeleteWalsBefore cannot be called on the same VersionEdit. + void DeleteWalsBefore(WalNumber number) { + assert((NumEntries() == 1) == !wal_deletion_.IsEmpty()); + wal_deletion_ = WalDeletion(number); + } + + const WalDeletion& GetWalDeletion() const { return wal_deletion_; } + + bool IsWalDeletion() const { return !wal_deletion_.IsEmpty(); } + + bool IsWalManipulation() const { + size_t entries = NumEntries(); + return (entries > 0) && ((entries == wal_additions_.size()) || + (entries == !wal_deletion_.IsEmpty())); + } + + // Number of edits + size_t NumEntries() const { + return new_files_.size() + deleted_files_.size() + + blob_file_additions_.size() + blob_file_garbages_.size() + + wal_additions_.size() + !wal_deletion_.IsEmpty(); + } + + void SetColumnFamily(uint32_t column_family_id) { + column_family_ = column_family_id; + } + uint32_t GetColumnFamily() const { return column_family_; } + + // set column family ID by calling SetColumnFamily() + void AddColumnFamily(const std::string& name) { + assert(!is_column_family_drop_); + assert(!is_column_family_add_); + assert(NumEntries() == 0); + is_column_family_add_ = true; + column_family_name_ = name; + } + + // set column family ID by calling SetColumnFamily() + void DropColumnFamily() { + assert(!is_column_family_drop_); + assert(!is_column_family_add_); + assert(NumEntries() == 0); + is_column_family_drop_ = true; + } + + bool IsColumnFamilyManipulation() const { + return is_column_family_add_ || is_column_family_drop_; + } + + bool IsColumnFamilyAdd() const { return is_column_family_add_; } + + bool IsColumnFamilyDrop() const { return is_column_family_drop_; } + + void MarkAtomicGroup(uint32_t remaining_entries) { + is_in_atomic_group_ = true; + remaining_entries_ = remaining_entries; + } + bool IsInAtomicGroup() const { return is_in_atomic_group_; } + uint32_t GetRemainingEntries() const { return remaining_entries_; } + + bool HasFullHistoryTsLow() const { return !full_history_ts_low_.empty(); } + const std::string& GetFullHistoryTsLow() const { + assert(HasFullHistoryTsLow()); + return full_history_ts_low_; + } + void SetFullHistoryTsLow(std::string full_history_ts_low) { + assert(!full_history_ts_low.empty()); + full_history_ts_low_ = std::move(full_history_ts_low); + } + + // return true on success. + bool EncodeTo(std::string* dst) const; + Status DecodeFrom(const Slice& src); + + std::string DebugString(bool hex_key = false) const; + std::string DebugJSON(int edit_num, bool hex_key = false) const; + + private: + friend class ReactiveVersionSet; + friend class VersionEditHandlerBase; + friend class ListColumnFamiliesHandler; + friend class VersionEditHandler; + friend class VersionEditHandlerPointInTime; + friend class DumpManifestHandler; + friend class VersionSet; + friend class Version; + friend class AtomicGroupReadBuffer; + + bool GetLevel(Slice* input, int* level, const char** msg); + + const char* DecodeNewFile4From(Slice* input); + + int max_level_ = 0; + std::string db_id_; + std::string comparator_; + uint64_t log_number_ = 0; + uint64_t prev_log_number_ = 0; + uint64_t next_file_number_ = 0; + uint32_t max_column_family_ = 0; + // The most recent WAL log number that is deleted + uint64_t min_log_number_to_keep_ = 0; + SequenceNumber last_sequence_ = 0; + bool has_db_id_ = false; + bool has_comparator_ = false; + bool has_log_number_ = false; + bool has_prev_log_number_ = false; + bool has_next_file_number_ = false; + bool has_max_column_family_ = false; + bool has_min_log_number_to_keep_ = false; + bool has_last_sequence_ = false; + + // Compaction cursors for round-robin compaction policy + CompactCursors compact_cursors_; + + DeletedFiles deleted_files_; + NewFiles new_files_; + + BlobFileAdditions blob_file_additions_; + BlobFileGarbages blob_file_garbages_; + + WalAdditions wal_additions_; + WalDeletion wal_deletion_; + + // Each version edit record should have column_family_ set + // If it's not set, it is default (0) + uint32_t column_family_ = 0; + // a version edit can be either column_family add or + // column_family drop. If it's column family add, + // it also includes column family name. + bool is_column_family_drop_ = false; + bool is_column_family_add_ = false; + std::string column_family_name_; + + bool is_in_atomic_group_ = false; + uint32_t remaining_entries_ = 0; + + std::string full_history_ts_low_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/version_edit_handler.cc b/src/rocksdb/db/version_edit_handler.cc new file mode 100644 index 000000000..145e78789 --- /dev/null +++ b/src/rocksdb/db/version_edit_handler.cc @@ -0,0 +1,1002 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_edit_handler.h" + +#include +#include + +#include "db/blob/blob_file_reader.h" +#include "db/blob/blob_source.h" +#include "logging/logging.h" +#include "monitoring/persistent_stats_history.h" + +namespace ROCKSDB_NAMESPACE { + +void VersionEditHandlerBase::Iterate(log::Reader& reader, + Status* log_read_status) { + Slice record; + std::string scratch; + assert(log_read_status); + assert(log_read_status->ok()); + + size_t recovered_edits = 0; + Status s = Initialize(); + while (reader.LastRecordEnd() < max_manifest_read_size_ && s.ok() && + reader.ReadRecord(&record, &scratch) && log_read_status->ok()) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (!s.ok()) { + break; + } + + s = read_buffer_.AddEdit(&edit); + if (!s.ok()) { + break; + } + ColumnFamilyData* cfd = nullptr; + if (edit.is_in_atomic_group_) { + if (read_buffer_.IsFull()) { + for (auto& e : read_buffer_.replay_buffer()) { + s = ApplyVersionEdit(e, &cfd); + if (!s.ok()) { + break; + } + ++recovered_edits; + } + if (!s.ok()) { + break; + } + read_buffer_.Clear(); + } + } else { + s = ApplyVersionEdit(edit, &cfd); + if (s.ok()) { + ++recovered_edits; + } + } + } + if (!log_read_status->ok()) { + s = *log_read_status; + } + + CheckIterationResult(reader, &s); + + if (!s.ok()) { + if (s.IsCorruption()) { + // when we find a Corruption error, something is + // wrong with the underlying file. in this case we + // want to report the filename, so in here we append + // the filename to the Corruption message + assert(reader.file()); + + // build a new error message + std::stringstream message; + // append previous dynamic state message + const char* state = s.getState(); + if (state != nullptr) { + message << state; + message << ' '; + } + // append the filename to the corruption message + message << "in file " << reader.file()->file_name(); + // overwrite the status with the extended status + s = Status(s.code(), s.subcode(), s.severity(), message.str()); + } + status_ = s; + } + TEST_SYNC_POINT_CALLBACK("VersionEditHandlerBase::Iterate:Finish", + &recovered_edits); +} + +Status ListColumnFamiliesHandler::ApplyVersionEdit( + VersionEdit& edit, ColumnFamilyData** /*unused*/) { + Status s; + if (edit.is_column_family_add_) { + if (column_family_names_.find(edit.column_family_) != + column_family_names_.end()) { + s = Status::Corruption("Manifest adding the same column family twice"); + } else { + column_family_names_.insert( + {edit.column_family_, edit.column_family_name_}); + } + } else if (edit.is_column_family_drop_) { + if (column_family_names_.find(edit.column_family_) == + column_family_names_.end()) { + s = Status::Corruption("Manifest - dropping non-existing column family"); + } else { + column_family_names_.erase(edit.column_family_); + } + } + return s; +} + +Status FileChecksumRetriever::ApplyVersionEdit(VersionEdit& edit, + ColumnFamilyData** /*unused*/) { + for (const auto& deleted_file : edit.GetDeletedFiles()) { + Status s = file_checksum_list_.RemoveOneFileChecksum(deleted_file.second); + if (!s.ok()) { + return s; + } + } + for (const auto& new_file : edit.GetNewFiles()) { + Status s = file_checksum_list_.InsertOneFileChecksum( + new_file.second.fd.GetNumber(), new_file.second.file_checksum, + new_file.second.file_checksum_func_name); + if (!s.ok()) { + return s; + } + } + for (const auto& new_blob_file : edit.GetBlobFileAdditions()) { + std::string checksum_value = new_blob_file.GetChecksumValue(); + std::string checksum_method = new_blob_file.GetChecksumMethod(); + assert(checksum_value.empty() == checksum_method.empty()); + if (checksum_method.empty()) { + checksum_value = kUnknownFileChecksum; + checksum_method = kUnknownFileChecksumFuncName; + } + Status s = file_checksum_list_.InsertOneFileChecksum( + new_blob_file.GetBlobFileNumber(), checksum_value, checksum_method); + if (!s.ok()) { + return s; + } + } + return Status::OK(); +} + +VersionEditHandler::VersionEditHandler( + bool read_only, std::vector column_families, + VersionSet* version_set, bool track_missing_files, + bool no_error_if_files_missing, const std::shared_ptr& io_tracer, + bool skip_load_table_files) + : VersionEditHandlerBase(), + read_only_(read_only), + column_families_(std::move(column_families)), + version_set_(version_set), + track_missing_files_(track_missing_files), + no_error_if_files_missing_(no_error_if_files_missing), + io_tracer_(io_tracer), + skip_load_table_files_(skip_load_table_files), + initialized_(false) { + assert(version_set_ != nullptr); +} + +Status VersionEditHandler::Initialize() { + Status s; + if (!initialized_) { + for (const auto& cf_desc : column_families_) { + name_to_options_.emplace(cf_desc.name, cf_desc.options); + } + auto default_cf_iter = name_to_options_.find(kDefaultColumnFamilyName); + if (default_cf_iter == name_to_options_.end()) { + s = Status::InvalidArgument("Default column family not specified"); + } + if (s.ok()) { + VersionEdit default_cf_edit; + default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName); + default_cf_edit.SetColumnFamily(0); + ColumnFamilyData* cfd = + CreateCfAndInit(default_cf_iter->second, default_cf_edit); + assert(cfd != nullptr); +#ifdef NDEBUG + (void)cfd; +#endif + initialized_ = true; + } + } + return s; +} + +Status VersionEditHandler::ApplyVersionEdit(VersionEdit& edit, + ColumnFamilyData** cfd) { + Status s; + if (edit.is_column_family_add_) { + s = OnColumnFamilyAdd(edit, cfd); + } else if (edit.is_column_family_drop_) { + s = OnColumnFamilyDrop(edit, cfd); + } else if (edit.IsWalAddition()) { + s = OnWalAddition(edit); + } else if (edit.IsWalDeletion()) { + s = OnWalDeletion(edit); + } else { + s = OnNonCfOperation(edit, cfd); + } + if (s.ok()) { + assert(cfd != nullptr); + s = ExtractInfoFromVersionEdit(*cfd, edit); + } + return s; +} + +Status VersionEditHandler::OnColumnFamilyAdd(VersionEdit& edit, + ColumnFamilyData** cfd) { + bool cf_in_not_found = false; + bool cf_in_builders = false; + CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders); + + assert(cfd != nullptr); + *cfd = nullptr; + Status s; + if (cf_in_builders || cf_in_not_found) { + s = Status::Corruption("MANIFEST adding the same column family twice: " + + edit.column_family_name_); + } + if (s.ok()) { + auto cf_options = name_to_options_.find(edit.column_family_name_); + // implicitly add persistent_stats column family without requiring user + // to specify + ColumnFamilyData* tmp_cfd = nullptr; + bool is_persistent_stats_column_family = + edit.column_family_name_.compare(kPersistentStatsColumnFamilyName) == 0; + if (cf_options == name_to_options_.end() && + !is_persistent_stats_column_family) { + column_families_not_found_.emplace(edit.column_family_, + edit.column_family_name_); + } else { + if (is_persistent_stats_column_family) { + ColumnFamilyOptions cfo; + OptimizeForPersistentStats(&cfo); + tmp_cfd = CreateCfAndInit(cfo, edit); + } else { + tmp_cfd = CreateCfAndInit(cf_options->second, edit); + } + *cfd = tmp_cfd; + } + } + return s; +} + +Status VersionEditHandler::OnColumnFamilyDrop(VersionEdit& edit, + ColumnFamilyData** cfd) { + bool cf_in_not_found = false; + bool cf_in_builders = false; + CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders); + + assert(cfd != nullptr); + *cfd = nullptr; + ColumnFamilyData* tmp_cfd = nullptr; + Status s; + if (cf_in_builders) { + tmp_cfd = DestroyCfAndCleanup(edit); + } else if (cf_in_not_found) { + column_families_not_found_.erase(edit.column_family_); + } else { + s = Status::Corruption("MANIFEST - dropping non-existing column family"); + } + *cfd = tmp_cfd; + return s; +} + +Status VersionEditHandler::OnWalAddition(VersionEdit& edit) { + assert(edit.IsWalAddition()); + return version_set_->wals_.AddWals(edit.GetWalAdditions()); +} + +Status VersionEditHandler::OnWalDeletion(VersionEdit& edit) { + assert(edit.IsWalDeletion()); + return version_set_->wals_.DeleteWalsBefore( + edit.GetWalDeletion().GetLogNumber()); +} + +Status VersionEditHandler::OnNonCfOperation(VersionEdit& edit, + ColumnFamilyData** cfd) { + bool cf_in_not_found = false; + bool cf_in_builders = false; + CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders); + + assert(cfd != nullptr); + *cfd = nullptr; + Status s; + if (!cf_in_not_found) { + if (!cf_in_builders) { + s = Status::Corruption( + "MANIFEST record referencing unknown column family"); + } + ColumnFamilyData* tmp_cfd = nullptr; + if (s.ok()) { + auto builder_iter = builders_.find(edit.column_family_); + assert(builder_iter != builders_.end()); + tmp_cfd = version_set_->GetColumnFamilySet()->GetColumnFamily( + edit.column_family_); + assert(tmp_cfd != nullptr); + s = MaybeCreateVersion(edit, tmp_cfd, /*force_create_version=*/false); + if (s.ok()) { + s = builder_iter->second->version_builder()->Apply(&edit); + } + } + *cfd = tmp_cfd; + } + return s; +} + +// TODO maybe cache the computation result +bool VersionEditHandler::HasMissingFiles() const { + bool ret = false; + for (const auto& elem : cf_to_missing_files_) { + const auto& missing_files = elem.second; + if (!missing_files.empty()) { + ret = true; + break; + } + } + if (!ret) { + for (const auto& elem : cf_to_missing_blob_files_high_) { + if (elem.second != kInvalidBlobFileNumber) { + ret = true; + break; + } + } + } + return ret; +} + +void VersionEditHandler::CheckColumnFamilyId(const VersionEdit& edit, + bool* cf_in_not_found, + bool* cf_in_builders) const { + assert(cf_in_not_found != nullptr); + assert(cf_in_builders != nullptr); + // Not found means that user didn't supply that column + // family option AND we encountered column family add + // record. Once we encounter column family drop record, + // we will delete the column family from + // column_families_not_found. + bool in_not_found = column_families_not_found_.find(edit.column_family_) != + column_families_not_found_.end(); + // in builders means that user supplied that column family + // option AND that we encountered column family add record + bool in_builders = builders_.find(edit.column_family_) != builders_.end(); + // They cannot both be true + assert(!(in_not_found && in_builders)); + *cf_in_not_found = in_not_found; + *cf_in_builders = in_builders; +} + +void VersionEditHandler::CheckIterationResult(const log::Reader& reader, + Status* s) { + assert(s != nullptr); + if (!s->ok()) { + // Do nothing here. + } else if (!version_edit_params_.has_log_number_ || + !version_edit_params_.has_next_file_number_ || + !version_edit_params_.has_last_sequence_) { + std::string msg("no "); + if (!version_edit_params_.has_log_number_) { + msg.append("log_file_number, "); + } + if (!version_edit_params_.has_next_file_number_) { + msg.append("next_file_number, "); + } + if (!version_edit_params_.has_last_sequence_) { + msg.append("last_sequence, "); + } + msg = msg.substr(0, msg.size() - 2); + msg.append(" entry in MANIFEST"); + *s = Status::Corruption(msg); + } + // There were some column families in the MANIFEST that weren't specified + // in the argument. This is OK in read_only mode + if (s->ok() && MustOpenAllColumnFamilies() && + !column_families_not_found_.empty()) { + std::string msg; + for (const auto& cf : column_families_not_found_) { + msg.append(", "); + msg.append(cf.second); + } + msg = msg.substr(2); + *s = Status::InvalidArgument("Column families not opened: " + msg); + } + if (s->ok()) { + version_set_->GetColumnFamilySet()->UpdateMaxColumnFamily( + version_edit_params_.max_column_family_); + version_set_->MarkMinLogNumberToKeep( + version_edit_params_.min_log_number_to_keep_); + version_set_->MarkFileNumberUsed(version_edit_params_.prev_log_number_); + version_set_->MarkFileNumberUsed(version_edit_params_.log_number_); + for (auto* cfd : *(version_set_->GetColumnFamilySet())) { + if (cfd->IsDropped()) { + continue; + } + auto builder_iter = builders_.find(cfd->GetID()); + assert(builder_iter != builders_.end()); + auto* builder = builder_iter->second->version_builder(); + if (!builder->CheckConsistencyForNumLevels()) { + *s = Status::InvalidArgument( + "db has more levels than options.num_levels"); + break; + } + } + } + if (s->ok()) { + for (auto* cfd : *(version_set_->GetColumnFamilySet())) { + if (cfd->IsDropped()) { + continue; + } + if (read_only_) { + cfd->table_cache()->SetTablesAreImmortal(); + } + *s = LoadTables(cfd, /*prefetch_index_and_filter_in_cache=*/false, + /*is_initial_load=*/true); + if (!s->ok()) { + // If s is IOError::PathNotFound, then we mark the db as corrupted. + if (s->IsPathNotFound()) { + *s = Status::Corruption("Corruption: " + s->ToString()); + } + break; + } + } + } + if (s->ok()) { + for (auto* cfd : *(version_set_->column_family_set_)) { + if (cfd->IsDropped()) { + continue; + } + assert(cfd->initialized()); + VersionEdit edit; + *s = MaybeCreateVersion(edit, cfd, /*force_create_version=*/true); + if (!s->ok()) { + break; + } + } + } + if (s->ok()) { + version_set_->manifest_file_size_ = reader.GetReadOffset(); + assert(version_set_->manifest_file_size_ > 0); + version_set_->next_file_number_.store( + version_edit_params_.next_file_number_ + 1); + SequenceNumber last_seq = version_edit_params_.last_sequence_; + assert(last_seq != kMaxSequenceNumber); + if (last_seq != kMaxSequenceNumber && + last_seq > version_set_->last_allocated_sequence_.load()) { + version_set_->last_allocated_sequence_.store(last_seq); + } + if (last_seq != kMaxSequenceNumber && + last_seq > version_set_->last_published_sequence_.load()) { + version_set_->last_published_sequence_.store(last_seq); + } + if (last_seq != kMaxSequenceNumber && + last_seq > version_set_->last_sequence_.load()) { + version_set_->last_sequence_.store(last_seq); + } + if (last_seq != kMaxSequenceNumber && + last_seq > version_set_->descriptor_last_sequence_) { + // This is the maximum last sequence of all `VersionEdit`s iterated. It + // may be greater than the maximum `largest_seqno` of all files in case + // the newest data referred to by the MANIFEST has been dropped or had its + // sequence number zeroed through compaction. + version_set_->descriptor_last_sequence_ = last_seq; + } + version_set_->prev_log_number_ = version_edit_params_.prev_log_number_; + } +} + +ColumnFamilyData* VersionEditHandler::CreateCfAndInit( + const ColumnFamilyOptions& cf_options, const VersionEdit& edit) { + ColumnFamilyData* cfd = version_set_->CreateColumnFamily(cf_options, &edit); + assert(cfd != nullptr); + cfd->set_initialized(); + assert(builders_.find(edit.column_family_) == builders_.end()); + builders_.emplace(edit.column_family_, + VersionBuilderUPtr(new BaseReferencedVersionBuilder(cfd))); + if (track_missing_files_) { + cf_to_missing_files_.emplace(edit.column_family_, + std::unordered_set()); + cf_to_missing_blob_files_high_.emplace(edit.column_family_, + kInvalidBlobFileNumber); + } + return cfd; +} + +ColumnFamilyData* VersionEditHandler::DestroyCfAndCleanup( + const VersionEdit& edit) { + auto builder_iter = builders_.find(edit.column_family_); + assert(builder_iter != builders_.end()); + builders_.erase(builder_iter); + if (track_missing_files_) { + auto missing_files_iter = cf_to_missing_files_.find(edit.column_family_); + assert(missing_files_iter != cf_to_missing_files_.end()); + cf_to_missing_files_.erase(missing_files_iter); + + auto missing_blob_files_high_iter = + cf_to_missing_blob_files_high_.find(edit.column_family_); + assert(missing_blob_files_high_iter != + cf_to_missing_blob_files_high_.end()); + cf_to_missing_blob_files_high_.erase(missing_blob_files_high_iter); + } + ColumnFamilyData* ret = + version_set_->GetColumnFamilySet()->GetColumnFamily(edit.column_family_); + assert(ret != nullptr); + ret->SetDropped(); + ret->UnrefAndTryDelete(); + ret = nullptr; + return ret; +} + +Status VersionEditHandler::MaybeCreateVersion(const VersionEdit& /*edit*/, + ColumnFamilyData* cfd, + bool force_create_version) { + assert(cfd->initialized()); + Status s; + if (force_create_version) { + auto builder_iter = builders_.find(cfd->GetID()); + assert(builder_iter != builders_.end()); + auto* builder = builder_iter->second->version_builder(); + auto* v = new Version(cfd, version_set_, version_set_->file_options_, + *cfd->GetLatestMutableCFOptions(), io_tracer_, + version_set_->current_version_number_++); + s = builder->SaveTo(v->storage_info()); + if (s.ok()) { + // Install new version + v->PrepareAppend( + *cfd->GetLatestMutableCFOptions(), + !(version_set_->db_options_->skip_stats_update_on_db_open)); + version_set_->AppendVersion(cfd, v); + } else { + delete v; + } + } + return s; +} + +Status VersionEditHandler::LoadTables(ColumnFamilyData* cfd, + bool prefetch_index_and_filter_in_cache, + bool is_initial_load) { + bool skip_load_table_files = skip_load_table_files_; + TEST_SYNC_POINT_CALLBACK( + "VersionEditHandler::LoadTables:skip_load_table_files", + &skip_load_table_files); + if (skip_load_table_files) { + return Status::OK(); + } + assert(cfd != nullptr); + assert(!cfd->IsDropped()); + auto builder_iter = builders_.find(cfd->GetID()); + assert(builder_iter != builders_.end()); + assert(builder_iter->second != nullptr); + VersionBuilder* builder = builder_iter->second->version_builder(); + assert(builder); + Status s = builder->LoadTableHandlers( + cfd->internal_stats(), + version_set_->db_options_->max_file_opening_threads, + prefetch_index_and_filter_in_cache, is_initial_load, + cfd->GetLatestMutableCFOptions()->prefix_extractor, + MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions())); + if ((s.IsPathNotFound() || s.IsCorruption()) && no_error_if_files_missing_) { + s = Status::OK(); + } + if (!s.ok() && !version_set_->db_options_->paranoid_checks) { + s = Status::OK(); + } + return s; +} + +Status VersionEditHandler::ExtractInfoFromVersionEdit(ColumnFamilyData* cfd, + const VersionEdit& edit) { + Status s; + if (edit.has_db_id_) { + version_set_->db_id_ = edit.GetDbId(); + version_edit_params_.SetDBId(edit.db_id_); + } + if (cfd != nullptr) { + if (edit.has_log_number_) { + if (cfd->GetLogNumber() > edit.log_number_) { + ROCKS_LOG_WARN( + version_set_->db_options()->info_log, + "MANIFEST corruption detected, but ignored - Log numbers in " + "records NOT monotonically increasing"); + } else { + cfd->SetLogNumber(edit.log_number_); + version_edit_params_.SetLogNumber(edit.log_number_); + } + } + if (edit.has_comparator_ && + edit.comparator_ != cfd->user_comparator()->Name()) { + if (!cf_to_cmp_names_) { + s = Status::InvalidArgument( + cfd->user_comparator()->Name(), + "does not match existing comparator " + edit.comparator_); + } else { + cf_to_cmp_names_->emplace(cfd->GetID(), edit.comparator_); + } + } + if (edit.HasFullHistoryTsLow()) { + const std::string& new_ts = edit.GetFullHistoryTsLow(); + cfd->SetFullHistoryTsLow(new_ts); + } + } + + if (s.ok()) { + if (edit.has_prev_log_number_) { + version_edit_params_.SetPrevLogNumber(edit.prev_log_number_); + } + if (edit.has_next_file_number_) { + version_edit_params_.SetNextFile(edit.next_file_number_); + } + if (edit.has_max_column_family_) { + version_edit_params_.SetMaxColumnFamily(edit.max_column_family_); + } + if (edit.has_min_log_number_to_keep_) { + version_edit_params_.min_log_number_to_keep_ = + std::max(version_edit_params_.min_log_number_to_keep_, + edit.min_log_number_to_keep_); + } + if (edit.has_last_sequence_) { + // `VersionEdit::last_sequence_`s are assumed to be non-decreasing. This + // is legacy behavior that cannot change without breaking downgrade + // compatibility. + assert(!version_edit_params_.has_last_sequence_ || + version_edit_params_.last_sequence_ <= edit.last_sequence_); + version_edit_params_.SetLastSequence(edit.last_sequence_); + } + if (!version_edit_params_.has_prev_log_number_) { + version_edit_params_.SetPrevLogNumber(0); + } + } + return s; +} + +VersionEditHandlerPointInTime::VersionEditHandlerPointInTime( + bool read_only, std::vector column_families, + VersionSet* version_set, const std::shared_ptr& io_tracer) + : VersionEditHandler(read_only, column_families, version_set, + /*track_missing_files=*/true, + /*no_error_if_files_missing=*/true, io_tracer) {} + +VersionEditHandlerPointInTime::~VersionEditHandlerPointInTime() { + for (const auto& elem : versions_) { + delete elem.second; + } + versions_.clear(); +} + +void VersionEditHandlerPointInTime::CheckIterationResult( + const log::Reader& reader, Status* s) { + VersionEditHandler::CheckIterationResult(reader, s); + assert(s != nullptr); + if (s->ok()) { + for (auto* cfd : *(version_set_->column_family_set_)) { + if (cfd->IsDropped()) { + continue; + } + assert(cfd->initialized()); + auto v_iter = versions_.find(cfd->GetID()); + if (v_iter != versions_.end()) { + assert(v_iter->second != nullptr); + + version_set_->AppendVersion(cfd, v_iter->second); + versions_.erase(v_iter); + } + } + } else { + for (const auto& elem : versions_) { + delete elem.second; + } + versions_.clear(); + } +} + +ColumnFamilyData* VersionEditHandlerPointInTime::DestroyCfAndCleanup( + const VersionEdit& edit) { + ColumnFamilyData* cfd = VersionEditHandler::DestroyCfAndCleanup(edit); + auto v_iter = versions_.find(edit.column_family_); + if (v_iter != versions_.end()) { + delete v_iter->second; + versions_.erase(v_iter); + } + return cfd; +} + +Status VersionEditHandlerPointInTime::MaybeCreateVersion( + const VersionEdit& edit, ColumnFamilyData* cfd, bool force_create_version) { + assert(cfd != nullptr); + if (!force_create_version) { + assert(edit.column_family_ == cfd->GetID()); + } + auto missing_files_iter = cf_to_missing_files_.find(cfd->GetID()); + assert(missing_files_iter != cf_to_missing_files_.end()); + std::unordered_set& missing_files = missing_files_iter->second; + + auto missing_blob_files_high_iter = + cf_to_missing_blob_files_high_.find(cfd->GetID()); + assert(missing_blob_files_high_iter != cf_to_missing_blob_files_high_.end()); + const uint64_t prev_missing_blob_file_high = + missing_blob_files_high_iter->second; + + VersionBuilder* builder = nullptr; + + if (prev_missing_blob_file_high != kInvalidBlobFileNumber) { + auto builder_iter = builders_.find(cfd->GetID()); + assert(builder_iter != builders_.end()); + builder = builder_iter->second->version_builder(); + assert(builder != nullptr); + } + + // At this point, we have not yet applied the new version edits read from the + // MANIFEST. We check whether we have any missing table and blob files. + const bool prev_has_missing_files = + !missing_files.empty() || + (prev_missing_blob_file_high != kInvalidBlobFileNumber && + prev_missing_blob_file_high >= builder->GetMinOldestBlobFileNumber()); + + for (const auto& file : edit.GetDeletedFiles()) { + uint64_t file_num = file.second; + auto fiter = missing_files.find(file_num); + if (fiter != missing_files.end()) { + missing_files.erase(fiter); + } + } + + assert(!cfd->ioptions()->cf_paths.empty()); + Status s; + for (const auto& elem : edit.GetNewFiles()) { + int level = elem.first; + const FileMetaData& meta = elem.second; + const FileDescriptor& fd = meta.fd; + uint64_t file_num = fd.GetNumber(); + const std::string fpath = + MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_num); + s = VerifyFile(cfd, fpath, level, meta); + if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) { + missing_files.insert(file_num); + s = Status::OK(); + } else if (!s.ok()) { + break; + } + } + + uint64_t missing_blob_file_num = prev_missing_blob_file_high; + for (const auto& elem : edit.GetBlobFileAdditions()) { + uint64_t file_num = elem.GetBlobFileNumber(); + s = VerifyBlobFile(cfd, file_num, elem); + if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) { + missing_blob_file_num = std::max(missing_blob_file_num, file_num); + s = Status::OK(); + } else if (!s.ok()) { + break; + } + } + + bool has_missing_blob_files = false; + if (missing_blob_file_num != kInvalidBlobFileNumber && + missing_blob_file_num >= prev_missing_blob_file_high) { + missing_blob_files_high_iter->second = missing_blob_file_num; + has_missing_blob_files = true; + } else if (missing_blob_file_num < prev_missing_blob_file_high) { + assert(false); + } + + // We still have not applied the new version edit, but have tried to add new + // table and blob files after verifying their presence and consistency. + // Therefore, we know whether we will see new missing table and blob files + // later after actually applying the version edit. We perform the check here + // and record the result. + const bool has_missing_files = + !missing_files.empty() || has_missing_blob_files; + + bool missing_info = !version_edit_params_.has_log_number_ || + !version_edit_params_.has_next_file_number_ || + !version_edit_params_.has_last_sequence_; + + // Create version before apply edit. The version will represent the state + // before applying the version edit. + // A new version will created if: + // 1) no error has occurred so far, and + // 2) log_number_, next_file_number_ and last_sequence_ are known, and + // 3) any of the following: + // a) no missing file before, but will have missing file(s) after applying + // this version edit. + // b) no missing file after applying the version edit, and the caller + // explicitly request that a new version be created. + if (s.ok() && !missing_info && + ((has_missing_files && !prev_has_missing_files) || + (!has_missing_files && force_create_version))) { + if (!builder) { + auto builder_iter = builders_.find(cfd->GetID()); + assert(builder_iter != builders_.end()); + builder = builder_iter->second->version_builder(); + assert(builder); + } + + auto* version = new Version(cfd, version_set_, version_set_->file_options_, + *cfd->GetLatestMutableCFOptions(), io_tracer_, + version_set_->current_version_number_++); + s = builder->LoadTableHandlers( + cfd->internal_stats(), + version_set_->db_options_->max_file_opening_threads, false, true, + cfd->GetLatestMutableCFOptions()->prefix_extractor, + MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions())); + if (!s.ok()) { + delete version; + if (s.IsCorruption()) { + s = Status::OK(); + } + return s; + } + s = builder->SaveTo(version->storage_info()); + if (s.ok()) { + version->PrepareAppend( + *cfd->GetLatestMutableCFOptions(), + !version_set_->db_options_->skip_stats_update_on_db_open); + auto v_iter = versions_.find(cfd->GetID()); + if (v_iter != versions_.end()) { + delete v_iter->second; + v_iter->second = version; + } else { + versions_.emplace(cfd->GetID(), version); + } + } else { + delete version; + } + } + return s; +} + +Status VersionEditHandlerPointInTime::VerifyFile(ColumnFamilyData* cfd, + const std::string& fpath, + int level, + const FileMetaData& fmeta) { + return version_set_->VerifyFileMetadata(cfd, fpath, level, fmeta); +} + +Status VersionEditHandlerPointInTime::VerifyBlobFile( + ColumnFamilyData* cfd, uint64_t blob_file_num, + const BlobFileAddition& blob_addition) { + BlobSource* blob_source = cfd->blob_source(); + assert(blob_source); + CacheHandleGuard blob_file_reader; + Status s = blob_source->GetBlobFileReader(blob_file_num, &blob_file_reader); + if (!s.ok()) { + return s; + } + // TODO: verify checksum + (void)blob_addition; + return s; +} + +Status VersionEditHandlerPointInTime::LoadTables( + ColumnFamilyData* /*cfd*/, bool /*prefetch_index_and_filter_in_cache*/, + bool /*is_initial_load*/) { + return Status::OK(); +} + +Status ManifestTailer::Initialize() { + if (Mode::kRecovery == mode_) { + return VersionEditHandler::Initialize(); + } + assert(Mode::kCatchUp == mode_); + Status s; + if (!initialized_) { + ColumnFamilySet* cfd_set = version_set_->GetColumnFamilySet(); + assert(cfd_set); + ColumnFamilyData* default_cfd = cfd_set->GetDefault(); + assert(default_cfd); + auto builder_iter = builders_.find(default_cfd->GetID()); + assert(builder_iter != builders_.end()); + + Version* dummy_version = default_cfd->dummy_versions(); + assert(dummy_version); + Version* base_version = dummy_version->Next(); + assert(base_version); + base_version->Ref(); + VersionBuilderUPtr new_builder( + new BaseReferencedVersionBuilder(default_cfd, base_version)); + builder_iter->second = std::move(new_builder); + + initialized_ = true; + } + return s; +} + +Status ManifestTailer::ApplyVersionEdit(VersionEdit& edit, + ColumnFamilyData** cfd) { + Status s = VersionEditHandler::ApplyVersionEdit(edit, cfd); + if (s.ok()) { + assert(cfd); + if (*cfd) { + cfds_changed_.insert(*cfd); + } + } + return s; +} + +Status ManifestTailer::OnColumnFamilyAdd(VersionEdit& edit, + ColumnFamilyData** cfd) { + if (Mode::kRecovery == mode_) { + return VersionEditHandler::OnColumnFamilyAdd(edit, cfd); + } + assert(Mode::kCatchUp == mode_); + ColumnFamilySet* cfd_set = version_set_->GetColumnFamilySet(); + assert(cfd_set); + ColumnFamilyData* tmp_cfd = cfd_set->GetColumnFamily(edit.GetColumnFamily()); + assert(cfd); + *cfd = tmp_cfd; + if (!tmp_cfd) { + // For now, ignore new column families created after Recover() succeeds. + return Status::OK(); + } + auto builder_iter = builders_.find(edit.GetColumnFamily()); + assert(builder_iter != builders_.end()); + + Version* dummy_version = tmp_cfd->dummy_versions(); + assert(dummy_version); + Version* base_version = dummy_version->Next(); + assert(base_version); + base_version->Ref(); + VersionBuilderUPtr new_builder( + new BaseReferencedVersionBuilder(tmp_cfd, base_version)); + builder_iter->second = std::move(new_builder); + +#ifndef NDEBUG + auto version_iter = versions_.find(edit.GetColumnFamily()); + assert(version_iter == versions_.end()); +#endif // !NDEBUG + return Status::OK(); +} + +void ManifestTailer::CheckIterationResult(const log::Reader& reader, + Status* s) { + VersionEditHandlerPointInTime::CheckIterationResult(reader, s); + assert(s); + if (s->ok()) { + if (Mode::kRecovery == mode_) { + mode_ = Mode::kCatchUp; + } else { + assert(Mode::kCatchUp == mode_); + } + } +} + +Status ManifestTailer::VerifyFile(ColumnFamilyData* cfd, + const std::string& fpath, int level, + const FileMetaData& fmeta) { + Status s = + VersionEditHandlerPointInTime::VerifyFile(cfd, fpath, level, fmeta); + // TODO: Open file or create hard link to prevent the file from being + // deleted. + return s; +} + +void DumpManifestHandler::CheckIterationResult(const log::Reader& reader, + Status* s) { + VersionEditHandler::CheckIterationResult(reader, s); + if (!s->ok()) { + fprintf(stdout, "%s\n", s->ToString().c_str()); + return; + } + assert(cf_to_cmp_names_); + for (auto* cfd : *(version_set_->column_family_set_)) { + fprintf(stdout, + "--------------- Column family \"%s\" (ID %" PRIu32 + ") --------------\n", + cfd->GetName().c_str(), cfd->GetID()); + fprintf(stdout, "log number: %" PRIu64 "\n", cfd->GetLogNumber()); + auto it = cf_to_cmp_names_->find(cfd->GetID()); + if (it != cf_to_cmp_names_->end()) { + fprintf(stdout, + "comparator: <%s>, but the comparator object is not available.\n", + it->second.c_str()); + } else { + fprintf(stdout, "comparator: %s\n", cfd->user_comparator()->Name()); + } + assert(cfd->current()); + + // Print out DebugStrings. Can include non-terminating null characters. + fwrite(cfd->current()->DebugString(hex_).data(), sizeof(char), + cfd->current()->DebugString(hex_).size(), stdout); + } + fprintf(stdout, + "next_file_number %" PRIu64 " last_sequence %" PRIu64 + " prev_log_number %" PRIu64 " max_column_family %" PRIu32 + " min_log_number_to_keep %" PRIu64 "\n", + version_set_->current_next_file_number(), + version_set_->LastSequence(), version_set_->prev_log_number(), + version_set_->column_family_set_->GetMaxColumnFamily(), + version_set_->min_log_number_to_keep()); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/version_edit_handler.h b/src/rocksdb/db/version_edit_handler.h new file mode 100644 index 000000000..fd2379b07 --- /dev/null +++ b/src/rocksdb/db/version_edit_handler.h @@ -0,0 +1,313 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/version_builder.h" +#include "db/version_edit.h" +#include "db/version_set.h" + +namespace ROCKSDB_NAMESPACE { + +struct FileMetaData; + +class VersionEditHandlerBase { + public: + explicit VersionEditHandlerBase() + : max_manifest_read_size_(std::numeric_limits::max()) {} + + virtual ~VersionEditHandlerBase() {} + + void Iterate(log::Reader& reader, Status* log_read_status); + + const Status& status() const { return status_; } + + AtomicGroupReadBuffer& GetReadBuffer() { return read_buffer_; } + + protected: + explicit VersionEditHandlerBase(uint64_t max_read_size) + : max_manifest_read_size_(max_read_size) {} + virtual Status Initialize() { return Status::OK(); } + + virtual Status ApplyVersionEdit(VersionEdit& edit, + ColumnFamilyData** cfd) = 0; + + virtual void CheckIterationResult(const log::Reader& /*reader*/, + Status* /*s*/) {} + + void ClearReadBuffer() { read_buffer_.Clear(); } + + Status status_; + + private: + AtomicGroupReadBuffer read_buffer_; + const uint64_t max_manifest_read_size_; +}; + +class ListColumnFamiliesHandler : public VersionEditHandlerBase { + public: + ListColumnFamiliesHandler() : VersionEditHandlerBase() {} + + ~ListColumnFamiliesHandler() override {} + + const std::map GetColumnFamilyNames() const { + return column_family_names_; + } + + protected: + Status ApplyVersionEdit(VersionEdit& edit, + ColumnFamilyData** /*unused*/) override; + + private: + // default column family is always implicitly there + std::map column_family_names_{ + {0, kDefaultColumnFamilyName}}; +}; + +class FileChecksumRetriever : public VersionEditHandlerBase { + public: + FileChecksumRetriever(uint64_t max_read_size, + FileChecksumList& file_checksum_list) + : VersionEditHandlerBase(max_read_size), + file_checksum_list_(file_checksum_list) {} + + ~FileChecksumRetriever() override {} + + protected: + Status ApplyVersionEdit(VersionEdit& edit, + ColumnFamilyData** /*unused*/) override; + + private: + FileChecksumList& file_checksum_list_; +}; + +using VersionBuilderUPtr = std::unique_ptr; + +// A class used for scanning MANIFEST file. +// VersionEditHandler reads a MANIFEST file, parses the version edits, and +// builds the version set's in-memory state, e.g. the version storage info for +// the versions of column families. +// To use this class and its subclasses, +// 1. Create an object of VersionEditHandler or its subclasses. +// VersionEditHandler handler(read_only, column_families, version_set, +// track_missing_files, +// no_error_if_files_missing); +// 2. Status s = handler.Iterate(reader, &db_id); +// 3. Check s and handle possible errors. +// +// Not thread-safe, external synchronization is necessary if an object of +// VersionEditHandler is shared by multiple threads. +class VersionEditHandler : public VersionEditHandlerBase { + public: + explicit VersionEditHandler( + bool read_only, + const std::vector& column_families, + VersionSet* version_set, bool track_missing_files, + bool no_error_if_files_missing, + const std::shared_ptr& io_tracer) + : VersionEditHandler(read_only, column_families, version_set, + track_missing_files, no_error_if_files_missing, + io_tracer, /*skip_load_table_files=*/false) {} + + ~VersionEditHandler() override {} + + const VersionEditParams& GetVersionEditParams() const { + return version_edit_params_; + } + + bool HasMissingFiles() const; + + void GetDbId(std::string* db_id) const { + if (db_id && version_edit_params_.has_db_id_) { + *db_id = version_edit_params_.db_id_; + } + } + + protected: + explicit VersionEditHandler( + bool read_only, std::vector column_families, + VersionSet* version_set, bool track_missing_files, + bool no_error_if_files_missing, + const std::shared_ptr& io_tracer, bool skip_load_table_files); + + Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override; + + virtual Status OnColumnFamilyAdd(VersionEdit& edit, ColumnFamilyData** cfd); + + Status OnColumnFamilyDrop(VersionEdit& edit, ColumnFamilyData** cfd); + + Status OnNonCfOperation(VersionEdit& edit, ColumnFamilyData** cfd); + + Status OnWalAddition(VersionEdit& edit); + + Status OnWalDeletion(VersionEdit& edit); + + Status Initialize() override; + + void CheckColumnFamilyId(const VersionEdit& edit, bool* cf_in_not_found, + bool* cf_in_builders) const; + + void CheckIterationResult(const log::Reader& reader, Status* s) override; + + ColumnFamilyData* CreateCfAndInit(const ColumnFamilyOptions& cf_options, + const VersionEdit& edit); + + virtual ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit); + + virtual Status MaybeCreateVersion(const VersionEdit& edit, + ColumnFamilyData* cfd, + bool force_create_version); + + virtual Status LoadTables(ColumnFamilyData* cfd, + bool prefetch_index_and_filter_in_cache, + bool is_initial_load); + + virtual bool MustOpenAllColumnFamilies() const { return !read_only_; } + + const bool read_only_; + std::vector column_families_; + VersionSet* version_set_; + std::unordered_map builders_; + std::unordered_map name_to_options_; + // Keeps track of column families in manifest that were not found in + // column families parameters. if those column families are not dropped + // by subsequent manifest records, Recover() will return failure status. + std::unordered_map column_families_not_found_; + VersionEditParams version_edit_params_; + const bool track_missing_files_; + std::unordered_map> + cf_to_missing_files_; + std::unordered_map cf_to_missing_blob_files_high_; + bool no_error_if_files_missing_; + std::shared_ptr io_tracer_; + bool skip_load_table_files_; + bool initialized_; + std::unique_ptr> cf_to_cmp_names_; + + private: + Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd, + const VersionEdit& edit); +}; + +// A class similar to its base class, i.e. VersionEditHandler. +// VersionEditHandlerPointInTime restores the versions to the most recent point +// in time such that at this point, the version does not have missing files. +// +// Not thread-safe, external synchronization is necessary if an object of +// VersionEditHandlerPointInTime is shared by multiple threads. +class VersionEditHandlerPointInTime : public VersionEditHandler { + public: + VersionEditHandlerPointInTime( + bool read_only, std::vector column_families, + VersionSet* version_set, const std::shared_ptr& io_tracer); + ~VersionEditHandlerPointInTime() override; + + protected: + void CheckIterationResult(const log::Reader& reader, Status* s) override; + ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit) override; + Status MaybeCreateVersion(const VersionEdit& edit, ColumnFamilyData* cfd, + bool force_create_version) override; + virtual Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, + int level, const FileMetaData& fmeta); + virtual Status VerifyBlobFile(ColumnFamilyData* cfd, uint64_t blob_file_num, + const BlobFileAddition& blob_addition); + + Status LoadTables(ColumnFamilyData* cfd, + bool prefetch_index_and_filter_in_cache, + bool is_initial_load) override; + + std::unordered_map versions_; +}; + +class ManifestTailer : public VersionEditHandlerPointInTime { + public: + explicit ManifestTailer(std::vector column_families, + VersionSet* version_set, + const std::shared_ptr& io_tracer) + : VersionEditHandlerPointInTime(/*read_only=*/false, column_families, + version_set, io_tracer), + mode_(Mode::kRecovery) {} + + void PrepareToReadNewManifest() { + initialized_ = false; + ClearReadBuffer(); + } + + std::unordered_set& GetUpdatedColumnFamilies() { + return cfds_changed_; + } + + protected: + Status Initialize() override; + + bool MustOpenAllColumnFamilies() const override { return false; } + + Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override; + + Status OnColumnFamilyAdd(VersionEdit& edit, ColumnFamilyData** cfd) override; + + void CheckIterationResult(const log::Reader& reader, Status* s) override; + + Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level, + const FileMetaData& fmeta) override; + + enum Mode : uint8_t { + kRecovery = 0, + kCatchUp = 1, + }; + + Mode mode_; + std::unordered_set cfds_changed_; +}; + +class DumpManifestHandler : public VersionEditHandler { + public: + DumpManifestHandler(std::vector column_families, + VersionSet* version_set, + const std::shared_ptr& io_tracer, bool verbose, + bool hex, bool json) + : VersionEditHandler( + /*read_only=*/true, column_families, version_set, + /*track_missing_files=*/false, + /*no_error_if_files_missing=*/false, io_tracer, + /*skip_load_table_files=*/true), + verbose_(verbose), + hex_(hex), + json_(json), + count_(0) { + cf_to_cmp_names_.reset(new std::unordered_map()); + } + + ~DumpManifestHandler() override {} + + Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override { + // Write out each individual edit + if (verbose_ && !json_) { + // Print out DebugStrings. Can include non-terminating null characters. + fwrite(edit.DebugString(hex_).data(), sizeof(char), + edit.DebugString(hex_).size(), stdout); + } else if (json_) { + // Print out DebugStrings. Can include non-terminating null characters. + fwrite(edit.DebugString(hex_).data(), sizeof(char), + edit.DebugString(hex_).size(), stdout); + } + ++count_; + return VersionEditHandler::ApplyVersionEdit(edit, cfd); + } + + void CheckIterationResult(const log::Reader& reader, Status* s) override; + + private: + const bool verbose_; + const bool hex_; + const bool json_; + int count_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/version_edit_test.cc b/src/rocksdb/db/version_edit_test.cc new file mode 100644 index 000000000..c7f271d83 --- /dev/null +++ b/src/rocksdb/db/version_edit_test.cc @@ -0,0 +1,730 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_edit.h" + +#include "db/blob/blob_index.h" +#include "rocksdb/advanced_options.h" +#include "table/unique_id_impl.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/coding.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +static void TestEncodeDecode(const VersionEdit& edit) { + std::string encoded, encoded2; + edit.EncodeTo(&encoded); + VersionEdit parsed; + Status s = parsed.DecodeFrom(encoded); + ASSERT_TRUE(s.ok()) << s.ToString(); + parsed.EncodeTo(&encoded2); + ASSERT_EQ(encoded, encoded2); +} + +class VersionEditTest : public testing::Test {}; + +TEST_F(VersionEditTest, EncodeDecode) { + static const uint64_t kBig = 1ull << 50; + static const uint32_t kBig32Bit = 1ull << 30; + + VersionEdit edit; + for (int i = 0; i < 4; i++) { + TestEncodeDecode(edit); + edit.AddFile(3, kBig + 300 + i, kBig32Bit + 400 + i, 0, + InternalKey("foo", kBig + 500 + i, kTypeValue), + InternalKey("zoo", kBig + 600 + i, kTypeDeletion), + kBig + 500 + i, kBig + 600 + i, false, Temperature::kUnknown, + kInvalidBlobFileNumber, 888, 678, "234", "crc32c", + kNullUniqueId64x2); + edit.DeleteFile(4, kBig + 700 + i); + } + + edit.SetComparatorName("foo"); + edit.SetLogNumber(kBig + 100); + edit.SetNextFile(kBig + 200); + edit.SetLastSequence(kBig + 1000); + TestEncodeDecode(edit); +} + +TEST_F(VersionEditTest, EncodeDecodeNewFile4) { + static const uint64_t kBig = 1ull << 50; + + VersionEdit edit; + edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue), + InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, + kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2); + edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), + InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, + kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2); + edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue), + InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502, + kBig + 602, true, Temperature::kUnknown, kInvalidBlobFileNumber, + 666, 888, kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2); + edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex), + InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503, + kBig + 603, true, Temperature::kUnknown, 1001, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2); + + edit.DeleteFile(4, 700); + + edit.SetComparatorName("foo"); + edit.SetLogNumber(kBig + 100); + edit.SetNextFile(kBig + 200); + edit.SetLastSequence(kBig + 1000); + TestEncodeDecode(edit); + + std::string encoded, encoded2; + edit.EncodeTo(&encoded); + VersionEdit parsed; + Status s = parsed.DecodeFrom(encoded); + ASSERT_TRUE(s.ok()) << s.ToString(); + auto& new_files = parsed.GetNewFiles(); + ASSERT_TRUE(new_files[0].second.marked_for_compaction); + ASSERT_TRUE(!new_files[1].second.marked_for_compaction); + ASSERT_TRUE(new_files[2].second.marked_for_compaction); + ASSERT_TRUE(new_files[3].second.marked_for_compaction); + ASSERT_EQ(3u, new_files[0].second.fd.GetPathId()); + ASSERT_EQ(3u, new_files[1].second.fd.GetPathId()); + ASSERT_EQ(0u, new_files[2].second.fd.GetPathId()); + ASSERT_EQ(0u, new_files[3].second.fd.GetPathId()); + ASSERT_EQ(kInvalidBlobFileNumber, + new_files[0].second.oldest_blob_file_number); + ASSERT_EQ(kInvalidBlobFileNumber, + new_files[1].second.oldest_blob_file_number); + ASSERT_EQ(kInvalidBlobFileNumber, + new_files[2].second.oldest_blob_file_number); + ASSERT_EQ(1001, new_files[3].second.oldest_blob_file_number); +} + +TEST_F(VersionEditTest, ForwardCompatibleNewFile4) { + static const uint64_t kBig = 1ull << 50; + VersionEdit edit; + edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue), + InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, + kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2); + edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), + InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, + kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber, + 686, 868, "234", "crc32c", kNullUniqueId64x2); + edit.DeleteFile(4, 700); + + edit.SetComparatorName("foo"); + edit.SetLogNumber(kBig + 100); + edit.SetNextFile(kBig + 200); + edit.SetLastSequence(kBig + 1000); + + std::string encoded; + + // Call back function to add extra customized builds. + bool first = true; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionEdit::EncodeTo:NewFile4:CustomizeFields", [&](void* arg) { + std::string* str = reinterpret_cast(arg); + PutVarint32(str, 33); + const std::string str1 = "random_string"; + PutLengthPrefixedSlice(str, str1); + if (first) { + first = false; + PutVarint32(str, 22); + const std::string str2 = "s"; + PutLengthPrefixedSlice(str, str2); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + edit.EncodeTo(&encoded); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + VersionEdit parsed; + Status s = parsed.DecodeFrom(encoded); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_TRUE(!first); + auto& new_files = parsed.GetNewFiles(); + ASSERT_TRUE(new_files[0].second.marked_for_compaction); + ASSERT_TRUE(!new_files[1].second.marked_for_compaction); + ASSERT_EQ(3u, new_files[0].second.fd.GetPathId()); + ASSERT_EQ(3u, new_files[1].second.fd.GetPathId()); + ASSERT_EQ(1u, parsed.GetDeletedFiles().size()); +} + +TEST_F(VersionEditTest, NewFile4NotSupportedField) { + static const uint64_t kBig = 1ull << 50; + VersionEdit edit; + edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue), + InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, + kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2); + + edit.SetComparatorName("foo"); + edit.SetLogNumber(kBig + 100); + edit.SetNextFile(kBig + 200); + edit.SetLastSequence(kBig + 1000); + + std::string encoded; + + // Call back function to add extra customized builds. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionEdit::EncodeTo:NewFile4:CustomizeFields", [&](void* arg) { + std::string* str = reinterpret_cast(arg); + const std::string str1 = "s"; + PutLengthPrefixedSlice(str, str1); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + edit.EncodeTo(&encoded); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + VersionEdit parsed; + Status s = parsed.DecodeFrom(encoded); + ASSERT_NOK(s); +} + +TEST_F(VersionEditTest, EncodeEmptyFile) { + VersionEdit edit; + edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2); + std::string buffer; + ASSERT_TRUE(!edit.EncodeTo(&buffer)); +} + +TEST_F(VersionEditTest, ColumnFamilyTest) { + VersionEdit edit; + edit.SetColumnFamily(2); + edit.AddColumnFamily("column_family"); + edit.SetMaxColumnFamily(5); + TestEncodeDecode(edit); + + edit.Clear(); + edit.SetColumnFamily(3); + edit.DropColumnFamily(); + TestEncodeDecode(edit); +} + +TEST_F(VersionEditTest, MinLogNumberToKeep) { + VersionEdit edit; + edit.SetMinLogNumberToKeep(13); + TestEncodeDecode(edit); + + edit.Clear(); + edit.SetMinLogNumberToKeep(23); + TestEncodeDecode(edit); +} + +TEST_F(VersionEditTest, AtomicGroupTest) { + VersionEdit edit; + edit.MarkAtomicGroup(1); + TestEncodeDecode(edit); +} + +TEST_F(VersionEditTest, IgnorableField) { + VersionEdit ve; + std::string encoded; + + // Size of ignorable field is too large + PutVarint32Varint64(&encoded, 2 /* kLogNumber */, 66); + // This is a customized ignorable tag + PutVarint32Varint64(&encoded, + 0x2710 /* A field with kTagSafeIgnoreMask set */, + 5 /* fieldlength 5 */); + encoded += "abc"; // Only fills 3 bytes, + ASSERT_NOK(ve.DecodeFrom(encoded)); + + encoded.clear(); + // Error when seeing unidentified tag that is not ignorable + PutVarint32Varint64(&encoded, 2 /* kLogNumber */, 66); + // This is a customized ignorable tag + PutVarint32Varint64(&encoded, 666 /* A field with kTagSafeIgnoreMask unset */, + 3 /* fieldlength 3 */); + encoded += "abc"; // Fill 3 bytes + PutVarint32Varint64(&encoded, 3 /* next file number */, 88); + ASSERT_NOK(ve.DecodeFrom(encoded)); + + // Safely ignore an identified but safely ignorable entry + encoded.clear(); + PutVarint32Varint64(&encoded, 2 /* kLogNumber */, 66); + // This is a customized ignorable tag + PutVarint32Varint64(&encoded, + 0x2710 /* A field with kTagSafeIgnoreMask set */, + 3 /* fieldlength 3 */); + encoded += "abc"; // Fill 3 bytes + PutVarint32Varint64(&encoded, 3 /* kNextFileNumber */, 88); + + ASSERT_OK(ve.DecodeFrom(encoded)); + + ASSERT_TRUE(ve.HasLogNumber()); + ASSERT_TRUE(ve.HasNextFile()); + ASSERT_EQ(66, ve.GetLogNumber()); + ASSERT_EQ(88, ve.GetNextFile()); +} + +TEST_F(VersionEditTest, DbId) { + VersionEdit edit; + edit.SetDBId("ab34-cd12-435f-er00"); + TestEncodeDecode(edit); + + edit.Clear(); + edit.SetDBId("34ba-cd12-435f-er01"); + TestEncodeDecode(edit); +} + +TEST_F(VersionEditTest, BlobFileAdditionAndGarbage) { + VersionEdit edit; + + const std::string checksum_method_prefix = "Hash"; + const std::string checksum_value_prefix = "Value"; + + for (uint64_t blob_file_number = 1; blob_file_number <= 10; + ++blob_file_number) { + const uint64_t total_blob_count = blob_file_number << 10; + const uint64_t total_blob_bytes = blob_file_number << 20; + + std::string checksum_method(checksum_method_prefix); + AppendNumberTo(&checksum_method, blob_file_number); + + std::string checksum_value(checksum_value_prefix); + AppendNumberTo(&checksum_value, blob_file_number); + + edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes, + checksum_method, checksum_value); + + const uint64_t garbage_blob_count = total_blob_count >> 2; + const uint64_t garbage_blob_bytes = total_blob_bytes >> 1; + + edit.AddBlobFileGarbage(blob_file_number, garbage_blob_count, + garbage_blob_bytes); + } + + TestEncodeDecode(edit); +} + +TEST_F(VersionEditTest, AddWalEncodeDecode) { + VersionEdit edit; + for (uint64_t log_number = 1; log_number <= 20; log_number++) { + WalMetadata meta; + bool has_size = rand() % 2 == 0; + if (has_size) { + meta.SetSyncedSizeInBytes(rand() % 1000); + } + edit.AddWal(log_number, meta); + } + TestEncodeDecode(edit); +} + +static std::string PrefixEncodedWalAdditionWithLength( + const std::string& encoded) { + std::string ret; + PutVarint32(&ret, Tag::kWalAddition2); + PutLengthPrefixedSlice(&ret, encoded); + return ret; +} + +TEST_F(VersionEditTest, AddWalDecodeBadLogNumber) { + std::string encoded; + + { + // No log number. + std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded); + VersionEdit edit; + Status s = edit.DecodeFrom(encoded_edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(s.ToString().find("Error decoding WAL log number") != + std::string::npos) + << s.ToString(); + } + + { + // log number should be varint64, + // but we only encode 128 which is not a valid representation of varint64. + char c = 0; + unsigned char* ptr = reinterpret_cast(&c); + *ptr = 128; + encoded.append(1, c); + + std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded); + VersionEdit edit; + Status s = edit.DecodeFrom(encoded_edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(s.ToString().find("Error decoding WAL log number") != + std::string::npos) + << s.ToString(); + } +} + +TEST_F(VersionEditTest, AddWalDecodeBadTag) { + constexpr WalNumber kLogNumber = 100; + constexpr uint64_t kSizeInBytes = 100; + + std::string encoded; + PutVarint64(&encoded, kLogNumber); + + { + // No tag. + std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded); + VersionEdit edit; + Status s = edit.DecodeFrom(encoded_edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos) + << s.ToString(); + } + + { + // Only has size tag, no terminate tag. + std::string encoded_with_size = encoded; + PutVarint32(&encoded_with_size, + static_cast(WalAdditionTag::kSyncedSize)); + PutVarint64(&encoded_with_size, kSizeInBytes); + + std::string encoded_edit = + PrefixEncodedWalAdditionWithLength(encoded_with_size); + VersionEdit edit; + Status s = edit.DecodeFrom(encoded_edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos) + << s.ToString(); + } + + { + // Only has terminate tag. + std::string encoded_with_terminate = encoded; + PutVarint32(&encoded_with_terminate, + static_cast(WalAdditionTag::kTerminate)); + + std::string encoded_edit = + PrefixEncodedWalAdditionWithLength(encoded_with_terminate); + VersionEdit edit; + ASSERT_OK(edit.DecodeFrom(encoded_edit)); + auto& wal_addition = edit.GetWalAdditions()[0]; + ASSERT_EQ(wal_addition.GetLogNumber(), kLogNumber); + ASSERT_FALSE(wal_addition.GetMetadata().HasSyncedSize()); + } +} + +TEST_F(VersionEditTest, AddWalDecodeNoSize) { + constexpr WalNumber kLogNumber = 100; + + std::string encoded; + PutVarint64(&encoded, kLogNumber); + PutVarint32(&encoded, static_cast(WalAdditionTag::kSyncedSize)); + // No real size after the size tag. + + { + // Without terminate tag. + std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded); + VersionEdit edit; + Status s = edit.DecodeFrom(encoded_edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(s.ToString().find("Error decoding WAL file size") != + std::string::npos) + << s.ToString(); + } + + { + // With terminate tag. + PutVarint32(&encoded, static_cast(WalAdditionTag::kTerminate)); + + std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded); + VersionEdit edit; + Status s = edit.DecodeFrom(encoded_edit); + ASSERT_TRUE(s.IsCorruption()); + // The terminate tag is misunderstood as the size. + ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos) + << s.ToString(); + } +} + +TEST_F(VersionEditTest, AddWalDebug) { + constexpr int n = 2; + constexpr std::array kLogNumbers{{10, 20}}; + constexpr std::array kSizeInBytes{{100, 200}}; + + VersionEdit edit; + for (int i = 0; i < n; i++) { + edit.AddWal(kLogNumbers[i], WalMetadata(kSizeInBytes[i])); + } + + const WalAdditions& wals = edit.GetWalAdditions(); + + ASSERT_TRUE(edit.IsWalAddition()); + ASSERT_EQ(wals.size(), n); + for (int i = 0; i < n; i++) { + const WalAddition& wal = wals[i]; + ASSERT_EQ(wal.GetLogNumber(), kLogNumbers[i]); + ASSERT_EQ(wal.GetMetadata().GetSyncedSizeInBytes(), kSizeInBytes[i]); + } + + std::string expected_str = "VersionEdit {\n"; + for (int i = 0; i < n; i++) { + std::stringstream ss; + ss << " WalAddition: log_number: " << kLogNumbers[i] + << " synced_size_in_bytes: " << kSizeInBytes[i] << "\n"; + expected_str += ss.str(); + } + expected_str += " ColumnFamily: 0\n}\n"; + ASSERT_EQ(edit.DebugString(true), expected_str); + + std::string expected_json = "{\"EditNumber\": 4, \"WalAdditions\": ["; + for (int i = 0; i < n; i++) { + std::stringstream ss; + ss << "{\"LogNumber\": " << kLogNumbers[i] << ", " + << "\"SyncedSizeInBytes\": " << kSizeInBytes[i] << "}"; + if (i < n - 1) ss << ", "; + expected_json += ss.str(); + } + expected_json += "], \"ColumnFamily\": 0}"; + ASSERT_EQ(edit.DebugJSON(4, true), expected_json); +} + +TEST_F(VersionEditTest, DeleteWalEncodeDecode) { + VersionEdit edit; + edit.DeleteWalsBefore(rand() % 100); + TestEncodeDecode(edit); +} + +TEST_F(VersionEditTest, DeleteWalDebug) { + constexpr int n = 2; + constexpr std::array kLogNumbers{{10, 20}}; + + VersionEdit edit; + edit.DeleteWalsBefore(kLogNumbers[n - 1]); + + const WalDeletion& wal = edit.GetWalDeletion(); + + ASSERT_TRUE(edit.IsWalDeletion()); + ASSERT_EQ(wal.GetLogNumber(), kLogNumbers[n - 1]); + + std::string expected_str = "VersionEdit {\n"; + { + std::stringstream ss; + ss << " WalDeletion: log_number: " << kLogNumbers[n - 1] << "\n"; + expected_str += ss.str(); + } + expected_str += " ColumnFamily: 0\n}\n"; + ASSERT_EQ(edit.DebugString(true), expected_str); + + std::string expected_json = "{\"EditNumber\": 4, \"WalDeletion\": "; + { + std::stringstream ss; + ss << "{\"LogNumber\": " << kLogNumbers[n - 1] << "}"; + expected_json += ss.str(); + } + expected_json += ", \"ColumnFamily\": 0}"; + ASSERT_EQ(edit.DebugJSON(4, true), expected_json); +} + +TEST_F(VersionEditTest, FullHistoryTsLow) { + VersionEdit edit; + ASSERT_FALSE(edit.HasFullHistoryTsLow()); + std::string ts = test::EncodeInt(0); + edit.SetFullHistoryTsLow(ts); + TestEncodeDecode(edit); +} + +// Tests that if RocksDB is downgraded, the new types of VersionEdits +// that have a tag larger than kTagSafeIgnoreMask can be safely ignored. +TEST_F(VersionEditTest, IgnorableTags) { + SyncPoint::GetInstance()->SetCallBack( + "VersionEdit::EncodeTo:IgnoreIgnorableTags", [&](void* arg) { + bool* ignore = static_cast(arg); + *ignore = true; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr uint64_t kPrevLogNumber = 100; + constexpr uint64_t kLogNumber = 200; + constexpr uint64_t kNextFileNumber = 300; + constexpr uint64_t kColumnFamilyId = 400; + + VersionEdit edit; + // Add some ignorable entries. + for (int i = 0; i < 2; i++) { + edit.AddWal(i + 1, WalMetadata(i + 2)); + } + edit.SetDBId("db_id"); + // Add unignorable entries. + edit.SetPrevLogNumber(kPrevLogNumber); + edit.SetLogNumber(kLogNumber); + // Add more ignorable entries. + edit.DeleteWalsBefore(100); + // Add unignorable entry. + edit.SetNextFile(kNextFileNumber); + // Add more ignorable entries. + edit.SetFullHistoryTsLow("ts"); + // Add unignorable entry. + edit.SetColumnFamily(kColumnFamilyId); + + std::string encoded; + ASSERT_TRUE(edit.EncodeTo(&encoded)); + + VersionEdit decoded; + ASSERT_OK(decoded.DecodeFrom(encoded)); + + // Check that all ignorable entries are ignored. + ASSERT_FALSE(decoded.HasDbId()); + ASSERT_FALSE(decoded.HasFullHistoryTsLow()); + ASSERT_FALSE(decoded.IsWalAddition()); + ASSERT_FALSE(decoded.IsWalDeletion()); + ASSERT_TRUE(decoded.GetWalAdditions().empty()); + ASSERT_TRUE(decoded.GetWalDeletion().IsEmpty()); + + // Check that unignorable entries are still present. + ASSERT_EQ(edit.GetPrevLogNumber(), kPrevLogNumber); + ASSERT_EQ(edit.GetLogNumber(), kLogNumber); + ASSERT_EQ(edit.GetNextFile(), kNextFileNumber); + ASSERT_EQ(edit.GetColumnFamily(), kColumnFamilyId); + + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST(FileMetaDataTest, UpdateBoundariesBlobIndex) { + FileMetaData meta; + + { + constexpr uint64_t file_number = 10; + constexpr uint32_t path_id = 0; + constexpr uint64_t file_size = 0; + + meta.fd = FileDescriptor(file_number, path_id, file_size); + } + + constexpr char key[] = "foo"; + + constexpr uint64_t expected_oldest_blob_file_number = 20; + + // Plain old value (does not affect oldest_blob_file_number) + { + constexpr char value[] = "value"; + constexpr SequenceNumber seq = 200; + + ASSERT_OK(meta.UpdateBoundaries(key, value, seq, kTypeValue)); + ASSERT_EQ(meta.oldest_blob_file_number, kInvalidBlobFileNumber); + } + + // Non-inlined, non-TTL blob index (sets oldest_blob_file_number) + { + constexpr uint64_t blob_file_number = 25; + static_assert(blob_file_number > expected_oldest_blob_file_number, + "unexpected"); + + constexpr uint64_t offset = 1000; + constexpr uint64_t size = 100; + + std::string blob_index; + BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size, + kNoCompression); + + constexpr SequenceNumber seq = 201; + + ASSERT_OK(meta.UpdateBoundaries(key, blob_index, seq, kTypeBlobIndex)); + ASSERT_EQ(meta.oldest_blob_file_number, blob_file_number); + } + + // Another one, with the oldest blob file number (updates + // oldest_blob_file_number) + { + constexpr uint64_t offset = 2000; + constexpr uint64_t size = 300; + + std::string blob_index; + BlobIndex::EncodeBlob(&blob_index, expected_oldest_blob_file_number, offset, + size, kNoCompression); + + constexpr SequenceNumber seq = 202; + + ASSERT_OK(meta.UpdateBoundaries(key, blob_index, seq, kTypeBlobIndex)); + ASSERT_EQ(meta.oldest_blob_file_number, expected_oldest_blob_file_number); + } + + // Inlined TTL blob index (does not affect oldest_blob_file_number) + { + constexpr uint64_t expiration = 9876543210; + constexpr char value[] = "value"; + + std::string blob_index; + BlobIndex::EncodeInlinedTTL(&blob_index, expiration, value); + + constexpr SequenceNumber seq = 203; + + ASSERT_OK(meta.UpdateBoundaries(key, blob_index, seq, kTypeBlobIndex)); + ASSERT_EQ(meta.oldest_blob_file_number, expected_oldest_blob_file_number); + } + + // Non-inlined TTL blob index (does not affect oldest_blob_file_number, even + // though file number is smaller) + { + constexpr uint64_t expiration = 9876543210; + constexpr uint64_t blob_file_number = 15; + static_assert(blob_file_number < expected_oldest_blob_file_number, + "unexpected"); + + constexpr uint64_t offset = 2000; + constexpr uint64_t size = 500; + + std::string blob_index; + BlobIndex::EncodeBlobTTL(&blob_index, expiration, blob_file_number, offset, + size, kNoCompression); + + constexpr SequenceNumber seq = 204; + + ASSERT_OK(meta.UpdateBoundaries(key, blob_index, seq, kTypeBlobIndex)); + ASSERT_EQ(meta.oldest_blob_file_number, expected_oldest_blob_file_number); + } + + // Corrupt blob index + { + constexpr char corrupt_blob_index[] = "!corrupt!"; + constexpr SequenceNumber seq = 205; + + ASSERT_TRUE( + meta.UpdateBoundaries(key, corrupt_blob_index, seq, kTypeBlobIndex) + .IsCorruption()); + ASSERT_EQ(meta.oldest_blob_file_number, expected_oldest_blob_file_number); + } + + // Invalid blob file number + { + constexpr uint64_t offset = 10000; + constexpr uint64_t size = 1000; + + std::string blob_index; + BlobIndex::EncodeBlob(&blob_index, kInvalidBlobFileNumber, offset, size, + kNoCompression); + + constexpr SequenceNumber seq = 206; + + ASSERT_TRUE(meta.UpdateBoundaries(key, blob_index, seq, kTypeBlobIndex) + .IsCorruption()); + ASSERT_EQ(meta.oldest_blob_file_number, expected_oldest_blob_file_number); + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/version_set.cc b/src/rocksdb/db/version_set.cc new file mode 100644 index 000000000..427af6e25 --- /dev/null +++ b/src/rocksdb/db/version_set.cc @@ -0,0 +1,6903 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_set.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/blob/blob_fetcher.h" +#include "db/blob/blob_file_cache.h" +#include "db/blob/blob_file_reader.h" +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/blob/blob_source.h" +#include "db/compaction/compaction.h" +#include "db/compaction/file_pri.h" +#include "db/dbformat.h" +#include "db/internal_stats.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/merge_context.h" +#include "db/merge_helper.h" +#include "db/pinned_iterators_manager.h" +#include "db/table_cache.h" +#include "db/version_builder.h" +#include "db/version_edit_handler.h" +#if USE_COROUTINES +#include "folly/experimental/coro/BlockingWait.h" +#include "folly/experimental/coro/Collect.h" +#endif +#include "file/filename.h" +#include "file/random_access_file_reader.h" +#include "file/read_write_util.h" +#include "file/writable_file_writer.h" +#include "logging/logging.h" +#include "monitoring/file_read_sample.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/persistent_stats_history.h" +#include "options/options_helper.h" +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/format.h" +#include "table/get_context.h" +#include "table/internal_iterator.h" +#include "table/merging_iterator.h" +#include "table/meta_blocks.h" +#include "table/multiget_context.h" +#include "table/plain/plain_table_factory.h" +#include "table/table_reader.h" +#include "table/two_level_iterator.h" +#include "table/unique_id_impl.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/coding.h" +#include "util/coro_utils.h" +#include "util/stop_watch.h" +#include "util/string_util.h" +#include "util/user_comparator_wrapper.h" + +// Generate the regular and coroutine versions of some methods by +// including version_set_sync_and_async.h twice +// Macros in the header will expand differently based on whether +// WITH_COROUTINES or WITHOUT_COROUTINES is defined +// clang-format off +#define WITHOUT_COROUTINES +#include "db/version_set_sync_and_async.h" +#undef WITHOUT_COROUTINES +#define WITH_COROUTINES +#include "db/version_set_sync_and_async.h" +#undef WITH_COROUTINES +// clang-format on + +namespace ROCKSDB_NAMESPACE { + +namespace { + +// Find File in LevelFilesBrief data structure +// Within an index range defined by left and right +int FindFileInRange(const InternalKeyComparator& icmp, + const LevelFilesBrief& file_level, const Slice& key, + uint32_t left, uint32_t right) { + auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool { + return icmp.InternalKeyComparator::Compare(f.largest_key, k) < 0; + }; + const auto& b = file_level.files; + return static_cast(std::lower_bound(b + left, b + right, key, cmp) - b); +} + +Status OverlapWithIterator(const Comparator* ucmp, + const Slice& smallest_user_key, + const Slice& largest_user_key, + InternalIterator* iter, bool* overlap) { + InternalKey range_start(smallest_user_key, kMaxSequenceNumber, + kValueTypeForSeek); + iter->Seek(range_start.Encode()); + if (!iter->status().ok()) { + return iter->status(); + } + + *overlap = false; + if (iter->Valid()) { + ParsedInternalKey seek_result; + Status s = ParseInternalKey(iter->key(), &seek_result, + false /* log_err_key */); // TODO + if (!s.ok()) return s; + + if (ucmp->CompareWithoutTimestamp(seek_result.user_key, largest_user_key) <= + 0) { + *overlap = true; + } + } + + return iter->status(); +} + +// Class to help choose the next file to search for the particular key. +// Searches and returns files level by level. +// We can search level-by-level since entries never hop across +// levels. Therefore we are guaranteed that if we find data +// in a smaller level, later levels are irrelevant (unless we +// are MergeInProgress). +class FilePicker { + public: + FilePicker(const Slice& user_key, const Slice& ikey, + autovector* file_levels, unsigned int num_levels, + FileIndexer* file_indexer, const Comparator* user_comparator, + const InternalKeyComparator* internal_comparator) + : num_levels_(num_levels), + curr_level_(static_cast(-1)), + returned_file_level_(static_cast(-1)), + hit_file_level_(static_cast(-1)), + search_left_bound_(0), + search_right_bound_(FileIndexer::kLevelMaxIndex), + level_files_brief_(file_levels), + is_hit_file_last_in_level_(false), + curr_file_level_(nullptr), + user_key_(user_key), + ikey_(ikey), + file_indexer_(file_indexer), + user_comparator_(user_comparator), + internal_comparator_(internal_comparator) { + // Setup member variables to search first level. + search_ended_ = !PrepareNextLevel(); + if (!search_ended_) { + // Prefetch Level 0 table data to avoid cache miss if possible. + for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) { + auto* r = (*level_files_brief_)[0].files[i].fd.table_reader; + if (r) { + r->Prepare(ikey); + } + } + } + } + + int GetCurrentLevel() const { return curr_level_; } + + FdWithKeyRange* GetNextFile() { + while (!search_ended_) { // Loops over different levels. + while (curr_index_in_curr_level_ < curr_file_level_->num_files) { + // Loops over all files in current level. + FdWithKeyRange* f = &curr_file_level_->files[curr_index_in_curr_level_]; + hit_file_level_ = curr_level_; + is_hit_file_last_in_level_ = + curr_index_in_curr_level_ == curr_file_level_->num_files - 1; + int cmp_largest = -1; + + // Do key range filtering of files or/and fractional cascading if: + // (1) not all the files are in level 0, or + // (2) there are more than 3 current level files + // If there are only 3 or less current level files in the system, we + // skip the key range filtering. In this case, more likely, the system + // is highly tuned to minimize number of tables queried by each query, + // so it is unlikely that key range filtering is more efficient than + // querying the files. + if (num_levels_ > 1 || curr_file_level_->num_files > 3) { + // Check if key is within a file's range. If search left bound and + // right bound point to the same find, we are sure key falls in + // range. + assert(curr_level_ == 0 || + curr_index_in_curr_level_ == start_index_in_curr_level_ || + user_comparator_->CompareWithoutTimestamp( + user_key_, ExtractUserKey(f->smallest_key)) <= 0); + + int cmp_smallest = user_comparator_->CompareWithoutTimestamp( + user_key_, ExtractUserKey(f->smallest_key)); + if (cmp_smallest >= 0) { + cmp_largest = user_comparator_->CompareWithoutTimestamp( + user_key_, ExtractUserKey(f->largest_key)); + } + + // Setup file search bound for the next level based on the + // comparison results + if (curr_level_ > 0) { + file_indexer_->GetNextLevelIndex( + curr_level_, curr_index_in_curr_level_, cmp_smallest, + cmp_largest, &search_left_bound_, &search_right_bound_); + } + // Key falls out of current file's range + if (cmp_smallest < 0 || cmp_largest > 0) { + if (curr_level_ == 0) { + ++curr_index_in_curr_level_; + continue; + } else { + // Search next level. + break; + } + } + } + + returned_file_level_ = curr_level_; + if (curr_level_ > 0 && cmp_largest < 0) { + // No more files to search in this level. + search_ended_ = !PrepareNextLevel(); + } else { + ++curr_index_in_curr_level_; + } + return f; + } + // Start searching next level. + search_ended_ = !PrepareNextLevel(); + } + // Search ended. + return nullptr; + } + + // getter for current file level + // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts + unsigned int GetHitFileLevel() { return hit_file_level_; } + + // Returns true if the most recent "hit file" (i.e., one returned by + // GetNextFile()) is at the last index in its level. + bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; } + + private: + unsigned int num_levels_; + unsigned int curr_level_; + unsigned int returned_file_level_; + unsigned int hit_file_level_; + int32_t search_left_bound_; + int32_t search_right_bound_; + autovector* level_files_brief_; + bool search_ended_; + bool is_hit_file_last_in_level_; + LevelFilesBrief* curr_file_level_; + unsigned int curr_index_in_curr_level_; + unsigned int start_index_in_curr_level_; + Slice user_key_; + Slice ikey_; + FileIndexer* file_indexer_; + const Comparator* user_comparator_; + const InternalKeyComparator* internal_comparator_; + + // Setup local variables to search next level. + // Returns false if there are no more levels to search. + bool PrepareNextLevel() { + curr_level_++; + while (curr_level_ < num_levels_) { + curr_file_level_ = &(*level_files_brief_)[curr_level_]; + if (curr_file_level_->num_files == 0) { + // When current level is empty, the search bound generated from upper + // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is + // also empty. + assert(search_left_bound_ == 0); + assert(search_right_bound_ == -1 || + search_right_bound_ == FileIndexer::kLevelMaxIndex); + // Since current level is empty, it will need to search all files in + // the next level + search_left_bound_ = 0; + search_right_bound_ = FileIndexer::kLevelMaxIndex; + curr_level_++; + continue; + } + + // Some files may overlap each other. We find + // all files that overlap user_key and process them in order from + // newest to oldest. In the context of merge-operator, this can occur at + // any level. Otherwise, it only occurs at Level-0 (since Put/Deletes + // are always compacted into a single entry). + int32_t start_index; + if (curr_level_ == 0) { + // On Level-0, we read through all files to check for overlap. + start_index = 0; + } else { + // On Level-n (n>=1), files are sorted. Binary search to find the + // earliest file whose largest key >= ikey. Search left bound and + // right bound are used to narrow the range. + if (search_left_bound_ <= search_right_bound_) { + if (search_right_bound_ == FileIndexer::kLevelMaxIndex) { + search_right_bound_ = + static_cast(curr_file_level_->num_files) - 1; + } + // `search_right_bound_` is an inclusive upper-bound, but since it was + // determined based on user key, it is still possible the lookup key + // falls to the right of `search_right_bound_`'s corresponding file. + // So, pass a limit one higher, which allows us to detect this case. + start_index = + FindFileInRange(*internal_comparator_, *curr_file_level_, ikey_, + static_cast(search_left_bound_), + static_cast(search_right_bound_) + 1); + if (start_index == search_right_bound_ + 1) { + // `ikey_` comes after `search_right_bound_`. The lookup key does + // not exist on this level, so let's skip this level and do a full + // binary search on the next level. + search_left_bound_ = 0; + search_right_bound_ = FileIndexer::kLevelMaxIndex; + curr_level_++; + continue; + } + } else { + // search_left_bound > search_right_bound, key does not exist in + // this level. Since no comparison is done in this level, it will + // need to search all files in the next level. + search_left_bound_ = 0; + search_right_bound_ = FileIndexer::kLevelMaxIndex; + curr_level_++; + continue; + } + } + start_index_in_curr_level_ = start_index; + curr_index_in_curr_level_ = start_index; + + return true; + } + // curr_level_ = num_levels_. So, no more levels to search. + return false; + } +}; +} // anonymous namespace + +class FilePickerMultiGet { + private: + struct FilePickerContext; + + public: + FilePickerMultiGet(MultiGetRange* range, + autovector* file_levels, + unsigned int num_levels, FileIndexer* file_indexer, + const Comparator* user_comparator, + const InternalKeyComparator* internal_comparator) + : num_levels_(num_levels), + curr_level_(static_cast(-1)), + returned_file_level_(static_cast(-1)), + hit_file_level_(static_cast(-1)), + range_(*range, range->begin(), range->end()), + maybe_repeat_key_(false), + current_level_range_(*range, range->begin(), range->end()), + current_file_range_(*range, range->begin(), range->end()), + batch_iter_(range->begin()), + batch_iter_prev_(range->begin()), + upper_key_(range->begin()), + level_files_brief_(file_levels), + is_hit_file_last_in_level_(false), + curr_file_level_(nullptr), + file_indexer_(file_indexer), + user_comparator_(user_comparator), + internal_comparator_(internal_comparator), + hit_file_(nullptr) { + for (auto iter = range_.begin(); iter != range_.end(); ++iter) { + fp_ctx_array_[iter.index()] = + FilePickerContext(0, FileIndexer::kLevelMaxIndex); + } + + // Setup member variables to search first level. + search_ended_ = !PrepareNextLevel(); + if (!search_ended_) { + // REVISIT + // Prefetch Level 0 table data to avoid cache miss if possible. + // As of now, only PlainTableReader and CuckooTableReader do any + // prefetching. This may not be necessary anymore once we implement + // batching in those table readers + for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) { + auto* r = (*level_files_brief_)[0].files[i].fd.table_reader; + if (r) { + for (auto iter = range_.begin(); iter != range_.end(); ++iter) { + r->Prepare(iter->ikey); + } + } + } + } + } + + FilePickerMultiGet(MultiGetRange* range, const FilePickerMultiGet& other) + : num_levels_(other.num_levels_), + curr_level_(other.curr_level_), + returned_file_level_(other.returned_file_level_), + hit_file_level_(other.hit_file_level_), + fp_ctx_array_(other.fp_ctx_array_), + range_(*range, range->begin(), range->end()), + maybe_repeat_key_(false), + current_level_range_(*range, range->begin(), range->end()), + current_file_range_(*range, range->begin(), range->end()), + batch_iter_(range->begin()), + batch_iter_prev_(range->begin()), + upper_key_(range->begin()), + level_files_brief_(other.level_files_brief_), + is_hit_file_last_in_level_(false), + curr_file_level_(other.curr_file_level_), + file_indexer_(other.file_indexer_), + user_comparator_(other.user_comparator_), + internal_comparator_(other.internal_comparator_), + hit_file_(nullptr) { + PrepareNextLevelForSearch(); + } + + int GetCurrentLevel() const { return curr_level_; } + + void PrepareNextLevelForSearch() { search_ended_ = !PrepareNextLevel(); } + + FdWithKeyRange* GetNextFileInLevel() { + if (batch_iter_ == current_level_range_.end() || search_ended_) { + hit_file_ = nullptr; + return nullptr; + } else { + if (maybe_repeat_key_) { + maybe_repeat_key_ = false; + // Check if we found the final value for the last key in the + // previous lookup range. If we did, then there's no need to look + // any further for that key, so advance batch_iter_. Else, keep + // batch_iter_ positioned on that key so we look it up again in + // the next file + // For L0, always advance the key because we will look in the next + // file regardless for all keys not found yet + if (current_level_range_.CheckKeyDone(batch_iter_) || + curr_level_ == 0) { + batch_iter_ = upper_key_; + } + } + // batch_iter_prev_ will become the start key for the next file + // lookup + batch_iter_prev_ = batch_iter_; + } + + MultiGetRange next_file_range(current_level_range_, batch_iter_prev_, + current_level_range_.end()); + size_t curr_file_index = + (batch_iter_ != current_level_range_.end()) + ? fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level + : curr_file_level_->num_files; + FdWithKeyRange* f; + bool is_last_key_in_file; + if (!GetNextFileInLevelWithKeys(&next_file_range, &curr_file_index, &f, + &is_last_key_in_file)) { + hit_file_ = nullptr; + return nullptr; + } else { + if (is_last_key_in_file) { + // Since cmp_largest is 0, batch_iter_ still points to the last key + // that falls in this file, instead of the next one. Increment + // the file index for all keys between batch_iter_ and upper_key_ + auto tmp_iter = batch_iter_; + while (tmp_iter != upper_key_) { + ++(fp_ctx_array_[tmp_iter.index()].curr_index_in_curr_level); + ++tmp_iter; + } + maybe_repeat_key_ = true; + } + // Set the range for this file + current_file_range_ = + MultiGetRange(next_file_range, batch_iter_prev_, upper_key_); + returned_file_level_ = curr_level_; + hit_file_level_ = curr_level_; + is_hit_file_last_in_level_ = + curr_file_index == curr_file_level_->num_files - 1; + hit_file_ = f; + return f; + } + } + + // getter for current file level + // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts + unsigned int GetHitFileLevel() { return hit_file_level_; } + + FdWithKeyRange* GetHitFile() { return hit_file_; } + + // Returns true if the most recent "hit file" (i.e., one returned by + // GetNextFile()) is at the last index in its level. + bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; } + + bool KeyMaySpanNextFile() { return maybe_repeat_key_; } + + bool IsSearchEnded() { return search_ended_; } + + const MultiGetRange& CurrentFileRange() { return current_file_range_; } + + bool RemainingOverlapInLevel() { + return !current_level_range_.Suffix(current_file_range_).empty(); + } + + MultiGetRange& GetRange() { return range_; } + + void ReplaceRange(const MultiGetRange& other) { + assert(hit_file_ == nullptr); + range_ = other; + current_level_range_ = other; + } + + FilePickerMultiGet(FilePickerMultiGet&& other) + : num_levels_(other.num_levels_), + curr_level_(other.curr_level_), + returned_file_level_(other.returned_file_level_), + hit_file_level_(other.hit_file_level_), + fp_ctx_array_(std::move(other.fp_ctx_array_)), + range_(std::move(other.range_)), + maybe_repeat_key_(other.maybe_repeat_key_), + current_level_range_(std::move(other.current_level_range_)), + current_file_range_(std::move(other.current_file_range_)), + batch_iter_(other.batch_iter_, ¤t_level_range_), + batch_iter_prev_(other.batch_iter_prev_, ¤t_level_range_), + upper_key_(other.upper_key_, ¤t_level_range_), + level_files_brief_(other.level_files_brief_), + search_ended_(other.search_ended_), + is_hit_file_last_in_level_(other.is_hit_file_last_in_level_), + curr_file_level_(other.curr_file_level_), + file_indexer_(other.file_indexer_), + user_comparator_(other.user_comparator_), + internal_comparator_(other.internal_comparator_), + hit_file_(other.hit_file_) {} + + private: + unsigned int num_levels_; + unsigned int curr_level_; + unsigned int returned_file_level_; + unsigned int hit_file_level_; + + struct FilePickerContext { + int32_t search_left_bound; + int32_t search_right_bound; + unsigned int curr_index_in_curr_level; + unsigned int start_index_in_curr_level; + + FilePickerContext(int32_t left, int32_t right) + : search_left_bound(left), + search_right_bound(right), + curr_index_in_curr_level(0), + start_index_in_curr_level(0) {} + + FilePickerContext() = default; + }; + std::array fp_ctx_array_; + MultiGetRange range_; + bool maybe_repeat_key_; + MultiGetRange current_level_range_; + MultiGetRange current_file_range_; + // Iterator to iterate through the keys in a MultiGet batch, that gets reset + // at the beginning of each level. Each call to GetNextFile() will position + // batch_iter_ at or right after the last key that was found in the returned + // SST file + MultiGetRange::Iterator batch_iter_; + // An iterator that records the previous position of batch_iter_, i.e last + // key found in the previous SST file, in order to serve as the start of + // the batch key range for the next SST file + MultiGetRange::Iterator batch_iter_prev_; + MultiGetRange::Iterator upper_key_; + autovector* level_files_brief_; + bool search_ended_; + bool is_hit_file_last_in_level_; + LevelFilesBrief* curr_file_level_; + FileIndexer* file_indexer_; + const Comparator* user_comparator_; + const InternalKeyComparator* internal_comparator_; + FdWithKeyRange* hit_file_; + + // Iterates through files in the current level until it finds a file that + // contains at least one key from the MultiGet batch + bool GetNextFileInLevelWithKeys(MultiGetRange* next_file_range, + size_t* file_index, FdWithKeyRange** fd, + bool* is_last_key_in_file) { + size_t curr_file_index = *file_index; + FdWithKeyRange* f = nullptr; + bool file_hit = false; + int cmp_largest = -1; + if (curr_file_index >= curr_file_level_->num_files) { + // In the unlikely case the next key is a duplicate of the current key, + // and the current key is the last in the level and the internal key + // was not found, we need to skip lookup for the remaining keys and + // reset the search bounds + if (batch_iter_ != current_level_range_.end()) { + ++batch_iter_; + for (; batch_iter_ != current_level_range_.end(); ++batch_iter_) { + struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()]; + fp_ctx.search_left_bound = 0; + fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex; + } + } + return false; + } + // Loops over keys in the MultiGet batch until it finds a file with + // atleast one of the keys. Then it keeps moving forward until the + // last key in the batch that falls in that file + while (batch_iter_ != current_level_range_.end() && + (fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level == + curr_file_index || + !file_hit)) { + struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()]; + f = &curr_file_level_->files[fp_ctx.curr_index_in_curr_level]; + Slice& user_key = batch_iter_->ukey_without_ts; + + // Do key range filtering of files or/and fractional cascading if: + // (1) not all the files are in level 0, or + // (2) there are more than 3 current level files + // If there are only 3 or less current level files in the system, we + // skip the key range filtering. In this case, more likely, the system + // is highly tuned to minimize number of tables queried by each query, + // so it is unlikely that key range filtering is more efficient than + // querying the files. + if (num_levels_ > 1 || curr_file_level_->num_files > 3) { + // Check if key is within a file's range. If search left bound and + // right bound point to the same find, we are sure key falls in + // range. + int cmp_smallest = user_comparator_->CompareWithoutTimestamp( + user_key, false, ExtractUserKey(f->smallest_key), true); + + assert(curr_level_ == 0 || + fp_ctx.curr_index_in_curr_level == + fp_ctx.start_index_in_curr_level || + cmp_smallest <= 0); + + if (cmp_smallest >= 0) { + cmp_largest = user_comparator_->CompareWithoutTimestamp( + user_key, false, ExtractUserKey(f->largest_key), true); + } else { + cmp_largest = -1; + } + + // Setup file search bound for the next level based on the + // comparison results + if (curr_level_ > 0) { + file_indexer_->GetNextLevelIndex( + curr_level_, fp_ctx.curr_index_in_curr_level, cmp_smallest, + cmp_largest, &fp_ctx.search_left_bound, + &fp_ctx.search_right_bound); + } + // Key falls out of current file's range + if (cmp_smallest < 0 || cmp_largest > 0) { + next_file_range->SkipKey(batch_iter_); + } else { + file_hit = true; + } + } else { + file_hit = true; + } + if (cmp_largest == 0) { + // cmp_largest is 0, which means the next key will not be in this + // file, so stop looking further. However, its possible there are + // duplicates in the batch, so find the upper bound for the batch + // in this file (upper_key_) by skipping past the duplicates. We + // leave batch_iter_ as is since we may have to pick up from there + // for the next file, if this file has a merge value rather than + // final value + upper_key_ = batch_iter_; + ++upper_key_; + while (upper_key_ != current_level_range_.end() && + user_comparator_->CompareWithoutTimestamp( + batch_iter_->ukey_without_ts, false, + upper_key_->ukey_without_ts, false) == 0) { + ++upper_key_; + } + break; + } else { + if (curr_level_ == 0) { + // We need to look through all files in level 0 + ++fp_ctx.curr_index_in_curr_level; + } + ++batch_iter_; + } + if (!file_hit) { + curr_file_index = + (batch_iter_ != current_level_range_.end()) + ? fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level + : curr_file_level_->num_files; + } + } + + *fd = f; + *file_index = curr_file_index; + *is_last_key_in_file = cmp_largest == 0; + if (!*is_last_key_in_file) { + // If the largest key in the batch overlapping the file is not the + // largest key in the file, upper_ley_ would not have been updated so + // update it here + upper_key_ = batch_iter_; + } + return file_hit; + } + + // Setup local variables to search next level. + // Returns false if there are no more levels to search. + bool PrepareNextLevel() { + if (curr_level_ == 0) { + MultiGetRange::Iterator mget_iter = current_level_range_.begin(); + if (fp_ctx_array_[mget_iter.index()].curr_index_in_curr_level < + curr_file_level_->num_files) { + batch_iter_prev_ = current_level_range_.begin(); + upper_key_ = batch_iter_ = current_level_range_.begin(); + return true; + } + } + + curr_level_++; + // Reset key range to saved value + while (curr_level_ < num_levels_) { + bool level_contains_keys = false; + curr_file_level_ = &(*level_files_brief_)[curr_level_]; + if (curr_file_level_->num_files == 0) { + // When current level is empty, the search bound generated from upper + // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is + // also empty. + + for (auto mget_iter = current_level_range_.begin(); + mget_iter != current_level_range_.end(); ++mget_iter) { + struct FilePickerContext& fp_ctx = fp_ctx_array_[mget_iter.index()]; + + assert(fp_ctx.search_left_bound == 0); + assert(fp_ctx.search_right_bound == -1 || + fp_ctx.search_right_bound == FileIndexer::kLevelMaxIndex); + // Since current level is empty, it will need to search all files in + // the next level + fp_ctx.search_left_bound = 0; + fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex; + } + // Skip all subsequent empty levels + do { + ++curr_level_; + } while ((curr_level_ < num_levels_) && + (*level_files_brief_)[curr_level_].num_files == 0); + continue; + } + + // Some files may overlap each other. We find + // all files that overlap user_key and process them in order from + // newest to oldest. In the context of merge-operator, this can occur at + // any level. Otherwise, it only occurs at Level-0 (since Put/Deletes + // are always compacted into a single entry). + int32_t start_index = -1; + current_level_range_ = + MultiGetRange(range_, range_.begin(), range_.end()); + for (auto mget_iter = current_level_range_.begin(); + mget_iter != current_level_range_.end(); ++mget_iter) { + struct FilePickerContext& fp_ctx = fp_ctx_array_[mget_iter.index()]; + if (curr_level_ == 0) { + // On Level-0, we read through all files to check for overlap. + start_index = 0; + level_contains_keys = true; + } else { + // On Level-n (n>=1), files are sorted. Binary search to find the + // earliest file whose largest key >= ikey. Search left bound and + // right bound are used to narrow the range. + if (fp_ctx.search_left_bound <= fp_ctx.search_right_bound) { + if (fp_ctx.search_right_bound == FileIndexer::kLevelMaxIndex) { + fp_ctx.search_right_bound = + static_cast(curr_file_level_->num_files) - 1; + } + // `search_right_bound_` is an inclusive upper-bound, but since it + // was determined based on user key, it is still possible the lookup + // key falls to the right of `search_right_bound_`'s corresponding + // file. So, pass a limit one higher, which allows us to detect this + // case. + Slice& ikey = mget_iter->ikey; + start_index = FindFileInRange( + *internal_comparator_, *curr_file_level_, ikey, + static_cast(fp_ctx.search_left_bound), + static_cast(fp_ctx.search_right_bound) + 1); + if (start_index == fp_ctx.search_right_bound + 1) { + // `ikey_` comes after `search_right_bound_`. The lookup key does + // not exist on this level, so let's skip this level and do a full + // binary search on the next level. + fp_ctx.search_left_bound = 0; + fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex; + current_level_range_.SkipKey(mget_iter); + continue; + } else { + level_contains_keys = true; + } + } else { + // search_left_bound > search_right_bound, key does not exist in + // this level. Since no comparison is done in this level, it will + // need to search all files in the next level. + fp_ctx.search_left_bound = 0; + fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex; + current_level_range_.SkipKey(mget_iter); + continue; + } + } + fp_ctx.start_index_in_curr_level = start_index; + fp_ctx.curr_index_in_curr_level = start_index; + } + if (level_contains_keys) { + batch_iter_prev_ = current_level_range_.begin(); + upper_key_ = batch_iter_ = current_level_range_.begin(); + return true; + } + curr_level_++; + } + // curr_level_ = num_levels_. So, no more levels to search. + return false; + } +}; + +VersionStorageInfo::~VersionStorageInfo() { delete[] files_; } + +Version::~Version() { + assert(refs_ == 0); + + // Remove from linked list + prev_->next_ = next_; + next_->prev_ = prev_; + + // Drop references to files + for (int level = 0; level < storage_info_.num_levels_; level++) { + for (size_t i = 0; i < storage_info_.files_[level].size(); i++) { + FileMetaData* f = storage_info_.files_[level][i]; + assert(f->refs > 0); + f->refs--; + if (f->refs <= 0) { + assert(cfd_ != nullptr); + uint32_t path_id = f->fd.GetPathId(); + assert(path_id < cfd_->ioptions()->cf_paths.size()); + vset_->obsolete_files_.push_back( + ObsoleteFileInfo(f, cfd_->ioptions()->cf_paths[path_id].path, + cfd_->GetFileMetadataCacheReservationManager())); + } + } + } +} + +int FindFile(const InternalKeyComparator& icmp, + const LevelFilesBrief& file_level, const Slice& key) { + return FindFileInRange(icmp, file_level, key, 0, + static_cast(file_level.num_files)); +} + +void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level, + const std::vector& files, + Arena* arena) { + assert(file_level); + assert(arena); + + size_t num = files.size(); + file_level->num_files = num; + char* mem = arena->AllocateAligned(num * sizeof(FdWithKeyRange)); + file_level->files = new (mem) FdWithKeyRange[num]; + + for (size_t i = 0; i < num; i++) { + Slice smallest_key = files[i]->smallest.Encode(); + Slice largest_key = files[i]->largest.Encode(); + + // Copy key slice to sequential memory + size_t smallest_size = smallest_key.size(); + size_t largest_size = largest_key.size(); + mem = arena->AllocateAligned(smallest_size + largest_size); + memcpy(mem, smallest_key.data(), smallest_size); + memcpy(mem + smallest_size, largest_key.data(), largest_size); + + FdWithKeyRange& f = file_level->files[i]; + f.fd = files[i]->fd; + f.file_metadata = files[i]; + f.smallest_key = Slice(mem, smallest_size); + f.largest_key = Slice(mem + smallest_size, largest_size); + } +} + +static bool AfterFile(const Comparator* ucmp, const Slice* user_key, + const FdWithKeyRange* f) { + // nullptr user_key occurs before all keys and is therefore never after *f + return (user_key != nullptr && + ucmp->CompareWithoutTimestamp(*user_key, + ExtractUserKey(f->largest_key)) > 0); +} + +static bool BeforeFile(const Comparator* ucmp, const Slice* user_key, + const FdWithKeyRange* f) { + // nullptr user_key occurs after all keys and is therefore never before *f + return (user_key != nullptr && + ucmp->CompareWithoutTimestamp(*user_key, + ExtractUserKey(f->smallest_key)) < 0); +} + +bool SomeFileOverlapsRange(const InternalKeyComparator& icmp, + bool disjoint_sorted_files, + const LevelFilesBrief& file_level, + const Slice* smallest_user_key, + const Slice* largest_user_key) { + const Comparator* ucmp = icmp.user_comparator(); + if (!disjoint_sorted_files) { + // Need to check against all files + for (size_t i = 0; i < file_level.num_files; i++) { + const FdWithKeyRange* f = &(file_level.files[i]); + if (AfterFile(ucmp, smallest_user_key, f) || + BeforeFile(ucmp, largest_user_key, f)) { + // No overlap + } else { + return true; // Overlap + } + } + return false; + } + + // Binary search over file list + uint32_t index = 0; + if (smallest_user_key != nullptr) { + // Find the leftmost possible internal key for smallest_user_key + InternalKey small; + small.SetMinPossibleForUserKey(*smallest_user_key); + index = FindFile(icmp, file_level, small.Encode()); + } + + if (index >= file_level.num_files) { + // beginning of range is after all files, so no overlap. + return false; + } + + return !BeforeFile(ucmp, largest_user_key, &file_level.files[index]); +} + +namespace { + +class LevelIterator final : public InternalIterator { + public: + // @param read_options Must outlive this iterator. + LevelIterator( + TableCache* table_cache, const ReadOptions& read_options, + const FileOptions& file_options, const InternalKeyComparator& icomparator, + const LevelFilesBrief* flevel, + const std::shared_ptr& prefix_extractor, + bool should_sample, HistogramImpl* file_read_hist, + TableReaderCaller caller, bool skip_filters, int level, + RangeDelAggregator* range_del_agg, + const std::vector* compaction_boundaries = + nullptr, + bool allow_unprepared_value = false, + TruncatedRangeDelIterator**** range_tombstone_iter_ptr_ = nullptr) + : table_cache_(table_cache), + read_options_(read_options), + file_options_(file_options), + icomparator_(icomparator), + user_comparator_(icomparator.user_comparator()), + flevel_(flevel), + prefix_extractor_(prefix_extractor), + file_read_hist_(file_read_hist), + should_sample_(should_sample), + caller_(caller), + skip_filters_(skip_filters), + allow_unprepared_value_(allow_unprepared_value), + file_index_(flevel_->num_files), + level_(level), + range_del_agg_(range_del_agg), + pinned_iters_mgr_(nullptr), + compaction_boundaries_(compaction_boundaries), + is_next_read_sequential_(false), + range_tombstone_iter_(nullptr), + to_return_sentinel_(false) { + // Empty level is not supported. + assert(flevel_ != nullptr && flevel_->num_files > 0); + if (range_tombstone_iter_ptr_) { + *range_tombstone_iter_ptr_ = &range_tombstone_iter_; + } + } + + ~LevelIterator() override { delete file_iter_.Set(nullptr); } + + // Seek to the first file with a key >= target. + // If range_tombstone_iter_ is not nullptr, then we pretend that file + // boundaries are fake keys (sentinel keys). These keys are used to keep range + // tombstones alive even when all point keys in an SST file are exhausted. + // These sentinel keys will be skipped in merging iterator. + void Seek(const Slice& target) override; + void SeekForPrev(const Slice& target) override; + void SeekToFirst() override; + void SeekToLast() override; + void Next() final override; + bool NextAndGetResult(IterateResult* result) override; + void Prev() override; + + // In addition to valid and invalid state (!file_iter.Valid() and + // status.ok()), a third state of the iterator is when !file_iter_.Valid() and + // to_return_sentinel_. This means we are at the end of a file, and a sentinel + // key (the file boundary that we pretend as a key) is to be returned next. + // file_iter_.Valid() and to_return_sentinel_ should not both be true. + bool Valid() const override { + assert(!(file_iter_.Valid() && to_return_sentinel_)); + return file_iter_.Valid() || to_return_sentinel_; + } + Slice key() const override { + assert(Valid()); + if (to_return_sentinel_) { + // Sentinel should be returned after file_iter_ reaches the end of the + // file + assert(!file_iter_.Valid()); + return sentinel_; + } + return file_iter_.key(); + } + + Slice value() const override { + assert(Valid()); + assert(!to_return_sentinel_); + return file_iter_.value(); + } + + Status status() const override { + return file_iter_.iter() ? file_iter_.status() : Status::OK(); + } + + bool PrepareValue() override { return file_iter_.PrepareValue(); } + + inline bool MayBeOutOfLowerBound() override { + assert(Valid()); + return may_be_out_of_lower_bound_ && file_iter_.MayBeOutOfLowerBound(); + } + + inline IterBoundCheck UpperBoundCheckResult() override { + if (Valid()) { + return file_iter_.UpperBoundCheckResult(); + } else { + return IterBoundCheck::kUnknown; + } + } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + if (file_iter_.iter()) { + file_iter_.SetPinnedItersMgr(pinned_iters_mgr); + } + } + + bool IsKeyPinned() const override { + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + file_iter_.iter() && file_iter_.IsKeyPinned(); + } + + bool IsValuePinned() const override { + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + file_iter_.iter() && file_iter_.IsValuePinned(); + } + + bool IsDeleteRangeSentinelKey() const override { return to_return_sentinel_; } + + private: + // Return true if at least one invalid file is seen and skipped. + bool SkipEmptyFileForward(); + void SkipEmptyFileBackward(); + void SetFileIterator(InternalIterator* iter); + void InitFileIterator(size_t new_file_index); + + const Slice& file_smallest_key(size_t file_index) { + assert(file_index < flevel_->num_files); + return flevel_->files[file_index].smallest_key; + } + + const Slice& file_largest_key(size_t file_index) { + assert(file_index < flevel_->num_files); + return flevel_->files[file_index].largest_key; + } + + bool KeyReachedUpperBound(const Slice& internal_key) { + return read_options_.iterate_upper_bound != nullptr && + user_comparator_.CompareWithoutTimestamp( + ExtractUserKey(internal_key), /*a_has_ts=*/true, + *read_options_.iterate_upper_bound, /*b_has_ts=*/false) >= 0; + } + + void ClearRangeTombstoneIter() { + if (range_tombstone_iter_ && *range_tombstone_iter_) { + delete *range_tombstone_iter_; + *range_tombstone_iter_ = nullptr; + } + } + + // Move file_iter_ to the file at file_index_. + // range_tombstone_iter_ is updated with a range tombstone iterator + // into the new file. Old range tombstone iterator is cleared. + InternalIterator* NewFileIterator() { + assert(file_index_ < flevel_->num_files); + auto file_meta = flevel_->files[file_index_]; + if (should_sample_) { + sample_file_read_inc(file_meta.file_metadata); + } + + const InternalKey* smallest_compaction_key = nullptr; + const InternalKey* largest_compaction_key = nullptr; + if (compaction_boundaries_ != nullptr) { + smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest; + largest_compaction_key = (*compaction_boundaries_)[file_index_].largest; + } + CheckMayBeOutOfLowerBound(); + ClearRangeTombstoneIter(); + return table_cache_->NewIterator( + read_options_, file_options_, icomparator_, *file_meta.file_metadata, + range_del_agg_, prefix_extractor_, + nullptr /* don't need reference to table */, file_read_hist_, caller_, + /*arena=*/nullptr, skip_filters_, level_, + /*max_file_size_for_l0_meta_pin=*/0, smallest_compaction_key, + largest_compaction_key, allow_unprepared_value_, range_tombstone_iter_); + } + + // Check if current file being fully within iterate_lower_bound. + // + // Note MyRocks may update iterate bounds between seek. To workaround it, + // we need to check and update may_be_out_of_lower_bound_ accordingly. + void CheckMayBeOutOfLowerBound() { + if (read_options_.iterate_lower_bound != nullptr && + file_index_ < flevel_->num_files) { + may_be_out_of_lower_bound_ = + user_comparator_.CompareWithoutTimestamp( + ExtractUserKey(file_smallest_key(file_index_)), /*a_has_ts=*/true, + *read_options_.iterate_lower_bound, /*b_has_ts=*/false) < 0; + } + } + + TableCache* table_cache_; + const ReadOptions& read_options_; + const FileOptions& file_options_; + const InternalKeyComparator& icomparator_; + const UserComparatorWrapper user_comparator_; + const LevelFilesBrief* flevel_; + mutable FileDescriptor current_value_; + // `prefix_extractor_` may be non-null even for total order seek. Checking + // this variable is not the right way to identify whether prefix iterator + // is used. + const std::shared_ptr& prefix_extractor_; + + HistogramImpl* file_read_hist_; + bool should_sample_; + TableReaderCaller caller_; + bool skip_filters_; + bool allow_unprepared_value_; + bool may_be_out_of_lower_bound_ = true; + size_t file_index_; + int level_; + RangeDelAggregator* range_del_agg_; + IteratorWrapper file_iter_; // May be nullptr + PinnedIteratorsManager* pinned_iters_mgr_; + + // To be propagated to RangeDelAggregator in order to safely truncate range + // tombstones. + const std::vector* compaction_boundaries_; + + bool is_next_read_sequential_; + + // This is set when this level iterator is used under a merging iterator + // that processes range tombstones. range_tombstone_iter_ points to where the + // merging iterator stores the range tombstones iterator for this level. When + // this level iterator moves to a new SST file, it updates the range + // tombstones accordingly through this pointer. So the merging iterator always + // has access to the current SST file's range tombstones. + // + // The level iterator treats file boundary as fake keys (sentinel keys) to + // keep range tombstones alive if needed and make upper level, i.e. merging + // iterator, aware of file changes (when level iterator moves to a new SST + // file, there is some bookkeeping work that needs to be done at merging + // iterator end). + // + // *range_tombstone_iter_ points to range tombstones of the current SST file + TruncatedRangeDelIterator** range_tombstone_iter_; + + // Whether next/prev key is a sentinel key. + bool to_return_sentinel_ = false; + // The sentinel key to be returned + Slice sentinel_; + // Sets flags for if we should return the sentinel key next. + // The condition for returning sentinel is reaching the end of current + // file_iter_: !Valid() && status.().ok(). + void TrySetDeleteRangeSentinel(const Slice& boundary_key); + void ClearSentinel() { to_return_sentinel_ = false; } + + // Set in Seek() when a prefix seek reaches end of the current file, + // and the next file has a different prefix. SkipEmptyFileForward() + // will not move to next file when this flag is set. + bool prefix_exhausted_ = false; +}; + +void LevelIterator::TrySetDeleteRangeSentinel(const Slice& boundary_key) { + assert(range_tombstone_iter_); + if (file_iter_.iter() != nullptr && !file_iter_.Valid() && + file_iter_.status().ok()) { + to_return_sentinel_ = true; + sentinel_ = boundary_key; + } +} + +void LevelIterator::Seek(const Slice& target) { + prefix_exhausted_ = false; + ClearSentinel(); + // Check whether the seek key fall under the same file + bool need_to_reseek = true; + if (file_iter_.iter() != nullptr && file_index_ < flevel_->num_files) { + const FdWithKeyRange& cur_file = flevel_->files[file_index_]; + if (icomparator_.InternalKeyComparator::Compare( + target, cur_file.largest_key) <= 0 && + icomparator_.InternalKeyComparator::Compare( + target, cur_file.smallest_key) >= 0) { + need_to_reseek = false; + assert(static_cast(FindFile(icomparator_, *flevel_, target)) == + file_index_); + } + } + if (need_to_reseek) { + TEST_SYNC_POINT("LevelIterator::Seek:BeforeFindFile"); + size_t new_file_index = FindFile(icomparator_, *flevel_, target); + InitFileIterator(new_file_index); + } + + if (file_iter_.iter() != nullptr) { + file_iter_.Seek(target); + // Status::TryAgain indicates asynchronous request for retrieval of data + // blocks has been submitted. So it should return at this point and Seek + // should be called again to retrieve the requested block and execute the + // remaining code. + if (file_iter_.status() == Status::TryAgain()) { + return; + } + if (!file_iter_.Valid() && file_iter_.status().ok() && + prefix_extractor_ != nullptr && !read_options_.total_order_seek && + !read_options_.auto_prefix_mode && + file_index_ < flevel_->num_files - 1) { + size_t ts_sz = user_comparator_.user_comparator()->timestamp_size(); + Slice target_user_key_without_ts = + ExtractUserKeyAndStripTimestamp(target, ts_sz); + Slice next_file_first_user_key_without_ts = + ExtractUserKeyAndStripTimestamp(file_smallest_key(file_index_ + 1), + ts_sz); + if (prefix_extractor_->InDomain(target_user_key_without_ts) && + (!prefix_extractor_->InDomain(next_file_first_user_key_without_ts) || + user_comparator_.CompareWithoutTimestamp( + prefix_extractor_->Transform(target_user_key_without_ts), false, + prefix_extractor_->Transform( + next_file_first_user_key_without_ts), + false) != 0)) { + // SkipEmptyFileForward() will not advance to next file when this flag + // is set for reason detailed below. + // + // The file we initially positioned to has no keys under the target + // prefix, and the next file's smallest key has a different prefix than + // target. When doing prefix iterator seek, when keys for one prefix + // have been exhausted, it can jump to any key that is larger. Here we + // are enforcing a stricter contract than that, in order to make it + // easier for higher layers (merging and DB iterator) to reason the + // correctness: + // 1. Within the prefix, the result should be accurate. + // 2. If keys for the prefix is exhausted, it is either positioned to + // the next key after the prefix, or make the iterator invalid. + // A side benefit will be that it invalidates the iterator earlier so + // that the upper level merging iterator can merge fewer child + // iterators. + // + // The flag is cleared in Seek*() calls. There is no need to clear the + // flag in Prev() since Prev() will not be called when the flag is set + // for reasons explained below. If range_tombstone_iter_ is nullptr, + // then there is no file boundary sentinel key. Since + // !file_iter_.Valid() from the if condition above, this level iterator + // is !Valid(), so Prev() will not be called. If range_tombstone_iter_ + // is not nullptr, there are two cases depending on if this level + // iterator reaches top of the heap in merging iterator (the upper + // layer). + // If so, merging iterator will see the sentinel key, call + // NextAndGetResult() and the call to NextAndGetResult() will skip the + // sentinel key and makes this level iterator invalid. If not, then it + // could be because the upper layer is done before any method of this + // level iterator is called or another Seek*() call is invoked. Either + // way, Prev() is never called before Seek*(). + // The flag should not be cleared at the beginning of + // Next/NextAndGetResult() since it is used in SkipEmptyFileForward() + // called in Next/NextAndGetResult(). + prefix_exhausted_ = true; + } + } + + if (range_tombstone_iter_) { + TrySetDeleteRangeSentinel(file_largest_key(file_index_)); + } + } + SkipEmptyFileForward(); + CheckMayBeOutOfLowerBound(); +} + +void LevelIterator::SeekForPrev(const Slice& target) { + prefix_exhausted_ = false; + ClearSentinel(); + size_t new_file_index = FindFile(icomparator_, *flevel_, target); + // Seek beyond this level's smallest key + if (new_file_index == 0 && + icomparator_.Compare(target, file_smallest_key(0)) < 0) { + SetFileIterator(nullptr); + ClearRangeTombstoneIter(); + CheckMayBeOutOfLowerBound(); + return; + } + if (new_file_index >= flevel_->num_files) { + new_file_index = flevel_->num_files - 1; + } + + InitFileIterator(new_file_index); + if (file_iter_.iter() != nullptr) { + file_iter_.SeekForPrev(target); + if (range_tombstone_iter_ && + icomparator_.Compare(target, file_smallest_key(file_index_)) >= 0) { + // In SeekForPrev() case, it is possible that the target is less than + // file's lower boundary since largest key is used to determine file index + // (FindFile()). When target is less than file's lower boundary, sentinel + // key should not be set so that SeekForPrev() does not result in a key + // larger than target. This is correct in that there is no need to keep + // the range tombstones in this file alive as they only cover keys + // starting from the file's lower boundary, which is after `target`. + TrySetDeleteRangeSentinel(file_smallest_key(file_index_)); + } + SkipEmptyFileBackward(); + } + CheckMayBeOutOfLowerBound(); +} + +void LevelIterator::SeekToFirst() { + prefix_exhausted_ = false; + ClearSentinel(); + InitFileIterator(0); + if (file_iter_.iter() != nullptr) { + file_iter_.SeekToFirst(); + if (range_tombstone_iter_) { + // We do this in SeekToFirst() and SeekToLast() since + // we could have an empty file with only range tombstones. + TrySetDeleteRangeSentinel(file_largest_key(file_index_)); + } + } + SkipEmptyFileForward(); + CheckMayBeOutOfLowerBound(); +} + +void LevelIterator::SeekToLast() { + prefix_exhausted_ = false; + ClearSentinel(); + InitFileIterator(flevel_->num_files - 1); + if (file_iter_.iter() != nullptr) { + file_iter_.SeekToLast(); + if (range_tombstone_iter_) { + TrySetDeleteRangeSentinel(file_smallest_key(file_index_)); + } + } + SkipEmptyFileBackward(); + CheckMayBeOutOfLowerBound(); +} + +void LevelIterator::Next() { + assert(Valid()); + if (to_return_sentinel_) { + // file_iter_ is at EOF already when to_return_sentinel_ + ClearSentinel(); + } else { + file_iter_.Next(); + if (range_tombstone_iter_) { + TrySetDeleteRangeSentinel(file_largest_key(file_index_)); + } + } + SkipEmptyFileForward(); +} + +bool LevelIterator::NextAndGetResult(IterateResult* result) { + assert(Valid()); + // file_iter_ is at EOF already when to_return_sentinel_ + bool is_valid = !to_return_sentinel_ && file_iter_.NextAndGetResult(result); + if (!is_valid) { + if (to_return_sentinel_) { + ClearSentinel(); + } else if (range_tombstone_iter_) { + TrySetDeleteRangeSentinel(file_largest_key(file_index_)); + } + is_next_read_sequential_ = true; + SkipEmptyFileForward(); + is_next_read_sequential_ = false; + is_valid = Valid(); + if (is_valid) { + // This could be set in TrySetDeleteRangeSentinel() or + // SkipEmptyFileForward() above. + if (to_return_sentinel_) { + result->key = sentinel_; + result->bound_check_result = IterBoundCheck::kUnknown; + result->value_prepared = true; + } else { + result->key = key(); + result->bound_check_result = file_iter_.UpperBoundCheckResult(); + // Ideally, we should return the real file_iter_.value_prepared but the + // information is not here. It would casue an extra PrepareValue() + // for the first key of a file. + result->value_prepared = !allow_unprepared_value_; + } + } + } + return is_valid; +} + +void LevelIterator::Prev() { + assert(Valid()); + if (to_return_sentinel_) { + ClearSentinel(); + } else { + file_iter_.Prev(); + if (range_tombstone_iter_) { + TrySetDeleteRangeSentinel(file_smallest_key(file_index_)); + } + } + SkipEmptyFileBackward(); +} + +bool LevelIterator::SkipEmptyFileForward() { + bool seen_empty_file = false; + // Pause at sentinel key + while (!to_return_sentinel_ && + (file_iter_.iter() == nullptr || + (!file_iter_.Valid() && file_iter_.status().ok() && + file_iter_.iter()->UpperBoundCheckResult() != + IterBoundCheck::kOutOfBound))) { + seen_empty_file = true; + // Move to next file + if (file_index_ >= flevel_->num_files - 1 || + KeyReachedUpperBound(file_smallest_key(file_index_ + 1)) || + prefix_exhausted_) { + SetFileIterator(nullptr); + ClearRangeTombstoneIter(); + break; + } + // may init a new *range_tombstone_iter + InitFileIterator(file_index_ + 1); + // We moved to a new SST file + // Seek range_tombstone_iter_ to reset its !Valid() default state. + // We do not need to call range_tombstone_iter_.Seek* in + // LevelIterator::Seek* since when the merging iterator calls + // LevelIterator::Seek*, it should also call Seek* into the corresponding + // range tombstone iterator. + if (file_iter_.iter() != nullptr) { + file_iter_.SeekToFirst(); + if (range_tombstone_iter_) { + if (*range_tombstone_iter_) { + (*range_tombstone_iter_)->SeekToFirst(); + } + TrySetDeleteRangeSentinel(file_largest_key(file_index_)); + } + } + } + return seen_empty_file; +} + +void LevelIterator::SkipEmptyFileBackward() { + // Pause at sentinel key + while (!to_return_sentinel_ && + (file_iter_.iter() == nullptr || + (!file_iter_.Valid() && file_iter_.status().ok()))) { + // Move to previous file + if (file_index_ == 0) { + // Already the first file + SetFileIterator(nullptr); + ClearRangeTombstoneIter(); + return; + } + InitFileIterator(file_index_ - 1); + // We moved to a new SST file + // Seek range_tombstone_iter_ to reset its !Valid() default state. + if (file_iter_.iter() != nullptr) { + file_iter_.SeekToLast(); + if (range_tombstone_iter_) { + if (*range_tombstone_iter_) { + (*range_tombstone_iter_)->SeekToLast(); + } + TrySetDeleteRangeSentinel(file_smallest_key(file_index_)); + if (to_return_sentinel_) { + break; + } + } + } + } +} + +void LevelIterator::SetFileIterator(InternalIterator* iter) { + if (pinned_iters_mgr_ && iter) { + iter->SetPinnedItersMgr(pinned_iters_mgr_); + } + + InternalIterator* old_iter = file_iter_.Set(iter); + + // Update the read pattern for PrefetchBuffer. + if (is_next_read_sequential_) { + file_iter_.UpdateReadaheadState(old_iter); + } + + if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) { + pinned_iters_mgr_->PinIterator(old_iter); + } else { + delete old_iter; + } +} + +void LevelIterator::InitFileIterator(size_t new_file_index) { + if (new_file_index >= flevel_->num_files) { + file_index_ = new_file_index; + SetFileIterator(nullptr); + ClearRangeTombstoneIter(); + return; + } else { + // If the file iterator shows incomplete, we try it again if users seek + // to the same file, as this time we may go to a different data block + // which is cached in block cache. + // + if (file_iter_.iter() != nullptr && !file_iter_.status().IsIncomplete() && + new_file_index == file_index_) { + // file_iter_ is already constructed with this iterator, so + // no need to change anything + } else { + file_index_ = new_file_index; + InternalIterator* iter = NewFileIterator(); + SetFileIterator(iter); + } + } +} +} // anonymous namespace + +Status Version::GetTableProperties(std::shared_ptr* tp, + const FileMetaData* file_meta, + const std::string* fname) const { + auto table_cache = cfd_->table_cache(); + auto ioptions = cfd_->ioptions(); + Status s = table_cache->GetTableProperties( + file_options_, cfd_->internal_comparator(), *file_meta, tp, + mutable_cf_options_.prefix_extractor, true /* no io */); + if (s.ok()) { + return s; + } + + // We only ignore error type `Incomplete` since it's by design that we + // disallow table when it's not in table cache. + if (!s.IsIncomplete()) { + return s; + } + + // 2. Table is not present in table cache, we'll read the table properties + // directly from the properties block in the file. + std::unique_ptr file; + std::string file_name; + if (fname != nullptr) { + file_name = *fname; + } else { + file_name = TableFileName(ioptions->cf_paths, file_meta->fd.GetNumber(), + file_meta->fd.GetPathId()); + } + s = ioptions->fs->NewRandomAccessFile(file_name, file_options_, &file, + nullptr); + if (!s.ok()) { + return s; + } + + // By setting the magic number to kNullTableMagicNumber, we can bypass + // the magic number check in the footer. + std::unique_ptr file_reader( + new RandomAccessFileReader( + std::move(file), file_name, nullptr /* env */, io_tracer_, + nullptr /* stats */, 0 /* hist_type */, nullptr /* file_read_hist */, + nullptr /* rate_limiter */, ioptions->listeners)); + std::unique_ptr props; + s = ReadTableProperties( + file_reader.get(), file_meta->fd.GetFileSize(), + Footer::kNullTableMagicNumber /* table's magic number */, *ioptions, + &props); + if (!s.ok()) { + return s; + } + *tp = std::move(props); + RecordTick(ioptions->stats, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES); + return s; +} + +Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) { + Status s; + for (int level = 0; level < storage_info_.num_levels_; level++) { + s = GetPropertiesOfAllTables(props, level); + if (!s.ok()) { + return s; + } + } + + return Status::OK(); +} + +Status Version::TablesRangeTombstoneSummary(int max_entries_to_print, + std::string* out_str) { + if (max_entries_to_print <= 0) { + return Status::OK(); + } + int num_entries_left = max_entries_to_print; + + std::stringstream ss; + + for (int level = 0; level < storage_info_.num_levels_; level++) { + for (const auto& file_meta : storage_info_.files_[level]) { + auto fname = + TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(), + file_meta->fd.GetPathId()); + + ss << "=== file : " << fname << " ===\n"; + + TableCache* table_cache = cfd_->table_cache(); + std::unique_ptr tombstone_iter; + + Status s = table_cache->GetRangeTombstoneIterator( + ReadOptions(), cfd_->internal_comparator(), *file_meta, + &tombstone_iter); + if (!s.ok()) { + return s; + } + if (tombstone_iter) { + tombstone_iter->SeekToFirst(); + + // TODO: print timestamp + while (tombstone_iter->Valid() && num_entries_left > 0) { + ss << "start: " << tombstone_iter->start_key().ToString(true) + << " end: " << tombstone_iter->end_key().ToString(true) + << " seq: " << tombstone_iter->seq() << '\n'; + tombstone_iter->Next(); + num_entries_left--; + } + if (num_entries_left <= 0) { + break; + } + } + } + if (num_entries_left <= 0) { + break; + } + } + assert(num_entries_left >= 0); + if (num_entries_left <= 0) { + ss << "(results may not be complete)\n"; + } + + *out_str = ss.str(); + return Status::OK(); +} + +Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props, + int level) { + for (const auto& file_meta : storage_info_.files_[level]) { + auto fname = + TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(), + file_meta->fd.GetPathId()); + // 1. If the table is already present in table cache, load table + // properties from there. + std::shared_ptr table_properties; + Status s = GetTableProperties(&table_properties, file_meta, &fname); + if (s.ok()) { + props->insert({fname, table_properties}); + } else { + return s; + } + } + + return Status::OK(); +} + +Status Version::GetPropertiesOfTablesInRange( + const Range* range, std::size_t n, TablePropertiesCollection* props) const { + for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) { + for (decltype(n) i = 0; i < n; i++) { + // Convert user_key into a corresponding internal key. + InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); + std::vector files; + storage_info_.GetOverlappingInputs(level, &k1, &k2, &files, -1, nullptr, + false); + for (const auto& file_meta : files) { + auto fname = + TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(), + file_meta->fd.GetPathId()); + if (props->count(fname) == 0) { + // 1. If the table is already present in table cache, load table + // properties from there. + std::shared_ptr table_properties; + Status s = GetTableProperties(&table_properties, file_meta, &fname); + if (s.ok()) { + props->insert({fname, table_properties}); + } else { + return s; + } + } + } + } + } + + return Status::OK(); +} + +Status Version::GetAggregatedTableProperties( + std::shared_ptr* tp, int level) { + TablePropertiesCollection props; + Status s; + if (level < 0) { + s = GetPropertiesOfAllTables(&props); + } else { + s = GetPropertiesOfAllTables(&props, level); + } + if (!s.ok()) { + return s; + } + + auto* new_tp = new TableProperties(); + for (const auto& item : props) { + new_tp->Add(*item.second); + } + tp->reset(new_tp); + return Status::OK(); +} + +size_t Version::GetMemoryUsageByTableReaders() { + size_t total_usage = 0; + for (auto& file_level : storage_info_.level_files_brief_) { + for (size_t i = 0; i < file_level.num_files; i++) { + total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader( + file_options_, cfd_->internal_comparator(), + *file_level.files[i].file_metadata, + mutable_cf_options_.prefix_extractor); + } + } + return total_usage; +} + +void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) { + assert(cf_meta); + assert(cfd_); + + cf_meta->name = cfd_->GetName(); + cf_meta->size = 0; + cf_meta->file_count = 0; + cf_meta->levels.clear(); + + cf_meta->blob_file_size = 0; + cf_meta->blob_file_count = 0; + cf_meta->blob_files.clear(); + + auto* ioptions = cfd_->ioptions(); + auto* vstorage = storage_info(); + + for (int level = 0; level < cfd_->NumberLevels(); level++) { + uint64_t level_size = 0; + cf_meta->file_count += vstorage->LevelFiles(level).size(); + std::vector files; + for (const auto& file : vstorage->LevelFiles(level)) { + uint32_t path_id = file->fd.GetPathId(); + std::string file_path; + if (path_id < ioptions->cf_paths.size()) { + file_path = ioptions->cf_paths[path_id].path; + } else { + assert(!ioptions->cf_paths.empty()); + file_path = ioptions->cf_paths.back().path; + } + const uint64_t file_number = file->fd.GetNumber(); + files.emplace_back( + MakeTableFileName("", file_number), file_number, file_path, + file->fd.GetFileSize(), file->fd.smallest_seqno, + file->fd.largest_seqno, file->smallest.user_key().ToString(), + file->largest.user_key().ToString(), + file->stats.num_reads_sampled.load(std::memory_order_relaxed), + file->being_compacted, file->temperature, + file->oldest_blob_file_number, file->TryGetOldestAncesterTime(), + file->TryGetFileCreationTime(), file->file_checksum, + file->file_checksum_func_name); + files.back().num_entries = file->num_entries; + files.back().num_deletions = file->num_deletions; + level_size += file->fd.GetFileSize(); + } + cf_meta->levels.emplace_back(level, level_size, std::move(files)); + cf_meta->size += level_size; + } + for (const auto& meta : vstorage->GetBlobFiles()) { + assert(meta); + + cf_meta->blob_files.emplace_back( + meta->GetBlobFileNumber(), BlobFileName("", meta->GetBlobFileNumber()), + ioptions->cf_paths.front().path, meta->GetBlobFileSize(), + meta->GetTotalBlobCount(), meta->GetTotalBlobBytes(), + meta->GetGarbageBlobCount(), meta->GetGarbageBlobBytes(), + meta->GetChecksumMethod(), meta->GetChecksumValue()); + ++cf_meta->blob_file_count; + cf_meta->blob_file_size += meta->GetBlobFileSize(); + } +} + +uint64_t Version::GetSstFilesSize() { + uint64_t sst_files_size = 0; + for (int level = 0; level < storage_info_.num_levels_; level++) { + for (const auto& file_meta : storage_info_.LevelFiles(level)) { + sst_files_size += file_meta->fd.GetFileSize(); + } + } + return sst_files_size; +} + +void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) { + uint64_t oldest_time = std::numeric_limits::max(); + for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) { + for (FileMetaData* meta : storage_info_.LevelFiles(level)) { + assert(meta->fd.table_reader != nullptr); + uint64_t file_creation_time = meta->TryGetFileCreationTime(); + if (file_creation_time == kUnknownFileCreationTime) { + *creation_time = 0; + return; + } + if (file_creation_time < oldest_time) { + oldest_time = file_creation_time; + } + } + } + *creation_time = oldest_time; +} + +InternalIterator* Version::TEST_GetLevelIterator( + const ReadOptions& read_options, MergeIteratorBuilder* merge_iter_builder, + int level, bool allow_unprepared_value) { + auto* arena = merge_iter_builder->GetArena(); + auto* mem = arena->AllocateAligned(sizeof(LevelIterator)); + TruncatedRangeDelIterator*** tombstone_iter_ptr = nullptr; + auto level_iter = new (mem) LevelIterator( + cfd_->table_cache(), read_options, file_options_, + cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), + mutable_cf_options_.prefix_extractor, should_sample_file_read(), + cfd_->internal_stats()->GetFileReadHist(level), + TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, + nullptr /* range_del_agg */, nullptr /* compaction_boundaries */, + allow_unprepared_value, &tombstone_iter_ptr); + if (read_options.ignore_range_deletions) { + merge_iter_builder->AddIterator(level_iter); + } else { + merge_iter_builder->AddPointAndTombstoneIterator( + level_iter, nullptr /* tombstone_iter */, tombstone_iter_ptr); + } + return level_iter; +} + +uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const { + // Estimation will be inaccurate when: + // (1) there exist merge keys + // (2) keys are directly overwritten + // (3) deletion on non-existing keys + // (4) low number of samples + if (current_num_samples_ == 0) { + return 0; + } + + if (current_num_non_deletions_ <= current_num_deletions_) { + return 0; + } + + uint64_t est = current_num_non_deletions_ - current_num_deletions_; + + uint64_t file_count = 0; + for (int level = 0; level < num_levels_; ++level) { + file_count += files_[level].size(); + } + + if (current_num_samples_ < file_count) { + // casting to avoid overflowing + return static_cast( + (est * static_cast(file_count) / current_num_samples_)); + } else { + return est; + } +} + +double VersionStorageInfo::GetEstimatedCompressionRatioAtLevel( + int level) const { + assert(level < num_levels_); + uint64_t sum_file_size_bytes = 0; + uint64_t sum_data_size_bytes = 0; + for (auto* file_meta : files_[level]) { + sum_file_size_bytes += file_meta->fd.GetFileSize(); + sum_data_size_bytes += file_meta->raw_key_size + file_meta->raw_value_size; + } + if (sum_file_size_bytes == 0) { + return -1.0; + } + return static_cast(sum_data_size_bytes) / sum_file_size_bytes; +} + +void Version::AddIterators(const ReadOptions& read_options, + const FileOptions& soptions, + MergeIteratorBuilder* merge_iter_builder, + bool allow_unprepared_value) { + assert(storage_info_.finalized_); + + for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) { + AddIteratorsForLevel(read_options, soptions, merge_iter_builder, level, + allow_unprepared_value); + } +} + +void Version::AddIteratorsForLevel(const ReadOptions& read_options, + const FileOptions& soptions, + MergeIteratorBuilder* merge_iter_builder, + int level, bool allow_unprepared_value) { + assert(storage_info_.finalized_); + if (level >= storage_info_.num_non_empty_levels()) { + // This is an empty level + return; + } else if (storage_info_.LevelFilesBrief(level).num_files == 0) { + // No files in this level + return; + } + + bool should_sample = should_sample_file_read(); + + auto* arena = merge_iter_builder->GetArena(); + if (level == 0) { + // Merge all level zero files together since they may overlap + TruncatedRangeDelIterator* tombstone_iter = nullptr; + for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) { + const auto& file = storage_info_.LevelFilesBrief(0).files[i]; + auto table_iter = cfd_->table_cache()->NewIterator( + read_options, soptions, cfd_->internal_comparator(), + *file.file_metadata, /*range_del_agg=*/nullptr, + mutable_cf_options_.prefix_extractor, nullptr, + cfd_->internal_stats()->GetFileReadHist(0), + TableReaderCaller::kUserIterator, arena, + /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr, allow_unprepared_value, + &tombstone_iter); + if (read_options.ignore_range_deletions) { + merge_iter_builder->AddIterator(table_iter); + } else { + merge_iter_builder->AddPointAndTombstoneIterator(table_iter, + tombstone_iter); + } + } + if (should_sample) { + // Count ones for every L0 files. This is done per iterator creation + // rather than Seek(), while files in other levels are recored per seek. + // If users execute one range query per iterator, there may be some + // discrepancy here. + for (FileMetaData* meta : storage_info_.LevelFiles(0)) { + sample_file_read_inc(meta); + } + } + } else if (storage_info_.LevelFilesBrief(level).num_files > 0) { + // For levels > 0, we can use a concatenating iterator that sequentially + // walks through the non-overlapping files in the level, opening them + // lazily. + auto* mem = arena->AllocateAligned(sizeof(LevelIterator)); + TruncatedRangeDelIterator*** tombstone_iter_ptr = nullptr; + auto level_iter = new (mem) LevelIterator( + cfd_->table_cache(), read_options, soptions, + cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), + mutable_cf_options_.prefix_extractor, should_sample_file_read(), + cfd_->internal_stats()->GetFileReadHist(level), + TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, + /*range_del_agg=*/nullptr, /*compaction_boundaries=*/nullptr, + allow_unprepared_value, &tombstone_iter_ptr); + if (read_options.ignore_range_deletions) { + merge_iter_builder->AddIterator(level_iter); + } else { + merge_iter_builder->AddPointAndTombstoneIterator( + level_iter, nullptr /* tombstone_iter */, tombstone_iter_ptr); + } + } +} + +Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, + const FileOptions& file_options, + const Slice& smallest_user_key, + const Slice& largest_user_key, + int level, bool* overlap) { + assert(storage_info_.finalized_); + + auto icmp = cfd_->internal_comparator(); + auto ucmp = icmp.user_comparator(); + + Arena arena; + Status status; + ReadRangeDelAggregator range_del_agg(&icmp, + kMaxSequenceNumber /* upper_bound */); + + *overlap = false; + + if (level == 0) { + for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) { + const auto file = &storage_info_.LevelFilesBrief(0).files[i]; + if (AfterFile(ucmp, &smallest_user_key, file) || + BeforeFile(ucmp, &largest_user_key, file)) { + continue; + } + ScopedArenaIterator iter(cfd_->table_cache()->NewIterator( + read_options, file_options, cfd_->internal_comparator(), + *file->file_metadata, &range_del_agg, + mutable_cf_options_.prefix_extractor, nullptr, + cfd_->internal_stats()->GetFileReadHist(0), + TableReaderCaller::kUserIterator, &arena, + /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr, + /*allow_unprepared_value=*/false)); + status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key, + iter.get(), overlap); + if (!status.ok() || *overlap) { + break; + } + } + } else if (storage_info_.LevelFilesBrief(level).num_files > 0) { + auto mem = arena.AllocateAligned(sizeof(LevelIterator)); + ScopedArenaIterator iter(new (mem) LevelIterator( + cfd_->table_cache(), read_options, file_options, + cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), + mutable_cf_options_.prefix_extractor, should_sample_file_read(), + cfd_->internal_stats()->GetFileReadHist(level), + TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, + &range_del_agg)); + status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key, + iter.get(), overlap); + } + + if (status.ok() && *overlap == false && + range_del_agg.IsRangeOverlapped(smallest_user_key, largest_user_key)) { + *overlap = true; + } + return status; +} + +VersionStorageInfo::VersionStorageInfo( + const InternalKeyComparator* internal_comparator, + const Comparator* user_comparator, int levels, + CompactionStyle compaction_style, VersionStorageInfo* ref_vstorage, + bool _force_consistency_checks) + : internal_comparator_(internal_comparator), + user_comparator_(user_comparator), + // cfd is nullptr if Version is dummy + num_levels_(levels), + num_non_empty_levels_(0), + file_indexer_(user_comparator), + compaction_style_(compaction_style), + files_(new std::vector[num_levels_]), + base_level_(num_levels_ == 1 ? -1 : 1), + level_multiplier_(0.0), + files_by_compaction_pri_(num_levels_), + level0_non_overlapping_(false), + next_file_to_compact_by_size_(num_levels_), + compaction_score_(num_levels_), + compaction_level_(num_levels_), + l0_delay_trigger_count_(0), + compact_cursor_(num_levels_), + accumulated_file_size_(0), + accumulated_raw_key_size_(0), + accumulated_raw_value_size_(0), + accumulated_num_non_deletions_(0), + accumulated_num_deletions_(0), + current_num_non_deletions_(0), + current_num_deletions_(0), + current_num_samples_(0), + estimated_compaction_needed_bytes_(0), + finalized_(false), + force_consistency_checks_(_force_consistency_checks) { + if (ref_vstorage != nullptr) { + accumulated_file_size_ = ref_vstorage->accumulated_file_size_; + accumulated_raw_key_size_ = ref_vstorage->accumulated_raw_key_size_; + accumulated_raw_value_size_ = ref_vstorage->accumulated_raw_value_size_; + accumulated_num_non_deletions_ = + ref_vstorage->accumulated_num_non_deletions_; + accumulated_num_deletions_ = ref_vstorage->accumulated_num_deletions_; + current_num_non_deletions_ = ref_vstorage->current_num_non_deletions_; + current_num_deletions_ = ref_vstorage->current_num_deletions_; + current_num_samples_ = ref_vstorage->current_num_samples_; + oldest_snapshot_seqnum_ = ref_vstorage->oldest_snapshot_seqnum_; + compact_cursor_ = ref_vstorage->compact_cursor_; + compact_cursor_.resize(num_levels_); + } +} + +Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset, + const FileOptions& file_opt, + const MutableCFOptions mutable_cf_options, + const std::shared_ptr& io_tracer, + uint64_t version_number) + : env_(vset->env_), + clock_(vset->clock_), + cfd_(column_family_data), + info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->logger), + db_statistics_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->stats), + table_cache_((cfd_ == nullptr) ? nullptr : cfd_->table_cache()), + blob_source_(cfd_ ? cfd_->blob_source() : nullptr), + merge_operator_( + (cfd_ == nullptr) ? nullptr : cfd_->ioptions()->merge_operator.get()), + storage_info_( + (cfd_ == nullptr) ? nullptr : &cfd_->internal_comparator(), + (cfd_ == nullptr) ? nullptr : cfd_->user_comparator(), + cfd_ == nullptr ? 0 : cfd_->NumberLevels(), + cfd_ == nullptr ? kCompactionStyleLevel + : cfd_->ioptions()->compaction_style, + (cfd_ == nullptr || cfd_->current() == nullptr) + ? nullptr + : cfd_->current()->storage_info(), + cfd_ == nullptr ? false : cfd_->ioptions()->force_consistency_checks), + vset_(vset), + next_(this), + prev_(this), + refs_(0), + file_options_(file_opt), + mutable_cf_options_(mutable_cf_options), + max_file_size_for_l0_meta_pin_( + MaxFileSizeForL0MetaPin(mutable_cf_options_)), + version_number_(version_number), + io_tracer_(io_tracer) {} + +Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key, + const Slice& blob_index_slice, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* value, uint64_t* bytes_read) const { + BlobIndex blob_index; + + { + Status s = blob_index.DecodeFrom(blob_index_slice); + if (!s.ok()) { + return s; + } + } + + return GetBlob(read_options, user_key, blob_index, prefetch_buffer, value, + bytes_read); +} + +Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key, + const BlobIndex& blob_index, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* value, uint64_t* bytes_read) const { + assert(value); + + if (blob_index.HasTTL() || blob_index.IsInlined()) { + return Status::Corruption("Unexpected TTL/inlined blob index"); + } + + const uint64_t blob_file_number = blob_index.file_number(); + + auto blob_file_meta = storage_info_.GetBlobFileMetaData(blob_file_number); + if (!blob_file_meta) { + return Status::Corruption("Invalid blob file number"); + } + + assert(blob_source_); + value->Reset(); + const Status s = blob_source_->GetBlob( + read_options, user_key, blob_file_number, blob_index.offset(), + blob_file_meta->GetBlobFileSize(), blob_index.size(), + blob_index.compression(), prefetch_buffer, value, bytes_read); + + return s; +} + +void Version::MultiGetBlob( + const ReadOptions& read_options, MultiGetRange& range, + std::unordered_map& blob_ctxs) { + assert(!blob_ctxs.empty()); + + autovector blob_reqs; + + for (auto& ctx : blob_ctxs) { + const auto file_number = ctx.first; + const auto blob_file_meta = storage_info_.GetBlobFileMetaData(file_number); + + autovector blob_reqs_in_file; + BlobReadContexts& blobs_in_file = ctx.second; + for (const auto& blob : blobs_in_file) { + const BlobIndex& blob_index = blob.first; + const KeyContext& key_context = blob.second; + + if (!blob_file_meta) { + *key_context.s = Status::Corruption("Invalid blob file number"); + continue; + } + + if (blob_index.HasTTL() || blob_index.IsInlined()) { + *key_context.s = + Status::Corruption("Unexpected TTL/inlined blob index"); + continue; + } + + key_context.value->Reset(); + blob_reqs_in_file.emplace_back( + key_context.ukey_with_ts, blob_index.offset(), blob_index.size(), + blob_index.compression(), key_context.value, key_context.s); + } + if (blob_reqs_in_file.size() > 0) { + const auto file_size = blob_file_meta->GetBlobFileSize(); + blob_reqs.emplace_back(file_number, file_size, blob_reqs_in_file); + } + } + + if (blob_reqs.size() > 0) { + blob_source_->MultiGetBlob(read_options, blob_reqs, /*bytes_read=*/nullptr); + } + + for (auto& ctx : blob_ctxs) { + BlobReadContexts& blobs_in_file = ctx.second; + for (const auto& blob : blobs_in_file) { + const KeyContext& key_context = blob.second; + if (key_context.s->ok()) { + range.AddValueSize(key_context.value->size()); + if (range.GetValueSize() > read_options.value_size_soft_limit) { + *key_context.s = Status::Aborted(); + } + } else if (key_context.s->IsIncomplete()) { + // read_options.read_tier == kBlockCacheTier + // Cannot read blob(s): no disk I/O allowed + assert(key_context.get_context); + auto& get_context = *(key_context.get_context); + get_context.MarkKeyMayExist(); + } + } + } +} + +void Version::Get(const ReadOptions& read_options, const LookupKey& k, + PinnableSlice* value, PinnableWideColumns* columns, + std::string* timestamp, Status* status, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + PinnedIteratorsManager* pinned_iters_mgr, bool* value_found, + bool* key_exists, SequenceNumber* seq, ReadCallback* callback, + bool* is_blob, bool do_merge) { + Slice ikey = k.internal_key(); + Slice user_key = k.user_key(); + + assert(status->ok() || status->IsMergeInProgress()); + + if (key_exists != nullptr) { + // will falsify below if not found + *key_exists = true; + } + + uint64_t tracing_get_id = BlockCacheTraceHelper::kReservedGetId; + if (vset_ && vset_->block_cache_tracer_ && + vset_->block_cache_tracer_->is_tracing_enabled()) { + tracing_get_id = vset_->block_cache_tracer_->NextGetId(); + } + + // Note: the old StackableDB-based BlobDB passes in + // GetImplOptions::is_blob_index; for the integrated BlobDB implementation, we + // need to provide it here. + bool is_blob_index = false; + bool* const is_blob_to_use = is_blob ? is_blob : &is_blob_index; + BlobFetcher blob_fetcher(this, read_options); + + assert(pinned_iters_mgr); + GetContext get_context( + user_comparator(), merge_operator_, info_log_, db_statistics_, + status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key, + do_merge ? value : nullptr, do_merge ? columns : nullptr, + do_merge ? timestamp : nullptr, value_found, merge_context, do_merge, + max_covering_tombstone_seq, clock_, seq, + merge_operator_ ? pinned_iters_mgr : nullptr, callback, is_blob_to_use, + tracing_get_id, &blob_fetcher); + + // Pin blocks that we read to hold merge operands + if (merge_operator_) { + pinned_iters_mgr->StartPinning(); + } + + FilePicker fp(user_key, ikey, &storage_info_.level_files_brief_, + storage_info_.num_non_empty_levels_, + &storage_info_.file_indexer_, user_comparator(), + internal_comparator()); + FdWithKeyRange* f = fp.GetNextFile(); + + while (f != nullptr) { + if (*max_covering_tombstone_seq > 0) { + // The remaining files we look at will only contain covered keys, so we + // stop here. + break; + } + if (get_context.sample()) { + sample_file_read_inc(f->file_metadata); + } + + bool timer_enabled = + GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex && + get_perf_context()->per_level_perf_context_enabled; + StopWatchNano timer(clock_, timer_enabled /* auto_start */); + *status = table_cache_->Get( + read_options, *internal_comparator(), *f->file_metadata, ikey, + &get_context, mutable_cf_options_.prefix_extractor, + cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), + IsFilterSkipped(static_cast(fp.GetHitFileLevel()), + fp.IsHitFileLastInLevel()), + fp.GetHitFileLevel(), max_file_size_for_l0_meta_pin_); + // TODO: examine the behavior for corrupted key + if (timer_enabled) { + PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(), + fp.GetHitFileLevel()); + } + if (!status->ok()) { + if (db_statistics_ != nullptr) { + get_context.ReportCounters(); + } + return; + } + + // report the counters before returning + if (get_context.State() != GetContext::kNotFound && + get_context.State() != GetContext::kMerge && + db_statistics_ != nullptr) { + get_context.ReportCounters(); + } + switch (get_context.State()) { + case GetContext::kNotFound: + // Keep searching in other files + break; + case GetContext::kMerge: + // TODO: update per-level perfcontext user_key_return_count for kMerge + break; + case GetContext::kFound: + if (fp.GetHitFileLevel() == 0) { + RecordTick(db_statistics_, GET_HIT_L0); + } else if (fp.GetHitFileLevel() == 1) { + RecordTick(db_statistics_, GET_HIT_L1); + } else if (fp.GetHitFileLevel() >= 2) { + RecordTick(db_statistics_, GET_HIT_L2_AND_UP); + } + + PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, + fp.GetHitFileLevel()); + + if (is_blob_index) { + if (do_merge && value) { + TEST_SYNC_POINT_CALLBACK("Version::Get::TamperWithBlobIndex", + value); + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + constexpr uint64_t* bytes_read = nullptr; + + *status = GetBlob(read_options, user_key, *value, prefetch_buffer, + value, bytes_read); + if (!status->ok()) { + if (status->IsIncomplete()) { + get_context.MarkKeyMayExist(); + } + return; + } + } + } + + return; + case GetContext::kDeleted: + // Use empty error message for speed + *status = Status::NotFound(); + return; + case GetContext::kCorrupt: + *status = Status::Corruption("corrupted key for ", user_key); + return; + case GetContext::kUnexpectedBlobIndex: + ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index."); + *status = Status::NotSupported( + "Encounter unexpected blob index. Please open DB with " + "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); + return; + } + f = fp.GetNextFile(); + } + if (db_statistics_ != nullptr) { + get_context.ReportCounters(); + } + if (GetContext::kMerge == get_context.State()) { + if (!do_merge) { + *status = Status::OK(); + return; + } + if (!merge_operator_) { + *status = Status::InvalidArgument( + "merge_operator is not properly initialized."); + return; + } + // merge_operands are in saver and we hit the beginning of the key history + // do a final merge of nullptr and operands; + if (value || columns) { + std::string result; + *status = MergeHelper::TimedFullMerge( + merge_operator_, user_key, nullptr, merge_context->GetOperands(), + &result, info_log_, db_statistics_, clock_, + /* result_operand */ nullptr, /* update_num_ops_stats */ true); + if (status->ok()) { + if (LIKELY(value != nullptr)) { + *(value->GetSelf()) = std::move(result); + value->PinSelf(); + } else { + assert(columns != nullptr); + columns->SetPlainValue(result); + } + } + } + } else { + if (key_exists != nullptr) { + *key_exists = false; + } + *status = Status::NotFound(); // Use an empty error message for speed + } +} + +void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, + ReadCallback* callback) { + PinnedIteratorsManager pinned_iters_mgr; + + // Pin blocks that we read to hold merge operands + if (merge_operator_) { + pinned_iters_mgr.StartPinning(); + } + uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId; + + if (vset_ && vset_->block_cache_tracer_ && + vset_->block_cache_tracer_->is_tracing_enabled()) { + tracing_mget_id = vset_->block_cache_tracer_->NextGetId(); + } + // Even though we know the batch size won't be > MAX_BATCH_SIZE, + // use autovector in order to avoid unnecessary construction of GetContext + // objects, which is expensive + autovector get_ctx; + BlobFetcher blob_fetcher(this, read_options); + for (auto iter = range->begin(); iter != range->end(); ++iter) { + assert(iter->s->ok() || iter->s->IsMergeInProgress()); + get_ctx.emplace_back( + user_comparator(), merge_operator_, info_log_, db_statistics_, + iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge, + iter->ukey_with_ts, iter->value, /*columns=*/nullptr, iter->timestamp, + nullptr, &(iter->merge_context), true, + &iter->max_covering_tombstone_seq, clock_, nullptr, + merge_operator_ ? &pinned_iters_mgr : nullptr, callback, + &iter->is_blob_index, tracing_mget_id, &blob_fetcher); + // MergeInProgress status, if set, has been transferred to the get_context + // state, so we set status to ok here. From now on, the iter status will + // be used for IO errors, and get_context state will be used for any + // key level errors + *(iter->s) = Status::OK(); + } + int get_ctx_index = 0; + for (auto iter = range->begin(); iter != range->end(); + ++iter, get_ctx_index++) { + iter->get_context = &(get_ctx[get_ctx_index]); + } + + Status s; + // blob_file => [[blob_idx, it], ...] + std::unordered_map blob_ctxs; + MultiGetRange keys_with_blobs_range(*range, range->begin(), range->end()); +#if USE_COROUTINES + if (read_options.async_io && read_options.optimize_multiget_for_io && + using_coroutines()) { + s = MultiGetAsync(read_options, range, &blob_ctxs); + } else +#endif // USE_COROUTINES + { + MultiGetRange file_picker_range(*range, range->begin(), range->end()); + FilePickerMultiGet fp(&file_picker_range, &storage_info_.level_files_brief_, + storage_info_.num_non_empty_levels_, + &storage_info_.file_indexer_, user_comparator(), + internal_comparator()); + FdWithKeyRange* f = fp.GetNextFileInLevel(); + uint64_t num_index_read = 0; + uint64_t num_filter_read = 0; + uint64_t num_sst_read = 0; + uint64_t num_level_read = 0; + + int prev_level = -1; + + while (!fp.IsSearchEnded()) { + // This will be set to true later if we actually look up in a file in L0. + // For per level stats purposes, an L0 file is treated as a level + bool dump_stats_for_l0_file = false; + + // Avoid using the coroutine version if we're looking in a L0 file, since + // L0 files won't be parallelized anyway. The regular synchronous version + // is faster. + if (!read_options.async_io || !using_coroutines() || + fp.GetHitFileLevel() == 0 || !fp.RemainingOverlapInLevel()) { + if (f) { + bool skip_filters = + IsFilterSkipped(static_cast(fp.GetHitFileLevel()), + fp.IsHitFileLastInLevel()); + // Call MultiGetFromSST for looking up a single file + s = MultiGetFromSST(read_options, fp.CurrentFileRange(), + fp.GetHitFileLevel(), skip_filters, + /*skip_range_deletions=*/false, f, blob_ctxs, + /*table_handle=*/nullptr, num_filter_read, + num_index_read, num_sst_read); + if (fp.GetHitFileLevel() == 0) { + dump_stats_for_l0_file = true; + } + } + if (s.ok()) { + f = fp.GetNextFileInLevel(); + } +#if USE_COROUTINES + } else { + std::vector> mget_tasks; + while (f != nullptr) { + MultiGetRange file_range = fp.CurrentFileRange(); + Cache::Handle* table_handle = nullptr; + bool skip_filters = + IsFilterSkipped(static_cast(fp.GetHitFileLevel()), + fp.IsHitFileLastInLevel()); + bool skip_range_deletions = false; + if (!skip_filters) { + Status status = table_cache_->MultiGetFilter( + read_options, *internal_comparator(), *f->file_metadata, + mutable_cf_options_.prefix_extractor, + cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), + fp.GetHitFileLevel(), &file_range, &table_handle); + skip_range_deletions = true; + if (status.ok()) { + skip_filters = true; + } else if (!status.IsNotSupported()) { + s = status; + } + } + + if (!s.ok()) { + break; + } + + if (!file_range.empty()) { + mget_tasks.emplace_back(MultiGetFromSSTCoroutine( + read_options, file_range, fp.GetHitFileLevel(), skip_filters, + skip_range_deletions, f, blob_ctxs, table_handle, + num_filter_read, num_index_read, num_sst_read)); + } + if (fp.KeyMaySpanNextFile()) { + break; + } + f = fp.GetNextFileInLevel(); + } + if (mget_tasks.size() > 0) { + RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT, + mget_tasks.size()); + // Collect all results so far + std::vector statuses = folly::coro::blockingWait( + folly::coro::collectAllRange(std::move(mget_tasks)) + .scheduleOn(&range->context()->executor())); + if (s.ok()) { + for (Status stat : statuses) { + if (!stat.ok()) { + s = std::move(stat); + break; + } + } + } + + if (s.ok() && fp.KeyMaySpanNextFile()) { + f = fp.GetNextFileInLevel(); + } + } +#endif // USE_COROUTINES + } + // If bad status or we found final result for all the keys + if (!s.ok() || file_picker_range.empty()) { + break; + } + if (!f) { + // Reached the end of this level. Prepare the next level + fp.PrepareNextLevelForSearch(); + if (!fp.IsSearchEnded()) { + // Its possible there is no overlap on this level and f is nullptr + f = fp.GetNextFileInLevel(); + } + if (dump_stats_for_l0_file || + (prev_level != 0 && prev_level != (int)fp.GetHitFileLevel())) { + // Dump the stats if the search has moved to the next level and + // reset for next level. + if (num_filter_read + num_index_read) { + RecordInHistogram(db_statistics_, + NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, + num_index_read + num_filter_read); + } + if (num_sst_read) { + RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, + num_sst_read); + num_level_read++; + } + num_filter_read = 0; + num_index_read = 0; + num_sst_read = 0; + } + prev_level = fp.GetHitFileLevel(); + } + } + + // Dump stats for most recent level + if (num_filter_read + num_index_read) { + RecordInHistogram(db_statistics_, + NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, + num_index_read + num_filter_read); + } + if (num_sst_read) { + RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read); + num_level_read++; + } + if (num_level_read) { + RecordInHistogram(db_statistics_, NUM_LEVEL_READ_PER_MULTIGET, + num_level_read); + } + } + + if (s.ok() && !blob_ctxs.empty()) { + MultiGetBlob(read_options, keys_with_blobs_range, blob_ctxs); + } + + // Process any left over keys + for (auto iter = range->begin(); s.ok() && iter != range->end(); ++iter) { + GetContext& get_context = *iter->get_context; + Status* status = iter->s; + Slice user_key = iter->lkey->user_key(); + + if (db_statistics_ != nullptr) { + get_context.ReportCounters(); + } + if (GetContext::kMerge == get_context.State()) { + if (!merge_operator_) { + *status = Status::InvalidArgument( + "merge_operator is not properly initialized."); + range->MarkKeyDone(iter); + continue; + } + // merge_operands are in saver and we hit the beginning of the key history + // do a final merge of nullptr and operands; + std::string* str_value = + iter->value != nullptr ? iter->value->GetSelf() : nullptr; + *status = MergeHelper::TimedFullMerge( + merge_operator_, user_key, nullptr, iter->merge_context.GetOperands(), + str_value, info_log_, db_statistics_, clock_, + /* result_operand */ nullptr, /* update_num_ops_stats */ true); + if (LIKELY(iter->value != nullptr)) { + iter->value->PinSelf(); + range->AddValueSize(iter->value->size()); + range->MarkKeyDone(iter); + if (range->GetValueSize() > read_options.value_size_soft_limit) { + s = Status::Aborted(); + break; + } + } + } else { + range->MarkKeyDone(iter); + *status = Status::NotFound(); // Use an empty error message for speed + } + } + + for (auto iter = range->begin(); iter != range->end(); ++iter) { + range->MarkKeyDone(iter); + *(iter->s) = s; + } +} + +#ifdef USE_COROUTINES +Status Version::ProcessBatch( + const ReadOptions& read_options, FilePickerMultiGet* batch, + std::vector>& mget_tasks, + std::unordered_map* blob_ctxs, + autovector& batches, std::deque& waiting, + std::deque& to_process, unsigned int& num_tasks_queued, + std::unordered_map>& + mget_stats) { + FilePickerMultiGet& fp = *batch; + MultiGetRange range = fp.GetRange(); + // Initialize a new empty range. Any keys that are not in this level will + // eventually become part of the new range. + MultiGetRange leftover(range, range.begin(), range.begin()); + FdWithKeyRange* f = nullptr; + Status s; + + f = fp.GetNextFileInLevel(); + while (!f) { + fp.PrepareNextLevelForSearch(); + if (!fp.IsSearchEnded()) { + f = fp.GetNextFileInLevel(); + } else { + break; + } + } + while (f) { + MultiGetRange file_range = fp.CurrentFileRange(); + Cache::Handle* table_handle = nullptr; + bool skip_filters = IsFilterSkipped(static_cast(fp.GetHitFileLevel()), + fp.IsHitFileLastInLevel()); + bool skip_range_deletions = false; + if (!skip_filters) { + Status status = table_cache_->MultiGetFilter( + read_options, *internal_comparator(), *f->file_metadata, + mutable_cf_options_.prefix_extractor, + cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), + fp.GetHitFileLevel(), &file_range, &table_handle); + if (status.ok()) { + skip_filters = true; + skip_range_deletions = true; + } else if (!status.IsNotSupported()) { + s = status; + } + } + if (!s.ok()) { + break; + } + // At this point, file_range contains any keys that are likely in this + // file. It may have false positives, but that's ok since higher level + // lookups for the key are dependent on this lookup anyway. + // Add the complement of file_range to leftover. That's the set of keys + // definitely not in this level. + // Subtract the complement of file_range from range, since they will be + // processed in a separate batch in parallel. + leftover += ~file_range; + range -= ~file_range; + if (!file_range.empty()) { + int level = fp.GetHitFileLevel(); + auto stat = mget_stats.find(level); + if (stat == mget_stats.end()) { + auto entry = mget_stats.insert({level, {0, 0, 0}}); + assert(entry.second); + stat = entry.first; + } + + if (waiting.empty() && to_process.empty() && + !fp.RemainingOverlapInLevel() && leftover.empty() && + mget_tasks.empty()) { + // All keys are in one SST file, so take the fast path + s = MultiGetFromSST(read_options, file_range, fp.GetHitFileLevel(), + skip_filters, skip_range_deletions, f, *blob_ctxs, + table_handle, std::get<0>(stat->second), + std::get<1>(stat->second), + std::get<2>(stat->second)); + } else { + mget_tasks.emplace_back(MultiGetFromSSTCoroutine( + read_options, file_range, fp.GetHitFileLevel(), skip_filters, + skip_range_deletions, f, *blob_ctxs, table_handle, + std::get<0>(stat->second), std::get<1>(stat->second), + std::get<2>(stat->second))); + ++num_tasks_queued; + } + } + if (fp.KeyMaySpanNextFile() && !file_range.empty()) { + break; + } + f = fp.GetNextFileInLevel(); + } + // Split the current batch only if some keys are likely in this level and + // some are not. Only split if we're done with this level, i.e f is null. + // Otherwise, it means there are more files in this level to look at. + if (s.ok() && !f && !leftover.empty() && !range.empty()) { + fp.ReplaceRange(range); + batches.emplace_back(&leftover, fp); + to_process.emplace_back(batches.size() - 1); + } + // 1. If f is non-null, that means we might not be done with this level. + // This can happen if one of the keys is the last key in the file, i.e + // fp.KeyMaySpanNextFile() is true. + // 2. If range is empty, then we're done with this range and no need to + // prepare the next level + // 3. If some tasks were queued for this range, then the next level will be + // prepared after executing those tasks + if (!f && !range.empty() && !num_tasks_queued) { + fp.PrepareNextLevelForSearch(); + } + return s; +} + +Status Version::MultiGetAsync( + const ReadOptions& options, MultiGetRange* range, + std::unordered_map* blob_ctxs) { + autovector batches; + std::deque waiting; + std::deque to_process; + Status s; + std::vector> mget_tasks; + std::unordered_map> mget_stats; + + // Create the initial batch with the input range + batches.emplace_back(range, &storage_info_.level_files_brief_, + storage_info_.num_non_empty_levels_, + &storage_info_.file_indexer_, user_comparator(), + internal_comparator()); + to_process.emplace_back(0); + + while (!to_process.empty()) { + // As we process a batch, it may get split into two. So reserve space for + // an additional batch in the autovector in order to prevent later moves + // of elements in ProcessBatch(). + batches.reserve(batches.size() + 1); + + size_t idx = to_process.front(); + FilePickerMultiGet* batch = &batches.at(idx); + unsigned int num_tasks_queued = 0; + to_process.pop_front(); + if (batch->IsSearchEnded() || batch->GetRange().empty()) { + // If to_process is empty, i.e no more batches to look at, then we need + // schedule the enqueued coroutines and wait for them. Otherwise, we + // skip this batch and move to the next one in to_process. + if (!to_process.empty()) { + continue; + } + } else { + // Look through one level. This may split the batch and enqueue it to + // to_process + s = ProcessBatch(options, batch, mget_tasks, blob_ctxs, batches, waiting, + to_process, num_tasks_queued, mget_stats); + // If ProcessBatch didn't enqueue any coroutine tasks, it means all + // keys were filtered out. So put the batch back in to_process to + // lookup in the next level + if (!num_tasks_queued && !batch->IsSearchEnded()) { + // Put this back in the processing queue + to_process.emplace_back(idx); + } else if (num_tasks_queued) { + waiting.emplace_back(idx); + } + } + // If ProcessBatch() returned an error, then schedule the enqueued + // coroutines and wait for them, then abort the MultiGet. + if (to_process.empty() || !s.ok()) { + if (mget_tasks.size() > 0) { + assert(waiting.size()); + RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT, mget_tasks.size()); + // Collect all results so far + std::vector statuses = folly::coro::blockingWait( + folly::coro::collectAllRange(std::move(mget_tasks)) + .scheduleOn(&range->context()->executor())); + mget_tasks.clear(); + if (s.ok()) { + for (Status stat : statuses) { + if (!stat.ok()) { + s = std::move(stat); + break; + } + } + } + + if (!s.ok()) { + break; + } + + for (size_t wait_idx : waiting) { + FilePickerMultiGet& fp = batches.at(wait_idx); + // 1. If fp.GetHitFile() is non-null, then there could be more + // overlap in this level. So skip preparing next level. + // 2. If fp.GetRange() is empty, then this batch is completed + // and no need to prepare the next level. + if (!fp.GetHitFile() && !fp.GetRange().empty()) { + fp.PrepareNextLevelForSearch(); + } + } + to_process.swap(waiting); + } else { + assert(!s.ok() || waiting.size() == 0); + } + } + if (!s.ok()) { + break; + } + } + + uint64_t num_levels = 0; + for (auto& stat : mget_stats) { + if (stat.first == 0) { + num_levels += std::get<2>(stat.second); + } else { + num_levels++; + } + + uint64_t num_meta_reads = + std::get<0>(stat.second) + std::get<1>(stat.second); + uint64_t num_sst_reads = std::get<2>(stat.second); + if (num_meta_reads > 0) { + RecordInHistogram(db_statistics_, + NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, + num_meta_reads); + } + if (num_sst_reads > 0) { + RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_reads); + } + } + if (num_levels > 0) { + RecordInHistogram(db_statistics_, NUM_LEVEL_READ_PER_MULTIGET, num_levels); + } + + return s; +} +#endif + +bool Version::IsFilterSkipped(int level, bool is_file_last_in_level) { + // Reaching the bottom level implies misses at all upper levels, so we'll + // skip checking the filters when we predict a hit. + return cfd_->ioptions()->optimize_filters_for_hits && + (level > 0 || is_file_last_in_level) && + level == storage_info_.num_non_empty_levels() - 1; +} + +void VersionStorageInfo::GenerateLevelFilesBrief() { + level_files_brief_.resize(num_non_empty_levels_); + for (int level = 0; level < num_non_empty_levels_; level++) { + DoGenerateLevelFilesBrief(&level_files_brief_[level], files_[level], + &arena_); + } +} + +void VersionStorageInfo::PrepareForVersionAppend( + const ImmutableOptions& immutable_options, + const MutableCFOptions& mutable_cf_options) { + ComputeCompensatedSizes(); + UpdateNumNonEmptyLevels(); + CalculateBaseBytes(immutable_options, mutable_cf_options); + UpdateFilesByCompactionPri(immutable_options, mutable_cf_options); + GenerateFileIndexer(); + GenerateLevelFilesBrief(); + GenerateLevel0NonOverlapping(); + if (!immutable_options.allow_ingest_behind) { + GenerateBottommostFiles(); + } + GenerateFileLocationIndex(); +} + +void Version::PrepareAppend(const MutableCFOptions& mutable_cf_options, + bool update_stats) { + TEST_SYNC_POINT_CALLBACK( + "Version::PrepareAppend:forced_check", + reinterpret_cast(&storage_info_.force_consistency_checks_)); + + if (update_stats) { + UpdateAccumulatedStats(); + } + + storage_info_.PrepareForVersionAppend(*cfd_->ioptions(), mutable_cf_options); +} + +bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) { + if (file_meta->init_stats_from_file || file_meta->compensated_file_size > 0) { + return false; + } + std::shared_ptr tp; + Status s = GetTableProperties(&tp, file_meta); + file_meta->init_stats_from_file = true; + if (!s.ok()) { + ROCKS_LOG_ERROR(vset_->db_options_->info_log, + "Unable to load table properties for file %" PRIu64 + " --- %s\n", + file_meta->fd.GetNumber(), s.ToString().c_str()); + return false; + } + if (tp.get() == nullptr) return false; + file_meta->num_entries = tp->num_entries; + file_meta->num_deletions = tp->num_deletions; + file_meta->raw_value_size = tp->raw_value_size; + file_meta->raw_key_size = tp->raw_key_size; + + return true; +} + +void VersionStorageInfo::UpdateAccumulatedStats(FileMetaData* file_meta) { + TEST_SYNC_POINT_CALLBACK("VersionStorageInfo::UpdateAccumulatedStats", + nullptr); + + assert(file_meta->init_stats_from_file); + accumulated_file_size_ += file_meta->fd.GetFileSize(); + accumulated_raw_key_size_ += file_meta->raw_key_size; + accumulated_raw_value_size_ += file_meta->raw_value_size; + accumulated_num_non_deletions_ += + file_meta->num_entries - file_meta->num_deletions; + accumulated_num_deletions_ += file_meta->num_deletions; + + current_num_non_deletions_ += + file_meta->num_entries - file_meta->num_deletions; + current_num_deletions_ += file_meta->num_deletions; + current_num_samples_++; +} + +void VersionStorageInfo::RemoveCurrentStats(FileMetaData* file_meta) { + if (file_meta->init_stats_from_file) { + current_num_non_deletions_ -= + file_meta->num_entries - file_meta->num_deletions; + current_num_deletions_ -= file_meta->num_deletions; + current_num_samples_--; + } +} + +void Version::UpdateAccumulatedStats() { + // maximum number of table properties loaded from files. + const int kMaxInitCount = 20; + int init_count = 0; + // here only the first kMaxInitCount files which haven't been + // initialized from file will be updated with num_deletions. + // The motivation here is to cap the maximum I/O per Version creation. + // The reason for choosing files from lower-level instead of higher-level + // is that such design is able to propagate the initialization from + // lower-level to higher-level: When the num_deletions of lower-level + // files are updated, it will make the lower-level files have accurate + // compensated_file_size, making lower-level to higher-level compaction + // will be triggered, which creates higher-level files whose num_deletions + // will be updated here. + for (int level = 0; + level < storage_info_.num_levels_ && init_count < kMaxInitCount; + ++level) { + for (auto* file_meta : storage_info_.files_[level]) { + if (MaybeInitializeFileMetaData(file_meta)) { + // each FileMeta will be initialized only once. + storage_info_.UpdateAccumulatedStats(file_meta); + // when option "max_open_files" is -1, all the file metadata has + // already been read, so MaybeInitializeFileMetaData() won't incur + // any I/O cost. "max_open_files=-1" means that the table cache passed + // to the VersionSet and then to the ColumnFamilySet has a size of + // TableCache::kInfiniteCapacity + if (vset_->GetColumnFamilySet()->get_table_cache()->GetCapacity() == + TableCache::kInfiniteCapacity) { + continue; + } + if (++init_count >= kMaxInitCount) { + break; + } + } + } + } + // In case all sampled-files contain only deletion entries, then we + // load the table-property of a file in higher-level to initialize + // that value. + for (int level = storage_info_.num_levels_ - 1; + storage_info_.accumulated_raw_value_size_ == 0 && level >= 0; --level) { + for (int i = static_cast(storage_info_.files_[level].size()) - 1; + storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) { + if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) { + storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]); + } + } + } +} + +void VersionStorageInfo::ComputeCompensatedSizes() { + static const int kDeletionWeightOnCompaction = 2; + uint64_t average_value_size = GetAverageValueSize(); + + // compute the compensated size + for (int level = 0; level < num_levels_; level++) { + for (auto* file_meta : files_[level]) { + // Here we only compute compensated_file_size for those file_meta + // which compensated_file_size is uninitialized (== 0). This is true only + // for files that have been created right now and no other thread has + // access to them. That's why we can safely mutate compensated_file_size. + if (file_meta->compensated_file_size == 0) { + file_meta->compensated_file_size = file_meta->fd.GetFileSize(); + // Here we only boost the size of deletion entries of a file only + // when the number of deletion entries is greater than the number of + // non-deletion entries in the file. The motivation here is that in + // a stable workload, the number of deletion entries should be roughly + // equal to the number of non-deletion entries. If we compensate the + // size of deletion entries in a stable workload, the deletion + // compensation logic might introduce unwanted effet which changes the + // shape of LSM tree. + if (file_meta->num_deletions * 2 >= file_meta->num_entries) { + file_meta->compensated_file_size += + (file_meta->num_deletions * 2 - file_meta->num_entries) * + average_value_size * kDeletionWeightOnCompaction; + } + } + } + } +} + +int VersionStorageInfo::MaxInputLevel() const { + if (compaction_style_ == kCompactionStyleLevel) { + return num_levels() - 2; + } + return 0; +} + +int VersionStorageInfo::MaxOutputLevel(bool allow_ingest_behind) const { + if (allow_ingest_behind) { + assert(num_levels() > 1); + return num_levels() - 2; + } + return num_levels() - 1; +} + +void VersionStorageInfo::EstimateCompactionBytesNeeded( + const MutableCFOptions& mutable_cf_options) { + // Only implemented for level-based compaction + if (compaction_style_ != kCompactionStyleLevel) { + estimated_compaction_needed_bytes_ = 0; + return; + } + + // Start from Level 0, if level 0 qualifies compaction to level 1, + // we estimate the size of compaction. + // Then we move on to the next level and see whether it qualifies compaction + // to the next level. The size of the level is estimated as the actual size + // on the level plus the input bytes from the previous level if there is any. + // If it exceeds, take the exceeded bytes as compaction input and add the size + // of the compaction size to tatal size. + // We keep doing it to Level 2, 3, etc, until the last level and return the + // accumulated bytes. + + uint64_t bytes_compact_to_next_level = 0; + uint64_t level_size = 0; + for (auto* f : files_[0]) { + level_size += f->fd.GetFileSize(); + } + // Level 0 + bool level0_compact_triggered = false; + if (static_cast(files_[0].size()) >= + mutable_cf_options.level0_file_num_compaction_trigger || + level_size >= mutable_cf_options.max_bytes_for_level_base) { + level0_compact_triggered = true; + estimated_compaction_needed_bytes_ = level_size; + bytes_compact_to_next_level = level_size; + } else { + estimated_compaction_needed_bytes_ = 0; + } + + // Level 1 and up. + uint64_t bytes_next_level = 0; + for (int level = base_level(); level <= MaxInputLevel(); level++) { + level_size = 0; + if (bytes_next_level > 0) { +#ifndef NDEBUG + uint64_t level_size2 = 0; + for (auto* f : files_[level]) { + level_size2 += f->fd.GetFileSize(); + } + assert(level_size2 == bytes_next_level); +#endif + level_size = bytes_next_level; + bytes_next_level = 0; + } else { + for (auto* f : files_[level]) { + level_size += f->fd.GetFileSize(); + } + } + if (level == base_level() && level0_compact_triggered) { + // Add base level size to compaction if level0 compaction triggered. + estimated_compaction_needed_bytes_ += level_size; + } + // Add size added by previous compaction + level_size += bytes_compact_to_next_level; + bytes_compact_to_next_level = 0; + uint64_t level_target = MaxBytesForLevel(level); + if (level_size > level_target) { + bytes_compact_to_next_level = level_size - level_target; + // Estimate the actual compaction fan-out ratio as size ratio between + // the two levels. + + assert(bytes_next_level == 0); + if (level + 1 < num_levels_) { + for (auto* f : files_[level + 1]) { + bytes_next_level += f->fd.GetFileSize(); + } + } + if (bytes_next_level > 0) { + assert(level_size > 0); + estimated_compaction_needed_bytes_ += static_cast( + static_cast(bytes_compact_to_next_level) * + (static_cast(bytes_next_level) / + static_cast(level_size) + + 1)); + } + } + } +} + +namespace { +uint32_t GetExpiredTtlFilesCount(const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + const std::vector& files) { + uint32_t ttl_expired_files_count = 0; + + int64_t _current_time; + auto status = ioptions.clock->GetCurrentTime(&_current_time); + if (status.ok()) { + const uint64_t current_time = static_cast(_current_time); + for (FileMetaData* f : files) { + if (!f->being_compacted) { + uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime(); + if (oldest_ancester_time != 0 && + oldest_ancester_time < (current_time - mutable_cf_options.ttl)) { + ttl_expired_files_count++; + } + } + } + } + return ttl_expired_files_count; +} +} // anonymous namespace + +void VersionStorageInfo::ComputeCompactionScore( + const ImmutableOptions& immutable_options, + const MutableCFOptions& mutable_cf_options) { + double total_downcompact_bytes = 0.0; + // Historically, score is defined as actual bytes in a level divided by + // the level's target size, and 1.0 is the threshold for triggering + // compaction. Higher score means higher prioritization. + // Now we keep the compaction triggering condition, but consider more + // factors for priorization, while still keeping the 1.0 threshold. + // In order to provide flexibility for reducing score while still + // maintaining it to be over 1.0, we scale the original score by 10x + // if it is larger than 1.0. + const double kScoreScale = 10.0; + for (int level = 0; level <= MaxInputLevel(); level++) { + double score; + if (level == 0) { + // We treat level-0 specially by bounding the number of files + // instead of number of bytes for two reasons: + // + // (1) With larger write-buffer sizes, it is nice not to do too + // many level-0 compactions. + // + // (2) The files in level-0 are merged on every read and + // therefore we wish to avoid too many files when the individual + // file size is small (perhaps because of a small write-buffer + // setting, or very high compression ratios, or lots of + // overwrites/deletions). + int num_sorted_runs = 0; + uint64_t total_size = 0; + for (auto* f : files_[level]) { + total_downcompact_bytes += static_cast(f->fd.GetFileSize()); + if (!f->being_compacted) { + total_size += f->compensated_file_size; + num_sorted_runs++; + } + } + if (compaction_style_ == kCompactionStyleUniversal) { + // For universal compaction, we use level0 score to indicate + // compaction score for the whole DB. Adding other levels as if + // they are L0 files. + for (int i = 1; i < num_levels(); i++) { + // Its possible that a subset of the files in a level may be in a + // compaction, due to delete triggered compaction or trivial move. + // In that case, the below check may not catch a level being + // compacted as it only checks the first file. The worst that can + // happen is a scheduled compaction thread will find nothing to do. + if (!files_[i].empty() && !files_[i][0]->being_compacted) { + num_sorted_runs++; + } + } + } + + if (compaction_style_ == kCompactionStyleFIFO) { + score = static_cast(total_size) / + mutable_cf_options.compaction_options_fifo.max_table_files_size; + if (mutable_cf_options.compaction_options_fifo.allow_compaction || + mutable_cf_options.compaction_options_fifo.age_for_warm > 0) { + // Warm tier move can happen at any time. It's too expensive to + // check very file's timestamp now. For now, just trigger it + // slightly more frequently than FIFO compaction so that this + // happens first. + score = std::max( + static_cast(num_sorted_runs) / + mutable_cf_options.level0_file_num_compaction_trigger, + score); + } + if (mutable_cf_options.ttl > 0) { + score = std::max( + static_cast(GetExpiredTtlFilesCount( + immutable_options, mutable_cf_options, files_[level])), + score); + } + } else { + score = static_cast(num_sorted_runs) / + mutable_cf_options.level0_file_num_compaction_trigger; + if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) { + // Level-based involves L0->L0 compactions that can lead to oversized + // L0 files. Take into account size as well to avoid later giant + // compactions to the base level. + // If score in L0 is always too high, L0->L1 will always be + // prioritized over L1->L2 compaction and L1 will accumulate to + // too large. But if L0 score isn't high enough, L0 will accumulate + // and data is not moved to L1 fast enough. With potential L0->L0 + // compaction, number of L0 files aren't always an indication of + // L0 oversizing, and we also need to consider total size of L0. + if (immutable_options.level_compaction_dynamic_level_bytes) { + if (total_size >= mutable_cf_options.max_bytes_for_level_base) { + // When calculating estimated_compaction_needed_bytes, we assume + // L0 is qualified as pending compactions. We will need to make + // sure that it qualifies for compaction. + // It might be guafanteed by logic below anyway, but we are + // explicit here to make sure we don't stop writes with no + // compaction scheduled. + score = std::max(score, 1.01); + } + if (total_size > level_max_bytes_[base_level_]) { + // In this case, we compare L0 size with actual L1 size and make + // sure score is more than 1.0 (10.0 after scaled) if L0 is larger + // than L1. Since in this case L1 score is lower than 10.0, L0->L1 + // is prioritized over L1->L2. + uint64_t base_level_size = 0; + for (auto f : files_[base_level_]) { + base_level_size += f->compensated_file_size; + } + score = std::max(score, static_cast(total_size) / + static_cast(std::max( + base_level_size, + level_max_bytes_[base_level_]))); + } + if (score > 1.0) { + score *= kScoreScale; + } + } else { + score = std::max(score, + static_cast(total_size) / + mutable_cf_options.max_bytes_for_level_base); + } + } + } + } else { + // Compute the ratio of current size to size limit. + uint64_t level_bytes_no_compacting = 0; + uint64_t level_total_bytes = 0; + for (auto f : files_[level]) { + level_total_bytes += f->fd.GetFileSize(); + if (!f->being_compacted) { + level_bytes_no_compacting += f->compensated_file_size; + } + } + if (!immutable_options.level_compaction_dynamic_level_bytes || + level_bytes_no_compacting < MaxBytesForLevel(level)) { + score = static_cast(level_bytes_no_compacting) / + MaxBytesForLevel(level); + } else { + // If there are a large mount of data being compacted down to the + // current level soon, we would de-prioritize compaction from + // a level where the incoming data would be a large ratio. We do + // it by dividing level size not by target level size, but + // the target size and the incoming compaction bytes. + score = static_cast(level_bytes_no_compacting) / + (MaxBytesForLevel(level) + total_downcompact_bytes) * + kScoreScale; + } + if (level_total_bytes > MaxBytesForLevel(level)) { + total_downcompact_bytes += + static_cast(level_total_bytes - MaxBytesForLevel(level)); + } + } + compaction_level_[level] = level; + compaction_score_[level] = score; + } + + // sort all the levels based on their score. Higher scores get listed + // first. Use bubble sort because the number of entries are small. + for (int i = 0; i < num_levels() - 2; i++) { + for (int j = i + 1; j < num_levels() - 1; j++) { + if (compaction_score_[i] < compaction_score_[j]) { + double score = compaction_score_[i]; + int level = compaction_level_[i]; + compaction_score_[i] = compaction_score_[j]; + compaction_level_[i] = compaction_level_[j]; + compaction_score_[j] = score; + compaction_level_[j] = level; + } + } + } + ComputeFilesMarkedForCompaction(); + if (!immutable_options.allow_ingest_behind) { + ComputeBottommostFilesMarkedForCompaction(); + } + if (mutable_cf_options.ttl > 0) { + ComputeExpiredTtlFiles(immutable_options, mutable_cf_options.ttl); + } + if (mutable_cf_options.periodic_compaction_seconds > 0) { + ComputeFilesMarkedForPeriodicCompaction( + immutable_options, mutable_cf_options.periodic_compaction_seconds); + } + + if (mutable_cf_options.enable_blob_garbage_collection && + mutable_cf_options.blob_garbage_collection_age_cutoff > 0.0 && + mutable_cf_options.blob_garbage_collection_force_threshold < 1.0) { + ComputeFilesMarkedForForcedBlobGC( + mutable_cf_options.blob_garbage_collection_age_cutoff, + mutable_cf_options.blob_garbage_collection_force_threshold); + } + + EstimateCompactionBytesNeeded(mutable_cf_options); +} + +void VersionStorageInfo::ComputeFilesMarkedForCompaction() { + files_marked_for_compaction_.clear(); + int last_qualify_level = 0; + + // Do not include files from the last level with data + // If table properties collector suggests a file on the last level, + // we should not move it to a new level. + for (int level = num_levels() - 1; level >= 1; level--) { + if (!files_[level].empty()) { + last_qualify_level = level - 1; + break; + } + } + + for (int level = 0; level <= last_qualify_level; level++) { + for (auto* f : files_[level]) { + if (!f->being_compacted && f->marked_for_compaction) { + files_marked_for_compaction_.emplace_back(level, f); + } + } + } +} + +void VersionStorageInfo::ComputeExpiredTtlFiles( + const ImmutableOptions& ioptions, const uint64_t ttl) { + assert(ttl > 0); + + expired_ttl_files_.clear(); + + int64_t _current_time; + auto status = ioptions.clock->GetCurrentTime(&_current_time); + if (!status.ok()) { + return; + } + const uint64_t current_time = static_cast(_current_time); + + for (int level = 0; level < num_levels() - 1; level++) { + for (FileMetaData* f : files_[level]) { + if (!f->being_compacted) { + uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime(); + if (oldest_ancester_time > 0 && + oldest_ancester_time < (current_time - ttl)) { + expired_ttl_files_.emplace_back(level, f); + } + } + } + } +} + +void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction( + const ImmutableOptions& ioptions, + const uint64_t periodic_compaction_seconds) { + assert(periodic_compaction_seconds > 0); + + files_marked_for_periodic_compaction_.clear(); + + int64_t temp_current_time; + auto status = ioptions.clock->GetCurrentTime(&temp_current_time); + if (!status.ok()) { + return; + } + const uint64_t current_time = static_cast(temp_current_time); + + // If periodic_compaction_seconds is larger than current time, periodic + // compaction can't possibly be triggered. + if (periodic_compaction_seconds > current_time) { + return; + } + + const uint64_t allowed_time_limit = + current_time - periodic_compaction_seconds; + + for (int level = 0; level < num_levels(); level++) { + for (auto f : files_[level]) { + if (!f->being_compacted) { + // Compute a file's modification time in the following order: + // 1. Use file_creation_time table property if it is > 0. + // 2. Use creation_time table property if it is > 0. + // 3. Use file's mtime metadata if the above two table properties are 0. + // Don't consider the file at all if the modification time cannot be + // correctly determined based on the above conditions. + uint64_t file_modification_time = f->TryGetFileCreationTime(); + if (file_modification_time == kUnknownFileCreationTime) { + file_modification_time = f->TryGetOldestAncesterTime(); + } + if (file_modification_time == kUnknownOldestAncesterTime) { + auto file_path = TableFileName(ioptions.cf_paths, f->fd.GetNumber(), + f->fd.GetPathId()); + status = ioptions.env->GetFileModificationTime( + file_path, &file_modification_time); + if (!status.ok()) { + ROCKS_LOG_WARN(ioptions.logger, + "Can't get file modification time: %s: %s", + file_path.c_str(), status.ToString().c_str()); + continue; + } + } + if (file_modification_time > 0 && + file_modification_time < allowed_time_limit) { + files_marked_for_periodic_compaction_.emplace_back(level, f); + } + } + } + } +} + +void VersionStorageInfo::ComputeFilesMarkedForForcedBlobGC( + double blob_garbage_collection_age_cutoff, + double blob_garbage_collection_force_threshold) { + files_marked_for_forced_blob_gc_.clear(); + + if (blob_files_.empty()) { + return; + } + + // Number of blob files eligible for GC based on age + const size_t cutoff_count = static_cast( + blob_garbage_collection_age_cutoff * blob_files_.size()); + if (!cutoff_count) { + return; + } + + // Compute the sum of total and garbage bytes over the oldest batch of blob + // files. The oldest batch is defined as the set of blob files which are + // kept alive by the same SSTs as the very oldest one. Here is a toy example. + // Let's assume we have three SSTs 1, 2, and 3, and four blob files 10, 11, + // 12, and 13. Also, let's say SSTs 1 and 2 both rely on blob file 10 and + // potentially some higher-numbered ones, while SST 3 relies on blob file 12 + // and potentially some higher-numbered ones. Then, the SST to oldest blob + // file mapping is as follows: + // + // SST file number Oldest blob file number + // 1 10 + // 2 10 + // 3 12 + // + // This is what the same thing looks like from the blob files' POV. (Note that + // the linked SSTs simply denote the inverse mapping of the above.) + // + // Blob file number Linked SST set + // 10 {1, 2} + // 11 {} + // 12 {3} + // 13 {} + // + // Then, the oldest batch of blob files consists of blob files 10 and 11, + // and we can get rid of them by forcing the compaction of SSTs 1 and 2. + // + // Note that the overall ratio of garbage computed for the batch has to exceed + // blob_garbage_collection_force_threshold and the entire batch has to be + // eligible for GC according to blob_garbage_collection_age_cutoff in order + // for us to schedule any compactions. + const auto& oldest_meta = blob_files_.front(); + assert(oldest_meta); + + const auto& linked_ssts = oldest_meta->GetLinkedSsts(); + assert(!linked_ssts.empty()); + + size_t count = 1; + uint64_t sum_total_blob_bytes = oldest_meta->GetTotalBlobBytes(); + uint64_t sum_garbage_blob_bytes = oldest_meta->GetGarbageBlobBytes(); + + assert(cutoff_count <= blob_files_.size()); + + for (; count < cutoff_count; ++count) { + const auto& meta = blob_files_[count]; + assert(meta); + + if (!meta->GetLinkedSsts().empty()) { + // Found the beginning of the next batch of blob files + break; + } + + sum_total_blob_bytes += meta->GetTotalBlobBytes(); + sum_garbage_blob_bytes += meta->GetGarbageBlobBytes(); + } + + if (count < blob_files_.size()) { + const auto& meta = blob_files_[count]; + assert(meta); + + if (meta->GetLinkedSsts().empty()) { + // Some files in the oldest batch are not eligible for GC + return; + } + } + + if (sum_garbage_blob_bytes < + blob_garbage_collection_force_threshold * sum_total_blob_bytes) { + return; + } + + for (uint64_t sst_file_number : linked_ssts) { + const FileLocation location = GetFileLocation(sst_file_number); + assert(location.IsValid()); + + const int level = location.GetLevel(); + assert(level >= 0); + + const size_t pos = location.GetPosition(); + + FileMetaData* const sst_meta = files_[level][pos]; + assert(sst_meta); + + if (sst_meta->being_compacted) { + continue; + } + + files_marked_for_forced_blob_gc_.emplace_back(level, sst_meta); + } +} + +namespace { + +// used to sort files by size +struct Fsize { + size_t index; + FileMetaData* file; +}; + +// Comparator that is used to sort files based on their size +// In normal mode: descending size +bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) { + return (first.file->compensated_file_size > + second.file->compensated_file_size); +} +} // anonymous namespace + +void VersionStorageInfo::AddFile(int level, FileMetaData* f) { + auto& level_files = files_[level]; + level_files.push_back(f); + + f->refs++; +} + +void VersionStorageInfo::AddBlobFile( + std::shared_ptr blob_file_meta) { + assert(blob_file_meta); + + assert(blob_files_.empty() || + (blob_files_.back() && blob_files_.back()->GetBlobFileNumber() < + blob_file_meta->GetBlobFileNumber())); + + blob_files_.emplace_back(std::move(blob_file_meta)); +} + +VersionStorageInfo::BlobFiles::const_iterator +VersionStorageInfo::GetBlobFileMetaDataLB(uint64_t blob_file_number) const { + return std::lower_bound( + blob_files_.begin(), blob_files_.end(), blob_file_number, + [](const std::shared_ptr& lhs, uint64_t rhs) { + assert(lhs); + return lhs->GetBlobFileNumber() < rhs; + }); +} + +void VersionStorageInfo::SetFinalized() { + finalized_ = true; + +#ifndef NDEBUG + if (compaction_style_ != kCompactionStyleLevel) { + // Not level based compaction. + return; + } + assert(base_level_ < 0 || num_levels() == 1 || + (base_level_ >= 1 && base_level_ < num_levels())); + // Verify all levels newer than base_level are empty except L0 + for (int level = 1; level < base_level(); level++) { + assert(NumLevelBytes(level) == 0); + } + uint64_t max_bytes_prev_level = 0; + for (int level = base_level(); level < num_levels() - 1; level++) { + if (LevelFiles(level).size() == 0) { + continue; + } + assert(MaxBytesForLevel(level) >= max_bytes_prev_level); + max_bytes_prev_level = MaxBytesForLevel(level); + } + for (int level = 0; level < num_levels(); level++) { + assert(LevelFiles(level).size() == 0 || + LevelFiles(level).size() == LevelFilesBrief(level).num_files); + if (LevelFiles(level).size() > 0) { + assert(level < num_non_empty_levels()); + } + } + assert(compaction_level_.size() > 0); + assert(compaction_level_.size() == compaction_score_.size()); +#endif +} + +void VersionStorageInfo::UpdateNumNonEmptyLevels() { + num_non_empty_levels_ = num_levels_; + for (int i = num_levels_ - 1; i >= 0; i--) { + if (files_[i].size() != 0) { + return; + } else { + num_non_empty_levels_ = i; + } + } +} + +namespace { +// Sort `temp` based on ratio of overlapping size over file size +void SortFileByOverlappingRatio( + const InternalKeyComparator& icmp, const std::vector& files, + const std::vector& next_level_files, SystemClock* clock, + int level, int num_non_empty_levels, uint64_t ttl, + std::vector* temp) { + std::unordered_map file_to_order; + auto next_level_it = next_level_files.begin(); + + int64_t curr_time; + Status status = clock->GetCurrentTime(&curr_time); + if (!status.ok()) { + // If we can't get time, disable TTL. + ttl = 0; + } + + FileTtlBooster ttl_booster(static_cast(curr_time), ttl, + num_non_empty_levels, level); + + for (auto& file : files) { + uint64_t overlapping_bytes = 0; + // Skip files in next level that is smaller than current file + while (next_level_it != next_level_files.end() && + icmp.Compare((*next_level_it)->largest, file->smallest) < 0) { + next_level_it++; + } + + while (next_level_it != next_level_files.end() && + icmp.Compare((*next_level_it)->smallest, file->largest) < 0) { + overlapping_bytes += (*next_level_it)->fd.file_size; + + if (icmp.Compare((*next_level_it)->largest, file->largest) > 0) { + // next level file cross large boundary of current file. + break; + } + next_level_it++; + } + + uint64_t ttl_boost_score = (ttl > 0) ? ttl_booster.GetBoostScore(file) : 1; + assert(ttl_boost_score > 0); + assert(file->compensated_file_size != 0); + file_to_order[file->fd.GetNumber()] = overlapping_bytes * 1024U / + file->compensated_file_size / + ttl_boost_score; + } + + size_t num_to_sort = temp->size() > VersionStorageInfo::kNumberFilesToSort + ? VersionStorageInfo::kNumberFilesToSort + : temp->size(); + + std::partial_sort(temp->begin(), temp->begin() + num_to_sort, temp->end(), + [&](const Fsize& f1, const Fsize& f2) -> bool { + // If score is the same, pick file with smaller keys. + // This makes the algorithm more deterministic, and also + // help the trivial move case to have more files to + // extend. + if (file_to_order[f1.file->fd.GetNumber()] == + file_to_order[f2.file->fd.GetNumber()]) { + return icmp.Compare(f1.file->smallest, + f2.file->smallest) < 0; + } + return file_to_order[f1.file->fd.GetNumber()] < + file_to_order[f2.file->fd.GetNumber()]; + }); +} + +void SortFileByRoundRobin(const InternalKeyComparator& icmp, + std::vector* compact_cursor, + bool level0_non_overlapping, int level, + std::vector* temp) { + if (level == 0 && !level0_non_overlapping) { + // Using kOldestSmallestSeqFirst when level === 0, since the + // files may overlap (not fully sorted) + std::sort(temp->begin(), temp->end(), + [](const Fsize& f1, const Fsize& f2) -> bool { + return f1.file->fd.smallest_seqno < f2.file->fd.smallest_seqno; + }); + return; + } + + bool should_move_files = + compact_cursor->at(level).size() > 0 && temp->size() > 1; + + // The iterator points to the Fsize with smallest key larger than or equal to + // the given cursor + std::vector::iterator current_file_iter; + if (should_move_files) { + // Find the file of which the smallest key is larger than or equal to + // the cursor (the smallest key in the successor file of the last + // chosen file), skip this if the cursor is invalid or there is only + // one file in this level + current_file_iter = std::lower_bound( + temp->begin(), temp->end(), compact_cursor->at(level), + [&](const Fsize& f, const InternalKey& cursor) -> bool { + return icmp.Compare(cursor, f.file->smallest) > 0; + }); + + should_move_files = + current_file_iter != temp->end() && current_file_iter != temp->begin(); + } + if (should_move_files) { + // Construct a local temporary vector + std::vector local_temp; + local_temp.reserve(temp->size()); + // Move the selected File into the first position and its successors + // into the second, third, ..., positions + for (auto iter = current_file_iter; iter != temp->end(); iter++) { + local_temp.push_back(*iter); + } + // Move the origin predecessors of the selected file in a round-robin + // manner + for (auto iter = temp->begin(); iter != current_file_iter; iter++) { + local_temp.push_back(*iter); + } + // Replace all the items in temp + for (size_t i = 0; i < local_temp.size(); i++) { + temp->at(i) = local_temp[i]; + } + } +} +} // anonymous namespace + +void VersionStorageInfo::UpdateFilesByCompactionPri( + const ImmutableOptions& ioptions, const MutableCFOptions& options) { + if (compaction_style_ == kCompactionStyleNone || + compaction_style_ == kCompactionStyleFIFO || + compaction_style_ == kCompactionStyleUniversal) { + // don't need this + return; + } + // No need to sort the highest level because it is never compacted. + for (int level = 0; level < num_levels() - 1; level++) { + const std::vector& files = files_[level]; + auto& files_by_compaction_pri = files_by_compaction_pri_[level]; + assert(files_by_compaction_pri.size() == 0); + + // populate a temp vector for sorting based on size + std::vector temp(files.size()); + for (size_t i = 0; i < files.size(); i++) { + temp[i].index = i; + temp[i].file = files[i]; + } + + // sort the top number_of_files_to_sort_ based on file size + size_t num = VersionStorageInfo::kNumberFilesToSort; + if (num > temp.size()) { + num = temp.size(); + } + switch (ioptions.compaction_pri) { + case kByCompensatedSize: + std::partial_sort(temp.begin(), temp.begin() + num, temp.end(), + CompareCompensatedSizeDescending); + break; + case kOldestLargestSeqFirst: + std::sort(temp.begin(), temp.end(), + [](const Fsize& f1, const Fsize& f2) -> bool { + return f1.file->fd.largest_seqno < + f2.file->fd.largest_seqno; + }); + break; + case kOldestSmallestSeqFirst: + std::sort(temp.begin(), temp.end(), + [](const Fsize& f1, const Fsize& f2) -> bool { + return f1.file->fd.smallest_seqno < + f2.file->fd.smallest_seqno; + }); + break; + case kMinOverlappingRatio: + SortFileByOverlappingRatio(*internal_comparator_, files_[level], + files_[level + 1], ioptions.clock, level, + num_non_empty_levels_, options.ttl, &temp); + break; + case kRoundRobin: + SortFileByRoundRobin(*internal_comparator_, &compact_cursor_, + level0_non_overlapping_, level, &temp); + break; + default: + assert(false); + } + assert(temp.size() == files.size()); + + // initialize files_by_compaction_pri_ + for (size_t i = 0; i < temp.size(); i++) { + files_by_compaction_pri.push_back(static_cast(temp[i].index)); + } + next_file_to_compact_by_size_[level] = 0; + assert(files_[level].size() == files_by_compaction_pri_[level].size()); + } +} + +void VersionStorageInfo::GenerateLevel0NonOverlapping() { + assert(!finalized_); + level0_non_overlapping_ = true; + if (level_files_brief_.size() == 0) { + return; + } + + // A copy of L0 files sorted by smallest key + std::vector level0_sorted_file( + level_files_brief_[0].files, + level_files_brief_[0].files + level_files_brief_[0].num_files); + std::sort(level0_sorted_file.begin(), level0_sorted_file.end(), + [this](const FdWithKeyRange& f1, const FdWithKeyRange& f2) -> bool { + return (internal_comparator_->Compare(f1.smallest_key, + f2.smallest_key) < 0); + }); + + for (size_t i = 1; i < level0_sorted_file.size(); ++i) { + FdWithKeyRange& f = level0_sorted_file[i]; + FdWithKeyRange& prev = level0_sorted_file[i - 1]; + if (internal_comparator_->Compare(prev.largest_key, f.smallest_key) >= 0) { + level0_non_overlapping_ = false; + break; + } + } +} + +void VersionStorageInfo::GenerateBottommostFiles() { + assert(!finalized_); + assert(bottommost_files_.empty()); + for (size_t level = 0; level < level_files_brief_.size(); ++level) { + for (size_t file_idx = 0; file_idx < level_files_brief_[level].num_files; + ++file_idx) { + const FdWithKeyRange& f = level_files_brief_[level].files[file_idx]; + int l0_file_idx; + if (level == 0) { + l0_file_idx = static_cast(file_idx); + } else { + l0_file_idx = -1; + } + Slice smallest_user_key = ExtractUserKey(f.smallest_key); + Slice largest_user_key = ExtractUserKey(f.largest_key); + if (!RangeMightExistAfterSortedRun(smallest_user_key, largest_user_key, + static_cast(level), + l0_file_idx)) { + bottommost_files_.emplace_back(static_cast(level), + f.file_metadata); + } + } + } +} + +void VersionStorageInfo::GenerateFileLocationIndex() { + size_t num_files = 0; + + for (int level = 0; level < num_levels_; ++level) { + num_files += files_[level].size(); + } + + file_locations_.reserve(num_files); + + for (int level = 0; level < num_levels_; ++level) { + for (size_t pos = 0; pos < files_[level].size(); ++pos) { + const FileMetaData* const meta = files_[level][pos]; + assert(meta); + + const uint64_t file_number = meta->fd.GetNumber(); + + assert(file_locations_.find(file_number) == file_locations_.end()); + file_locations_.emplace(file_number, FileLocation(level, pos)); + } + } +} + +void VersionStorageInfo::UpdateOldestSnapshot(SequenceNumber seqnum) { + assert(seqnum >= oldest_snapshot_seqnum_); + oldest_snapshot_seqnum_ = seqnum; + if (oldest_snapshot_seqnum_ > bottommost_files_mark_threshold_) { + ComputeBottommostFilesMarkedForCompaction(); + } +} + +void VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction() { + bottommost_files_marked_for_compaction_.clear(); + bottommost_files_mark_threshold_ = kMaxSequenceNumber; + for (auto& level_and_file : bottommost_files_) { + if (!level_and_file.second->being_compacted && + level_and_file.second->fd.largest_seqno != 0) { + // largest_seqno might be nonzero due to containing the final key in an + // earlier compaction, whose seqnum we didn't zero out. Multiple deletions + // ensures the file really contains deleted or overwritten keys. + if (level_and_file.second->fd.largest_seqno < oldest_snapshot_seqnum_) { + bottommost_files_marked_for_compaction_.push_back(level_and_file); + } else { + bottommost_files_mark_threshold_ = + std::min(bottommost_files_mark_threshold_, + level_and_file.second->fd.largest_seqno); + } + } + } +} + +void Version::Ref() { ++refs_; } + +bool Version::Unref() { + assert(refs_ >= 1); + --refs_; + if (refs_ == 0) { + delete this; + return true; + } + return false; +} + +bool VersionStorageInfo::OverlapInLevel(int level, + const Slice* smallest_user_key, + const Slice* largest_user_key) { + if (level >= num_non_empty_levels_) { + // empty level, no overlap + return false; + } + return SomeFileOverlapsRange(*internal_comparator_, (level > 0), + level_files_brief_[level], smallest_user_key, + largest_user_key); +} + +// Store in "*inputs" all files in "level" that overlap [begin,end] +// If hint_index is specified, then it points to a file in the +// overlapping range. +// The file_index returns a pointer to any file in an overlapping range. +void VersionStorageInfo::GetOverlappingInputs( + int level, const InternalKey* begin, const InternalKey* end, + std::vector* inputs, int hint_index, int* file_index, + bool expand_range, InternalKey** next_smallest) const { + if (level >= num_non_empty_levels_) { + // this level is empty, no overlapping inputs + return; + } + + inputs->clear(); + if (file_index) { + *file_index = -1; + } + const Comparator* user_cmp = user_comparator_; + if (level > 0) { + GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, hint_index, + file_index, false, next_smallest); + return; + } + + if (next_smallest) { + // next_smallest key only makes sense for non-level 0, where files are + // non-overlapping + *next_smallest = nullptr; + } + + Slice user_begin, user_end; + if (begin != nullptr) { + user_begin = begin->user_key(); + } + if (end != nullptr) { + user_end = end->user_key(); + } + + // index stores the file index need to check. + std::list index; + for (size_t i = 0; i < level_files_brief_[level].num_files; i++) { + index.emplace_back(i); + } + + while (!index.empty()) { + bool found_overlapping_file = false; + auto iter = index.begin(); + while (iter != index.end()) { + FdWithKeyRange* f = &(level_files_brief_[level].files[*iter]); + const Slice file_start = ExtractUserKey(f->smallest_key); + const Slice file_limit = ExtractUserKey(f->largest_key); + if (begin != nullptr && + user_cmp->CompareWithoutTimestamp(file_limit, user_begin) < 0) { + // "f" is completely before specified range; skip it + iter++; + } else if (end != nullptr && + user_cmp->CompareWithoutTimestamp(file_start, user_end) > 0) { + // "f" is completely after specified range; skip it + iter++; + } else { + // if overlap + inputs->emplace_back(files_[level][*iter]); + found_overlapping_file = true; + // record the first file index. + if (file_index && *file_index == -1) { + *file_index = static_cast(*iter); + } + // the related file is overlap, erase to avoid checking again. + iter = index.erase(iter); + if (expand_range) { + if (begin != nullptr && + user_cmp->CompareWithoutTimestamp(file_start, user_begin) < 0) { + user_begin = file_start; + } + if (end != nullptr && + user_cmp->CompareWithoutTimestamp(file_limit, user_end) > 0) { + user_end = file_limit; + } + } + } + } + // if all the files left are not overlap, break + if (!found_overlapping_file) { + break; + } + } +} + +// Store in "*inputs" files in "level" that within range [begin,end] +// Guarantee a "clean cut" boundary between the files in inputs +// and the surrounding files and the maxinum number of files. +// This will ensure that no parts of a key are lost during compaction. +// If hint_index is specified, then it points to a file in the range. +// The file_index returns a pointer to any file in an overlapping range. +void VersionStorageInfo::GetCleanInputsWithinInterval( + int level, const InternalKey* begin, const InternalKey* end, + std::vector* inputs, int hint_index, int* file_index) const { + inputs->clear(); + if (file_index) { + *file_index = -1; + } + if (level >= num_non_empty_levels_ || level == 0 || + level_files_brief_[level].num_files == 0) { + // this level is empty, no inputs within range + // also don't support clean input interval within L0 + return; + } + + GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, hint_index, + file_index, true /* within_interval */); +} + +// Store in "*inputs" all files in "level" that overlap [begin,end] +// Employ binary search to find at least one file that overlaps the +// specified range. From that file, iterate backwards and +// forwards to find all overlapping files. +// if within_range is set, then only store the maximum clean inputs +// within range [begin, end]. "clean" means there is a boundary +// between the files in "*inputs" and the surrounding files +void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch( + int level, const InternalKey* begin, const InternalKey* end, + std::vector* inputs, int hint_index, int* file_index, + bool within_interval, InternalKey** next_smallest) const { + assert(level > 0); + + auto user_cmp = user_comparator_; + const FdWithKeyRange* files = level_files_brief_[level].files; + const int num_files = static_cast(level_files_brief_[level].num_files); + + // begin to use binary search to find lower bound + // and upper bound. + int start_index = 0; + int end_index = num_files; + + if (begin != nullptr) { + // if within_interval is true, with file_key would find + // not overlapping ranges in std::lower_bound. + auto cmp = [&user_cmp, &within_interval](const FdWithKeyRange& f, + const InternalKey* k) { + auto& file_key = within_interval ? f.file_metadata->smallest + : f.file_metadata->largest; + return sstableKeyCompare(user_cmp, file_key, *k) < 0; + }; + + start_index = static_cast( + std::lower_bound(files, + files + (hint_index == -1 ? num_files : hint_index), + begin, cmp) - + files); + + if (start_index > 0 && within_interval) { + bool is_overlapping = true; + while (is_overlapping && start_index < num_files) { + auto& pre_limit = files[start_index - 1].file_metadata->largest; + auto& cur_start = files[start_index].file_metadata->smallest; + is_overlapping = sstableKeyCompare(user_cmp, pre_limit, cur_start) == 0; + start_index += is_overlapping; + } + } + } + + if (end != nullptr) { + // if within_interval is true, with file_key would find + // not overlapping ranges in std::upper_bound. + auto cmp = [&user_cmp, &within_interval](const InternalKey* k, + const FdWithKeyRange& f) { + auto& file_key = within_interval ? f.file_metadata->largest + : f.file_metadata->smallest; + return sstableKeyCompare(user_cmp, *k, file_key) < 0; + }; + + end_index = static_cast( + std::upper_bound(files + start_index, files + num_files, end, cmp) - + files); + + if (end_index < num_files && within_interval) { + bool is_overlapping = true; + while (is_overlapping && end_index > start_index) { + auto& next_start = files[end_index].file_metadata->smallest; + auto& cur_limit = files[end_index - 1].file_metadata->largest; + is_overlapping = + sstableKeyCompare(user_cmp, cur_limit, next_start) == 0; + end_index -= is_overlapping; + } + } + } + + assert(start_index <= end_index); + + // If there were no overlapping files, return immediately. + if (start_index == end_index) { + if (next_smallest) { + *next_smallest = nullptr; + } + return; + } + + assert(start_index < end_index); + + // returns the index where an overlap is found + if (file_index) { + *file_index = start_index; + } + + // insert overlapping files into vector + for (int i = start_index; i < end_index; i++) { + inputs->push_back(files_[level][i]); + } + + if (next_smallest != nullptr) { + // Provide the next key outside the range covered by inputs + if (end_index < static_cast(files_[level].size())) { + **next_smallest = files_[level][end_index]->smallest; + } else { + *next_smallest = nullptr; + } + } +} + +uint64_t VersionStorageInfo::NumLevelBytes(int level) const { + assert(level >= 0); + assert(level < num_levels()); + return TotalFileSize(files_[level]); +} + +const char* VersionStorageInfo::LevelSummary( + LevelSummaryStorage* scratch) const { + int len = 0; + if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) { + assert(base_level_ < static_cast(level_max_bytes_.size())); + if (level_multiplier_ != 0.0) { + len = snprintf( + scratch->buffer, sizeof(scratch->buffer), + "base level %d level multiplier %.2f max bytes base %" PRIu64 " ", + base_level_, level_multiplier_, level_max_bytes_[base_level_]); + } + } + len += + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "files["); + for (int i = 0; i < num_levels(); i++) { + int sz = sizeof(scratch->buffer) - len; + int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size())); + if (ret < 0 || ret >= sz) break; + len += ret; + } + if (len > 0) { + // overwrite the last space + --len; + } + len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, + "] max score %.2f", compaction_score_[0]); + + if (!files_marked_for_compaction_.empty()) { + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, + " (%" ROCKSDB_PRIszt " files need compaction)", + files_marked_for_compaction_.size()); + } + + return scratch->buffer; +} + +const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch, + int level) const { + int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size["); + for (const auto& f : files_[level]) { + int sz = sizeof(scratch->buffer) - len; + char sztxt[16]; + AppendHumanBytes(f->fd.GetFileSize(), sztxt, sizeof(sztxt)); + int ret = snprintf(scratch->buffer + len, sz, + "#%" PRIu64 "(seq=%" PRIu64 ",sz=%s,%d) ", + f->fd.GetNumber(), f->fd.smallest_seqno, sztxt, + static_cast(f->being_compacted)); + if (ret < 0 || ret >= sz) break; + len += ret; + } + // overwrite the last space (only if files_[level].size() is non-zero) + if (files_[level].size() && len > 0) { + --len; + } + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]"); + return scratch->buffer; +} + +uint64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() { + uint64_t result = 0; + std::vector overlaps; + for (int level = 1; level < num_levels() - 1; level++) { + for (const auto& f : files_[level]) { + GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps); + const uint64_t sum = TotalFileSize(overlaps); + if (sum > result) { + result = sum; + } + } + } + return result; +} + +uint64_t VersionStorageInfo::MaxBytesForLevel(int level) const { + // Note: the result for level zero is not really used since we set + // the level-0 compaction threshold based on number of files. + assert(level >= 0); + assert(level < static_cast(level_max_bytes_.size())); + return level_max_bytes_[level]; +} + +void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions, + const MutableCFOptions& options) { + // Special logic to set number of sorted runs. + // It is to match the previous behavior when all files are in L0. + int num_l0_count = static_cast(files_[0].size()); + if (compaction_style_ == kCompactionStyleUniversal) { + // For universal compaction, we use level0 score to indicate + // compaction score for the whole DB. Adding other levels as if + // they are L0 files. + for (int i = 1; i < num_levels(); i++) { + if (!files_[i].empty()) { + num_l0_count++; + } + } + } + set_l0_delay_trigger_count(num_l0_count); + + level_max_bytes_.resize(ioptions.num_levels); + if (!ioptions.level_compaction_dynamic_level_bytes) { + base_level_ = (ioptions.compaction_style == kCompactionStyleLevel) ? 1 : -1; + + // Calculate for static bytes base case + for (int i = 0; i < ioptions.num_levels; ++i) { + if (i == 0 && ioptions.compaction_style == kCompactionStyleUniversal) { + level_max_bytes_[i] = options.max_bytes_for_level_base; + } else if (i > 1) { + level_max_bytes_[i] = MultiplyCheckOverflow( + MultiplyCheckOverflow(level_max_bytes_[i - 1], + options.max_bytes_for_level_multiplier), + options.MaxBytesMultiplerAdditional(i - 1)); + } else { + level_max_bytes_[i] = options.max_bytes_for_level_base; + } + } + } else { + uint64_t max_level_size = 0; + + int first_non_empty_level = -1; + // Find size of non-L0 level of most data. + // Cannot use the size of the last level because it can be empty or less + // than previous levels after compaction. + for (int i = 1; i < num_levels_; i++) { + uint64_t total_size = 0; + for (const auto& f : files_[i]) { + total_size += f->fd.GetFileSize(); + } + if (total_size > 0 && first_non_empty_level == -1) { + first_non_empty_level = i; + } + if (total_size > max_level_size) { + max_level_size = total_size; + } + } + + // Prefill every level's max bytes to disallow compaction from there. + for (int i = 0; i < num_levels_; i++) { + level_max_bytes_[i] = std::numeric_limits::max(); + } + + if (max_level_size == 0) { + // No data for L1 and up. L0 compacts to last level directly. + // No compaction from L1+ needs to be scheduled. + base_level_ = num_levels_ - 1; + } else { + uint64_t base_bytes_max = options.max_bytes_for_level_base; + uint64_t base_bytes_min = static_cast( + base_bytes_max / options.max_bytes_for_level_multiplier); + + // Try whether we can make last level's target size to be max_level_size + uint64_t cur_level_size = max_level_size; + for (int i = num_levels_ - 2; i >= first_non_empty_level; i--) { + // Round up after dividing + cur_level_size = static_cast( + cur_level_size / options.max_bytes_for_level_multiplier); + } + + // Calculate base level and its size. + uint64_t base_level_size; + if (cur_level_size <= base_bytes_min) { + // Case 1. If we make target size of last level to be max_level_size, + // target size of the first non-empty level would be smaller than + // base_bytes_min. We set it be base_bytes_min. + base_level_size = base_bytes_min + 1U; + base_level_ = first_non_empty_level; + ROCKS_LOG_INFO(ioptions.logger, + "More existing levels in DB than needed. " + "max_bytes_for_level_multiplier may not be guaranteed."); + } else { + // Find base level (where L0 data is compacted to). + base_level_ = first_non_empty_level; + while (base_level_ > 1 && cur_level_size > base_bytes_max) { + --base_level_; + cur_level_size = static_cast( + cur_level_size / options.max_bytes_for_level_multiplier); + } + if (cur_level_size > base_bytes_max) { + // Even L1 will be too large + assert(base_level_ == 1); + base_level_size = base_bytes_max; + } else { + base_level_size = cur_level_size; + } + } + + level_multiplier_ = options.max_bytes_for_level_multiplier; + assert(base_level_size > 0); + + uint64_t level_size = base_level_size; + for (int i = base_level_; i < num_levels_; i++) { + if (i > base_level_) { + level_size = MultiplyCheckOverflow(level_size, level_multiplier_); + } + // Don't set any level below base_bytes_max. Otherwise, the LSM can + // assume an hourglass shape where L1+ sizes are smaller than L0. This + // causes compaction scoring, which depends on level sizes, to favor L1+ + // at the expense of L0, which may fill up and stall. + level_max_bytes_[i] = std::max(level_size, base_bytes_max); + } + } + } +} + +uint64_t VersionStorageInfo::EstimateLiveDataSize() const { + // Estimate the live data size by adding up the size of a maximal set of + // sst files with no range overlap in same or higher level. The less + // compacted, the more optimistic (smaller) this estimate is. Also, + // for multiple sorted runs within a level, file order will matter. + uint64_t size = 0; + + auto ikey_lt = [this](InternalKey* x, InternalKey* y) { + return internal_comparator_->Compare(*x, *y) < 0; + }; + // (Ordered) map of largest keys in files being included in size estimate + std::map ranges(ikey_lt); + + for (int l = num_levels_ - 1; l >= 0; l--) { + bool found_end = false; + for (auto file : files_[l]) { + // Find the first file already included with largest key is larger than + // the smallest key of `file`. If that file does not overlap with the + // current file, none of the files in the map does. If there is + // no potential overlap, we can safely insert the rest of this level + // (if the level is not 0) into the map without checking again because + // the elements in the level are sorted and non-overlapping. + auto lb = (found_end && l != 0) ? ranges.end() + : ranges.lower_bound(&file->smallest); + found_end = (lb == ranges.end()); + if (found_end || internal_comparator_->Compare( + file->largest, (*lb).second->smallest) < 0) { + ranges.emplace_hint(lb, &file->largest, file); + size += file->fd.file_size; + } + } + } + + // For BlobDB, the result also includes the exact value of live bytes in the + // blob files of the version. + for (const auto& meta : blob_files_) { + assert(meta); + + size += meta->GetTotalBlobBytes(); + size -= meta->GetGarbageBlobBytes(); + } + + return size; +} + +bool VersionStorageInfo::RangeMightExistAfterSortedRun( + const Slice& smallest_user_key, const Slice& largest_user_key, + int last_level, int last_l0_idx) { + assert((last_l0_idx != -1) == (last_level == 0)); + // TODO(ajkr): this preserves earlier behavior where we considered an L0 file + // bottommost only if it's the oldest L0 file and there are no files on older + // levels. It'd be better to consider it bottommost if there's no overlap in + // older levels/files. + if (last_level == 0 && + last_l0_idx != static_cast(LevelFiles(0).size() - 1)) { + return true; + } + + // Checks whether there are files living beyond the `last_level`. If lower + // levels have files, it checks for overlap between [`smallest_key`, + // `largest_key`] and those files. Bottomlevel optimizations can be made if + // there are no files in lower levels or if there is no overlap with the files + // in the lower levels. + for (int level = last_level + 1; level < num_levels(); level++) { + // The range is not in the bottommost level if there are files in lower + // levels when the `last_level` is 0 or if there are files in lower levels + // which overlap with [`smallest_key`, `largest_key`]. + if (files_[level].size() > 0 && + (last_level == 0 || + OverlapInLevel(level, &smallest_user_key, &largest_user_key))) { + return true; + } + } + return false; +} + +void Version::AddLiveFiles(std::vector* live_table_files, + std::vector* live_blob_files) const { + assert(live_table_files); + assert(live_blob_files); + + for (int level = 0; level < storage_info_.num_levels(); ++level) { + const auto& level_files = storage_info_.LevelFiles(level); + for (const auto& meta : level_files) { + assert(meta); + + live_table_files->emplace_back(meta->fd.GetNumber()); + } + } + + const auto& blob_files = storage_info_.GetBlobFiles(); + for (const auto& meta : blob_files) { + assert(meta); + + live_blob_files->emplace_back(meta->GetBlobFileNumber()); + } +} + +void Version::RemoveLiveFiles( + std::vector& sst_delete_candidates, + std::vector& blob_delete_candidates) const { + for (ObsoleteFileInfo& fi : sst_delete_candidates) { + if (!fi.only_delete_metadata && + storage_info()->GetFileLocation(fi.metadata->fd.GetNumber()) != + VersionStorageInfo::FileLocation::Invalid()) { + fi.only_delete_metadata = true; + } + } + + blob_delete_candidates.erase( + std::remove_if( + blob_delete_candidates.begin(), blob_delete_candidates.end(), + [this](ObsoleteBlobFileInfo& x) { + return storage_info()->GetBlobFileMetaData(x.GetBlobFileNumber()); + }), + blob_delete_candidates.end()); +} + +std::string Version::DebugString(bool hex, bool print_stats) const { + std::string r; + for (int level = 0; level < storage_info_.num_levels_; level++) { + // E.g., + // --- level 1 --- + // 17:123[1 .. 124]['a' .. 'd'] + // 20:43[124 .. 128]['e' .. 'g'] + // + // if print_stats=true: + // 17:123[1 .. 124]['a' .. 'd'](4096) + r.append("--- level "); + AppendNumberTo(&r, level); + r.append(" --- version# "); + AppendNumberTo(&r, version_number_); + if (storage_info_.compact_cursor_[level].Valid()) { + r.append(" --- compact_cursor: "); + r.append(storage_info_.compact_cursor_[level].DebugString(hex)); + } + r.append(" ---\n"); + const std::vector& files = storage_info_.files_[level]; + for (size_t i = 0; i < files.size(); i++) { + r.push_back(' '); + AppendNumberTo(&r, files[i]->fd.GetNumber()); + r.push_back(':'); + AppendNumberTo(&r, files[i]->fd.GetFileSize()); + r.append("["); + AppendNumberTo(&r, files[i]->fd.smallest_seqno); + r.append(" .. "); + AppendNumberTo(&r, files[i]->fd.largest_seqno); + r.append("]"); + r.append("["); + r.append(files[i]->smallest.DebugString(hex)); + r.append(" .. "); + r.append(files[i]->largest.DebugString(hex)); + r.append("]"); + if (files[i]->oldest_blob_file_number != kInvalidBlobFileNumber) { + r.append(" blob_file:"); + AppendNumberTo(&r, files[i]->oldest_blob_file_number); + } + if (print_stats) { + r.append("("); + r.append(std::to_string( + files[i]->stats.num_reads_sampled.load(std::memory_order_relaxed))); + r.append(")"); + } + r.append("\n"); + } + } + + const auto& blob_files = storage_info_.GetBlobFiles(); + if (!blob_files.empty()) { + r.append("--- blob files --- version# "); + AppendNumberTo(&r, version_number_); + r.append(" ---\n"); + for (const auto& blob_file_meta : blob_files) { + assert(blob_file_meta); + + r.append(blob_file_meta->DebugString()); + r.push_back('\n'); + } + } + + return r; +} + +// this is used to batch writes to the manifest file +struct VersionSet::ManifestWriter { + Status status; + bool done; + InstrumentedCondVar cv; + ColumnFamilyData* cfd; + const MutableCFOptions mutable_cf_options; + const autovector& edit_list; + const std::function manifest_write_callback; + + explicit ManifestWriter( + InstrumentedMutex* mu, ColumnFamilyData* _cfd, + const MutableCFOptions& cf_options, const autovector& e, + const std::function& manifest_wcb) + : done(false), + cv(mu), + cfd(_cfd), + mutable_cf_options(cf_options), + edit_list(e), + manifest_write_callback(manifest_wcb) {} + ~ManifestWriter() { status.PermitUncheckedError(); } + + bool IsAllWalEdits() const { + bool all_wal_edits = true; + for (const auto& e : edit_list) { + if (!e->IsWalManipulation()) { + all_wal_edits = false; + break; + } + } + return all_wal_edits; + } +}; + +Status AtomicGroupReadBuffer::AddEdit(VersionEdit* edit) { + assert(edit); + if (edit->is_in_atomic_group_) { + TEST_SYNC_POINT("AtomicGroupReadBuffer::AddEdit:AtomicGroup"); + if (replay_buffer_.empty()) { + replay_buffer_.resize(edit->remaining_entries_ + 1); + TEST_SYNC_POINT_CALLBACK( + "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", edit); + } + read_edits_in_atomic_group_++; + if (read_edits_in_atomic_group_ + edit->remaining_entries_ != + static_cast(replay_buffer_.size())) { + TEST_SYNC_POINT_CALLBACK( + "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize", edit); + return Status::Corruption("corrupted atomic group"); + } + replay_buffer_[read_edits_in_atomic_group_ - 1] = *edit; + if (read_edits_in_atomic_group_ == replay_buffer_.size()) { + TEST_SYNC_POINT_CALLBACK( + "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", edit); + return Status::OK(); + } + return Status::OK(); + } + + // A normal edit. + if (!replay_buffer().empty()) { + TEST_SYNC_POINT_CALLBACK( + "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits", edit); + return Status::Corruption("corrupted atomic group"); + } + return Status::OK(); +} + +bool AtomicGroupReadBuffer::IsFull() const { + return read_edits_in_atomic_group_ == replay_buffer_.size(); +} + +bool AtomicGroupReadBuffer::IsEmpty() const { return replay_buffer_.empty(); } + +void AtomicGroupReadBuffer::Clear() { + read_edits_in_atomic_group_ = 0; + replay_buffer_.clear(); +} + +VersionSet::VersionSet(const std::string& dbname, + const ImmutableDBOptions* _db_options, + const FileOptions& storage_options, Cache* table_cache, + WriteBufferManager* write_buffer_manager, + WriteController* write_controller, + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, + const std::string& db_id, + const std::string& db_session_id) + : column_family_set_(new ColumnFamilySet( + dbname, _db_options, storage_options, table_cache, + write_buffer_manager, write_controller, block_cache_tracer, io_tracer, + db_id, db_session_id)), + table_cache_(table_cache), + env_(_db_options->env), + fs_(_db_options->fs, io_tracer), + clock_(_db_options->clock), + dbname_(dbname), + db_options_(_db_options), + next_file_number_(2), + manifest_file_number_(0), // Filled by Recover() + options_file_number_(0), + options_file_size_(0), + pending_manifest_file_number_(0), + last_sequence_(0), + last_allocated_sequence_(0), + last_published_sequence_(0), + prev_log_number_(0), + current_version_number_(0), + manifest_file_size_(0), + file_options_(storage_options), + block_cache_tracer_(block_cache_tracer), + io_tracer_(io_tracer), + db_session_id_(db_session_id) {} + +VersionSet::~VersionSet() { + // we need to delete column_family_set_ because its destructor depends on + // VersionSet + column_family_set_.reset(); + for (auto& file : obsolete_files_) { + if (file.metadata->table_reader_handle) { + table_cache_->Release(file.metadata->table_reader_handle); + TableCache::Evict(table_cache_, file.metadata->fd.GetNumber()); + } + file.DeleteMetadata(); + } + obsolete_files_.clear(); + io_status_.PermitUncheckedError(); +} + +void VersionSet::Reset() { + if (column_family_set_) { + WriteBufferManager* wbm = column_family_set_->write_buffer_manager(); + WriteController* wc = column_family_set_->write_controller(); + // db_id becomes the source of truth after DBImpl::Recover(): + // https://github.com/facebook/rocksdb/blob/v7.3.1/db/db_impl/db_impl_open.cc#L527 + // Note: we may not be able to recover db_id from MANIFEST if + // options.write_dbid_to_manifest is false (default). + column_family_set_.reset(new ColumnFamilySet( + dbname_, db_options_, file_options_, table_cache_, wbm, wc, + block_cache_tracer_, io_tracer_, db_id_, db_session_id_)); + } + db_id_.clear(); + next_file_number_.store(2); + min_log_number_to_keep_.store(0); + manifest_file_number_ = 0; + options_file_number_ = 0; + pending_manifest_file_number_ = 0; + last_sequence_.store(0); + last_allocated_sequence_.store(0); + last_published_sequence_.store(0); + prev_log_number_ = 0; + descriptor_log_.reset(); + current_version_number_ = 0; + manifest_writers_.clear(); + manifest_file_size_ = 0; + obsolete_files_.clear(); + obsolete_manifests_.clear(); + wals_.Reset(); +} + +void VersionSet::AppendVersion(ColumnFamilyData* column_family_data, + Version* v) { + // compute new compaction score + v->storage_info()->ComputeCompactionScore( + *column_family_data->ioptions(), + *column_family_data->GetLatestMutableCFOptions()); + + // Mark v finalized + v->storage_info_.SetFinalized(); + + // Make "v" current + assert(v->refs_ == 0); + Version* current = column_family_data->current(); + assert(v != current); + if (current != nullptr) { + assert(current->refs_ > 0); + current->Unref(); + } + column_family_data->SetCurrent(v); + v->Ref(); + + // Append to linked list + v->prev_ = column_family_data->dummy_versions()->prev_; + v->next_ = column_family_data->dummy_versions(); + v->prev_->next_ = v; + v->next_->prev_ = v; +} + +Status VersionSet::ProcessManifestWrites( + std::deque& writers, InstrumentedMutex* mu, + FSDirectory* dir_contains_current_file, bool new_descriptor_log, + const ColumnFamilyOptions* new_cf_options) { + mu->AssertHeld(); + assert(!writers.empty()); + ManifestWriter& first_writer = writers.front(); + ManifestWriter* last_writer = &first_writer; + + assert(!manifest_writers_.empty()); + assert(manifest_writers_.front() == &first_writer); + + autovector batch_edits; + autovector versions; + autovector mutable_cf_options_ptrs; + std::vector> builder_guards; + + // Tracking `max_last_sequence` is needed to ensure we write + // `VersionEdit::last_sequence_`s in non-decreasing order according to the + // recovery code's requirement. It also allows us to defer updating + // `descriptor_last_sequence_` until the apply phase, after the log phase + // succeeds. + SequenceNumber max_last_sequence = descriptor_last_sequence_; + + if (first_writer.edit_list.front()->IsColumnFamilyManipulation()) { + // No group commits for column family add or drop + LogAndApplyCFHelper(first_writer.edit_list.front(), &max_last_sequence); + batch_edits.push_back(first_writer.edit_list.front()); + } else { + auto it = manifest_writers_.cbegin(); + size_t group_start = std::numeric_limits::max(); + while (it != manifest_writers_.cend()) { + if ((*it)->edit_list.front()->IsColumnFamilyManipulation()) { + // no group commits for column family add or drop + break; + } + last_writer = *(it++); + assert(last_writer != nullptr); + assert(last_writer->cfd != nullptr); + if (last_writer->cfd->IsDropped()) { + // If we detect a dropped CF at this point, and the corresponding + // version edits belong to an atomic group, then we need to find out + // the preceding version edits in the same atomic group, and update + // their `remaining_entries_` member variable because we are NOT going + // to write the version edits' of dropped CF to the MANIFEST. If we + // don't update, then Recover can report corrupted atomic group because + // the `remaining_entries_` do not match. + if (!batch_edits.empty()) { + if (batch_edits.back()->is_in_atomic_group_ && + batch_edits.back()->remaining_entries_ > 0) { + assert(group_start < batch_edits.size()); + const auto& edit_list = last_writer->edit_list; + size_t k = 0; + while (k < edit_list.size()) { + if (!edit_list[k]->is_in_atomic_group_) { + break; + } else if (edit_list[k]->remaining_entries_ == 0) { + ++k; + break; + } + ++k; + } + for (auto i = group_start; i < batch_edits.size(); ++i) { + assert(static_cast(k) <= + batch_edits.back()->remaining_entries_); + batch_edits[i]->remaining_entries_ -= static_cast(k); + } + } + } + continue; + } + // We do a linear search on versions because versions is small. + // TODO(yanqin) maybe consider unordered_map + Version* version = nullptr; + VersionBuilder* builder = nullptr; + for (int i = 0; i != static_cast(versions.size()); ++i) { + uint32_t cf_id = last_writer->cfd->GetID(); + if (versions[i]->cfd()->GetID() == cf_id) { + version = versions[i]; + assert(!builder_guards.empty() && + builder_guards.size() == versions.size()); + builder = builder_guards[i]->version_builder(); + TEST_SYNC_POINT_CALLBACK( + "VersionSet::ProcessManifestWrites:SameColumnFamily", &cf_id); + break; + } + } + if (version == nullptr) { + // WAL manipulations do not need to be applied to versions. + if (!last_writer->IsAllWalEdits()) { + version = new Version(last_writer->cfd, this, file_options_, + last_writer->mutable_cf_options, io_tracer_, + current_version_number_++); + versions.push_back(version); + mutable_cf_options_ptrs.push_back(&last_writer->mutable_cf_options); + builder_guards.emplace_back( + new BaseReferencedVersionBuilder(last_writer->cfd)); + builder = builder_guards.back()->version_builder(); + } + assert(last_writer->IsAllWalEdits() || builder); + assert(last_writer->IsAllWalEdits() || version); + TEST_SYNC_POINT_CALLBACK("VersionSet::ProcessManifestWrites:NewVersion", + version); + } + for (const auto& e : last_writer->edit_list) { + if (e->is_in_atomic_group_) { + if (batch_edits.empty() || !batch_edits.back()->is_in_atomic_group_ || + (batch_edits.back()->is_in_atomic_group_ && + batch_edits.back()->remaining_entries_ == 0)) { + group_start = batch_edits.size(); + } + } else if (group_start != std::numeric_limits::max()) { + group_start = std::numeric_limits::max(); + } + Status s = LogAndApplyHelper(last_writer->cfd, builder, e, + &max_last_sequence, mu); + if (!s.ok()) { + // free up the allocated memory + for (auto v : versions) { + delete v; + } + return s; + } + batch_edits.push_back(e); + } + } + for (int i = 0; i < static_cast(versions.size()); ++i) { + assert(!builder_guards.empty() && + builder_guards.size() == versions.size()); + auto* builder = builder_guards[i]->version_builder(); + Status s = builder->SaveTo(versions[i]->storage_info()); + if (!s.ok()) { + // free up the allocated memory + for (auto v : versions) { + delete v; + } + return s; + } + } + } + +#ifndef NDEBUG + // Verify that version edits of atomic groups have correct + // remaining_entries_. + size_t k = 0; + while (k < batch_edits.size()) { + while (k < batch_edits.size() && !batch_edits[k]->is_in_atomic_group_) { + ++k; + } + if (k == batch_edits.size()) { + break; + } + size_t i = k; + while (i < batch_edits.size()) { + if (!batch_edits[i]->is_in_atomic_group_) { + break; + } + assert(i - k + batch_edits[i]->remaining_entries_ == + batch_edits[k]->remaining_entries_); + if (batch_edits[i]->remaining_entries_ == 0) { + ++i; + break; + } + ++i; + } + assert(batch_edits[i - 1]->is_in_atomic_group_); + assert(0 == batch_edits[i - 1]->remaining_entries_); + std::vector tmp; + for (size_t j = k; j != i; ++j) { + tmp.emplace_back(batch_edits[j]); + } + TEST_SYNC_POINT_CALLBACK( + "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", &tmp); + k = i; + } +#endif // NDEBUG + + assert(pending_manifest_file_number_ == 0); + if (!descriptor_log_ || + manifest_file_size_ > db_options_->max_manifest_file_size) { + TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:BeforeNewManifest"); + new_descriptor_log = true; + } else { + pending_manifest_file_number_ = manifest_file_number_; + } + + // Local cached copy of state variable(s). WriteCurrentStateToManifest() + // reads its content after releasing db mutex to avoid race with + // SwitchMemtable(). + std::unordered_map curr_state; + VersionEdit wal_additions; + if (new_descriptor_log) { + pending_manifest_file_number_ = NewFileNumber(); + batch_edits.back()->SetNextFile(next_file_number_.load()); + + // if we are writing out new snapshot make sure to persist max column + // family. + if (column_family_set_->GetMaxColumnFamily() > 0) { + first_writer.edit_list.front()->SetMaxColumnFamily( + column_family_set_->GetMaxColumnFamily()); + } + for (const auto* cfd : *column_family_set_) { + assert(curr_state.find(cfd->GetID()) == curr_state.end()); + curr_state.emplace(std::make_pair( + cfd->GetID(), + MutableCFState(cfd->GetLogNumber(), cfd->GetFullHistoryTsLow()))); + } + + for (const auto& wal : wals_.GetWals()) { + wal_additions.AddWal(wal.first, wal.second); + } + } + + uint64_t new_manifest_file_size = 0; + Status s; + IOStatus io_s; + IOStatus manifest_io_status; + { + FileOptions opt_file_opts = fs_->OptimizeForManifestWrite(file_options_); + mu->Unlock(); + TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestStart"); + TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WriteManifest", nullptr); + if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) { + for (int i = 0; i < static_cast(versions.size()); ++i) { + assert(!builder_guards.empty() && + builder_guards.size() == versions.size()); + assert(!mutable_cf_options_ptrs.empty() && + builder_guards.size() == versions.size()); + ColumnFamilyData* cfd = versions[i]->cfd_; + s = builder_guards[i]->version_builder()->LoadTableHandlers( + cfd->internal_stats(), 1 /* max_threads */, + true /* prefetch_index_and_filter_in_cache */, + false /* is_initial_load */, + mutable_cf_options_ptrs[i]->prefix_extractor, + MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i])); + if (!s.ok()) { + if (db_options_->paranoid_checks) { + break; + } + s = Status::OK(); + } + } + } + + if (s.ok() && new_descriptor_log) { + // This is fine because everything inside of this block is serialized -- + // only one thread can be here at the same time + // create new manifest file + ROCKS_LOG_INFO(db_options_->info_log, "Creating manifest %" PRIu64 "\n", + pending_manifest_file_number_); + std::string descriptor_fname = + DescriptorFileName(dbname_, pending_manifest_file_number_); + std::unique_ptr descriptor_file; + io_s = NewWritableFile(fs_.get(), descriptor_fname, &descriptor_file, + opt_file_opts); + if (io_s.ok()) { + descriptor_file->SetPreallocationBlockSize( + db_options_->manifest_preallocation_size); + FileTypeSet tmp_set = db_options_->checksum_handoff_file_types; + std::unique_ptr file_writer(new WritableFileWriter( + std::move(descriptor_file), descriptor_fname, opt_file_opts, clock_, + io_tracer_, nullptr, db_options_->listeners, nullptr, + tmp_set.Contains(FileType::kDescriptorFile), + tmp_set.Contains(FileType::kDescriptorFile))); + descriptor_log_.reset( + new log::Writer(std::move(file_writer), 0, false)); + s = WriteCurrentStateToManifest(curr_state, wal_additions, + descriptor_log_.get(), io_s); + } else { + manifest_io_status = io_s; + s = io_s; + } + } + + if (s.ok()) { + if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) { + constexpr bool update_stats = true; + + for (int i = 0; i < static_cast(versions.size()); ++i) { + versions[i]->PrepareAppend(*mutable_cf_options_ptrs[i], update_stats); + } + } + + // Write new records to MANIFEST log +#ifndef NDEBUG + size_t idx = 0; +#endif + for (auto& e : batch_edits) { + std::string record; + if (!e->EncodeTo(&record)) { + s = Status::Corruption("Unable to encode VersionEdit:" + + e->DebugString(true)); + break; + } + TEST_KILL_RANDOM_WITH_WEIGHT("VersionSet::LogAndApply:BeforeAddRecord", + REDUCE_ODDS2); +#ifndef NDEBUG + if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) { + TEST_SYNC_POINT_CALLBACK( + "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0", + nullptr); + TEST_SYNC_POINT( + "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1"); + } + ++idx; +#endif /* !NDEBUG */ + io_s = descriptor_log_->AddRecord(record); + if (!io_s.ok()) { + s = io_s; + manifest_io_status = io_s; + break; + } + } + if (s.ok()) { + io_s = SyncManifest(db_options_, descriptor_log_->file()); + manifest_io_status = io_s; + TEST_SYNC_POINT_CALLBACK( + "VersionSet::ProcessManifestWrites:AfterSyncManifest", &io_s); + } + if (!io_s.ok()) { + s = io_s; + ROCKS_LOG_ERROR(db_options_->info_log, "MANIFEST write %s\n", + s.ToString().c_str()); + } + } + + // If we just created a new descriptor file, install it by writing a + // new CURRENT file that points to it. + if (s.ok()) { + assert(manifest_io_status.ok()); + } + if (s.ok() && new_descriptor_log) { + io_s = SetCurrentFile(fs_.get(), dbname_, pending_manifest_file_number_, + dir_contains_current_file); + if (!io_s.ok()) { + s = io_s; + } + } + + if (s.ok()) { + // find offset in manifest file where this version is stored. + new_manifest_file_size = descriptor_log_->file()->GetFileSize(); + } + + if (first_writer.edit_list.front()->is_column_family_drop_) { + TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:0"); + TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:1"); + TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:2"); + } + + LogFlush(db_options_->info_log); + TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestDone"); + mu->Lock(); + } + + if (s.ok()) { + // Apply WAL edits, DB mutex must be held. + for (auto& e : batch_edits) { + if (e->IsWalAddition()) { + s = wals_.AddWals(e->GetWalAdditions()); + } else if (e->IsWalDeletion()) { + s = wals_.DeleteWalsBefore(e->GetWalDeletion().GetLogNumber()); + } + if (!s.ok()) { + break; + } + } + } + + if (!io_s.ok()) { + if (io_status_.ok()) { + io_status_ = io_s; + } + } else if (!io_status_.ok()) { + io_status_ = io_s; + } + + // Append the old manifest file to the obsolete_manifest_ list to be deleted + // by PurgeObsoleteFiles later. + if (s.ok() && new_descriptor_log) { + obsolete_manifests_.emplace_back( + DescriptorFileName("", manifest_file_number_)); + } + + // Install the new versions + if (s.ok()) { + if (first_writer.edit_list.front()->is_column_family_add_) { + assert(batch_edits.size() == 1); + assert(new_cf_options != nullptr); + assert(max_last_sequence == descriptor_last_sequence_); + CreateColumnFamily(*new_cf_options, first_writer.edit_list.front()); + } else if (first_writer.edit_list.front()->is_column_family_drop_) { + assert(batch_edits.size() == 1); + assert(max_last_sequence == descriptor_last_sequence_); + first_writer.cfd->SetDropped(); + first_writer.cfd->UnrefAndTryDelete(); + } else { + // Each version in versions corresponds to a column family. + // For each column family, update its log number indicating that logs + // with number smaller than this should be ignored. + uint64_t last_min_log_number_to_keep = 0; + for (const auto& e : batch_edits) { + ColumnFamilyData* cfd = nullptr; + if (!e->IsColumnFamilyManipulation()) { + cfd = column_family_set_->GetColumnFamily(e->column_family_); + // e would not have been added to batch_edits if its corresponding + // column family is dropped. + assert(cfd); + } + if (cfd) { + if (e->has_log_number_ && e->log_number_ > cfd->GetLogNumber()) { + cfd->SetLogNumber(e->log_number_); + } + if (e->HasFullHistoryTsLow()) { + cfd->SetFullHistoryTsLow(e->GetFullHistoryTsLow()); + } + } + if (e->has_min_log_number_to_keep_) { + last_min_log_number_to_keep = + std::max(last_min_log_number_to_keep, e->min_log_number_to_keep_); + } + } + + if (last_min_log_number_to_keep != 0) { + MarkMinLogNumberToKeep(last_min_log_number_to_keep); + } + + for (int i = 0; i < static_cast(versions.size()); ++i) { + ColumnFamilyData* cfd = versions[i]->cfd_; + AppendVersion(cfd, versions[i]); + } + } + assert(max_last_sequence >= descriptor_last_sequence_); + descriptor_last_sequence_ = max_last_sequence; + manifest_file_number_ = pending_manifest_file_number_; + manifest_file_size_ = new_manifest_file_size; + prev_log_number_ = first_writer.edit_list.front()->prev_log_number_; + } else { + std::string version_edits; + for (auto& e : batch_edits) { + version_edits += ("\n" + e->DebugString(true)); + } + ROCKS_LOG_ERROR(db_options_->info_log, + "Error in committing version edit to MANIFEST: %s", + version_edits.c_str()); + for (auto v : versions) { + delete v; + } + if (manifest_io_status.ok()) { + manifest_file_number_ = pending_manifest_file_number_; + manifest_file_size_ = new_manifest_file_size; + } + // If manifest append failed for whatever reason, the file could be + // corrupted. So we need to force the next version update to start a + // new manifest file. + descriptor_log_.reset(); + // If manifest operations failed, then we know the CURRENT file still + // points to the original MANIFEST. Therefore, we can safely delete the + // new MANIFEST. + // If manifest operations succeeded, and we are here, then it is possible + // that renaming tmp file to CURRENT failed. + // + // On local POSIX-compliant FS, the CURRENT must point to the original + // MANIFEST. We can delete the new MANIFEST for simplicity, but we can also + // keep it. Future recovery will ignore this MANIFEST. It's also ok for the + // process not to crash and continue using the db. Any future LogAndApply() + // call will switch to a new MANIFEST and update CURRENT, still ignoring + // this one. + // + // On non-local FS, it is + // possible that the rename operation succeeded on the server (remote) + // side, but the client somehow returns a non-ok status to RocksDB. Note + // that this does not violate atomicity. Should we delete the new MANIFEST + // successfully, a subsequent recovery attempt will likely see the CURRENT + // pointing to the new MANIFEST, thus fail. We will not be able to open the + // DB again. Therefore, if manifest operations succeed, we should keep the + // the new MANIFEST. If the process proceeds, any future LogAndApply() call + // will switch to a new MANIFEST and update CURRENT. If user tries to + // re-open the DB, + // a) CURRENT points to the new MANIFEST, and the new MANIFEST is present. + // b) CURRENT points to the original MANIFEST, and the original MANIFEST + // also exists. + if (new_descriptor_log && !manifest_io_status.ok()) { + ROCKS_LOG_INFO(db_options_->info_log, + "Deleting manifest %" PRIu64 " current manifest %" PRIu64 + "\n", + pending_manifest_file_number_, manifest_file_number_); + Status manifest_del_status = env_->DeleteFile( + DescriptorFileName(dbname_, pending_manifest_file_number_)); + if (!manifest_del_status.ok()) { + ROCKS_LOG_WARN(db_options_->info_log, + "Failed to delete manifest %" PRIu64 ": %s", + pending_manifest_file_number_, + manifest_del_status.ToString().c_str()); + } + } + } + + pending_manifest_file_number_ = 0; + +#ifndef NDEBUG + // This is here kind of awkwardly because there's no other consistency + // checks on `VersionSet`'s updates for the new `Version`s. We might want + // to move it to a dedicated function, or remove it if we gain enough + // confidence in `descriptor_last_sequence_`. + if (s.ok()) { + for (const auto* v : versions) { + const auto* vstorage = v->storage_info(); + for (int level = 0; level < vstorage->num_levels(); ++level) { + for (const auto& file : vstorage->LevelFiles(level)) { + assert(file->fd.largest_seqno <= descriptor_last_sequence_); + } + } + } + } +#endif // NDEBUG + + // wake up all the waiting writers + while (true) { + ManifestWriter* ready = manifest_writers_.front(); + manifest_writers_.pop_front(); + bool need_signal = true; + for (const auto& w : writers) { + if (&w == ready) { + need_signal = false; + break; + } + } + ready->status = s; + ready->done = true; + if (ready->manifest_write_callback) { + (ready->manifest_write_callback)(s); + } + if (need_signal) { + ready->cv.Signal(); + } + if (ready == last_writer) { + break; + } + } + if (!manifest_writers_.empty()) { + manifest_writers_.front()->cv.Signal(); + } + return s; +} + +void VersionSet::WakeUpWaitingManifestWriters() { + // wake up all the waiting writers + // Notify new head of manifest write queue. + if (!manifest_writers_.empty()) { + manifest_writers_.front()->cv.Signal(); + } +} + +// 'datas' is grammatically incorrect. We still use this notation to indicate +// that this variable represents a collection of column_family_data. +Status VersionSet::LogAndApply( + const autovector& column_family_datas, + const autovector& mutable_cf_options_list, + const autovector>& edit_lists, + InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, + bool new_descriptor_log, const ColumnFamilyOptions* new_cf_options, + const std::vector>& manifest_wcbs) { + mu->AssertHeld(); + int num_edits = 0; + for (const auto& elist : edit_lists) { + num_edits += static_cast(elist.size()); + } + if (num_edits == 0) { + return Status::OK(); + } else if (num_edits > 1) { +#ifndef NDEBUG + for (const auto& edit_list : edit_lists) { + for (const auto& edit : edit_list) { + assert(!edit->IsColumnFamilyManipulation()); + } + } +#endif /* ! NDEBUG */ + } + + int num_cfds = static_cast(column_family_datas.size()); + if (num_cfds == 1 && column_family_datas[0] == nullptr) { + assert(edit_lists.size() == 1 && edit_lists[0].size() == 1); + assert(edit_lists[0][0]->is_column_family_add_); + assert(new_cf_options != nullptr); + } + std::deque writers; + if (num_cfds > 0) { + assert(static_cast(num_cfds) == mutable_cf_options_list.size()); + assert(static_cast(num_cfds) == edit_lists.size()); + } + for (int i = 0; i < num_cfds; ++i) { + const auto wcb = + manifest_wcbs.empty() ? [](const Status&) {} : manifest_wcbs[i]; + writers.emplace_back(mu, column_family_datas[i], + *mutable_cf_options_list[i], edit_lists[i], wcb); + manifest_writers_.push_back(&writers[i]); + } + assert(!writers.empty()); + ManifestWriter& first_writer = writers.front(); + TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:BeforeWriterWaiting", + nullptr); + while (!first_writer.done && &first_writer != manifest_writers_.front()) { + first_writer.cv.Wait(); + } + if (first_writer.done) { + // All non-CF-manipulation operations can be grouped together and committed + // to MANIFEST. They should all have finished. The status code is stored in + // the first manifest writer. +#ifndef NDEBUG + for (const auto& writer : writers) { + assert(writer.done); + } + TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WakeUpAndDone", mu); +#endif /* !NDEBUG */ + return first_writer.status; + } + + int num_undropped_cfds = 0; + for (auto cfd : column_family_datas) { + // if cfd == nullptr, it is a column family add. + if (cfd == nullptr || !cfd->IsDropped()) { + ++num_undropped_cfds; + } + } + if (0 == num_undropped_cfds) { + for (int i = 0; i != num_cfds; ++i) { + manifest_writers_.pop_front(); + } + // Notify new head of manifest write queue. + if (!manifest_writers_.empty()) { + manifest_writers_.front()->cv.Signal(); + } + return Status::ColumnFamilyDropped(); + } + return ProcessManifestWrites(writers, mu, dir_contains_current_file, + new_descriptor_log, new_cf_options); +} + +void VersionSet::LogAndApplyCFHelper(VersionEdit* edit, + SequenceNumber* max_last_sequence) { + assert(max_last_sequence != nullptr); + assert(edit->IsColumnFamilyManipulation()); + edit->SetNextFile(next_file_number_.load()); + assert(!edit->HasLastSequence()); + edit->SetLastSequence(*max_last_sequence); + if (edit->is_column_family_drop_) { + // if we drop column family, we have to make sure to save max column family, + // so that we don't reuse existing ID + edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily()); + } +} + +Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd, + VersionBuilder* builder, VersionEdit* edit, + SequenceNumber* max_last_sequence, + InstrumentedMutex* mu) { +#ifdef NDEBUG + (void)cfd; +#endif + mu->AssertHeld(); + assert(!edit->IsColumnFamilyManipulation()); + assert(max_last_sequence != nullptr); + + if (edit->has_log_number_) { + assert(edit->log_number_ >= cfd->GetLogNumber()); + assert(edit->log_number_ < next_file_number_.load()); + } + + if (!edit->has_prev_log_number_) { + edit->SetPrevLogNumber(prev_log_number_); + } + edit->SetNextFile(next_file_number_.load()); + if (edit->HasLastSequence() && edit->GetLastSequence() > *max_last_sequence) { + *max_last_sequence = edit->GetLastSequence(); + } else { + edit->SetLastSequence(*max_last_sequence); + } + + // The builder can be nullptr only if edit is WAL manipulation, + // because WAL edits do not need to be applied to versions, + // we return Status::OK() in this case. + assert(builder || edit->IsWalManipulation()); + return builder ? builder->Apply(edit) : Status::OK(); +} + +Status VersionSet::GetCurrentManifestPath(const std::string& dbname, + FileSystem* fs, + std::string* manifest_path, + uint64_t* manifest_file_number) { + assert(fs != nullptr); + assert(manifest_path != nullptr); + assert(manifest_file_number != nullptr); + + std::string fname; + Status s = ReadFileToString(fs, CurrentFileName(dbname), &fname); + if (!s.ok()) { + return s; + } + if (fname.empty() || fname.back() != '\n') { + return Status::Corruption("CURRENT file does not end with newline"); + } + // remove the trailing '\n' + fname.resize(fname.size() - 1); + FileType type; + bool parse_ok = ParseFileName(fname, manifest_file_number, &type); + if (!parse_ok || type != kDescriptorFile) { + return Status::Corruption("CURRENT file corrupted"); + } + *manifest_path = dbname; + if (dbname.back() != '/') { + manifest_path->push_back('/'); + } + manifest_path->append(fname); + return Status::OK(); +} + +Status VersionSet::Recover( + const std::vector& column_families, bool read_only, + std::string* db_id, bool no_error_if_files_missing) { + // Read "CURRENT" file, which contains a pointer to the current manifest file + std::string manifest_path; + Status s = GetCurrentManifestPath(dbname_, fs_.get(), &manifest_path, + &manifest_file_number_); + if (!s.ok()) { + return s; + } + + ROCKS_LOG_INFO(db_options_->info_log, "Recovering from manifest file: %s\n", + manifest_path.c_str()); + + std::unique_ptr manifest_file_reader; + { + std::unique_ptr manifest_file; + s = fs_->NewSequentialFile(manifest_path, + fs_->OptimizeForManifestRead(file_options_), + &manifest_file, nullptr); + if (!s.ok()) { + return s; + } + manifest_file_reader.reset(new SequentialFileReader( + std::move(manifest_file), manifest_path, + db_options_->log_readahead_size, io_tracer_, db_options_->listeners)); + } + uint64_t current_manifest_file_size = 0; + uint64_t log_number = 0; + { + VersionSet::LogReporter reporter; + Status log_read_status; + reporter.status = &log_read_status; + log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter, + true /* checksum */, 0 /* log_number */); + VersionEditHandler handler( + read_only, column_families, const_cast(this), + /*track_missing_files=*/false, no_error_if_files_missing, io_tracer_); + handler.Iterate(reader, &log_read_status); + s = handler.status(); + if (s.ok()) { + log_number = handler.GetVersionEditParams().log_number_; + current_manifest_file_size = reader.GetReadOffset(); + assert(current_manifest_file_size != 0); + handler.GetDbId(db_id); + } + } + + if (s.ok()) { + manifest_file_size_ = current_manifest_file_size; + ROCKS_LOG_INFO( + db_options_->info_log, + "Recovered from manifest file:%s succeeded," + "manifest_file_number is %" PRIu64 ", next_file_number is %" PRIu64 + ", last_sequence is %" PRIu64 ", log_number is %" PRIu64 + ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32 + ",min_log_number_to_keep is %" PRIu64 "\n", + manifest_path.c_str(), manifest_file_number_, next_file_number_.load(), + last_sequence_.load(), log_number, prev_log_number_, + column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep()); + + for (auto cfd : *column_family_set_) { + if (cfd->IsDropped()) { + continue; + } + ROCKS_LOG_INFO(db_options_->info_log, + "Column family [%s] (ID %" PRIu32 + "), log number is %" PRIu64 "\n", + cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber()); + } + } + + return s; +} + +namespace { +class ManifestPicker { + public: + explicit ManifestPicker(const std::string& dbname, + const std::vector& files_in_dbname); + // REQUIRES Valid() == true + std::string GetNextManifest(uint64_t* file_number, std::string* file_name); + bool Valid() const { return manifest_file_iter_ != manifest_files_.end(); } + + private: + const std::string& dbname_; + // MANIFEST file names(s) + std::vector manifest_files_; + std::vector::const_iterator manifest_file_iter_; +}; + +ManifestPicker::ManifestPicker(const std::string& dbname, + const std::vector& files_in_dbname) + : dbname_(dbname) { + // populate manifest files + assert(!files_in_dbname.empty()); + for (const auto& fname : files_in_dbname) { + uint64_t file_num = 0; + FileType file_type; + bool parse_ok = ParseFileName(fname, &file_num, &file_type); + if (parse_ok && file_type == kDescriptorFile) { + manifest_files_.push_back(fname); + } + } + // seek to first manifest + std::sort(manifest_files_.begin(), manifest_files_.end(), + [](const std::string& lhs, const std::string& rhs) { + uint64_t num1 = 0; + uint64_t num2 = 0; + FileType type1; + FileType type2; + bool parse_ok1 = ParseFileName(lhs, &num1, &type1); + bool parse_ok2 = ParseFileName(rhs, &num2, &type2); +#ifndef NDEBUG + assert(parse_ok1); + assert(parse_ok2); +#else + (void)parse_ok1; + (void)parse_ok2; +#endif + return num1 > num2; + }); + manifest_file_iter_ = manifest_files_.begin(); +} + +std::string ManifestPicker::GetNextManifest(uint64_t* number, + std::string* file_name) { + assert(Valid()); + std::string ret; + if (manifest_file_iter_ != manifest_files_.end()) { + ret.assign(dbname_); + if (ret.back() != kFilePathSeparator) { + ret.push_back(kFilePathSeparator); + } + ret.append(*manifest_file_iter_); + if (number) { + FileType type; + bool parse = ParseFileName(*manifest_file_iter_, number, &type); + assert(type == kDescriptorFile); +#ifndef NDEBUG + assert(parse); +#else + (void)parse; +#endif + } + if (file_name) { + *file_name = *manifest_file_iter_; + } + ++manifest_file_iter_; + } + return ret; +} +} // anonymous namespace + +Status VersionSet::TryRecover( + const std::vector& column_families, bool read_only, + const std::vector& files_in_dbname, std::string* db_id, + bool* has_missing_table_file) { + ManifestPicker manifest_picker(dbname_, files_in_dbname); + if (!manifest_picker.Valid()) { + return Status::Corruption("Cannot locate MANIFEST file in " + dbname_); + } + Status s; + std::string manifest_path = + manifest_picker.GetNextManifest(&manifest_file_number_, nullptr); + while (!manifest_path.empty()) { + s = TryRecoverFromOneManifest(manifest_path, column_families, read_only, + db_id, has_missing_table_file); + if (s.ok() || !manifest_picker.Valid()) { + break; + } + Reset(); + manifest_path = + manifest_picker.GetNextManifest(&manifest_file_number_, nullptr); + } + return s; +} + +Status VersionSet::TryRecoverFromOneManifest( + const std::string& manifest_path, + const std::vector& column_families, bool read_only, + std::string* db_id, bool* has_missing_table_file) { + ROCKS_LOG_INFO(db_options_->info_log, "Trying to recover from manifest: %s\n", + manifest_path.c_str()); + std::unique_ptr manifest_file_reader; + Status s; + { + std::unique_ptr manifest_file; + s = fs_->NewSequentialFile(manifest_path, + fs_->OptimizeForManifestRead(file_options_), + &manifest_file, nullptr); + if (!s.ok()) { + return s; + } + manifest_file_reader.reset(new SequentialFileReader( + std::move(manifest_file), manifest_path, + db_options_->log_readahead_size, io_tracer_, db_options_->listeners)); + } + + assert(s.ok()); + VersionSet::LogReporter reporter; + reporter.status = &s; + log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter, + /*checksum=*/true, /*log_num=*/0); + VersionEditHandlerPointInTime handler_pit( + read_only, column_families, const_cast(this), io_tracer_); + + handler_pit.Iterate(reader, &s); + + handler_pit.GetDbId(db_id); + + assert(nullptr != has_missing_table_file); + *has_missing_table_file = handler_pit.HasMissingFiles(); + + return handler_pit.status(); +} + +Status VersionSet::ListColumnFamilies(std::vector* column_families, + const std::string& dbname, + FileSystem* fs) { + // Read "CURRENT" file, which contains a pointer to the current manifest file + std::string manifest_path; + uint64_t manifest_file_number; + Status s = + GetCurrentManifestPath(dbname, fs, &manifest_path, &manifest_file_number); + if (!s.ok()) { + return s; + } + return ListColumnFamiliesFromManifest(manifest_path, fs, column_families); +} + +Status VersionSet::ListColumnFamiliesFromManifest( + const std::string& manifest_path, FileSystem* fs, + std::vector* column_families) { + std::unique_ptr file_reader; + Status s; + { + std::unique_ptr file; + // these are just for performance reasons, not correctness, + // so we're fine using the defaults + s = fs->NewSequentialFile(manifest_path, FileOptions(), &file, nullptr); + if (!s.ok()) { + return s; + } + file_reader = std::make_unique( + std::move(file), manifest_path, /*io_tracer=*/nullptr); + } + + VersionSet::LogReporter reporter; + reporter.status = &s; + log::Reader reader(nullptr, std::move(file_reader), &reporter, + true /* checksum */, 0 /* log_number */); + + ListColumnFamiliesHandler handler; + handler.Iterate(reader, &s); + + assert(column_families); + column_families->clear(); + if (handler.status().ok()) { + for (const auto& iter : handler.GetColumnFamilyNames()) { + column_families->push_back(iter.second); + } + } + + return handler.status(); +} + +#ifndef ROCKSDB_LITE +Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, + const Options* options, + const FileOptions& file_options, + int new_levels) { + if (new_levels <= 1) { + return Status::InvalidArgument( + "Number of levels needs to be bigger than 1"); + } + + ImmutableDBOptions db_options(*options); + ColumnFamilyOptions cf_options(*options); + std::shared_ptr tc(NewLRUCache(options->max_open_files - 10, + options->table_cache_numshardbits)); + WriteController wc(options->delayed_write_rate); + WriteBufferManager wb(options->db_write_buffer_size); + VersionSet versions(dbname, &db_options, file_options, tc.get(), &wb, &wc, + nullptr /*BlockCacheTracer*/, nullptr /*IOTracer*/, + /*db_id*/ "", + /*db_session_id*/ ""); + Status status; + + std::vector dummy; + ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName, + ColumnFamilyOptions(*options)); + dummy.push_back(dummy_descriptor); + status = versions.Recover(dummy); + if (!status.ok()) { + return status; + } + + Version* current_version = + versions.GetColumnFamilySet()->GetDefault()->current(); + auto* vstorage = current_version->storage_info(); + int current_levels = vstorage->num_levels(); + + if (current_levels <= new_levels) { + return Status::OK(); + } + + // Make sure there are file only on one level from + // (new_levels-1) to (current_levels-1) + int first_nonempty_level = -1; + int first_nonempty_level_filenum = 0; + for (int i = new_levels - 1; i < current_levels; i++) { + int file_num = vstorage->NumLevelFiles(i); + if (file_num != 0) { + if (first_nonempty_level < 0) { + first_nonempty_level = i; + first_nonempty_level_filenum = file_num; + } else { + char msg[255]; + snprintf(msg, sizeof(msg), + "Found at least two levels containing files: " + "[%d:%d],[%d:%d].\n", + first_nonempty_level, first_nonempty_level_filenum, i, + file_num); + return Status::InvalidArgument(msg); + } + } + } + + // we need to allocate an array with the old number of levels size to + // avoid SIGSEGV in WriteCurrentStatetoManifest() + // however, all levels bigger or equal to new_levels will be empty + std::vector* new_files_list = + new std::vector[current_levels]; + for (int i = 0; i < new_levels - 1; i++) { + new_files_list[i] = vstorage->LevelFiles(i); + } + + if (first_nonempty_level > 0) { + auto& new_last_level = new_files_list[new_levels - 1]; + + new_last_level = vstorage->LevelFiles(first_nonempty_level); + + for (size_t i = 0; i < new_last_level.size(); ++i) { + const FileMetaData* const meta = new_last_level[i]; + assert(meta); + + const uint64_t file_number = meta->fd.GetNumber(); + + vstorage->file_locations_[file_number] = + VersionStorageInfo::FileLocation(new_levels - 1, i); + } + } + + delete[] vstorage->files_; + vstorage->files_ = new_files_list; + vstorage->num_levels_ = new_levels; + vstorage->ResizeCompactCursors(new_levels); + + MutableCFOptions mutable_cf_options(*options); + VersionEdit ve; + InstrumentedMutex dummy_mutex; + InstrumentedMutexLock l(&dummy_mutex); + return versions.LogAndApply(versions.GetColumnFamilySet()->GetDefault(), + mutable_cf_options, &ve, &dummy_mutex, nullptr, + true); +} + +// Get the checksum information including the checksum and checksum function +// name of all SST and blob files in VersionSet. Store the information in +// FileChecksumList which contains a map from file number to its checksum info. +// If DB is not running, make sure call VersionSet::Recover() to load the file +// metadata from Manifest to VersionSet before calling this function. +Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) { + // Clean the previously stored checksum information if any. + Status s; + if (checksum_list == nullptr) { + s = Status::InvalidArgument("checksum_list is nullptr"); + return s; + } + checksum_list->reset(); + + for (auto cfd : *column_family_set_) { + assert(cfd); + + if (cfd->IsDropped() || !cfd->initialized()) { + continue; + } + + const auto* current = cfd->current(); + assert(current); + + const auto* vstorage = current->storage_info(); + assert(vstorage); + + /* SST files */ + for (int level = 0; level < cfd->NumberLevels(); level++) { + const auto& level_files = vstorage->LevelFiles(level); + + for (const auto& file : level_files) { + assert(file); + + s = checksum_list->InsertOneFileChecksum(file->fd.GetNumber(), + file->file_checksum, + file->file_checksum_func_name); + if (!s.ok()) { + return s; + } + } + } + + /* Blob files */ + const auto& blob_files = vstorage->GetBlobFiles(); + for (const auto& meta : blob_files) { + assert(meta); + + std::string checksum_value = meta->GetChecksumValue(); + std::string checksum_method = meta->GetChecksumMethod(); + assert(checksum_value.empty() == checksum_method.empty()); + if (meta->GetChecksumMethod().empty()) { + checksum_value = kUnknownFileChecksum; + checksum_method = kUnknownFileChecksumFuncName; + } + + s = checksum_list->InsertOneFileChecksum(meta->GetBlobFileNumber(), + checksum_value, checksum_method); + if (!s.ok()) { + return s; + } + } + } + + return s; +} + +Status VersionSet::DumpManifest(Options& options, std::string& dscname, + bool verbose, bool hex, bool json) { + assert(options.env); + std::vector column_families; + Status s = ListColumnFamiliesFromManifest( + dscname, options.env->GetFileSystem().get(), &column_families); + if (!s.ok()) { + return s; + } + + // Open the specified manifest file. + std::unique_ptr file_reader; + { + std::unique_ptr file; + const std::shared_ptr& fs = options.env->GetFileSystem(); + s = fs->NewSequentialFile( + dscname, fs->OptimizeForManifestRead(file_options_), &file, nullptr); + if (!s.ok()) { + return s; + } + file_reader = std::make_unique( + std::move(file), dscname, db_options_->log_readahead_size, io_tracer_); + } + + std::vector cf_descs; + for (const auto& cf : column_families) { + cf_descs.emplace_back(cf, options); + } + + DumpManifestHandler handler(cf_descs, this, io_tracer_, verbose, hex, json); + { + VersionSet::LogReporter reporter; + reporter.status = &s; + log::Reader reader(nullptr, std::move(file_reader), &reporter, + true /* checksum */, 0 /* log_number */); + handler.Iterate(reader, &s); + } + + return handler.status(); +} +#endif // ROCKSDB_LITE + +void VersionSet::MarkFileNumberUsed(uint64_t number) { + // only called during recovery and repair which are single threaded, so this + // works because there can't be concurrent calls + if (next_file_number_.load(std::memory_order_relaxed) <= number) { + next_file_number_.store(number + 1, std::memory_order_relaxed); + } +} +// Called only either from ::LogAndApply which is protected by mutex or during +// recovery which is single-threaded. +void VersionSet::MarkMinLogNumberToKeep(uint64_t number) { + if (min_log_number_to_keep_.load(std::memory_order_relaxed) < number) { + min_log_number_to_keep_.store(number, std::memory_order_relaxed); + } +} + +Status VersionSet::WriteCurrentStateToManifest( + const std::unordered_map& curr_state, + const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s) { + // TODO: Break up into multiple records to reduce memory usage on recovery? + + // WARNING: This method doesn't hold a mutex!! + + // This is done without DB mutex lock held, but only within single-threaded + // LogAndApply. Column family manipulations can only happen within LogAndApply + // (the same single thread), so we're safe to iterate. + + assert(io_s.ok()); + if (db_options_->write_dbid_to_manifest) { + VersionEdit edit_for_db_id; + assert(!db_id_.empty()); + edit_for_db_id.SetDBId(db_id_); + std::string db_id_record; + if (!edit_for_db_id.EncodeTo(&db_id_record)) { + return Status::Corruption("Unable to Encode VersionEdit:" + + edit_for_db_id.DebugString(true)); + } + io_s = log->AddRecord(db_id_record); + if (!io_s.ok()) { + return io_s; + } + } + + // Save WALs. + if (!wal_additions.GetWalAdditions().empty()) { + TEST_SYNC_POINT_CALLBACK("VersionSet::WriteCurrentStateToManifest:SaveWal", + const_cast(&wal_additions)); + std::string record; + if (!wal_additions.EncodeTo(&record)) { + return Status::Corruption("Unable to Encode VersionEdit: " + + wal_additions.DebugString(true)); + } + io_s = log->AddRecord(record); + if (!io_s.ok()) { + return io_s; + } + } + + for (auto cfd : *column_family_set_) { + assert(cfd); + + if (cfd->IsDropped()) { + continue; + } + assert(cfd->initialized()); + { + // Store column family info + VersionEdit edit; + if (cfd->GetID() != 0) { + // default column family is always there, + // no need to explicitly write it + edit.AddColumnFamily(cfd->GetName()); + edit.SetColumnFamily(cfd->GetID()); + } + edit.SetComparatorName( + cfd->internal_comparator().user_comparator()->Name()); + std::string record; + if (!edit.EncodeTo(&record)) { + return Status::Corruption("Unable to Encode VersionEdit:" + + edit.DebugString(true)); + } + io_s = log->AddRecord(record); + if (!io_s.ok()) { + return io_s; + } + } + + { + // Save files + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + + const auto* current = cfd->current(); + assert(current); + + const auto* vstorage = current->storage_info(); + assert(vstorage); + + for (int level = 0; level < cfd->NumberLevels(); level++) { + const auto& level_files = vstorage->LevelFiles(level); + + for (const auto& f : level_files) { + assert(f); + + edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(), + f->fd.GetFileSize(), f->smallest, f->largest, + f->fd.smallest_seqno, f->fd.largest_seqno, + f->marked_for_compaction, f->temperature, + f->oldest_blob_file_number, f->oldest_ancester_time, + f->file_creation_time, f->file_checksum, + f->file_checksum_func_name, f->unique_id); + } + } + + edit.SetCompactCursors(vstorage->GetCompactCursors()); + + const auto& blob_files = vstorage->GetBlobFiles(); + for (const auto& meta : blob_files) { + assert(meta); + + const uint64_t blob_file_number = meta->GetBlobFileNumber(); + + edit.AddBlobFile(blob_file_number, meta->GetTotalBlobCount(), + meta->GetTotalBlobBytes(), meta->GetChecksumMethod(), + meta->GetChecksumValue()); + if (meta->GetGarbageBlobCount() > 0) { + edit.AddBlobFileGarbage(blob_file_number, meta->GetGarbageBlobCount(), + meta->GetGarbageBlobBytes()); + } + } + + const auto iter = curr_state.find(cfd->GetID()); + assert(iter != curr_state.end()); + uint64_t log_number = iter->second.log_number; + edit.SetLogNumber(log_number); + + if (cfd->GetID() == 0) { + // min_log_number_to_keep is for the whole db, not for specific column + // family. So it does not need to be set for every column family, just + // need to be set once. Since default CF can never be dropped, we set + // the min_log to the default CF here. + uint64_t min_log = min_log_number_to_keep(); + if (min_log != 0) { + edit.SetMinLogNumberToKeep(min_log); + } + } + + const std::string& full_history_ts_low = iter->second.full_history_ts_low; + if (!full_history_ts_low.empty()) { + edit.SetFullHistoryTsLow(full_history_ts_low); + } + + edit.SetLastSequence(descriptor_last_sequence_); + + std::string record; + if (!edit.EncodeTo(&record)) { + return Status::Corruption("Unable to Encode VersionEdit:" + + edit.DebugString(true)); + } + io_s = log->AddRecord(record); + if (!io_s.ok()) { + return io_s; + } + } + } + return Status::OK(); +} + +// TODO(aekmekji): in CompactionJob::GenSubcompactionBoundaries(), this +// function is called repeatedly with consecutive pairs of slices. For example +// if the slice list is [a, b, c, d] this function is called with arguments +// (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where +// we avoid doing binary search for the keys b and c twice and instead somehow +// maintain state of where they first appear in the files. +uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options, + Version* v, const Slice& start, + const Slice& end, int start_level, + int end_level, TableReaderCaller caller) { + const auto& icmp = v->cfd_->internal_comparator(); + + // pre-condition + assert(icmp.Compare(start, end) <= 0); + + uint64_t total_full_size = 0; + const auto* vstorage = v->storage_info(); + const int num_non_empty_levels = vstorage->num_non_empty_levels(); + end_level = (end_level == -1) ? num_non_empty_levels + : std::min(end_level, num_non_empty_levels); + + assert(start_level <= end_level); + + // Outline of the optimization that uses options.files_size_error_margin. + // When approximating the files total size that is used to store a keys range, + // we first sum up the sizes of the files that fully fall into the range. + // Then we sum up the sizes of all the files that may intersect with the range + // (this includes all files in L0 as well). Then, if total_intersecting_size + // is smaller than total_full_size * options.files_size_error_margin - we can + // infer that the intersecting files have a sufficiently negligible + // contribution to the total size, and we can approximate the storage required + // for the keys in range as just half of the intersecting_files_size. + // E.g., if the value of files_size_error_margin is 0.1, then the error of the + // approximation is limited to only ~10% of the total size of files that fully + // fall into the keys range. In such case, this helps to avoid a costly + // process of binary searching the intersecting files that is required only + // for a more precise calculation of the total size. + + autovector first_files; + autovector last_files; + + // scan all the levels + for (int level = start_level; level < end_level; ++level) { + const LevelFilesBrief& files_brief = vstorage->LevelFilesBrief(level); + if (files_brief.num_files == 0) { + // empty level, skip exploration + continue; + } + + if (level == 0) { + // level 0 files are not in sorted order, we need to iterate through + // the list to compute the total bytes that require scanning, + // so handle the case explicitly (similarly to first_files case) + for (size_t i = 0; i < files_brief.num_files; i++) { + first_files.push_back(&files_brief.files[i]); + } + continue; + } + + assert(level > 0); + assert(files_brief.num_files > 0); + + // identify the file position for start key + const int idx_start = + FindFileInRange(icmp, files_brief, start, 0, + static_cast(files_brief.num_files - 1)); + assert(static_cast(idx_start) < files_brief.num_files); + + // identify the file position for end key + int idx_end = idx_start; + if (icmp.Compare(files_brief.files[idx_end].largest_key, end) < 0) { + idx_end = + FindFileInRange(icmp, files_brief, end, idx_start, + static_cast(files_brief.num_files - 1)); + } + assert(idx_end >= idx_start && + static_cast(idx_end) < files_brief.num_files); + + // scan all files from the starting index to the ending index + // (inferred from the sorted order) + + // first scan all the intermediate full files (excluding first and last) + for (int i = idx_start + 1; i < idx_end; ++i) { + uint64_t file_size = files_brief.files[i].fd.GetFileSize(); + // The entire file falls into the range, so we can just take its size. + assert(file_size == + ApproximateSize(v, files_brief.files[i], start, end, caller)); + total_full_size += file_size; + } + + // save the first and the last files (which may be the same file), so we + // can scan them later. + first_files.push_back(&files_brief.files[idx_start]); + if (idx_start != idx_end) { + // we need to estimate size for both files, only if they are different + last_files.push_back(&files_brief.files[idx_end]); + } + } + + // The sum of all file sizes that intersect the [start, end] keys range. + uint64_t total_intersecting_size = 0; + for (const auto* file_ptr : first_files) { + total_intersecting_size += file_ptr->fd.GetFileSize(); + } + for (const auto* file_ptr : last_files) { + total_intersecting_size += file_ptr->fd.GetFileSize(); + } + + // Now scan all the first & last files at each level, and estimate their size. + // If the total_intersecting_size is less than X% of the total_full_size - we + // want to approximate the result in order to avoid the costly binary search + // inside ApproximateSize. We use half of file size as an approximation below. + + const double margin = options.files_size_error_margin; + if (margin > 0 && total_intersecting_size < + static_cast(total_full_size * margin)) { + total_full_size += total_intersecting_size / 2; + } else { + // Estimate for all the first files (might also be last files), at each + // level + for (const auto file_ptr : first_files) { + total_full_size += ApproximateSize(v, *file_ptr, start, end, caller); + } + + // Estimate for all the last files, at each level + for (const auto file_ptr : last_files) { + // We could use ApproximateSize here, but calling ApproximateOffsetOf + // directly is just more efficient. + total_full_size += ApproximateOffsetOf(v, *file_ptr, end, caller); + } + } + + return total_full_size; +} + +uint64_t VersionSet::ApproximateOffsetOf(Version* v, const FdWithKeyRange& f, + const Slice& key, + TableReaderCaller caller) { + // pre-condition + assert(v); + const auto& icmp = v->cfd_->internal_comparator(); + + uint64_t result = 0; + if (icmp.Compare(f.largest_key, key) <= 0) { + // Entire file is before "key", so just add the file size + result = f.fd.GetFileSize(); + } else if (icmp.Compare(f.smallest_key, key) > 0) { + // Entire file is after "key", so ignore + result = 0; + } else { + // "key" falls in the range for this table. Add the + // approximate offset of "key" within the table. + TableCache* table_cache = v->cfd_->table_cache(); + if (table_cache != nullptr) { + result = table_cache->ApproximateOffsetOf( + key, *f.file_metadata, caller, icmp, + v->GetMutableCFOptions().prefix_extractor); + } + } + return result; +} + +uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, + const Slice& start, const Slice& end, + TableReaderCaller caller) { + // pre-condition + assert(v); + const auto& icmp = v->cfd_->internal_comparator(); + assert(icmp.Compare(start, end) <= 0); + + if (icmp.Compare(f.largest_key, start) <= 0 || + icmp.Compare(f.smallest_key, end) > 0) { + // Entire file is before or after the start/end keys range + return 0; + } + + if (icmp.Compare(f.smallest_key, start) >= 0) { + // Start of the range is before the file start - approximate by end offset + return ApproximateOffsetOf(v, f, end, caller); + } + + if (icmp.Compare(f.largest_key, end) < 0) { + // End of the range is after the file end - approximate by subtracting + // start offset from the file size + uint64_t start_offset = ApproximateOffsetOf(v, f, start, caller); + assert(f.fd.GetFileSize() >= start_offset); + return f.fd.GetFileSize() - start_offset; + } + + // The interval falls entirely in the range for this file. + TableCache* table_cache = v->cfd_->table_cache(); + if (table_cache == nullptr) { + return 0; + } + return table_cache->ApproximateSize( + start, end, *f.file_metadata, caller, icmp, + v->GetMutableCFOptions().prefix_extractor); +} + +void VersionSet::RemoveLiveFiles( + std::vector& sst_delete_candidates, + std::vector& blob_delete_candidates) const { + assert(column_family_set_); + for (auto cfd : *column_family_set_) { + assert(cfd); + if (!cfd->initialized()) { + continue; + } + + auto* current = cfd->current(); + bool found_current = false; + + Version* const dummy_versions = cfd->dummy_versions(); + assert(dummy_versions); + + for (Version* v = dummy_versions->next_; v != dummy_versions; + v = v->next_) { + v->RemoveLiveFiles(sst_delete_candidates, blob_delete_candidates); + if (v == current) { + found_current = true; + } + } + + if (!found_current && current != nullptr) { + // Should never happen unless it is a bug. + assert(false); + current->RemoveLiveFiles(sst_delete_candidates, blob_delete_candidates); + } + } +} + +void VersionSet::AddLiveFiles(std::vector* live_table_files, + std::vector* live_blob_files) const { + assert(live_table_files); + assert(live_blob_files); + + // pre-calculate space requirement + size_t total_table_files = 0; + size_t total_blob_files = 0; + + assert(column_family_set_); + for (auto cfd : *column_family_set_) { + assert(cfd); + + if (!cfd->initialized()) { + continue; + } + + Version* const dummy_versions = cfd->dummy_versions(); + assert(dummy_versions); + + for (Version* v = dummy_versions->next_; v != dummy_versions; + v = v->next_) { + assert(v); + + const auto* vstorage = v->storage_info(); + assert(vstorage); + + for (int level = 0; level < vstorage->num_levels(); ++level) { + total_table_files += vstorage->LevelFiles(level).size(); + } + + total_blob_files += vstorage->GetBlobFiles().size(); + } + } + + // just one time extension to the right size + live_table_files->reserve(live_table_files->size() + total_table_files); + live_blob_files->reserve(live_blob_files->size() + total_blob_files); + + assert(column_family_set_); + for (auto cfd : *column_family_set_) { + assert(cfd); + if (!cfd->initialized()) { + continue; + } + + auto* current = cfd->current(); + bool found_current = false; + + Version* const dummy_versions = cfd->dummy_versions(); + assert(dummy_versions); + + for (Version* v = dummy_versions->next_; v != dummy_versions; + v = v->next_) { + v->AddLiveFiles(live_table_files, live_blob_files); + if (v == current) { + found_current = true; + } + } + + if (!found_current && current != nullptr) { + // Should never happen unless it is a bug. + assert(false); + current->AddLiveFiles(live_table_files, live_blob_files); + } + } +} + +InternalIterator* VersionSet::MakeInputIterator( + const ReadOptions& read_options, const Compaction* c, + RangeDelAggregator* range_del_agg, + const FileOptions& file_options_compactions, + const std::optional& start, + const std::optional& end) { + auto cfd = c->column_family_data(); + // Level-0 files have to be merged together. For other levels, + // we will make a concatenating iterator per level. + // TODO(opt): use concatenating iterator for level-0 if there is no overlap + const size_t space = (c->level() == 0 ? c->input_levels(0)->num_files + + c->num_input_levels() - 1 + : c->num_input_levels()); + InternalIterator** list = new InternalIterator*[space]; + size_t num = 0; + for (size_t which = 0; which < c->num_input_levels(); which++) { + if (c->input_levels(which)->num_files != 0) { + if (c->level(which) == 0) { + const LevelFilesBrief* flevel = c->input_levels(which); + for (size_t i = 0; i < flevel->num_files; i++) { + const FileMetaData& fmd = *flevel->files[i].file_metadata; + if (start.has_value() && + cfd->user_comparator()->CompareWithoutTimestamp( + start.value(), fmd.largest.user_key()) > 0) { + continue; + } + // We should be able to filter out the case where the end key + // equals to the end boundary, since the end key is exclusive. + // We try to be extra safe here. + if (end.has_value() && + cfd->user_comparator()->CompareWithoutTimestamp( + end.value(), fmd.smallest.user_key()) < 0) { + continue; + } + + list[num++] = cfd->table_cache()->NewIterator( + read_options, file_options_compactions, + cfd->internal_comparator(), fmd, range_del_agg, + c->mutable_cf_options()->prefix_extractor, + /*table_reader_ptr=*/nullptr, + /*file_read_hist=*/nullptr, TableReaderCaller::kCompaction, + /*arena=*/nullptr, + /*skip_filters=*/false, + /*level=*/static_cast(c->level(which)), + MaxFileSizeForL0MetaPin(*c->mutable_cf_options()), + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr, + /*allow_unprepared_value=*/false); + } + } else { + // Create concatenating iterator for the files from this level + list[num++] = new LevelIterator( + cfd->table_cache(), read_options, file_options_compactions, + cfd->internal_comparator(), c->input_levels(which), + c->mutable_cf_options()->prefix_extractor, + /*should_sample=*/false, + /*no per level latency histogram=*/nullptr, + TableReaderCaller::kCompaction, /*skip_filters=*/false, + /*level=*/static_cast(c->level(which)), range_del_agg, + c->boundaries(which)); + } + } + } + assert(num <= space); + InternalIterator* result = + NewMergingIterator(&c->column_family_data()->internal_comparator(), list, + static_cast(num)); + delete[] list; + return result; +} + +Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel, + FileMetaData** meta, + ColumnFamilyData** cfd) { + for (auto cfd_iter : *column_family_set_) { + if (!cfd_iter->initialized()) { + continue; + } + Version* version = cfd_iter->current(); + const auto* vstorage = version->storage_info(); + for (int level = 0; level < vstorage->num_levels(); level++) { + for (const auto& file : vstorage->LevelFiles(level)) { + if (file->fd.GetNumber() == number) { + *meta = file; + *filelevel = level; + *cfd = cfd_iter; + return Status::OK(); + } + } + } + } + return Status::NotFound("File not present in any level"); +} + +void VersionSet::GetLiveFilesMetaData(std::vector* metadata) { + for (auto cfd : *column_family_set_) { + if (cfd->IsDropped() || !cfd->initialized()) { + continue; + } + for (int level = 0; level < cfd->NumberLevels(); level++) { + for (const auto& file : + cfd->current()->storage_info()->LevelFiles(level)) { + LiveFileMetaData filemetadata; + filemetadata.column_family_name = cfd->GetName(); + uint32_t path_id = file->fd.GetPathId(); + if (path_id < cfd->ioptions()->cf_paths.size()) { + filemetadata.db_path = cfd->ioptions()->cf_paths[path_id].path; + } else { + assert(!cfd->ioptions()->cf_paths.empty()); + filemetadata.db_path = cfd->ioptions()->cf_paths.back().path; + } + filemetadata.directory = filemetadata.db_path; + const uint64_t file_number = file->fd.GetNumber(); + filemetadata.name = MakeTableFileName("", file_number); + filemetadata.relative_filename = filemetadata.name.substr(1); + filemetadata.file_number = file_number; + filemetadata.level = level; + filemetadata.size = file->fd.GetFileSize(); + filemetadata.smallestkey = file->smallest.user_key().ToString(); + filemetadata.largestkey = file->largest.user_key().ToString(); + filemetadata.smallest_seqno = file->fd.smallest_seqno; + filemetadata.largest_seqno = file->fd.largest_seqno; + filemetadata.num_reads_sampled = + file->stats.num_reads_sampled.load(std::memory_order_relaxed); + filemetadata.being_compacted = file->being_compacted; + filemetadata.num_entries = file->num_entries; + filemetadata.num_deletions = file->num_deletions; + filemetadata.oldest_blob_file_number = file->oldest_blob_file_number; + filemetadata.file_checksum = file->file_checksum; + filemetadata.file_checksum_func_name = file->file_checksum_func_name; + filemetadata.temperature = file->temperature; + filemetadata.oldest_ancester_time = file->TryGetOldestAncesterTime(); + filemetadata.file_creation_time = file->TryGetFileCreationTime(); + metadata->push_back(filemetadata); + } + } + } +} + +void VersionSet::GetObsoleteFiles(std::vector* files, + std::vector* blob_files, + std::vector* manifest_filenames, + uint64_t min_pending_output) { + assert(files); + assert(blob_files); + assert(manifest_filenames); + assert(files->empty()); + assert(blob_files->empty()); + assert(manifest_filenames->empty()); + + std::vector pending_files; + for (auto& f : obsolete_files_) { + if (f.metadata->fd.GetNumber() < min_pending_output) { + files->emplace_back(std::move(f)); + } else { + pending_files.emplace_back(std::move(f)); + } + } + obsolete_files_.swap(pending_files); + + std::vector pending_blob_files; + for (auto& blob_file : obsolete_blob_files_) { + if (blob_file.GetBlobFileNumber() < min_pending_output) { + blob_files->emplace_back(std::move(blob_file)); + } else { + pending_blob_files.emplace_back(std::move(blob_file)); + } + } + obsolete_blob_files_.swap(pending_blob_files); + + obsolete_manifests_.swap(*manifest_filenames); +} + +ColumnFamilyData* VersionSet::CreateColumnFamily( + const ColumnFamilyOptions& cf_options, const VersionEdit* edit) { + assert(edit->is_column_family_add_); + + MutableCFOptions dummy_cf_options; + Version* dummy_versions = + new Version(nullptr, this, file_options_, dummy_cf_options, io_tracer_); + // Ref() dummy version once so that later we can call Unref() to delete it + // by avoiding calling "delete" explicitly (~Version is private) + dummy_versions->Ref(); + auto new_cfd = column_family_set_->CreateColumnFamily( + edit->column_family_name_, edit->column_family_, dummy_versions, + cf_options); + + Version* v = new Version(new_cfd, this, file_options_, + *new_cfd->GetLatestMutableCFOptions(), io_tracer_, + current_version_number_++); + + constexpr bool update_stats = false; + + v->PrepareAppend(*new_cfd->GetLatestMutableCFOptions(), update_stats); + + AppendVersion(new_cfd, v); + // GetLatestMutableCFOptions() is safe here without mutex since the + // cfd is not available to client + new_cfd->CreateNewMemtable(*new_cfd->GetLatestMutableCFOptions(), + LastSequence()); + new_cfd->SetLogNumber(edit->log_number_); + return new_cfd; +} + +uint64_t VersionSet::GetNumLiveVersions(Version* dummy_versions) { + uint64_t count = 0; + for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) { + count++; + } + return count; +} + +uint64_t VersionSet::GetTotalSstFilesSize(Version* dummy_versions) { + std::unordered_set unique_files; + uint64_t total_files_size = 0; + for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) { + VersionStorageInfo* storage_info = v->storage_info(); + for (int level = 0; level < storage_info->num_levels_; level++) { + for (const auto& file_meta : storage_info->LevelFiles(level)) { + if (unique_files.find(file_meta->fd.packed_number_and_path_id) == + unique_files.end()) { + unique_files.insert(file_meta->fd.packed_number_and_path_id); + total_files_size += file_meta->fd.GetFileSize(); + } + } + } + } + return total_files_size; +} + +uint64_t VersionSet::GetTotalBlobFileSize(Version* dummy_versions) { + std::unordered_set unique_blob_files; + + uint64_t all_versions_blob_file_size = 0; + + for (auto* v = dummy_versions->next_; v != dummy_versions; v = v->next_) { + // iterate all the versions + const auto* vstorage = v->storage_info(); + assert(vstorage); + + const auto& blob_files = vstorage->GetBlobFiles(); + + for (const auto& meta : blob_files) { + assert(meta); + + const uint64_t blob_file_number = meta->GetBlobFileNumber(); + + if (unique_blob_files.find(blob_file_number) == unique_blob_files.end()) { + // find Blob file that has not been counted + unique_blob_files.insert(blob_file_number); + all_versions_blob_file_size += meta->GetBlobFileSize(); + } + } + } + + return all_versions_blob_file_size; +} + +Status VersionSet::VerifyFileMetadata(ColumnFamilyData* cfd, + const std::string& fpath, int level, + const FileMetaData& meta) { + uint64_t fsize = 0; + Status status = fs_->GetFileSize(fpath, IOOptions(), &fsize, nullptr); + if (status.ok()) { + if (fsize != meta.fd.GetFileSize()) { + status = Status::Corruption("File size mismatch: " + fpath); + } + } + if (status.ok() && db_options_->verify_sst_unique_id_in_manifest) { + assert(cfd); + TableCache* table_cache = cfd->table_cache(); + assert(table_cache); + + const MutableCFOptions* const cf_opts = cfd->GetLatestMutableCFOptions(); + assert(cf_opts); + std::shared_ptr pe = cf_opts->prefix_extractor; + size_t max_sz_for_l0_meta_pin = MaxFileSizeForL0MetaPin(*cf_opts); + + const FileOptions& file_opts = file_options(); + + Version* version = cfd->current(); + assert(version); + VersionStorageInfo& storage_info = version->storage_info_; + const InternalKeyComparator* icmp = storage_info.InternalComparator(); + assert(icmp); + + InternalStats* internal_stats = cfd->internal_stats(); + + FileMetaData meta_copy = meta; + status = table_cache->FindTable( + ReadOptions(), file_opts, *icmp, meta_copy, + &(meta_copy.table_reader_handle), pe, + /*no_io=*/false, /*record_read_stats=*/true, + internal_stats->GetFileReadHist(level), false, level, + /*prefetch_index_and_filter_in_cache*/ false, max_sz_for_l0_meta_pin, + meta_copy.temperature); + if (meta_copy.table_reader_handle) { + table_cache->ReleaseHandle(meta_copy.table_reader_handle); + } + } + return status; +} + +ReactiveVersionSet::ReactiveVersionSet( + const std::string& dbname, const ImmutableDBOptions* _db_options, + const FileOptions& _file_options, Cache* table_cache, + WriteBufferManager* write_buffer_manager, WriteController* write_controller, + const std::shared_ptr& io_tracer) + : VersionSet(dbname, _db_options, _file_options, table_cache, + write_buffer_manager, write_controller, + /*block_cache_tracer=*/nullptr, io_tracer, /*db_id*/ "", + /*db_session_id*/ "") {} + +ReactiveVersionSet::~ReactiveVersionSet() {} + +Status ReactiveVersionSet::Recover( + const std::vector& column_families, + std::unique_ptr* manifest_reader, + std::unique_ptr* manifest_reporter, + std::unique_ptr* manifest_reader_status) { + assert(manifest_reader != nullptr); + assert(manifest_reporter != nullptr); + assert(manifest_reader_status != nullptr); + + manifest_reader_status->reset(new Status()); + manifest_reporter->reset(new LogReporter()); + static_cast_with_check(manifest_reporter->get())->status = + manifest_reader_status->get(); + Status s = MaybeSwitchManifest(manifest_reporter->get(), manifest_reader); + if (!s.ok()) { + return s; + } + log::Reader* reader = manifest_reader->get(); + assert(reader); + + manifest_tailer_.reset(new ManifestTailer( + column_families, const_cast(this), io_tracer_)); + + manifest_tailer_->Iterate(*reader, manifest_reader_status->get()); + + return manifest_tailer_->status(); +} + +Status ReactiveVersionSet::ReadAndApply( + InstrumentedMutex* mu, + std::unique_ptr* manifest_reader, + Status* manifest_read_status, + std::unordered_set* cfds_changed) { + assert(manifest_reader != nullptr); + assert(cfds_changed != nullptr); + mu->AssertHeld(); + + Status s; + log::Reader* reader = manifest_reader->get(); + assert(reader); + s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader); + if (!s.ok()) { + return s; + } + manifest_tailer_->Iterate(*(manifest_reader->get()), manifest_read_status); + s = manifest_tailer_->status(); + if (s.ok()) { + *cfds_changed = std::move(manifest_tailer_->GetUpdatedColumnFamilies()); + } + + return s; +} + +Status ReactiveVersionSet::MaybeSwitchManifest( + log::Reader::Reporter* reporter, + std::unique_ptr* manifest_reader) { + assert(manifest_reader != nullptr); + Status s; + std::string manifest_path; + s = GetCurrentManifestPath(dbname_, fs_.get(), &manifest_path, + &manifest_file_number_); + if (!s.ok()) { + return s; + } + std::unique_ptr manifest_file; + if (manifest_reader->get() != nullptr && + manifest_reader->get()->file()->file_name() == manifest_path) { + // CURRENT points to the same MANIFEST as before, no need to switch + // MANIFEST. + return s; + } + assert(nullptr == manifest_reader->get() || + manifest_reader->get()->file()->file_name() != manifest_path); + s = fs_->FileExists(manifest_path, IOOptions(), nullptr); + if (s.IsNotFound()) { + return Status::TryAgain( + "The primary may have switched to a new MANIFEST and deleted the old " + "one."); + } else if (!s.ok()) { + return s; + } + TEST_SYNC_POINT( + "ReactiveVersionSet::MaybeSwitchManifest:" + "AfterGetCurrentManifestPath:0"); + TEST_SYNC_POINT( + "ReactiveVersionSet::MaybeSwitchManifest:" + "AfterGetCurrentManifestPath:1"); + // The primary can also delete the MANIFEST while the secondary is reading + // it. This is OK on POSIX. For other file systems, maybe create a hard link + // to MANIFEST. The hard link should be cleaned up later by the secondary. + s = fs_->NewSequentialFile(manifest_path, + fs_->OptimizeForManifestRead(file_options_), + &manifest_file, nullptr); + std::unique_ptr manifest_file_reader; + if (s.ok()) { + manifest_file_reader.reset(new SequentialFileReader( + std::move(manifest_file), manifest_path, + db_options_->log_readahead_size, io_tracer_, db_options_->listeners)); + manifest_reader->reset(new log::FragmentBufferedReader( + nullptr, std::move(manifest_file_reader), reporter, true /* checksum */, + 0 /* log_number */)); + ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n", + manifest_path.c_str()); + if (manifest_tailer_) { + manifest_tailer_->PrepareToReadNewManifest(); + } + } else if (s.IsPathNotFound()) { + // This can happen if the primary switches to a new MANIFEST after the + // secondary reads the CURRENT file but before the secondary actually tries + // to open the MANIFEST. + s = Status::TryAgain( + "The primary may have switched to a new MANIFEST and deleted the old " + "one."); + } + return s; +} + +#ifndef NDEBUG +uint64_t ReactiveVersionSet::TEST_read_edits_in_atomic_group() const { + assert(manifest_tailer_); + return manifest_tailer_->GetReadBuffer().TEST_read_edits_in_atomic_group(); +} +#endif // !NDEBUG + +std::vector& ReactiveVersionSet::replay_buffer() { + assert(manifest_tailer_); + return manifest_tailer_->GetReadBuffer().replay_buffer(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/version_set.h b/src/rocksdb/db/version_set.h new file mode 100644 index 000000000..03176a8b5 --- /dev/null +++ b/src/rocksdb/db/version_set.h @@ -0,0 +1,1652 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// The representation of a DBImpl consists of a set of Versions. The +// newest version is called "current". Older versions may be kept +// around to provide a consistent view to live iterators. +// +// Each Version keeps track of a set of table files per level, as well as a +// set of blob files. The entire set of versions is maintained in a +// VersionSet. +// +// Version,VersionSet are thread-compatible, but require external +// synchronization on all accesses. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cache/cache_helpers.h" +#include "db/blob/blob_file_meta.h" +#include "db/column_family.h" +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_picker.h" +#include "db/dbformat.h" +#include "db/file_indexer.h" +#include "db/log_reader.h" +#include "db/range_del_aggregator.h" +#include "db/read_callback.h" +#include "db/table_cache.h" +#include "db/version_builder.h" +#include "db/version_edit.h" +#include "db/write_controller.h" +#include "env/file_system_tracer.h" +#if USE_COROUTINES +#include "folly/experimental/coro/BlockingWait.h" +#include "folly/experimental/coro/Collect.h" +#endif +#include "monitoring/instrumented_mutex.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/file_checksum.h" +#include "table/get_context.h" +#include "table/multiget_context.h" +#include "trace_replay/block_cache_tracer.h" +#include "util/autovector.h" +#include "util/coro_utils.h" +#include "util/hash_containers.h" + +namespace ROCKSDB_NAMESPACE { + +namespace log { +class Writer; +} + +class BlobIndex; +class Compaction; +class LogBuffer; +class LookupKey; +class MemTable; +class Version; +class VersionSet; +class WriteBufferManager; +class MergeContext; +class ColumnFamilySet; +class MergeIteratorBuilder; +class SystemClock; +class ManifestTailer; +class FilePickerMultiGet; + +// VersionEdit is always supposed to be valid and it is used to point at +// entries in Manifest. Ideally it should not be used as a container to +// carry around few of its fields as function params because it can cause +// readers to think it's a valid entry from Manifest. To avoid that confusion +// introducing VersionEditParams to simply carry around multiple VersionEdit +// params. It need not point to a valid record in Manifest. +using VersionEditParams = VersionEdit; + +// Return the smallest index i such that file_level.files[i]->largest >= key. +// Return file_level.num_files if there is no such file. +// REQUIRES: "file_level.files" contains a sorted list of +// non-overlapping files. +extern int FindFile(const InternalKeyComparator& icmp, + const LevelFilesBrief& file_level, const Slice& key); + +// Returns true iff some file in "files" overlaps the user key range +// [*smallest,*largest]. +// smallest==nullptr represents a key smaller than all keys in the DB. +// largest==nullptr represents a key largest than all keys in the DB. +// REQUIRES: If disjoint_sorted_files, file_level.files[] +// contains disjoint ranges in sorted order. +extern bool SomeFileOverlapsRange(const InternalKeyComparator& icmp, + bool disjoint_sorted_files, + const LevelFilesBrief& file_level, + const Slice* smallest_user_key, + const Slice* largest_user_key); + +// Generate LevelFilesBrief from vector +// Would copy smallest_key and largest_key data to sequential memory +// arena: Arena used to allocate the memory +extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level, + const std::vector& files, + Arena* arena); + +// Information of the storage associated with each Version, including number of +// levels of LSM tree, files information at each level, files marked for +// compaction, blob files, etc. +class VersionStorageInfo { + public: + VersionStorageInfo(const InternalKeyComparator* internal_comparator, + const Comparator* user_comparator, int num_levels, + CompactionStyle compaction_style, + VersionStorageInfo* src_vstorage, + bool _force_consistency_checks); + // No copying allowed + VersionStorageInfo(const VersionStorageInfo&) = delete; + void operator=(const VersionStorageInfo&) = delete; + ~VersionStorageInfo(); + + void Reserve(int level, size_t size) { files_[level].reserve(size); } + + void AddFile(int level, FileMetaData* f); + + // Resize/Initialize the space for compact_cursor_ + void ResizeCompactCursors(int level) { + compact_cursor_.resize(level, InternalKey()); + } + + const std::vector& GetCompactCursors() const { + return compact_cursor_; + } + + // REQUIRES: ResizeCompactCursors has been called + void AddCursorForOneLevel(int level, + const InternalKey& smallest_uncompacted_key) { + compact_cursor_[level] = smallest_uncompacted_key; + } + + // REQUIRES: lock is held + // Update the compact cursor and advance the file index using increment + // so that it can point to the next cursor (increment means the number of + // input files in this level of the last compaction) + const InternalKey& GetNextCompactCursor(int level, size_t increment) { + int cmp_idx = next_file_to_compact_by_size_[level] + (int)increment; + assert(cmp_idx <= (int)files_by_compaction_pri_[level].size()); + // TODO(zichen): may need to update next_file_to_compact_by_size_ + // for parallel compaction. + InternalKey new_cursor; + if (cmp_idx >= (int)files_by_compaction_pri_[level].size()) { + cmp_idx = 0; + } + // TODO(zichen): rethink if this strategy gives us some good guarantee + return files_[level][files_by_compaction_pri_[level][cmp_idx]]->smallest; + } + + void ReserveBlob(size_t size) { blob_files_.reserve(size); } + + void AddBlobFile(std::shared_ptr blob_file_meta); + + void PrepareForVersionAppend(const ImmutableOptions& immutable_options, + const MutableCFOptions& mutable_cf_options); + + // REQUIRES: PrepareForVersionAppend has been called + void SetFinalized(); + + // Update the accumulated stats from a file-meta. + void UpdateAccumulatedStats(FileMetaData* file_meta); + + // Decrease the current stat from a to-be-deleted file-meta + void RemoveCurrentStats(FileMetaData* file_meta); + + // Updates internal structures that keep track of compaction scores + // We use compaction scores to figure out which compaction to do next + // REQUIRES: db_mutex held!! + // TODO find a better way to pass compaction_options_fifo. + void ComputeCompactionScore(const ImmutableOptions& immutable_options, + const MutableCFOptions& mutable_cf_options); + + // Estimate est_comp_needed_bytes_ + void EstimateCompactionBytesNeeded( + const MutableCFOptions& mutable_cf_options); + + // This computes files_marked_for_compaction_ and is called by + // ComputeCompactionScore() + void ComputeFilesMarkedForCompaction(); + + // This computes ttl_expired_files_ and is called by + // ComputeCompactionScore() + void ComputeExpiredTtlFiles(const ImmutableOptions& ioptions, + const uint64_t ttl); + + // This computes files_marked_for_periodic_compaction_ and is called by + // ComputeCompactionScore() + void ComputeFilesMarkedForPeriodicCompaction( + const ImmutableOptions& ioptions, + const uint64_t periodic_compaction_seconds); + + // This computes bottommost_files_marked_for_compaction_ and is called by + // ComputeCompactionScore() or UpdateOldestSnapshot(). + // + // Among bottommost files (assumes they've already been computed), marks the + // ones that have keys that would be eliminated if recompacted, according to + // the seqnum of the oldest existing snapshot. Must be called every time + // oldest snapshot changes as that is when bottom-level files can become + // eligible for compaction. + // + // REQUIRES: DB mutex held + void ComputeBottommostFilesMarkedForCompaction(); + + // This computes files_marked_for_forced_blob_gc_ and is called by + // ComputeCompactionScore() + // + // REQUIRES: DB mutex held + void ComputeFilesMarkedForForcedBlobGC( + double blob_garbage_collection_age_cutoff, + double blob_garbage_collection_force_threshold); + + bool level0_non_overlapping() const { return level0_non_overlapping_; } + + // Updates the oldest snapshot and related internal state, like the bottommost + // files marked for compaction. + // REQUIRES: DB mutex held + void UpdateOldestSnapshot(SequenceNumber oldest_snapshot_seqnum); + + int MaxInputLevel() const; + int MaxOutputLevel(bool allow_ingest_behind) const; + + // Return level number that has idx'th highest score + int CompactionScoreLevel(int idx) const { return compaction_level_[idx]; } + + // Return idx'th highest score + double CompactionScore(int idx) const { return compaction_score_[idx]; } + + void GetOverlappingInputs( + int level, const InternalKey* begin, // nullptr means before all keys + const InternalKey* end, // nullptr means after all keys + std::vector* inputs, + int hint_index = -1, // index of overlap file + int* file_index = nullptr, // return index of overlap file + bool expand_range = true, // if set, returns files which overlap the + // range and overlap each other. If false, + // then just files intersecting the range + InternalKey** next_smallest = nullptr) // if non-null, returns the + const; // smallest key of next file not included + void GetCleanInputsWithinInterval( + int level, const InternalKey* begin, // nullptr means before all keys + const InternalKey* end, // nullptr means after all keys + std::vector* inputs, + int hint_index = -1, // index of overlap file + int* file_index = nullptr) // return index of overlap file + const; + + void GetOverlappingInputsRangeBinarySearch( + int level, // level > 0 + const InternalKey* begin, // nullptr means before all keys + const InternalKey* end, // nullptr means after all keys + std::vector* inputs, + int hint_index, // index of overlap file + int* file_index, // return index of overlap file + bool within_interval = false, // if set, force the inputs within interval + InternalKey** next_smallest = nullptr) // if non-null, returns the + const; // smallest key of next file not included + + // Returns true iff some file in the specified level overlaps + // some part of [*smallest_user_key,*largest_user_key]. + // smallest_user_key==NULL represents a key smaller than all keys in the DB. + // largest_user_key==NULL represents a key largest than all keys in the DB. + bool OverlapInLevel(int level, const Slice* smallest_user_key, + const Slice* largest_user_key); + + // Returns true iff the first or last file in inputs contains + // an overlapping user key to the file "just outside" of it (i.e. + // just after the last file, or just before the first file) + // REQUIRES: "*inputs" is a sorted list of non-overlapping files + bool HasOverlappingUserKey(const std::vector* inputs, + int level); + + int num_levels() const { return num_levels_; } + + // REQUIRES: PrepareForVersionAppend has been called + int num_non_empty_levels() const { + assert(finalized_); + return num_non_empty_levels_; + } + + // REQUIRES: PrepareForVersionAppend has been called + // This may or may not return number of level files. It is to keep backward + // compatible behavior in universal compaction. + int l0_delay_trigger_count() const { return l0_delay_trigger_count_; } + + void set_l0_delay_trigger_count(int v) { l0_delay_trigger_count_ = v; } + + // REQUIRES: This version has been saved (see VersionBuilder::SaveTo) + int NumLevelFiles(int level) const { + assert(finalized_); + return static_cast(files_[level].size()); + } + + // Return the combined file size of all files at the specified level. + uint64_t NumLevelBytes(int level) const; + + // REQUIRES: This version has been saved (see VersionBuilder::SaveTo) + const std::vector& LevelFiles(int level) const { + return files_[level]; + } + + class FileLocation { + public: + FileLocation() = default; + FileLocation(int level, size_t position) + : level_(level), position_(position) {} + + int GetLevel() const { return level_; } + size_t GetPosition() const { return position_; } + + bool IsValid() const { return level_ >= 0; } + + bool operator==(const FileLocation& rhs) const { + return level_ == rhs.level_ && position_ == rhs.position_; + } + + bool operator!=(const FileLocation& rhs) const { return !(*this == rhs); } + + static FileLocation Invalid() { return FileLocation(); } + + private: + int level_ = -1; + size_t position_ = 0; + }; + + // REQUIRES: PrepareForVersionAppend has been called + FileLocation GetFileLocation(uint64_t file_number) const { + const auto it = file_locations_.find(file_number); + + if (it == file_locations_.end()) { + return FileLocation::Invalid(); + } + + assert(it->second.GetLevel() < num_levels_); + assert(it->second.GetPosition() < files_[it->second.GetLevel()].size()); + assert(files_[it->second.GetLevel()][it->second.GetPosition()]); + assert(files_[it->second.GetLevel()][it->second.GetPosition()] + ->fd.GetNumber() == file_number); + + return it->second; + } + + // REQUIRES: PrepareForVersionAppend has been called + FileMetaData* GetFileMetaDataByNumber(uint64_t file_number) const { + auto location = GetFileLocation(file_number); + + if (!location.IsValid()) { + return nullptr; + } + + return files_[location.GetLevel()][location.GetPosition()]; + } + + // REQUIRES: This version has been saved (see VersionBuilder::SaveTo) + using BlobFiles = std::vector>; + const BlobFiles& GetBlobFiles() const { return blob_files_; } + + // REQUIRES: This version has been saved (see VersionBuilder::SaveTo) + BlobFiles::const_iterator GetBlobFileMetaDataLB( + uint64_t blob_file_number) const; + + // REQUIRES: This version has been saved (see VersionBuilder::SaveTo) + std::shared_ptr GetBlobFileMetaData( + uint64_t blob_file_number) const { + const auto it = GetBlobFileMetaDataLB(blob_file_number); + + assert(it == blob_files_.end() || *it); + + if (it != blob_files_.end() && + (*it)->GetBlobFileNumber() == blob_file_number) { + return *it; + } + + return std::shared_ptr(); + } + + // REQUIRES: This version has been saved (see VersionBuilder::SaveTo) + struct BlobStats { + uint64_t total_file_size = 0; + uint64_t total_garbage_size = 0; + double space_amp = 0.0; + }; + + BlobStats GetBlobStats() const { + uint64_t total_file_size = 0; + uint64_t total_garbage_size = 0; + + for (const auto& meta : blob_files_) { + assert(meta); + + total_file_size += meta->GetBlobFileSize(); + total_garbage_size += meta->GetGarbageBlobBytes(); + } + + double space_amp = 0.0; + if (total_file_size > total_garbage_size) { + space_amp = static_cast(total_file_size) / + (total_file_size - total_garbage_size); + } + + return BlobStats{total_file_size, total_garbage_size, space_amp}; + } + + const ROCKSDB_NAMESPACE::LevelFilesBrief& LevelFilesBrief(int level) const { + assert(level < static_cast(level_files_brief_.size())); + return level_files_brief_[level]; + } + + // REQUIRES: PrepareForVersionAppend has been called + const std::vector& FilesByCompactionPri(int level) const { + assert(finalized_); + return files_by_compaction_pri_[level]; + } + + // REQUIRES: ComputeCompactionScore has been called + // REQUIRES: DB mutex held during access + const autovector>& FilesMarkedForCompaction() + const { + assert(finalized_); + return files_marked_for_compaction_; + } + + // REQUIRES: ComputeCompactionScore has been called + // REQUIRES: DB mutex held during access + const autovector>& ExpiredTtlFiles() const { + assert(finalized_); + return expired_ttl_files_; + } + + // REQUIRES: ComputeCompactionScore has been called + // REQUIRES: DB mutex held during access + const autovector>& + FilesMarkedForPeriodicCompaction() const { + assert(finalized_); + return files_marked_for_periodic_compaction_; + } + + void TEST_AddFileMarkedForPeriodicCompaction(int level, FileMetaData* f) { + files_marked_for_periodic_compaction_.emplace_back(level, f); + } + + // REQUIRES: ComputeCompactionScore has been called + // REQUIRES: DB mutex held during access + const autovector>& + BottommostFilesMarkedForCompaction() const { + assert(finalized_); + return bottommost_files_marked_for_compaction_; + } + + // REQUIRES: ComputeCompactionScore has been called + // REQUIRES: DB mutex held during access + const autovector>& FilesMarkedForForcedBlobGC() + const { + assert(finalized_); + return files_marked_for_forced_blob_gc_; + } + + int base_level() const { return base_level_; } + double level_multiplier() const { return level_multiplier_; } + + // REQUIRES: lock is held + // Set the index that is used to offset into files_by_compaction_pri_ to find + // the next compaction candidate file. + void SetNextCompactionIndex(int level, int index) { + next_file_to_compact_by_size_[level] = index; + } + + // REQUIRES: lock is held + int NextCompactionIndex(int level) const { + return next_file_to_compact_by_size_[level]; + } + + // REQUIRES: PrepareForVersionAppend has been called + const FileIndexer& file_indexer() const { + assert(finalized_); + return file_indexer_; + } + + // Only the first few entries of files_by_compaction_pri_ are sorted. + // There is no need to sort all the files because it is likely + // that on a running system, we need to look at only the first + // few largest files because a new version is created every few + // seconds/minutes (because of concurrent compactions). + static const size_t kNumberFilesToSort = 50; + + // Return a human-readable short (single-line) summary of the number + // of files per level. Uses *scratch as backing store. + struct LevelSummaryStorage { + char buffer[1000]; + }; + struct FileSummaryStorage { + char buffer[3000]; + }; + const char* LevelSummary(LevelSummaryStorage* scratch) const; + // Return a human-readable short (single-line) summary of files + // in a specified level. Uses *scratch as backing store. + const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const; + + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + uint64_t MaxNextLevelOverlappingBytes(); + + // Return a human readable string that describes this version's contents. + std::string DebugString(bool hex = false) const; + + uint64_t GetAverageValueSize() const { + if (accumulated_num_non_deletions_ == 0) { + return 0; + } + assert(accumulated_raw_key_size_ + accumulated_raw_value_size_ > 0); + assert(accumulated_file_size_ > 0); + return accumulated_raw_value_size_ / accumulated_num_non_deletions_ * + accumulated_file_size_ / + (accumulated_raw_key_size_ + accumulated_raw_value_size_); + } + + uint64_t GetEstimatedActiveKeys() const; + + double GetEstimatedCompressionRatioAtLevel(int level) const; + + // re-initializes the index that is used to offset into + // files_by_compaction_pri_ + // to find the next compaction candidate file. + void ResetNextCompactionIndex(int level) { + next_file_to_compact_by_size_[level] = 0; + } + + const InternalKeyComparator* InternalComparator() const { + return internal_comparator_; + } + + // Returns maximum total bytes of data on a given level. + uint64_t MaxBytesForLevel(int level) const; + + // Returns an estimate of the amount of live data in bytes. + uint64_t EstimateLiveDataSize() const; + + uint64_t estimated_compaction_needed_bytes() const { + return estimated_compaction_needed_bytes_; + } + + void TEST_set_estimated_compaction_needed_bytes(uint64_t v) { + estimated_compaction_needed_bytes_ = v; + } + + bool force_consistency_checks() const { return force_consistency_checks_; } + + SequenceNumber bottommost_files_mark_threshold() const { + return bottommost_files_mark_threshold_; + } + + // Returns whether any key in [`smallest_key`, `largest_key`] could appear in + // an older L0 file than `last_l0_idx` or in a greater level than `last_level` + // + // @param last_level Level after which we check for overlap + // @param last_l0_idx If `last_level == 0`, index of L0 file after which we + // check for overlap; otherwise, must be -1 + bool RangeMightExistAfterSortedRun(const Slice& smallest_user_key, + const Slice& largest_user_key, + int last_level, int last_l0_idx); + + private: + void ComputeCompensatedSizes(); + void UpdateNumNonEmptyLevels(); + void CalculateBaseBytes(const ImmutableOptions& ioptions, + const MutableCFOptions& options); + void UpdateFilesByCompactionPri(const ImmutableOptions& immutable_options, + const MutableCFOptions& mutable_cf_options); + + void GenerateFileIndexer() { + file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_); + } + + void GenerateLevelFilesBrief(); + void GenerateLevel0NonOverlapping(); + void GenerateBottommostFiles(); + void GenerateFileLocationIndex(); + + const InternalKeyComparator* internal_comparator_; + const Comparator* user_comparator_; + int num_levels_; // Number of levels + int num_non_empty_levels_; // Number of levels. Any level larger than it + // is guaranteed to be empty. + // Per-level max bytes + std::vector level_max_bytes_; + + // A short brief metadata of files per level + autovector level_files_brief_; + FileIndexer file_indexer_; + Arena arena_; // Used to allocate space for file_levels_ + + CompactionStyle compaction_style_; + + // List of files per level, files in each level are arranged + // in increasing order of keys + std::vector* files_; + + // Map of all table files in version. Maps file number to (level, position on + // level). + using FileLocations = UnorderedMap; + FileLocations file_locations_; + + // Vector of blob files in version sorted by blob file number. + BlobFiles blob_files_; + + // Level that L0 data should be compacted to. All levels < base_level_ should + // be empty. -1 if it is not level-compaction so it's not applicable. + int base_level_; + + double level_multiplier_; + + // A list for the same set of files that are stored in files_, + // but files in each level are now sorted based on file + // size. The file with the largest size is at the front. + // This vector stores the index of the file from files_. + std::vector> files_by_compaction_pri_; + + // If true, means that files in L0 have keys with non overlapping ranges + bool level0_non_overlapping_; + + // An index into files_by_compaction_pri_ that specifies the first + // file that is not yet compacted + std::vector next_file_to_compact_by_size_; + + // Only the first few entries of files_by_compaction_pri_ are sorted. + // There is no need to sort all the files because it is likely + // that on a running system, we need to look at only the first + // few largest files because a new version is created every few + // seconds/minutes (because of concurrent compactions). + static const size_t number_of_files_to_sort_ = 50; + + // This vector contains list of files marked for compaction and also not + // currently being compacted. It is protected by DB mutex. It is calculated in + // ComputeCompactionScore() + autovector> files_marked_for_compaction_; + + autovector> expired_ttl_files_; + + autovector> + files_marked_for_periodic_compaction_; + + // These files are considered bottommost because none of their keys can exist + // at lower levels. They are not necessarily all in the same level. The marked + // ones are eligible for compaction because they contain duplicate key + // versions that are no longer protected by snapshot. These variables are + // protected by DB mutex and are calculated in `GenerateBottommostFiles()` and + // `ComputeBottommostFilesMarkedForCompaction()`. + autovector> bottommost_files_; + autovector> + bottommost_files_marked_for_compaction_; + + autovector> files_marked_for_forced_blob_gc_; + + // Threshold for needing to mark another bottommost file. Maintain it so we + // can quickly check when releasing a snapshot whether more bottommost files + // became eligible for compaction. It's defined as the min of the max nonzero + // seqnums of unmarked bottommost files. + SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber; + + // Monotonically increases as we release old snapshots. Zero indicates no + // snapshots have been released yet. When no snapshots remain we set it to the + // current seqnum, which needs to be protected as a snapshot can still be + // created that references it. + SequenceNumber oldest_snapshot_seqnum_ = 0; + + // Level that should be compacted next and its compaction score. + // Score < 1 means compaction is not strictly needed. These fields + // are initialized by ComputeCompactionScore. + // The most critical level to be compacted is listed first + // These are used to pick the best compaction level + std::vector compaction_score_; + std::vector compaction_level_; + int l0_delay_trigger_count_ = 0; // Count used to trigger slow down and stop + // for number of L0 files. + + // Compact cursors for round-robin compactions in each level + std::vector compact_cursor_; + + // the following are the sampled temporary stats. + // the current accumulated size of sampled files. + uint64_t accumulated_file_size_; + // the current accumulated size of all raw keys based on the sampled files. + uint64_t accumulated_raw_key_size_; + // the current accumulated size of all raw keys based on the sampled files. + uint64_t accumulated_raw_value_size_; + // total number of non-deletion entries + uint64_t accumulated_num_non_deletions_; + // total number of deletion entries + uint64_t accumulated_num_deletions_; + // current number of non_deletion entries + uint64_t current_num_non_deletions_; + // current number of deletion entries + uint64_t current_num_deletions_; + // current number of file samples + uint64_t current_num_samples_; + // Estimated bytes needed to be compacted until all levels' size is down to + // target sizes. + uint64_t estimated_compaction_needed_bytes_; + + bool finalized_; + + // If set to true, we will run consistency checks even if RocksDB + // is compiled in release mode + bool force_consistency_checks_; + + friend class Version; + friend class VersionSet; +}; + +struct ObsoleteFileInfo { + FileMetaData* metadata; + std::string path; + // If true, the FileMataData should be destroyed but the file should + // not be deleted. This is because another FileMetaData still references + // the file, usually because the file is trivial moved so two FileMetadata + // is managing the file. + bool only_delete_metadata = false; + + ObsoleteFileInfo() noexcept + : metadata(nullptr), only_delete_metadata(false) {} + ObsoleteFileInfo(FileMetaData* f, const std::string& file_path, + std::shared_ptr + file_metadata_cache_res_mgr_arg = nullptr) + : metadata(f), + path(file_path), + only_delete_metadata(false), + file_metadata_cache_res_mgr(file_metadata_cache_res_mgr_arg) {} + + ObsoleteFileInfo(const ObsoleteFileInfo&) = delete; + ObsoleteFileInfo& operator=(const ObsoleteFileInfo&) = delete; + + ObsoleteFileInfo(ObsoleteFileInfo&& rhs) noexcept : ObsoleteFileInfo() { + *this = std::move(rhs); + } + + ObsoleteFileInfo& operator=(ObsoleteFileInfo&& rhs) noexcept { + path = std::move(rhs.path); + metadata = rhs.metadata; + rhs.metadata = nullptr; + file_metadata_cache_res_mgr = rhs.file_metadata_cache_res_mgr; + rhs.file_metadata_cache_res_mgr = nullptr; + + return *this; + } + void DeleteMetadata() { + if (file_metadata_cache_res_mgr) { + Status s = file_metadata_cache_res_mgr->UpdateCacheReservation( + metadata->ApproximateMemoryUsage(), false /* increase */); + s.PermitUncheckedError(); + } + delete metadata; + metadata = nullptr; + } + + private: + std::shared_ptr file_metadata_cache_res_mgr; +}; + +class ObsoleteBlobFileInfo { + public: + ObsoleteBlobFileInfo(uint64_t blob_file_number, std::string path) + : blob_file_number_(blob_file_number), path_(std::move(path)) {} + + uint64_t GetBlobFileNumber() const { return blob_file_number_; } + const std::string& GetPath() const { return path_; } + + private: + uint64_t blob_file_number_; + std::string path_; +}; + +using MultiGetRange = MultiGetContext::Range; +// A column family's version consists of the table and blob files owned by +// the column family at a certain point in time. +class Version { + public: + // Append to *iters a sequence of iterators that will + // yield the contents of this Version when merged together. + // @param read_options Must outlive any iterator built by + // `merger_iter_builder`. + void AddIterators(const ReadOptions& read_options, + const FileOptions& soptions, + MergeIteratorBuilder* merger_iter_builder, + bool allow_unprepared_value); + + // @param read_options Must outlive any iterator built by + // `merger_iter_builder`. + void AddIteratorsForLevel(const ReadOptions& read_options, + const FileOptions& soptions, + MergeIteratorBuilder* merger_iter_builder, + int level, bool allow_unprepared_value); + + Status OverlapWithLevelIterator(const ReadOptions&, const FileOptions&, + const Slice& smallest_user_key, + const Slice& largest_user_key, int level, + bool* overlap); + + // Lookup the value for key or get all merge operands for key. + // If do_merge = true (default) then lookup value for key. + // Behavior if do_merge = true: + // If found, store it in *value and + // return OK. Else return a non-OK status. + // Uses *operands to store merge_operator operations to apply later. + // + // If the ReadOptions.read_tier is set to do a read-only fetch, then + // *value_found will be set to false if it cannot be determined whether + // this value exists without doing IO. + // + // If the key is Deleted, *status will be set to NotFound and + // *key_exists will be set to true. + // If no key was found, *status will be set to NotFound and + // *key_exists will be set to false. + // If seq is non-null, *seq will be set to the sequence number found + // for the key if a key was found. + // Behavior if do_merge = false + // If the key has any merge operands then store them in + // merge_context.operands_list and don't merge the operands + // REQUIRES: lock is not held + // REQUIRES: pinned_iters_mgr != nullptr + void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value, + PinnableWideColumns* columns, std::string* timestamp, Status* status, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + PinnedIteratorsManager* pinned_iters_mgr, + bool* value_found = nullptr, bool* key_exists = nullptr, + SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr, + bool* is_blob = nullptr, bool do_merge = true); + + void MultiGet(const ReadOptions&, MultiGetRange* range, + ReadCallback* callback = nullptr); + + // Interprets blob_index_slice as a blob reference, and (assuming the + // corresponding blob file is part of this Version) retrieves the blob and + // saves it in *value. + // REQUIRES: blob_index_slice stores an encoded blob reference + Status GetBlob(const ReadOptions& read_options, const Slice& user_key, + const Slice& blob_index_slice, + FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value, + uint64_t* bytes_read) const; + + // Retrieves a blob using a blob reference and saves it in *value, + // assuming the corresponding blob file is part of this Version. + Status GetBlob(const ReadOptions& read_options, const Slice& user_key, + const BlobIndex& blob_index, + FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value, + uint64_t* bytes_read) const; + + using BlobReadContext = + std::pair>; + using BlobReadContexts = std::vector; + void MultiGetBlob(const ReadOptions& read_options, MultiGetRange& range, + std::unordered_map& blob_ctxs); + + // Loads some stats information from files (if update_stats is set) and + // populates derived data structures. Call without mutex held. It needs to be + // called before appending the version to the version set. + void PrepareAppend(const MutableCFOptions& mutable_cf_options, + bool update_stats); + + // Reference count management (so Versions do not disappear out from + // under live iterators) + void Ref(); + // Decrease reference count. Delete the object if no reference left + // and return true. Otherwise, return false. + bool Unref(); + + // Add all files listed in the current version to *live_table_files and + // *live_blob_files. + void AddLiveFiles(std::vector* live_table_files, + std::vector* live_blob_files) const; + + // Remove live files that are in the delete candidate lists. + void RemoveLiveFiles( + std::vector& sst_delete_candidates, + std::vector& blob_delete_candidates) const; + + // Return a human readable string that describes this version's contents. + std::string DebugString(bool hex = false, bool print_stats = false) const; + + // Returns the version number of this version + uint64_t GetVersionNumber() const { return version_number_; } + + // REQUIRES: lock is held + // On success, "tp" will contains the table properties of the file + // specified in "file_meta". If the file name of "file_meta" is + // known ahead, passing it by a non-null "fname" can save a + // file-name conversion. + Status GetTableProperties(std::shared_ptr* tp, + const FileMetaData* file_meta, + const std::string* fname = nullptr) const; + + // REQUIRES: lock is held + // On success, *props will be populated with all SSTables' table properties. + // The keys of `props` are the sst file name, the values of `props` are the + // tables' properties, represented as std::shared_ptr. + Status GetPropertiesOfAllTables(TablePropertiesCollection* props); + Status GetPropertiesOfAllTables(TablePropertiesCollection* props, int level); + Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n, + TablePropertiesCollection* props) const; + + // Print summary of range delete tombstones in SST files into out_str, + // with maximum max_entries_to_print entries printed out. + Status TablesRangeTombstoneSummary(int max_entries_to_print, + std::string* out_str); + + // REQUIRES: lock is held + // On success, "tp" will contains the aggregated table property among + // the table properties of all sst files in this version. + Status GetAggregatedTableProperties( + std::shared_ptr* tp, int level = -1); + + uint64_t GetEstimatedActiveKeys() { + return storage_info_.GetEstimatedActiveKeys(); + } + + size_t GetMemoryUsageByTableReaders(); + + ColumnFamilyData* cfd() const { return cfd_; } + + // Return the next Version in the linked list. + Version* Next() const { return next_; } + + int TEST_refs() const { return refs_; } + + VersionStorageInfo* storage_info() { return &storage_info_; } + const VersionStorageInfo* storage_info() const { return &storage_info_; } + + VersionSet* version_set() { return vset_; } + + void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta); + + uint64_t GetSstFilesSize(); + + // Retrieves the file_creation_time of the oldest file in the DB. + // Prerequisite for this API is max_open_files = -1 + void GetCreationTimeOfOldestFile(uint64_t* creation_time); + + const MutableCFOptions& GetMutableCFOptions() { return mutable_cf_options_; } + + InternalIterator* TEST_GetLevelIterator( + const ReadOptions& read_options, MergeIteratorBuilder* merge_iter_builder, + int level, bool allow_unprepared_value); + + private: + Env* env_; + SystemClock* clock_; + + friend class ReactiveVersionSet; + friend class VersionSet; + friend class VersionEditHandler; + friend class VersionEditHandlerPointInTime; + + const InternalKeyComparator* internal_comparator() const { + return storage_info_.internal_comparator_; + } + const Comparator* user_comparator() const { + return storage_info_.user_comparator_; + } + + // Returns true if the filter blocks in the specified level will not be + // checked during read operations. In certain cases (trivial move or preload), + // the filter block may already be cached, but we still do not access it such + // that it eventually expires from the cache. + bool IsFilterSkipped(int level, bool is_file_last_in_level = false); + + // The helper function of UpdateAccumulatedStats, which may fill the missing + // fields of file_meta from its associated TableProperties. + // Returns true if it does initialize FileMetaData. + bool MaybeInitializeFileMetaData(FileMetaData* file_meta); + + // Update the accumulated stats associated with the current version. + // This accumulated stats will be used in compaction. + void UpdateAccumulatedStats(); + + DECLARE_SYNC_AND_ASYNC( + /* ret_type */ Status, /* func_name */ MultiGetFromSST, + const ReadOptions& read_options, MultiGetRange file_range, + int hit_file_level, bool skip_filters, bool skip_range_deletions, + FdWithKeyRange* f, + std::unordered_map& blob_ctxs, + Cache::Handle* table_handle, uint64_t& num_filter_read, + uint64_t& num_index_read, uint64_t& num_sst_read); + +#ifdef USE_COROUTINES + // MultiGet using async IO to read data blocks from SST files in parallel + // within and across levels + Status MultiGetAsync( + const ReadOptions& options, MultiGetRange* range, + std::unordered_map* blob_ctxs); + + // A helper function to lookup a batch of keys in a single level. It will + // queue coroutine tasks to mget_tasks. It may also split the input batch + // by creating a new batch with keys definitely not in this level and + // enqueuing it to to_process. + Status ProcessBatch( + const ReadOptions& read_options, FilePickerMultiGet* batch, + std::vector>& mget_tasks, + std::unordered_map* blob_ctxs, + autovector& batches, std::deque& waiting, + std::deque& to_process, unsigned int& num_tasks_queued, + std::unordered_map>& + mget_stats); +#endif + + ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs + Logger* info_log_; + Statistics* db_statistics_; + TableCache* table_cache_; + BlobSource* blob_source_; + const MergeOperator* merge_operator_; + + VersionStorageInfo storage_info_; + VersionSet* vset_; // VersionSet to which this Version belongs + Version* next_; // Next version in linked list + Version* prev_; // Previous version in linked list + int refs_; // Number of live refs to this version + const FileOptions file_options_; + const MutableCFOptions mutable_cf_options_; + // Cached value to avoid recomputing it on every read. + const size_t max_file_size_for_l0_meta_pin_; + + // A version number that uniquely represents this version. This is + // used for debugging and logging purposes only. + uint64_t version_number_; + std::shared_ptr io_tracer_; + + Version(ColumnFamilyData* cfd, VersionSet* vset, const FileOptions& file_opt, + MutableCFOptions mutable_cf_options, + const std::shared_ptr& io_tracer, + uint64_t version_number = 0); + + ~Version(); + + // No copying allowed + Version(const Version&) = delete; + void operator=(const Version&) = delete; +}; + +class BaseReferencedVersionBuilder; + +class AtomicGroupReadBuffer { + public: + AtomicGroupReadBuffer() = default; + Status AddEdit(VersionEdit* edit); + void Clear(); + bool IsFull() const; + bool IsEmpty() const; + + uint64_t TEST_read_edits_in_atomic_group() const { + return read_edits_in_atomic_group_; + } + std::vector& replay_buffer() { return replay_buffer_; } + + private: + uint64_t read_edits_in_atomic_group_ = 0; + std::vector replay_buffer_; +}; + +// VersionSet is the collection of versions of all the column families of the +// database. Each database owns one VersionSet. A VersionSet has access to all +// column families via ColumnFamilySet, i.e. set of the column families. +class VersionSet { + public: + VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options, + const FileOptions& file_options, Cache* table_cache, + WriteBufferManager* write_buffer_manager, + WriteController* write_controller, + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, + const std::string& db_id, const std::string& db_session_id); + // No copying allowed + VersionSet(const VersionSet&) = delete; + void operator=(const VersionSet&) = delete; + + virtual ~VersionSet(); + + Status LogAndApplyToDefaultColumnFamily( + VersionEdit* edit, InstrumentedMutex* mu, + FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, + const ColumnFamilyOptions* column_family_options = nullptr) { + ColumnFamilyData* default_cf = GetColumnFamilySet()->GetDefault(); + const MutableCFOptions* cf_options = + default_cf->GetLatestMutableCFOptions(); + return LogAndApply(default_cf, *cf_options, edit, mu, + dir_contains_current_file, new_descriptor_log, + column_family_options); + } + + // Apply *edit to the current version to form a new descriptor that + // is both saved to persistent state and installed as the new + // current version. Will release *mu while actually writing to the file. + // column_family_options has to be set if edit is column family add + // REQUIRES: *mu is held on entry. + // REQUIRES: no other thread concurrently calls LogAndApply() + Status LogAndApply( + ColumnFamilyData* column_family_data, + const MutableCFOptions& mutable_cf_options, VersionEdit* edit, + InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, + bool new_descriptor_log = false, + const ColumnFamilyOptions* column_family_options = nullptr) { + autovector cfds; + cfds.emplace_back(column_family_data); + autovector mutable_cf_options_list; + mutable_cf_options_list.emplace_back(&mutable_cf_options); + autovector> edit_lists; + autovector edit_list; + edit_list.emplace_back(edit); + edit_lists.emplace_back(edit_list); + return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu, + dir_contains_current_file, new_descriptor_log, + column_family_options); + } + // The batch version. If edit_list.size() > 1, caller must ensure that + // no edit in the list column family add or drop + Status LogAndApply( + ColumnFamilyData* column_family_data, + const MutableCFOptions& mutable_cf_options, + const autovector& edit_list, InstrumentedMutex* mu, + FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, + const ColumnFamilyOptions* column_family_options = nullptr, + const std::function& manifest_wcb = {}) { + autovector cfds; + cfds.emplace_back(column_family_data); + autovector mutable_cf_options_list; + mutable_cf_options_list.emplace_back(&mutable_cf_options); + autovector> edit_lists; + edit_lists.emplace_back(edit_list); + return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu, + dir_contains_current_file, new_descriptor_log, + column_family_options, {manifest_wcb}); + } + + // The across-multi-cf batch version. If edit_lists contain more than + // 1 version edits, caller must ensure that no edit in the []list is column + // family manipulation. + virtual Status LogAndApply( + const autovector& cfds, + const autovector& mutable_cf_options_list, + const autovector>& edit_lists, + InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, + bool new_descriptor_log = false, + const ColumnFamilyOptions* new_cf_options = nullptr, + const std::vector>& manifest_wcbs = + {}); + + static Status GetCurrentManifestPath(const std::string& dbname, + FileSystem* fs, + std::string* manifest_filename, + uint64_t* manifest_file_number); + void WakeUpWaitingManifestWriters(); + + // Recover the last saved descriptor from persistent storage. + // If read_only == true, Recover() will not complain if some column families + // are not opened + Status Recover(const std::vector& column_families, + bool read_only = false, std::string* db_id = nullptr, + bool no_error_if_files_missing = false); + + Status TryRecover(const std::vector& column_families, + bool read_only, + const std::vector& files_in_dbname, + std::string* db_id, bool* has_missing_table_file); + + // Try to recover the version set to the most recent consistent state + // recorded in the specified manifest. + Status TryRecoverFromOneManifest( + const std::string& manifest_path, + const std::vector& column_families, + bool read_only, std::string* db_id, bool* has_missing_table_file); + + // Reads a manifest file and returns a list of column families in + // column_families. + static Status ListColumnFamilies(std::vector* column_families, + const std::string& dbname, FileSystem* fs); + static Status ListColumnFamiliesFromManifest( + const std::string& manifest_path, FileSystem* fs, + std::vector* column_families); + +#ifndef ROCKSDB_LITE + // Try to reduce the number of levels. This call is valid when + // only one level from the new max level to the old + // max level containing files. + // The call is static, since number of levels is immutable during + // the lifetime of a RocksDB instance. It reduces number of levels + // in a DB by applying changes to manifest. + // For example, a db currently has 7 levels [0-6], and a call to + // to reduce to 5 [0-4] can only be executed when only one level + // among [4-6] contains files. + static Status ReduceNumberOfLevels(const std::string& dbname, + const Options* options, + const FileOptions& file_options, + int new_levels); + + // Get the checksum information of all live files + Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list); + + // printf contents (for debugging) + Status DumpManifest(Options& options, std::string& manifestFileName, + bool verbose, bool hex = false, bool json = false); + +#endif // ROCKSDB_LITE + + const std::string& DbSessionId() const { return db_session_id_; } + + // Return the current manifest file number + uint64_t manifest_file_number() const { return manifest_file_number_; } + + uint64_t options_file_number() const { return options_file_number_; } + + uint64_t pending_manifest_file_number() const { + return pending_manifest_file_number_; + } + + uint64_t current_next_file_number() const { return next_file_number_.load(); } + + uint64_t min_log_number_to_keep() const { + return min_log_number_to_keep_.load(); + } + + // Allocate and return a new file number + uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); } + + // Fetch And Add n new file number + uint64_t FetchAddFileNumber(uint64_t n) { + return next_file_number_.fetch_add(n); + } + + // Return the last sequence number. + uint64_t LastSequence() const { + return last_sequence_.load(std::memory_order_acquire); + } + + // Note: memory_order_acquire must be sufficient. + uint64_t LastAllocatedSequence() const { + return last_allocated_sequence_.load(std::memory_order_seq_cst); + } + + // Note: memory_order_acquire must be sufficient. + uint64_t LastPublishedSequence() const { + return last_published_sequence_.load(std::memory_order_seq_cst); + } + + // Set the last sequence number to s. + void SetLastSequence(uint64_t s) { + assert(s >= last_sequence_); + // Last visible sequence must always be less than last written seq + assert(!db_options_->two_write_queues || s <= last_allocated_sequence_); + last_sequence_.store(s, std::memory_order_release); + } + + // Note: memory_order_release must be sufficient + void SetLastPublishedSequence(uint64_t s) { + assert(s >= last_published_sequence_); + last_published_sequence_.store(s, std::memory_order_seq_cst); + } + + // Note: memory_order_release must be sufficient + void SetLastAllocatedSequence(uint64_t s) { + assert(s >= last_allocated_sequence_); + last_allocated_sequence_.store(s, std::memory_order_seq_cst); + } + + // Note: memory_order_release must be sufficient + uint64_t FetchAddLastAllocatedSequence(uint64_t s) { + return last_allocated_sequence_.fetch_add(s, std::memory_order_seq_cst); + } + + // Mark the specified file number as used. + // REQUIRED: this is only called during single-threaded recovery or repair. + void MarkFileNumberUsed(uint64_t number); + + // Mark the specified log number as deleted + // REQUIRED: this is only called during single-threaded recovery or repair, or + // from ::LogAndApply where the global mutex is held. + void MarkMinLogNumberToKeep(uint64_t number); + + // Return the log file number for the log file that is currently + // being compacted, or zero if there is no such log file. + uint64_t prev_log_number() const { return prev_log_number_; } + + // Returns the minimum log number which still has data not flushed to any SST + // file. + // In non-2PC mode, all the log numbers smaller than this number can be safely + // deleted, although we still use `min_log_number_to_keep_` to determine when + // to delete a WAL file. + uint64_t MinLogNumberWithUnflushedData() const { + return PreComputeMinLogNumberWithUnflushedData(nullptr); + } + + // Returns the minimum log number which still has data not flushed to any SST + // file. + // Empty column families' log number is considered to be + // new_log_number_for_empty_cf. + uint64_t PreComputeMinLogNumberWithUnflushedData( + uint64_t new_log_number_for_empty_cf) const { + uint64_t min_log_num = std::numeric_limits::max(); + for (auto cfd : *column_family_set_) { + // It's safe to ignore dropped column families here: + // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST. + uint64_t num = + cfd->IsEmpty() ? new_log_number_for_empty_cf : cfd->GetLogNumber(); + if (min_log_num > num && !cfd->IsDropped()) { + min_log_num = num; + } + } + return min_log_num; + } + // Returns the minimum log number which still has data not flushed to any SST + // file, except data from `cfd_to_skip`. + uint64_t PreComputeMinLogNumberWithUnflushedData( + const ColumnFamilyData* cfd_to_skip) const { + uint64_t min_log_num = std::numeric_limits::max(); + for (auto cfd : *column_family_set_) { + if (cfd == cfd_to_skip) { + continue; + } + // It's safe to ignore dropped column families here: + // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST. + if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) { + min_log_num = cfd->GetLogNumber(); + } + } + return min_log_num; + } + // Returns the minimum log number which still has data not flushed to any SST + // file, except data from `cfds_to_skip`. + uint64_t PreComputeMinLogNumberWithUnflushedData( + const std::unordered_set& cfds_to_skip) const { + uint64_t min_log_num = std::numeric_limits::max(); + for (auto cfd : *column_family_set_) { + if (cfds_to_skip.count(cfd)) { + continue; + } + // It's safe to ignore dropped column families here: + // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST. + if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) { + min_log_num = cfd->GetLogNumber(); + } + } + return min_log_num; + } + + // Create an iterator that reads over the compaction inputs for "*c". + // The caller should delete the iterator when no longer needed. + // @param read_options Must outlive the returned iterator. + // @param start, end indicates compaction range + InternalIterator* MakeInputIterator( + const ReadOptions& read_options, const Compaction* c, + RangeDelAggregator* range_del_agg, + const FileOptions& file_options_compactions, + const std::optional& start, + const std::optional& end); + + // Add all files listed in any live version to *live_table_files and + // *live_blob_files. Note that these lists may contain duplicates. + void AddLiveFiles(std::vector* live_table_files, + std::vector* live_blob_files) const; + + // Remove live files that are in the delete candidate lists. + void RemoveLiveFiles( + std::vector& sst_delete_candidates, + std::vector& blob_delete_candidates) const; + + // Return the approximate size of data to be scanned for range [start, end) + // in levels [start_level, end_level). If end_level == -1 it will search + // through all non-empty levels + uint64_t ApproximateSize(const SizeApproximationOptions& options, Version* v, + const Slice& start, const Slice& end, + int start_level, int end_level, + TableReaderCaller caller); + + // Return the size of the current manifest file + uint64_t manifest_file_size() const { return manifest_file_size_; } + + Status GetMetadataForFile(uint64_t number, int* filelevel, + FileMetaData** metadata, ColumnFamilyData** cfd); + + // This function doesn't support leveldb SST filenames + void GetLiveFilesMetaData(std::vector* metadata); + + void AddObsoleteBlobFile(uint64_t blob_file_number, std::string path) { + assert(table_cache_); + + table_cache_->Erase(GetSlice(&blob_file_number)); + + obsolete_blob_files_.emplace_back(blob_file_number, std::move(path)); + } + + void GetObsoleteFiles(std::vector* files, + std::vector* blob_files, + std::vector* manifest_filenames, + uint64_t min_pending_output); + + ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); } + RefedColumnFamilySet GetRefedColumnFamilySet() { + return RefedColumnFamilySet(GetColumnFamilySet()); + } + + const FileOptions& file_options() { return file_options_; } + void ChangeFileOptions(const MutableDBOptions& new_options) { + file_options_.writable_file_max_buffer_size = + new_options.writable_file_max_buffer_size; + } + + const ImmutableDBOptions* db_options() const { return db_options_; } + + static uint64_t GetNumLiveVersions(Version* dummy_versions); + + static uint64_t GetTotalSstFilesSize(Version* dummy_versions); + + static uint64_t GetTotalBlobFileSize(Version* dummy_versions); + + // Get the IO Status returned by written Manifest. + const IOStatus& io_status() const { return io_status_; } + + // The returned WalSet needs to be accessed with DB mutex held. + const WalSet& GetWalSet() const { return wals_; } + + void TEST_CreateAndAppendVersion(ColumnFamilyData* cfd) { + assert(cfd); + + const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + Version* const version = + new Version(cfd, this, file_options_, mutable_cf_options, io_tracer_); + + constexpr bool update_stats = false; + version->PrepareAppend(mutable_cf_options, update_stats); + AppendVersion(cfd, version); + } + + protected: + using VersionBuilderMap = + UnorderedMap>; + + struct ManifestWriter; + + friend class Version; + friend class VersionEditHandler; + friend class VersionEditHandlerPointInTime; + friend class DumpManifestHandler; + friend class DBImpl; + friend class DBImplReadOnly; + + struct LogReporter : public log::Reader::Reporter { + Status* status; + virtual void Corruption(size_t /*bytes*/, const Status& s) override { + if (status->ok()) { + *status = s; + } + } + }; + + void Reset(); + + // Returns approximated offset of a key in a file for a given version. + uint64_t ApproximateOffsetOf(Version* v, const FdWithKeyRange& f, + const Slice& key, TableReaderCaller caller); + + // Returns approximated data size between start and end keys in a file + // for a given version. + uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f, + const Slice& start, const Slice& end, + TableReaderCaller caller); + + struct MutableCFState { + uint64_t log_number; + std::string full_history_ts_low; + + explicit MutableCFState() = default; + explicit MutableCFState(uint64_t _log_number, std::string ts_low) + : log_number(_log_number), full_history_ts_low(std::move(ts_low)) {} + }; + + // Save current contents to *log + Status WriteCurrentStateToManifest( + const std::unordered_map& curr_state, + const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s); + + void AppendVersion(ColumnFamilyData* column_family_data, Version* v); + + ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, + const VersionEdit* edit); + + Status VerifyFileMetadata(ColumnFamilyData* cfd, const std::string& fpath, + int level, const FileMetaData& meta); + + // Protected by DB mutex. + WalSet wals_; + + std::unique_ptr column_family_set_; + Cache* table_cache_; + Env* const env_; + FileSystemPtr const fs_; + SystemClock* const clock_; + const std::string dbname_; + std::string db_id_; + const ImmutableDBOptions* const db_options_; + std::atomic next_file_number_; + // Any WAL number smaller than this should be ignored during recovery, + // and is qualified for being deleted. + std::atomic min_log_number_to_keep_ = {0}; + uint64_t manifest_file_number_; + uint64_t options_file_number_; + uint64_t options_file_size_; + uint64_t pending_manifest_file_number_; + // The last seq visible to reads. It normally indicates the last sequence in + // the memtable but when using two write queues it could also indicate the + // last sequence in the WAL visible to reads. + std::atomic last_sequence_; + // The last sequence number of data committed to the descriptor (manifest + // file). + SequenceNumber descriptor_last_sequence_ = 0; + // The last seq that is already allocated. It is applicable only when we have + // two write queues. In that case seq might or might not have appreated in + // memtable but it is expected to appear in the WAL. + // We have last_sequence <= last_allocated_sequence_ + std::atomic last_allocated_sequence_; + // The last allocated sequence that is also published to the readers. This is + // applicable only when last_seq_same_as_publish_seq_ is not set. Otherwise + // last_sequence_ also indicates the last published seq. + // We have last_sequence <= last_published_sequence_ <= + // last_allocated_sequence_ + std::atomic last_published_sequence_; + uint64_t prev_log_number_; // 0 or backing store for memtable being compacted + + // Opened lazily + std::unique_ptr descriptor_log_; + + // generates a increasing version number for every new version + uint64_t current_version_number_; + + // Queue of writers to the manifest file + std::deque manifest_writers_; + + // Current size of manifest file + uint64_t manifest_file_size_; + + std::vector obsolete_files_; + std::vector obsolete_blob_files_; + std::vector obsolete_manifests_; + + // env options for all reads and writes except compactions + FileOptions file_options_; + + BlockCacheTracer* const block_cache_tracer_; + + // Store the IO status when Manifest is written + IOStatus io_status_; + + std::shared_ptr io_tracer_; + + std::string db_session_id_; + + private: + // REQUIRES db mutex at beginning. may release and re-acquire db mutex + Status ProcessManifestWrites(std::deque& writers, + InstrumentedMutex* mu, + FSDirectory* dir_contains_current_file, + bool new_descriptor_log, + const ColumnFamilyOptions* new_cf_options); + + void LogAndApplyCFHelper(VersionEdit* edit, + SequenceNumber* max_last_sequence); + Status LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b, + VersionEdit* edit, SequenceNumber* max_last_sequence, + InstrumentedMutex* mu); +}; + +// ReactiveVersionSet represents a collection of versions of the column +// families of the database. Users of ReactiveVersionSet, e.g. DBImplSecondary, +// need to replay the MANIFEST (description log in older terms) in order to +// reconstruct and install versions. +class ReactiveVersionSet : public VersionSet { + public: + ReactiveVersionSet(const std::string& dbname, + const ImmutableDBOptions* _db_options, + const FileOptions& _file_options, Cache* table_cache, + WriteBufferManager* write_buffer_manager, + WriteController* write_controller, + const std::shared_ptr& io_tracer); + + ~ReactiveVersionSet() override; + + Status ReadAndApply( + InstrumentedMutex* mu, + std::unique_ptr* manifest_reader, + Status* manifest_read_status, + std::unordered_set* cfds_changed); + + Status Recover(const std::vector& column_families, + std::unique_ptr* manifest_reader, + std::unique_ptr* manifest_reporter, + std::unique_ptr* manifest_reader_status); +#ifndef NDEBUG + uint64_t TEST_read_edits_in_atomic_group() const; +#endif //! NDEBUG + + std::vector& replay_buffer(); + + protected: + // REQUIRES db mutex + Status ApplyOneVersionEditToBuilder( + VersionEdit& edit, std::unordered_set* cfds_changed, + VersionEdit* version_edit); + + Status MaybeSwitchManifest( + log::Reader::Reporter* reporter, + std::unique_ptr* manifest_reader); + + private: + std::unique_ptr manifest_tailer_; + + using VersionSet::LogAndApply; + using VersionSet::Recover; + + Status LogAndApply( + const autovector& /*cfds*/, + const autovector& /*mutable_cf_options_list*/, + const autovector>& /*edit_lists*/, + InstrumentedMutex* /*mu*/, FSDirectory* /*dir_contains_current_file*/, + bool /*new_descriptor_log*/, const ColumnFamilyOptions* /*new_cf_option*/, + const std::vector>& /*manifest_wcbs*/) + override { + return Status::NotSupported("not supported in reactive mode"); + } + + // No copy allowed + ReactiveVersionSet(const ReactiveVersionSet&); + ReactiveVersionSet& operator=(const ReactiveVersionSet&); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/version_set_sync_and_async.h b/src/rocksdb/db/version_set_sync_and_async.h new file mode 100644 index 000000000..755585990 --- /dev/null +++ b/src/rocksdb/db/version_set_sync_and_async.h @@ -0,0 +1,151 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/coro_utils.h" + +#if defined(WITHOUT_COROUTINES) || \ + (defined(USE_COROUTINES) && defined(WITH_COROUTINES)) + +namespace ROCKSDB_NAMESPACE { + +// Lookup a batch of keys in a single SST file +DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST) +(const ReadOptions& read_options, MultiGetRange file_range, int hit_file_level, + bool skip_filters, bool skip_range_deletions, FdWithKeyRange* f, + std::unordered_map& blob_ctxs, + Cache::Handle* table_handle, uint64_t& num_filter_read, + uint64_t& num_index_read, uint64_t& num_sst_read) { + bool timer_enabled = GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex && + get_perf_context()->per_level_perf_context_enabled; + + Status s; + StopWatchNano timer(clock_, timer_enabled /* auto_start */); + s = CO_AWAIT(table_cache_->MultiGet)( + read_options, *internal_comparator(), *f->file_metadata, &file_range, + mutable_cf_options_.prefix_extractor, + cfd_->internal_stats()->GetFileReadHist(hit_file_level), skip_filters, + skip_range_deletions, hit_file_level, table_handle); + // TODO: examine the behavior for corrupted key + if (timer_enabled) { + PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(), + hit_file_level); + } + if (!s.ok()) { + // TODO: Set status for individual keys appropriately + for (auto iter = file_range.begin(); iter != file_range.end(); ++iter) { + *iter->s = s; + file_range.MarkKeyDone(iter); + } + CO_RETURN s; + } + uint64_t batch_size = 0; + for (auto iter = file_range.begin(); s.ok() && iter != file_range.end(); + ++iter) { + GetContext& get_context = *iter->get_context; + Status* status = iter->s; + // The Status in the KeyContext takes precedence over GetContext state + // Status may be an error if there were any IO errors in the table + // reader. We never expect Status to be NotFound(), as that is + // determined by get_context + assert(!status->IsNotFound()); + if (!status->ok()) { + file_range.MarkKeyDone(iter); + continue; + } + + if (get_context.sample()) { + sample_file_read_inc(f->file_metadata); + } + batch_size++; + num_index_read += get_context.get_context_stats_.num_index_read; + num_filter_read += get_context.get_context_stats_.num_filter_read; + num_sst_read += get_context.get_context_stats_.num_sst_read; + // Reset these stats since they're specific to a level + get_context.get_context_stats_.num_index_read = 0; + get_context.get_context_stats_.num_filter_read = 0; + get_context.get_context_stats_.num_sst_read = 0; + + // report the counters before returning + if (get_context.State() != GetContext::kNotFound && + get_context.State() != GetContext::kMerge && + db_statistics_ != nullptr) { + get_context.ReportCounters(); + } else { + if (iter->max_covering_tombstone_seq > 0) { + // The remaining files we look at will only contain covered keys, so + // we stop here for this key + file_range.SkipKey(iter); + } + } + switch (get_context.State()) { + case GetContext::kNotFound: + // Keep searching in other files + break; + case GetContext::kMerge: + // TODO: update per-level perfcontext user_key_return_count for kMerge + break; + case GetContext::kFound: + if (hit_file_level == 0) { + RecordTick(db_statistics_, GET_HIT_L0); + } else if (hit_file_level == 1) { + RecordTick(db_statistics_, GET_HIT_L1); + } else if (hit_file_level >= 2) { + RecordTick(db_statistics_, GET_HIT_L2_AND_UP); + } + + PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, hit_file_level); + + file_range.MarkKeyDone(iter); + + if (iter->is_blob_index) { + if (iter->value) { + TEST_SYNC_POINT_CALLBACK("Version::MultiGet::TamperWithBlobIndex", + &(*iter)); + + const Slice& blob_index_slice = *(iter->value); + BlobIndex blob_index; + Status tmp_s = blob_index.DecodeFrom(blob_index_slice); + if (tmp_s.ok()) { + const uint64_t blob_file_num = blob_index.file_number(); + blob_ctxs[blob_file_num].emplace_back( + std::make_pair(blob_index, std::cref(*iter))); + } else { + *(iter->s) = tmp_s; + } + } + } else { + file_range.AddValueSize(iter->value->size()); + if (file_range.GetValueSize() > read_options.value_size_soft_limit) { + s = Status::Aborted(); + break; + } + } + continue; + case GetContext::kDeleted: + // Use empty error message for speed + *status = Status::NotFound(); + file_range.MarkKeyDone(iter); + continue; + case GetContext::kCorrupt: + *status = + Status::Corruption("corrupted key for ", iter->lkey->user_key()); + file_range.MarkKeyDone(iter); + continue; + case GetContext::kUnexpectedBlobIndex: + ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index."); + *status = Status::NotSupported( + "Encounter unexpected blob index. Please open DB with " + "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); + file_range.MarkKeyDone(iter); + continue; + } + } + + RecordInHistogram(db_statistics_, SST_BATCH_SIZE, batch_size); + CO_RETURN s; +} +} // namespace ROCKSDB_NAMESPACE +#endif diff --git a/src/rocksdb/db/version_set_test.cc b/src/rocksdb/db/version_set_test.cc new file mode 100644 index 000000000..7d17406c1 --- /dev/null +++ b/src/rocksdb/db/version_set_test.cc @@ -0,0 +1,3587 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_set.h" + +#include + +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "db/log_writer.h" +#include "rocksdb/advanced_options.h" +#include "rocksdb/convenience.h" +#include "rocksdb/file_system.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/mock_table.h" +#include "table/unique_id_impl.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class GenerateLevelFilesBriefTest : public testing::Test { + public: + std::vector files_; + LevelFilesBrief file_level_; + Arena arena_; + + GenerateLevelFilesBriefTest() {} + + ~GenerateLevelFilesBriefTest() override { + for (size_t i = 0; i < files_.size(); i++) { + delete files_[i]; + } + } + + void Add(const char* smallest, const char* largest, + SequenceNumber smallest_seq = 100, + SequenceNumber largest_seq = 100) { + FileMetaData* f = new FileMetaData( + files_.size() + 1, 0, 0, + InternalKey(smallest, smallest_seq, kTypeValue), + InternalKey(largest, largest_seq, kTypeValue), smallest_seq, + largest_seq, /* marked_for_compact */ false, Temperature::kUnknown, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); + files_.push_back(f); + } + + int Compare() { + int diff = 0; + for (size_t i = 0; i < files_.size(); i++) { + if (file_level_.files[i].fd.GetNumber() != files_[i]->fd.GetNumber()) { + diff++; + } + } + return diff; + } +}; + +TEST_F(GenerateLevelFilesBriefTest, Empty) { + DoGenerateLevelFilesBrief(&file_level_, files_, &arena_); + ASSERT_EQ(0u, file_level_.num_files); + ASSERT_EQ(0, Compare()); +} + +TEST_F(GenerateLevelFilesBriefTest, Single) { + Add("p", "q"); + DoGenerateLevelFilesBrief(&file_level_, files_, &arena_); + ASSERT_EQ(1u, file_level_.num_files); + ASSERT_EQ(0, Compare()); +} + +TEST_F(GenerateLevelFilesBriefTest, Multiple) { + Add("150", "200"); + Add("200", "250"); + Add("300", "350"); + Add("400", "450"); + DoGenerateLevelFilesBrief(&file_level_, files_, &arena_); + ASSERT_EQ(4u, file_level_.num_files); + ASSERT_EQ(0, Compare()); +} + +class CountingLogger : public Logger { + public: + CountingLogger() : log_count(0) {} + using Logger::Logv; + void Logv(const char* /*format*/, va_list /*ap*/) override { log_count++; } + int log_count; +}; + +Options GetOptionsWithNumLevels(int num_levels, + std::shared_ptr logger) { + Options opt; + opt.num_levels = num_levels; + opt.info_log = logger; + return opt; +} + +class VersionStorageInfoTestBase : public testing::Test { + public: + const Comparator* ucmp_; + InternalKeyComparator icmp_; + std::shared_ptr logger_; + Options options_; + ImmutableOptions ioptions_; + MutableCFOptions mutable_cf_options_; + VersionStorageInfo vstorage_; + + InternalKey GetInternalKey(const char* ukey, + SequenceNumber smallest_seq = 100) { + return InternalKey(ukey, smallest_seq, kTypeValue); + } + + explicit VersionStorageInfoTestBase(const Comparator* ucmp) + : ucmp_(ucmp), + icmp_(ucmp_), + logger_(new CountingLogger()), + options_(GetOptionsWithNumLevels(6, logger_)), + ioptions_(options_), + mutable_cf_options_(options_), + vstorage_(&icmp_, ucmp_, 6, kCompactionStyleLevel, + /*src_vstorage=*/nullptr, + /*_force_consistency_checks=*/false) {} + + ~VersionStorageInfoTestBase() override { + for (int i = 0; i < vstorage_.num_levels(); ++i) { + for (auto* f : vstorage_.LevelFiles(i)) { + if (--f->refs == 0) { + delete f; + } + } + } + } + + void Add(int level, uint32_t file_number, const char* smallest, + const char* largest, uint64_t file_size = 0, + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) { + constexpr SequenceNumber dummy_seq = 0; + + Add(level, file_number, GetInternalKey(smallest, dummy_seq), + GetInternalKey(largest, dummy_seq), file_size, oldest_blob_file_number); + } + + void Add(int level, uint32_t file_number, const InternalKey& smallest, + const InternalKey& largest, uint64_t file_size = 0, + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) { + assert(level < vstorage_.num_levels()); + FileMetaData* f = new FileMetaData( + file_number, 0, file_size, smallest, largest, /* smallest_seq */ 0, + /* largest_seq */ 0, /* marked_for_compact */ false, + Temperature::kUnknown, oldest_blob_file_number, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + f->compensated_file_size = file_size; + vstorage_.AddFile(level, f); + } + + void AddBlob(uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, + BlobFileMetaData::LinkedSsts linked_ssts, + uint64_t garbage_blob_count, uint64_t garbage_blob_bytes) { + auto shared_meta = SharedBlobFileMetaData::Create( + blob_file_number, total_blob_count, total_blob_bytes, + /* checksum_method */ std::string(), + /* checksum_value */ std::string()); + auto meta = + BlobFileMetaData::Create(std::move(shared_meta), std::move(linked_ssts), + garbage_blob_count, garbage_blob_bytes); + + vstorage_.AddBlobFile(std::move(meta)); + } + + void UpdateVersionStorageInfo() { + vstorage_.PrepareForVersionAppend(ioptions_, mutable_cf_options_); + vstorage_.SetFinalized(); + } + + std::string GetOverlappingFiles(int level, const InternalKey& begin, + const InternalKey& end) { + std::vector inputs; + vstorage_.GetOverlappingInputs(level, &begin, &end, &inputs); + + std::string result; + for (size_t i = 0; i < inputs.size(); ++i) { + if (i > 0) { + result += ","; + } + AppendNumberTo(&result, inputs[i]->fd.GetNumber()); + } + return result; + } +}; + +class VersionStorageInfoTest : public VersionStorageInfoTestBase { + public: + VersionStorageInfoTest() : VersionStorageInfoTestBase(BytewiseComparator()) {} + + ~VersionStorageInfoTest() override {} +}; + +TEST_F(VersionStorageInfoTest, MaxBytesForLevelStatic) { + ioptions_.level_compaction_dynamic_level_bytes = false; + mutable_cf_options_.max_bytes_for_level_base = 10; + mutable_cf_options_.max_bytes_for_level_multiplier = 5; + + Add(4, 100U, "1", "2", 100U); + Add(5, 101U, "1", "2", 100U); + + UpdateVersionStorageInfo(); + + ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 10U); + ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 50U); + ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 250U); + ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1250U); + + ASSERT_EQ(0, logger_->log_count); +} + +TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_1) { + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 5; + + Add(5, 1U, "1", "2", 500U); + + UpdateVersionStorageInfo(); + + ASSERT_EQ(0, logger_->log_count); + ASSERT_EQ(vstorage_.base_level(), 5); +} + +TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_2) { + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 5; + + Add(5, 1U, "1", "2", 500U); + Add(5, 2U, "3", "4", 550U); + + UpdateVersionStorageInfo(); + + ASSERT_EQ(0, logger_->log_count); + ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1000U); + ASSERT_EQ(vstorage_.base_level(), 4); +} + +TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_3) { + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 5; + + Add(5, 1U, "1", "2", 500U); + Add(5, 2U, "3", "4", 550U); + Add(4, 3U, "3", "4", 550U); + + UpdateVersionStorageInfo(); + + ASSERT_EQ(0, logger_->log_count); + ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1000U); + ASSERT_EQ(vstorage_.base_level(), 4); +} + +TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_4) { + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 5; + + Add(5, 1U, "1", "2", 500U); + Add(5, 2U, "3", "4", 550U); + Add(4, 3U, "3", "4", 550U); + Add(3, 4U, "3", "4", 250U); + Add(3, 5U, "5", "7", 300U); + + UpdateVersionStorageInfo(); + + ASSERT_EQ(1, logger_->log_count); + ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1005U); + ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 1000U); + ASSERT_EQ(vstorage_.base_level(), 3); +} + +TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_5) { + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 5; + + Add(5, 1U, "1", "2", 500U); + Add(5, 2U, "3", "4", 550U); + Add(4, 3U, "3", "4", 550U); + Add(3, 4U, "3", "4", 250U); + Add(3, 5U, "5", "7", 300U); + Add(1, 6U, "3", "4", 5U); + Add(1, 7U, "8", "9", 5U); + + UpdateVersionStorageInfo(); + + ASSERT_EQ(1, logger_->log_count); + ASSERT_GT(vstorage_.MaxBytesForLevel(4), 1005U); + ASSERT_GT(vstorage_.MaxBytesForLevel(3), 1005U); + ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 1005U); + ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 1000U); + ASSERT_EQ(vstorage_.base_level(), 1); +} + +TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicLotsOfData) { + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 100; + mutable_cf_options_.max_bytes_for_level_multiplier = 2; + + Add(0, 1U, "1", "2", 50U); + Add(1, 2U, "1", "2", 50U); + Add(2, 3U, "1", "2", 500U); + Add(3, 4U, "1", "2", 500U); + Add(4, 5U, "1", "2", 1700U); + Add(5, 6U, "1", "2", 500U); + + UpdateVersionStorageInfo(); + + ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 800U); + ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 400U); + ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 200U); + ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 100U); + ASSERT_EQ(vstorage_.base_level(), 1); + ASSERT_EQ(0, logger_->log_count); +} + +TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicLargeLevel) { + uint64_t kOneGB = 1000U * 1000U * 1000U; + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 10U * kOneGB; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + + Add(0, 1U, "1", "2", 50U); + Add(3, 4U, "1", "2", 32U * kOneGB); + Add(4, 5U, "1", "2", 500U * kOneGB); + Add(5, 6U, "1", "2", 3000U * kOneGB); + + UpdateVersionStorageInfo(); + + ASSERT_EQ(vstorage_.MaxBytesForLevel(5), 3000U * kOneGB); + ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 300U * kOneGB); + ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 30U * kOneGB); + ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 10U * kOneGB); + ASSERT_EQ(vstorage_.base_level(), 2); + ASSERT_EQ(0, logger_->log_count); +} + +TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_1) { + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 40000; + mutable_cf_options_.max_bytes_for_level_multiplier = 5; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + + Add(0, 1U, "1", "2", 10000U); + Add(0, 2U, "1", "2", 10000U); + Add(0, 3U, "1", "2", 10000U); + + Add(5, 4U, "1", "2", 1286250U); + Add(4, 5U, "1", "2", 200000U); + Add(3, 6U, "1", "2", 40000U); + Add(2, 7U, "1", "2", 8000U); + + UpdateVersionStorageInfo(); + + ASSERT_EQ(0, logger_->log_count); + ASSERT_EQ(2, vstorage_.base_level()); + // level multiplier should be 3.5 + ASSERT_EQ(vstorage_.level_multiplier(), 5.0); + ASSERT_EQ(40000U, vstorage_.MaxBytesForLevel(2)); + ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3)); + ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4)); + + vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_); + // Only L0 hits compaction. + ASSERT_EQ(vstorage_.CompactionScoreLevel(0), 0); +} + +TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_2) { + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 10000; + mutable_cf_options_.max_bytes_for_level_multiplier = 5; + mutable_cf_options_.level0_file_num_compaction_trigger = 4; + + Add(0, 11U, "1", "2", 10000U); + Add(0, 12U, "1", "2", 10000U); + Add(0, 13U, "1", "2", 10000U); + + // Level size should be around 10,000, 10,290, 51,450, 257,250 + Add(5, 4U, "1", "2", 1286250U); + Add(4, 5U, "1", "2", 258000U); // unadjusted score 1.003 + Add(3, 6U, "1", "2", 53000U); // unadjusted score 1.03 + Add(2, 7U, "1", "2", 20000U); // unadjusted score 1.94 + + UpdateVersionStorageInfo(); + + ASSERT_EQ(0, logger_->log_count); + ASSERT_EQ(1, vstorage_.base_level()); + ASSERT_EQ(10000U, vstorage_.MaxBytesForLevel(1)); + ASSERT_EQ(10290U, vstorage_.MaxBytesForLevel(2)); + ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3)); + ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4)); + + vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_); + // Although L2 and l3 have higher unadjusted compaction score, considering + // a relatively large L0 being compacted down soon, L4 is picked up for + // compaction. + // L0 is still picked up for oversizing. + ASSERT_EQ(0, vstorage_.CompactionScoreLevel(0)); + ASSERT_EQ(4, vstorage_.CompactionScoreLevel(1)); +} + +TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_3) { + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 20000; + mutable_cf_options_.max_bytes_for_level_multiplier = 5; + mutable_cf_options_.level0_file_num_compaction_trigger = 5; + + Add(0, 11U, "1", "2", 2500U); + Add(0, 12U, "1", "2", 2500U); + Add(0, 13U, "1", "2", 2500U); + Add(0, 14U, "1", "2", 2500U); + + // Level size should be around 20,000, 53000, 258000 + Add(5, 4U, "1", "2", 1286250U); + Add(4, 5U, "1", "2", 260000U); // Unadjusted score 1.01, adjusted about 4.3 + Add(3, 6U, "1", "2", 85000U); // Unadjusted score 1.42, adjusted about 11.6 + Add(2, 7U, "1", "2", 30000); // Unadjusted score 1.5, adjusted about 10.0 + + UpdateVersionStorageInfo(); + + ASSERT_EQ(0, logger_->log_count); + ASSERT_EQ(2, vstorage_.base_level()); + ASSERT_EQ(20000U, vstorage_.MaxBytesForLevel(2)); + + vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_); + // Although L2 has higher unadjusted compaction score, considering + // a relatively large L0 being compacted down soon, L3 is picked up for + // compaction. + + ASSERT_EQ(3, vstorage_.CompactionScoreLevel(0)); + ASSERT_EQ(2, vstorage_.CompactionScoreLevel(1)); + ASSERT_EQ(4, vstorage_.CompactionScoreLevel(2)); +} + +TEST_F(VersionStorageInfoTest, EstimateLiveDataSize) { + // Test whether the overlaps are detected as expected + Add(1, 1U, "4", "7", 1U); // Perfect overlap with last level + Add(2, 2U, "3", "5", 1U); // Partial overlap with last level + Add(2, 3U, "6", "8", 1U); // Partial overlap with last level + Add(3, 4U, "1", "9", 1U); // Contains range of last level + Add(4, 5U, "4", "5", 1U); // Inside range of last level + Add(4, 6U, "6", "7", 1U); // Inside range of last level + Add(5, 7U, "4", "7", 10U); + + UpdateVersionStorageInfo(); + + ASSERT_EQ(10U, vstorage_.EstimateLiveDataSize()); +} + +TEST_F(VersionStorageInfoTest, EstimateLiveDataSize2) { + Add(0, 1U, "9", "9", 1U); // Level 0 is not ordered + Add(0, 2U, "5", "6", 1U); // Ignored because of [5,6] in l1 + Add(1, 3U, "1", "2", 1U); // Ignored because of [2,3] in l2 + Add(1, 4U, "3", "4", 1U); // Ignored because of [2,3] in l2 + Add(1, 5U, "5", "6", 1U); + Add(2, 6U, "2", "3", 1U); + Add(3, 7U, "7", "8", 1U); + + UpdateVersionStorageInfo(); + + ASSERT_EQ(4U, vstorage_.EstimateLiveDataSize()); +} + +TEST_F(VersionStorageInfoTest, GetOverlappingInputs) { + // Two files that overlap at the range deletion tombstone sentinel. + Add(1, 1U, {"a", 0, kTypeValue}, + {"b", kMaxSequenceNumber, kTypeRangeDeletion}, 1); + Add(1, 2U, {"b", 0, kTypeValue}, {"c", 0, kTypeValue}, 1); + // Two files that overlap at the same user key. + Add(1, 3U, {"d", 0, kTypeValue}, {"e", kMaxSequenceNumber, kTypeValue}, 1); + Add(1, 4U, {"e", 0, kTypeValue}, {"f", 0, kTypeValue}, 1); + // Two files that do not overlap. + Add(1, 5U, {"g", 0, kTypeValue}, {"h", 0, kTypeValue}, 1); + Add(1, 6U, {"i", 0, kTypeValue}, {"j", 0, kTypeValue}, 1); + + UpdateVersionStorageInfo(); + + ASSERT_EQ("1,2", + GetOverlappingFiles(1, {"a", 0, kTypeValue}, {"b", 0, kTypeValue})); + ASSERT_EQ("1", + GetOverlappingFiles(1, {"a", 0, kTypeValue}, + {"b", kMaxSequenceNumber, kTypeRangeDeletion})); + ASSERT_EQ("2", GetOverlappingFiles(1, {"b", kMaxSequenceNumber, kTypeValue}, + {"c", 0, kTypeValue})); + ASSERT_EQ("3,4", + GetOverlappingFiles(1, {"d", 0, kTypeValue}, {"e", 0, kTypeValue})); + ASSERT_EQ("3", + GetOverlappingFiles(1, {"d", 0, kTypeValue}, + {"e", kMaxSequenceNumber, kTypeRangeDeletion})); + ASSERT_EQ("3,4", GetOverlappingFiles(1, {"e", kMaxSequenceNumber, kTypeValue}, + {"f", 0, kTypeValue})); + ASSERT_EQ("3,4", + GetOverlappingFiles(1, {"e", 0, kTypeValue}, {"f", 0, kTypeValue})); + ASSERT_EQ("5", + GetOverlappingFiles(1, {"g", 0, kTypeValue}, {"h", 0, kTypeValue})); + ASSERT_EQ("6", + GetOverlappingFiles(1, {"i", 0, kTypeValue}, {"j", 0, kTypeValue})); +} + +TEST_F(VersionStorageInfoTest, FileLocationAndMetaDataByNumber) { + Add(0, 11U, "1", "2", 5000U); + Add(0, 12U, "1", "2", 5000U); + + Add(2, 7U, "1", "2", 8000U); + + UpdateVersionStorageInfo(); + + ASSERT_EQ(vstorage_.GetFileLocation(11U), + VersionStorageInfo::FileLocation(0, 0)); + ASSERT_NE(vstorage_.GetFileMetaDataByNumber(11U), nullptr); + + ASSERT_EQ(vstorage_.GetFileLocation(12U), + VersionStorageInfo::FileLocation(0, 1)); + ASSERT_NE(vstorage_.GetFileMetaDataByNumber(12U), nullptr); + + ASSERT_EQ(vstorage_.GetFileLocation(7U), + VersionStorageInfo::FileLocation(2, 0)); + ASSERT_NE(vstorage_.GetFileMetaDataByNumber(7U), nullptr); + + ASSERT_FALSE(vstorage_.GetFileLocation(999U).IsValid()); + ASSERT_EQ(vstorage_.GetFileMetaDataByNumber(999U), nullptr); +} + +TEST_F(VersionStorageInfoTest, ForcedBlobGCEmpty) { + // No SST or blob files in VersionStorageInfo + UpdateVersionStorageInfo(); + + constexpr double age_cutoff = 0.5; + constexpr double force_threshold = 0.75; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); +} + +TEST_F(VersionStorageInfoTest, ForcedBlobGCSingleBatch) { + // Test the edge case when all blob files are part of the oldest batch. + // We have one L0 SST file #1, and four blob files #10, #11, #12, and #13. + // The oldest blob file used by SST #1 is blob file #10. + + constexpr int level = 0; + + constexpr uint64_t sst = 1; + + constexpr uint64_t first_blob = 10; + constexpr uint64_t second_blob = 11; + constexpr uint64_t third_blob = 12; + constexpr uint64_t fourth_blob = 13; + + { + constexpr char smallest[] = "bar1"; + constexpr char largest[] = "foo1"; + constexpr uint64_t file_size = 1000; + + Add(level, sst, smallest, largest, file_size, first_blob); + } + + { + constexpr uint64_t total_blob_count = 10; + constexpr uint64_t total_blob_bytes = 100000; + constexpr uint64_t garbage_blob_count = 2; + constexpr uint64_t garbage_blob_bytes = 15000; + + AddBlob(first_blob, total_blob_count, total_blob_bytes, + BlobFileMetaData::LinkedSsts{sst}, garbage_blob_count, + garbage_blob_bytes); + } + + { + constexpr uint64_t total_blob_count = 4; + constexpr uint64_t total_blob_bytes = 400000; + constexpr uint64_t garbage_blob_count = 3; + constexpr uint64_t garbage_blob_bytes = 235000; + + AddBlob(second_blob, total_blob_count, total_blob_bytes, + BlobFileMetaData::LinkedSsts{}, garbage_blob_count, + garbage_blob_bytes); + } + + { + constexpr uint64_t total_blob_count = 20; + constexpr uint64_t total_blob_bytes = 1000000; + constexpr uint64_t garbage_blob_count = 8; + constexpr uint64_t garbage_blob_bytes = 400000; + + AddBlob(third_blob, total_blob_count, total_blob_bytes, + BlobFileMetaData::LinkedSsts{}, garbage_blob_count, + garbage_blob_bytes); + } + + { + constexpr uint64_t total_blob_count = 128; + constexpr uint64_t total_blob_bytes = 1000000; + constexpr uint64_t garbage_blob_count = 67; + constexpr uint64_t garbage_blob_bytes = 600000; + + AddBlob(fourth_blob, total_blob_count, total_blob_bytes, + BlobFileMetaData::LinkedSsts{}, garbage_blob_count, + garbage_blob_bytes); + } + + UpdateVersionStorageInfo(); + + assert(vstorage_.num_levels() > 0); + const auto& level_files = vstorage_.LevelFiles(level); + + assert(level_files.size() == 1); + assert(level_files[0] && level_files[0]->fd.GetNumber() == sst); + + // No blob files eligible for GC due to the age cutoff + + { + constexpr double age_cutoff = 0.1; + constexpr double force_threshold = 0.0; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); + } + + // Part of the oldest batch of blob files (specifically, #12 and #13) is + // ineligible for GC due to the age cutoff + + { + constexpr double age_cutoff = 0.5; + constexpr double force_threshold = 0.0; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); + } + + // Oldest batch is eligible based on age cutoff but its overall garbage ratio + // is below threshold + + { + constexpr double age_cutoff = 1.0; + constexpr double force_threshold = 0.6; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); + } + + // Oldest batch is eligible based on age cutoff and its overall garbage ratio + // meets threshold + + { + constexpr double age_cutoff = 1.0; + constexpr double force_threshold = 0.5; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + auto ssts_to_be_compacted = vstorage_.FilesMarkedForForcedBlobGC(); + ASSERT_EQ(ssts_to_be_compacted.size(), 1); + + const autovector> + expected_ssts_to_be_compacted{{level, level_files[0]}}; + + ASSERT_EQ(ssts_to_be_compacted[0], expected_ssts_to_be_compacted[0]); + } +} + +TEST_F(VersionStorageInfoTest, ForcedBlobGCMultipleBatches) { + // Add three L0 SSTs (1, 2, and 3) and four blob files (10, 11, 12, and 13). + // The first two SSTs have the same oldest blob file, namely, the very oldest + // one (10), while the third SST's oldest blob file reference points to the + // third blob file (12). Thus, the oldest batch of blob files contains the + // first two blob files 10 and 11, and assuming they are eligible for GC based + // on the age cutoff, compacting away the SSTs 1 and 2 will eliminate them. + + constexpr int level = 0; + + constexpr uint64_t first_sst = 1; + constexpr uint64_t second_sst = 2; + constexpr uint64_t third_sst = 3; + + constexpr uint64_t first_blob = 10; + constexpr uint64_t second_blob = 11; + constexpr uint64_t third_blob = 12; + constexpr uint64_t fourth_blob = 13; + + { + constexpr char smallest[] = "bar1"; + constexpr char largest[] = "foo1"; + constexpr uint64_t file_size = 1000; + + Add(level, first_sst, smallest, largest, file_size, first_blob); + } + + { + constexpr char smallest[] = "bar2"; + constexpr char largest[] = "foo2"; + constexpr uint64_t file_size = 2000; + + Add(level, second_sst, smallest, largest, file_size, first_blob); + } + + { + constexpr char smallest[] = "bar3"; + constexpr char largest[] = "foo3"; + constexpr uint64_t file_size = 3000; + + Add(level, third_sst, smallest, largest, file_size, third_blob); + } + + { + constexpr uint64_t total_blob_count = 10; + constexpr uint64_t total_blob_bytes = 100000; + constexpr uint64_t garbage_blob_count = 2; + constexpr uint64_t garbage_blob_bytes = 15000; + + AddBlob(first_blob, total_blob_count, total_blob_bytes, + BlobFileMetaData::LinkedSsts{first_sst, second_sst}, + garbage_blob_count, garbage_blob_bytes); + } + + { + constexpr uint64_t total_blob_count = 4; + constexpr uint64_t total_blob_bytes = 400000; + constexpr uint64_t garbage_blob_count = 3; + constexpr uint64_t garbage_blob_bytes = 235000; + + AddBlob(second_blob, total_blob_count, total_blob_bytes, + BlobFileMetaData::LinkedSsts{}, garbage_blob_count, + garbage_blob_bytes); + } + + { + constexpr uint64_t total_blob_count = 20; + constexpr uint64_t total_blob_bytes = 1000000; + constexpr uint64_t garbage_blob_count = 8; + constexpr uint64_t garbage_blob_bytes = 123456; + + AddBlob(third_blob, total_blob_count, total_blob_bytes, + BlobFileMetaData::LinkedSsts{third_sst}, garbage_blob_count, + garbage_blob_bytes); + } + + { + constexpr uint64_t total_blob_count = 128; + constexpr uint64_t total_blob_bytes = 789012345; + constexpr uint64_t garbage_blob_count = 67; + constexpr uint64_t garbage_blob_bytes = 88888888; + + AddBlob(fourth_blob, total_blob_count, total_blob_bytes, + BlobFileMetaData::LinkedSsts{}, garbage_blob_count, + garbage_blob_bytes); + } + + UpdateVersionStorageInfo(); + + assert(vstorage_.num_levels() > 0); + const auto& level_files = vstorage_.LevelFiles(level); + + assert(level_files.size() == 3); + assert(level_files[0] && level_files[0]->fd.GetNumber() == first_sst); + assert(level_files[1] && level_files[1]->fd.GetNumber() == second_sst); + assert(level_files[2] && level_files[2]->fd.GetNumber() == third_sst); + + // No blob files eligible for GC due to the age cutoff + + { + constexpr double age_cutoff = 0.1; + constexpr double force_threshold = 0.0; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); + } + + // Part of the oldest batch of blob files (specifically, the second file) is + // ineligible for GC due to the age cutoff + + { + constexpr double age_cutoff = 0.25; + constexpr double force_threshold = 0.0; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); + } + + // Oldest batch is eligible based on age cutoff but its overall garbage ratio + // is below threshold + + { + constexpr double age_cutoff = 0.5; + constexpr double force_threshold = 0.6; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); + } + + // Oldest batch is eligible based on age cutoff and its overall garbage ratio + // meets threshold + + { + constexpr double age_cutoff = 0.5; + constexpr double force_threshold = 0.5; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + auto ssts_to_be_compacted = vstorage_.FilesMarkedForForcedBlobGC(); + ASSERT_EQ(ssts_to_be_compacted.size(), 2); + + std::sort(ssts_to_be_compacted.begin(), ssts_to_be_compacted.end(), + [](const std::pair& lhs, + const std::pair& rhs) { + assert(lhs.second); + assert(rhs.second); + return lhs.second->fd.GetNumber() < rhs.second->fd.GetNumber(); + }); + + const autovector> + expected_ssts_to_be_compacted{{level, level_files[0]}, + {level, level_files[1]}}; + + ASSERT_EQ(ssts_to_be_compacted[0], expected_ssts_to_be_compacted[0]); + ASSERT_EQ(ssts_to_be_compacted[1], expected_ssts_to_be_compacted[1]); + } + + // Now try the last two cases again with a greater than necessary age cutoff + + // Oldest batch is eligible based on age cutoff but its overall garbage ratio + // is below threshold + + { + constexpr double age_cutoff = 0.75; + constexpr double force_threshold = 0.6; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); + } + + // Oldest batch is eligible based on age cutoff and its overall garbage ratio + // meets threshold + + { + constexpr double age_cutoff = 0.75; + constexpr double force_threshold = 0.5; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + auto ssts_to_be_compacted = vstorage_.FilesMarkedForForcedBlobGC(); + ASSERT_EQ(ssts_to_be_compacted.size(), 2); + + std::sort(ssts_to_be_compacted.begin(), ssts_to_be_compacted.end(), + [](const std::pair& lhs, + const std::pair& rhs) { + assert(lhs.second); + assert(rhs.second); + return lhs.second->fd.GetNumber() < rhs.second->fd.GetNumber(); + }); + + const autovector> + expected_ssts_to_be_compacted{{level, level_files[0]}, + {level, level_files[1]}}; + + ASSERT_EQ(ssts_to_be_compacted[0], expected_ssts_to_be_compacted[0]); + ASSERT_EQ(ssts_to_be_compacted[1], expected_ssts_to_be_compacted[1]); + } +} + +class VersionStorageInfoTimestampTest : public VersionStorageInfoTestBase { + public: + VersionStorageInfoTimestampTest() + : VersionStorageInfoTestBase(test::BytewiseComparatorWithU64TsWrapper()) { + } + ~VersionStorageInfoTimestampTest() override {} + std::string Timestamp(uint64_t ts) const { + std::string ret; + PutFixed64(&ret, ts); + return ret; + } + std::string PackUserKeyAndTimestamp(const Slice& ukey, uint64_t ts) const { + std::string ret; + ret.assign(ukey.data(), ukey.size()); + PutFixed64(&ret, ts); + return ret; + } +}; + +TEST_F(VersionStorageInfoTimestampTest, GetOverlappingInputs) { + Add(/*level=*/1, /*file_number=*/1, /*smallest=*/ + {PackUserKeyAndTimestamp("a", /*ts=*/9), /*s=*/0, kTypeValue}, + /*largest=*/ + {PackUserKeyAndTimestamp("a", /*ts=*/8), /*s=*/0, kTypeValue}, + /*file_size=*/100); + Add(/*level=*/1, /*file_number=*/2, /*smallest=*/ + {PackUserKeyAndTimestamp("a", /*ts=*/5), /*s=*/0, kTypeValue}, + /*largest=*/ + {PackUserKeyAndTimestamp("b", /*ts=*/10), /*s=*/0, kTypeValue}, + /*file_size=*/100); + Add(/*level=*/1, /*file_number=*/3, /*smallest=*/ + {PackUserKeyAndTimestamp("c", /*ts=*/12), /*s=*/0, kTypeValue}, + /*largest=*/ + {PackUserKeyAndTimestamp("d", /*ts=*/1), /*s=*/0, kTypeValue}, + /*file_size=*/100); + + UpdateVersionStorageInfo(); + + ASSERT_EQ( + "1,2", + GetOverlappingFiles( + /*level=*/1, + {PackUserKeyAndTimestamp("a", /*ts=*/12), /*s=*/0, kTypeValue}, + {PackUserKeyAndTimestamp("a", /*ts=*/11), /*s=*/0, kTypeValue})); + ASSERT_EQ("3", + GetOverlappingFiles( + /*level=*/1, + {PackUserKeyAndTimestamp("c", /*ts=*/15), /*s=*/0, kTypeValue}, + {PackUserKeyAndTimestamp("c", /*ts=*/2), /*s=*/0, kTypeValue})); +} + +class FindLevelFileTest : public testing::Test { + public: + LevelFilesBrief file_level_; + bool disjoint_sorted_files_; + Arena arena_; + + FindLevelFileTest() : disjoint_sorted_files_(true) {} + + ~FindLevelFileTest() override {} + + void LevelFileInit(size_t num = 0) { + char* mem = arena_.AllocateAligned(num * sizeof(FdWithKeyRange)); + file_level_.files = new (mem) FdWithKeyRange[num]; + file_level_.num_files = 0; + } + + void Add(const char* smallest, const char* largest, + SequenceNumber smallest_seq = 100, + SequenceNumber largest_seq = 100) { + InternalKey smallest_key = InternalKey(smallest, smallest_seq, kTypeValue); + InternalKey largest_key = InternalKey(largest, largest_seq, kTypeValue); + + Slice smallest_slice = smallest_key.Encode(); + Slice largest_slice = largest_key.Encode(); + + char* mem = + arena_.AllocateAligned(smallest_slice.size() + largest_slice.size()); + memcpy(mem, smallest_slice.data(), smallest_slice.size()); + memcpy(mem + smallest_slice.size(), largest_slice.data(), + largest_slice.size()); + + // add to file_level_ + size_t num = file_level_.num_files; + auto& file = file_level_.files[num]; + file.fd = FileDescriptor(num + 1, 0, 0); + file.smallest_key = Slice(mem, smallest_slice.size()); + file.largest_key = Slice(mem + smallest_slice.size(), largest_slice.size()); + file_level_.num_files++; + } + + int Find(const char* key) { + InternalKey target(key, 100, kTypeValue); + InternalKeyComparator cmp(BytewiseComparator()); + return FindFile(cmp, file_level_, target.Encode()); + } + + bool Overlaps(const char* smallest, const char* largest) { + InternalKeyComparator cmp(BytewiseComparator()); + Slice s(smallest != nullptr ? smallest : ""); + Slice l(largest != nullptr ? largest : ""); + return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, file_level_, + (smallest != nullptr ? &s : nullptr), + (largest != nullptr ? &l : nullptr)); + } +}; + +TEST_F(FindLevelFileTest, LevelEmpty) { + LevelFileInit(0); + + ASSERT_EQ(0, Find("foo")); + ASSERT_TRUE(!Overlaps("a", "z")); + ASSERT_TRUE(!Overlaps(nullptr, "z")); + ASSERT_TRUE(!Overlaps("a", nullptr)); + ASSERT_TRUE(!Overlaps(nullptr, nullptr)); +} + +TEST_F(FindLevelFileTest, LevelSingle) { + LevelFileInit(1); + + Add("p", "q"); + ASSERT_EQ(0, Find("a")); + ASSERT_EQ(0, Find("p")); + ASSERT_EQ(0, Find("p1")); + ASSERT_EQ(0, Find("q")); + ASSERT_EQ(1, Find("q1")); + ASSERT_EQ(1, Find("z")); + + ASSERT_TRUE(!Overlaps("a", "b")); + ASSERT_TRUE(!Overlaps("z1", "z2")); + ASSERT_TRUE(Overlaps("a", "p")); + ASSERT_TRUE(Overlaps("a", "q")); + ASSERT_TRUE(Overlaps("a", "z")); + ASSERT_TRUE(Overlaps("p", "p1")); + ASSERT_TRUE(Overlaps("p", "q")); + ASSERT_TRUE(Overlaps("p", "z")); + ASSERT_TRUE(Overlaps("p1", "p2")); + ASSERT_TRUE(Overlaps("p1", "z")); + ASSERT_TRUE(Overlaps("q", "q")); + ASSERT_TRUE(Overlaps("q", "q1")); + + ASSERT_TRUE(!Overlaps(nullptr, "j")); + ASSERT_TRUE(!Overlaps("r", nullptr)); + ASSERT_TRUE(Overlaps(nullptr, "p")); + ASSERT_TRUE(Overlaps(nullptr, "p1")); + ASSERT_TRUE(Overlaps("q", nullptr)); + ASSERT_TRUE(Overlaps(nullptr, nullptr)); +} + +TEST_F(FindLevelFileTest, LevelMultiple) { + LevelFileInit(4); + + Add("150", "200"); + Add("200", "250"); + Add("300", "350"); + Add("400", "450"); + ASSERT_EQ(0, Find("100")); + ASSERT_EQ(0, Find("150")); + ASSERT_EQ(0, Find("151")); + ASSERT_EQ(0, Find("199")); + ASSERT_EQ(0, Find("200")); + ASSERT_EQ(1, Find("201")); + ASSERT_EQ(1, Find("249")); + ASSERT_EQ(1, Find("250")); + ASSERT_EQ(2, Find("251")); + ASSERT_EQ(2, Find("299")); + ASSERT_EQ(2, Find("300")); + ASSERT_EQ(2, Find("349")); + ASSERT_EQ(2, Find("350")); + ASSERT_EQ(3, Find("351")); + ASSERT_EQ(3, Find("400")); + ASSERT_EQ(3, Find("450")); + ASSERT_EQ(4, Find("451")); + + ASSERT_TRUE(!Overlaps("100", "149")); + ASSERT_TRUE(!Overlaps("251", "299")); + ASSERT_TRUE(!Overlaps("451", "500")); + ASSERT_TRUE(!Overlaps("351", "399")); + + ASSERT_TRUE(Overlaps("100", "150")); + ASSERT_TRUE(Overlaps("100", "200")); + ASSERT_TRUE(Overlaps("100", "300")); + ASSERT_TRUE(Overlaps("100", "400")); + ASSERT_TRUE(Overlaps("100", "500")); + ASSERT_TRUE(Overlaps("375", "400")); + ASSERT_TRUE(Overlaps("450", "450")); + ASSERT_TRUE(Overlaps("450", "500")); +} + +TEST_F(FindLevelFileTest, LevelMultipleNullBoundaries) { + LevelFileInit(4); + + Add("150", "200"); + Add("200", "250"); + Add("300", "350"); + Add("400", "450"); + ASSERT_TRUE(!Overlaps(nullptr, "149")); + ASSERT_TRUE(!Overlaps("451", nullptr)); + ASSERT_TRUE(Overlaps(nullptr, nullptr)); + ASSERT_TRUE(Overlaps(nullptr, "150")); + ASSERT_TRUE(Overlaps(nullptr, "199")); + ASSERT_TRUE(Overlaps(nullptr, "200")); + ASSERT_TRUE(Overlaps(nullptr, "201")); + ASSERT_TRUE(Overlaps(nullptr, "400")); + ASSERT_TRUE(Overlaps(nullptr, "800")); + ASSERT_TRUE(Overlaps("100", nullptr)); + ASSERT_TRUE(Overlaps("200", nullptr)); + ASSERT_TRUE(Overlaps("449", nullptr)); + ASSERT_TRUE(Overlaps("450", nullptr)); +} + +TEST_F(FindLevelFileTest, LevelOverlapSequenceChecks) { + LevelFileInit(1); + + Add("200", "200", 5000, 3000); + ASSERT_TRUE(!Overlaps("199", "199")); + ASSERT_TRUE(!Overlaps("201", "300")); + ASSERT_TRUE(Overlaps("200", "200")); + ASSERT_TRUE(Overlaps("190", "200")); + ASSERT_TRUE(Overlaps("200", "210")); +} + +TEST_F(FindLevelFileTest, LevelOverlappingFiles) { + LevelFileInit(2); + + Add("150", "600"); + Add("400", "500"); + disjoint_sorted_files_ = false; + ASSERT_TRUE(!Overlaps("100", "149")); + ASSERT_TRUE(!Overlaps("601", "700")); + ASSERT_TRUE(Overlaps("100", "150")); + ASSERT_TRUE(Overlaps("100", "200")); + ASSERT_TRUE(Overlaps("100", "300")); + ASSERT_TRUE(Overlaps("100", "400")); + ASSERT_TRUE(Overlaps("100", "500")); + ASSERT_TRUE(Overlaps("375", "400")); + ASSERT_TRUE(Overlaps("450", "450")); + ASSERT_TRUE(Overlaps("450", "500")); + ASSERT_TRUE(Overlaps("450", "700")); + ASSERT_TRUE(Overlaps("600", "700")); +} + +class VersionSetTestBase { + public: + const static std::string kColumnFamilyName1; + const static std::string kColumnFamilyName2; + const static std::string kColumnFamilyName3; + int num_initial_edits_; + + explicit VersionSetTestBase(const std::string& name) + : env_(nullptr), + dbname_(test::PerThreadDBPath(name)), + options_(), + db_options_(options_), + cf_options_(options_), + immutable_options_(db_options_, cf_options_), + mutable_cf_options_(cf_options_), + table_cache_(NewLRUCache(50000, 16)), + write_buffer_manager_(db_options_.db_write_buffer_size), + shutting_down_(false), + mock_table_factory_(std::make_shared()) { + EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env_, &env_guard_)); + if (env_ == Env::Default() && getenv("MEM_ENV")) { + env_guard_.reset(NewMemEnv(Env::Default())); + env_ = env_guard_.get(); + } + EXPECT_NE(nullptr, env_); + + fs_ = env_->GetFileSystem(); + EXPECT_OK(fs_->CreateDirIfMissing(dbname_, IOOptions(), nullptr)); + + options_.env = env_; + db_options_.env = env_; + db_options_.fs = fs_; + immutable_options_.env = env_; + immutable_options_.fs = fs_; + immutable_options_.clock = env_->GetSystemClock().get(); + + versions_.reset( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id*/ "", /*db_session_id*/ "")); + reactive_versions_ = std::make_shared( + dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, nullptr); + db_options_.db_paths.emplace_back(dbname_, + std::numeric_limits::max()); + } + + virtual ~VersionSetTestBase() { + if (getenv("KEEP_DB")) { + fprintf(stdout, "DB is still at %s\n", dbname_.c_str()); + } else { + Options options; + options.env = env_; + EXPECT_OK(DestroyDB(dbname_, options)); + } + } + + protected: + virtual void PrepareManifest( + std::vector* column_families, + SequenceNumber* last_seqno, std::unique_ptr* log_writer) { + assert(column_families != nullptr); + assert(last_seqno != nullptr); + assert(log_writer != nullptr); + VersionEdit new_db; + if (db_options_.write_dbid_to_manifest) { + DBOptions tmp_db_options; + tmp_db_options.env = env_; + std::unique_ptr impl(new DBImpl(tmp_db_options, dbname_)); + std::string db_id; + impl->GetDbIdentityFromIdentityFile(&db_id); + new_db.SetDBId(db_id); + } + new_db.SetLogNumber(0); + new_db.SetNextFile(2); + new_db.SetLastSequence(0); + + const std::vector cf_names = { + kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2, + kColumnFamilyName3}; + const int kInitialNumOfCfs = static_cast(cf_names.size()); + autovector new_cfs; + uint64_t last_seq = 1; + uint32_t cf_id = 1; + for (int i = 1; i != kInitialNumOfCfs; ++i) { + VersionEdit new_cf; + new_cf.AddColumnFamily(cf_names[i]); + new_cf.SetColumnFamily(cf_id++); + new_cf.SetLogNumber(0); + new_cf.SetNextFile(2); + new_cf.SetLastSequence(last_seq++); + new_cfs.emplace_back(new_cf); + } + *last_seqno = last_seq; + num_initial_edits_ = static_cast(new_cfs.size() + 1); + std::unique_ptr file_writer; + const std::string manifest = DescriptorFileName(dbname_, 1); + const auto& fs = env_->GetFileSystem(); + Status s = WritableFileWriter::Create( + fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer, + nullptr); + ASSERT_OK(s); + { + log_writer->reset(new log::Writer(std::move(file_writer), 0, false)); + std::string record; + new_db.EncodeTo(&record); + s = (*log_writer)->AddRecord(record); + for (const auto& e : new_cfs) { + record.clear(); + e.EncodeTo(&record); + s = (*log_writer)->AddRecord(record); + ASSERT_OK(s); + } + } + ASSERT_OK(s); + + cf_options_.table_factory = mock_table_factory_; + for (const auto& cf_name : cf_names) { + column_families->emplace_back(cf_name, cf_options_); + } + } + + // Create DB with 3 column families. + void NewDB() { + SequenceNumber last_seqno; + std::unique_ptr log_writer; + SetIdentityFile(env_, dbname_); + PrepareManifest(&column_families_, &last_seqno, &log_writer); + log_writer.reset(); + // Make "CURRENT" file point to the new manifest file. + Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + ASSERT_OK(s); + + EXPECT_OK(versions_->Recover(column_families_, false)); + EXPECT_EQ(column_families_.size(), + versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + } + + void ReopenDB() { + versions_.reset( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id*/ "", /*db_session_id*/ "")); + EXPECT_OK(versions_->Recover(column_families_, false)); + } + + void VerifyManifest(std::string* manifest_path) const { + assert(manifest_path != nullptr); + uint64_t manifest_file_number = 0; + Status s = versions_->GetCurrentManifestPath( + dbname_, fs_.get(), manifest_path, &manifest_file_number); + ASSERT_OK(s); + ASSERT_EQ(1, manifest_file_number); + } + + Status LogAndApplyToDefaultCF(VersionEdit& edit) { + mutex_.Lock(); + Status s = + versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), + mutable_cf_options_, &edit, &mutex_, nullptr); + mutex_.Unlock(); + return s; + } + + Status LogAndApplyToDefaultCF( + const autovector>& edits) { + autovector vedits; + for (auto& e : edits) { + vedits.push_back(e.get()); + } + mutex_.Lock(); + Status s = + versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), + mutable_cf_options_, vedits, &mutex_, nullptr); + mutex_.Unlock(); + return s; + } + + void CreateNewManifest() { + constexpr FSDirectory* db_directory = nullptr; + constexpr bool new_descriptor_log = true; + mutex_.Lock(); + VersionEdit dummy; + ASSERT_OK(versions_->LogAndApply( + versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, + &dummy, &mutex_, db_directory, new_descriptor_log)); + mutex_.Unlock(); + } + + ColumnFamilyData* CreateColumnFamily(const std::string& cf_name, + const ColumnFamilyOptions& cf_options) { + VersionEdit new_cf; + new_cf.AddColumnFamily(cf_name); + uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID(); + new_cf.SetColumnFamily(new_id); + new_cf.SetLogNumber(0); + new_cf.SetComparatorName(cf_options.comparator->Name()); + Status s; + mutex_.Lock(); + s = versions_->LogAndApply(/*column_family_data=*/nullptr, + MutableCFOptions(cf_options), &new_cf, &mutex_, + /*db_directory=*/nullptr, + /*new_descriptor_log=*/false, &cf_options); + mutex_.Unlock(); + EXPECT_OK(s); + ColumnFamilyData* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(cf_name); + EXPECT_NE(nullptr, cfd); + return cfd; + } + + Env* mem_env_; + Env* env_; + std::shared_ptr env_guard_; + std::shared_ptr fs_; + const std::string dbname_; + EnvOptions env_options_; + Options options_; + ImmutableDBOptions db_options_; + ColumnFamilyOptions cf_options_; + ImmutableOptions immutable_options_; + MutableCFOptions mutable_cf_options_; + std::shared_ptr table_cache_; + WriteController write_controller_; + WriteBufferManager write_buffer_manager_; + std::shared_ptr versions_; + std::shared_ptr reactive_versions_; + InstrumentedMutex mutex_; + std::atomic shutting_down_; + std::shared_ptr mock_table_factory_; + std::vector column_families_; +}; + +const std::string VersionSetTestBase::kColumnFamilyName1 = "alice"; +const std::string VersionSetTestBase::kColumnFamilyName2 = "bob"; +const std::string VersionSetTestBase::kColumnFamilyName3 = "charles"; + +class VersionSetTest : public VersionSetTestBase, public testing::Test { + public: + VersionSetTest() : VersionSetTestBase("version_set_test") {} +}; + +TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) { + NewDB(); + const int kGroupSize = 5; + autovector edits; + for (int i = 0; i != kGroupSize; ++i) { + edits.emplace_back(VersionEdit()); + } + autovector cfds; + autovector all_mutable_cf_options; + autovector> edit_lists; + for (int i = 0; i != kGroupSize; ++i) { + cfds.emplace_back(versions_->GetColumnFamilySet()->GetDefault()); + all_mutable_cf_options.emplace_back(&mutable_cf_options_); + autovector edit_list; + edit_list.emplace_back(&edits[i]); + edit_lists.emplace_back(edit_list); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + int count = 0; + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ProcessManifestWrites:SameColumnFamily", [&](void* arg) { + uint32_t* cf_id = reinterpret_cast(arg); + EXPECT_EQ(0u, *cf_id); + ++count; + }); + SyncPoint::GetInstance()->EnableProcessing(); + mutex_.Lock(); + Status s = versions_->LogAndApply(cfds, all_mutable_cf_options, edit_lists, + &mutex_, nullptr); + mutex_.Unlock(); + EXPECT_OK(s); + EXPECT_EQ(kGroupSize - 1, count); +} + +TEST_F(VersionSetTest, PersistBlobFileStateInNewManifest) { + // Initialize the database and add a couple of blob files, one with some + // garbage in it, and one without any garbage. + NewDB(); + + assert(versions_); + assert(versions_->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions_->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + Version* const version = cfd->current(); + assert(version); + + VersionStorageInfo* const storage_info = version->storage_info(); + assert(storage_info); + + { + constexpr uint64_t blob_file_number = 123; + constexpr uint64_t total_blob_count = 456; + constexpr uint64_t total_blob_bytes = 77777777; + constexpr char checksum_method[] = "SHA1"; + constexpr char checksum_value[] = + "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c" + "\x52\x5c\xbd"; + + auto shared_meta = SharedBlobFileMetaData::Create( + blob_file_number, total_blob_count, total_blob_bytes, checksum_method, + checksum_value); + + constexpr uint64_t garbage_blob_count = 89; + constexpr uint64_t garbage_blob_bytes = 1000000; + + auto meta = BlobFileMetaData::Create( + std::move(shared_meta), BlobFileMetaData::LinkedSsts(), + garbage_blob_count, garbage_blob_bytes); + + storage_info->AddBlobFile(std::move(meta)); + } + + { + constexpr uint64_t blob_file_number = 234; + constexpr uint64_t total_blob_count = 555; + constexpr uint64_t total_blob_bytes = 66666; + constexpr char checksum_method[] = "CRC32"; + constexpr char checksum_value[] = "\x3d\x87\xff\x57"; + + auto shared_meta = SharedBlobFileMetaData::Create( + blob_file_number, total_blob_count, total_blob_bytes, checksum_method, + checksum_value); + + constexpr uint64_t garbage_blob_count = 0; + constexpr uint64_t garbage_blob_bytes = 0; + + auto meta = BlobFileMetaData::Create( + std::move(shared_meta), BlobFileMetaData::LinkedSsts(), + garbage_blob_count, garbage_blob_bytes); + + storage_info->AddBlobFile(std::move(meta)); + } + + // Force the creation of a new manifest file and make sure metadata for + // the blob files is re-persisted. + size_t addition_encoded = 0; + SyncPoint::GetInstance()->SetCallBack( + "BlobFileAddition::EncodeTo::CustomFields", + [&](void* /* arg */) { ++addition_encoded; }); + + size_t garbage_encoded = 0; + SyncPoint::GetInstance()->SetCallBack( + "BlobFileGarbage::EncodeTo::CustomFields", + [&](void* /* arg */) { ++garbage_encoded; }); + SyncPoint::GetInstance()->EnableProcessing(); + + CreateNewManifest(); + + ASSERT_EQ(addition_encoded, 2); + ASSERT_EQ(garbage_encoded, 1); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(VersionSetTest, AddLiveBlobFiles) { + // Initialize the database and add a blob file. + NewDB(); + + assert(versions_); + assert(versions_->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions_->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + Version* const first_version = cfd->current(); + assert(first_version); + + VersionStorageInfo* const first_storage_info = first_version->storage_info(); + assert(first_storage_info); + + constexpr uint64_t first_blob_file_number = 234; + constexpr uint64_t first_total_blob_count = 555; + constexpr uint64_t first_total_blob_bytes = 66666; + constexpr char first_checksum_method[] = "CRC32"; + constexpr char first_checksum_value[] = "\x3d\x87\xff\x57"; + + auto first_shared_meta = SharedBlobFileMetaData::Create( + first_blob_file_number, first_total_blob_count, first_total_blob_bytes, + first_checksum_method, first_checksum_value); + + constexpr uint64_t garbage_blob_count = 0; + constexpr uint64_t garbage_blob_bytes = 0; + + auto first_meta = BlobFileMetaData::Create( + std::move(first_shared_meta), BlobFileMetaData::LinkedSsts(), + garbage_blob_count, garbage_blob_bytes); + + first_storage_info->AddBlobFile(first_meta); + + // Reference the version so it stays alive even after the following version + // edit. + first_version->Ref(); + + // Get live files directly from version. + std::vector version_table_files; + std::vector version_blob_files; + + first_version->AddLiveFiles(&version_table_files, &version_blob_files); + + ASSERT_EQ(version_blob_files.size(), 1); + ASSERT_EQ(version_blob_files[0], first_blob_file_number); + + // Create a new version containing an additional blob file. + versions_->TEST_CreateAndAppendVersion(cfd); + + Version* const second_version = cfd->current(); + assert(second_version); + assert(second_version != first_version); + + VersionStorageInfo* const second_storage_info = + second_version->storage_info(); + assert(second_storage_info); + + constexpr uint64_t second_blob_file_number = 456; + constexpr uint64_t second_total_blob_count = 100; + constexpr uint64_t second_total_blob_bytes = 2000000; + constexpr char second_checksum_method[] = "CRC32B"; + constexpr char second_checksum_value[] = "\x6d\xbd\xf2\x3a"; + + auto second_shared_meta = SharedBlobFileMetaData::Create( + second_blob_file_number, second_total_blob_count, second_total_blob_bytes, + second_checksum_method, second_checksum_value); + + auto second_meta = BlobFileMetaData::Create( + std::move(second_shared_meta), BlobFileMetaData::LinkedSsts(), + garbage_blob_count, garbage_blob_bytes); + + second_storage_info->AddBlobFile(std::move(first_meta)); + second_storage_info->AddBlobFile(std::move(second_meta)); + + // Get all live files from version set. Note that the result contains + // duplicates. + std::vector all_table_files; + std::vector all_blob_files; + + versions_->AddLiveFiles(&all_table_files, &all_blob_files); + + ASSERT_EQ(all_blob_files.size(), 3); + ASSERT_EQ(all_blob_files[0], first_blob_file_number); + ASSERT_EQ(all_blob_files[1], first_blob_file_number); + ASSERT_EQ(all_blob_files[2], second_blob_file_number); + + // Clean up previous version. + first_version->Unref(); +} + +TEST_F(VersionSetTest, ObsoleteBlobFile) { + // Initialize the database and add a blob file that is entirely garbage + // and thus can immediately be marked obsolete. + NewDB(); + + VersionEdit edit; + + constexpr uint64_t blob_file_number = 234; + constexpr uint64_t total_blob_count = 555; + constexpr uint64_t total_blob_bytes = 66666; + constexpr char checksum_method[] = "CRC32"; + constexpr char checksum_value[] = "\x3d\x87\xff\x57"; + + edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes, + checksum_method, checksum_value); + + edit.AddBlobFileGarbage(blob_file_number, total_blob_count, total_blob_bytes); + + mutex_.Lock(); + Status s = + versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), + mutable_cf_options_, &edit, &mutex_, nullptr); + mutex_.Unlock(); + + ASSERT_OK(s); + + // Make sure blob files from the pending number range are not returned + // as obsolete. + { + std::vector table_files; + std::vector blob_files; + std::vector manifest_files; + constexpr uint64_t min_pending_output = blob_file_number; + + versions_->GetObsoleteFiles(&table_files, &blob_files, &manifest_files, + min_pending_output); + + ASSERT_TRUE(blob_files.empty()); + } + + // Make sure the blob file is returned as obsolete if it's not in the pending + // range. + { + std::vector table_files; + std::vector blob_files; + std::vector manifest_files; + constexpr uint64_t min_pending_output = blob_file_number + 1; + + versions_->GetObsoleteFiles(&table_files, &blob_files, &manifest_files, + min_pending_output); + + ASSERT_EQ(blob_files.size(), 1); + ASSERT_EQ(blob_files[0].GetBlobFileNumber(), blob_file_number); + } + + // Make sure it's not returned a second time. + { + std::vector table_files; + std::vector blob_files; + std::vector manifest_files; + constexpr uint64_t min_pending_output = blob_file_number + 1; + + versions_->GetObsoleteFiles(&table_files, &blob_files, &manifest_files, + min_pending_output); + + ASSERT_TRUE(blob_files.empty()); + } +} + +TEST_F(VersionSetTest, WalEditsNotAppliedToVersion) { + NewDB(); + + constexpr uint64_t kNumWals = 5; + + autovector> edits; + // Add some WALs. + for (uint64_t i = 1; i <= kNumWals; i++) { + edits.emplace_back(new VersionEdit); + // WAL's size equals its log number. + edits.back()->AddWal(i, WalMetadata(i)); + } + // Delete the first half of the WALs. + edits.emplace_back(new VersionEdit); + edits.back()->DeleteWalsBefore(kNumWals / 2 + 1); + + autovector versions; + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ProcessManifestWrites:NewVersion", + [&](void* arg) { versions.push_back(reinterpret_cast(arg)); }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(LogAndApplyToDefaultCF(edits)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Since the edits are all WAL edits, no version should be created. + ASSERT_EQ(versions.size(), 1); + ASSERT_EQ(versions[0], nullptr); +} + +// Similar to WalEditsNotAppliedToVersion, but contains a non-WAL edit. +TEST_F(VersionSetTest, NonWalEditsAppliedToVersion) { + NewDB(); + + const std::string kDBId = "db_db"; + constexpr uint64_t kNumWals = 5; + + autovector> edits; + // Add some WALs. + for (uint64_t i = 1; i <= kNumWals; i++) { + edits.emplace_back(new VersionEdit); + // WAL's size equals its log number. + edits.back()->AddWal(i, WalMetadata(i)); + } + // Delete the first half of the WALs. + edits.emplace_back(new VersionEdit); + edits.back()->DeleteWalsBefore(kNumWals / 2 + 1); + edits.emplace_back(new VersionEdit); + edits.back()->SetDBId(kDBId); + + autovector versions; + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ProcessManifestWrites:NewVersion", + [&](void* arg) { versions.push_back(reinterpret_cast(arg)); }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(LogAndApplyToDefaultCF(edits)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Since the edits are all WAL edits, no version should be created. + ASSERT_EQ(versions.size(), 1); + ASSERT_NE(versions[0], nullptr); +} + +TEST_F(VersionSetTest, WalAddition) { + NewDB(); + + constexpr WalNumber kLogNumber = 10; + constexpr uint64_t kSizeInBytes = 111; + + // A WAL is just created. + { + VersionEdit edit; + edit.AddWal(kLogNumber); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + const auto& wals = versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kLogNumber) != wals.end()); + ASSERT_FALSE(wals.at(kLogNumber).HasSyncedSize()); + } + + // The WAL is synced for several times before closing. + { + for (uint64_t size_delta = 100; size_delta > 0; size_delta /= 2) { + uint64_t size = kSizeInBytes - size_delta; + WalMetadata wal(size); + VersionEdit edit; + edit.AddWal(kLogNumber, wal); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + const auto& wals = versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kLogNumber) != wals.end()); + ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize()); + ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), size); + } + } + + // The WAL is closed. + { + WalMetadata wal(kSizeInBytes); + VersionEdit edit; + edit.AddWal(kLogNumber, wal); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + const auto& wals = versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kLogNumber) != wals.end()); + ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize()); + ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSizeInBytes); + } + + // Recover a new VersionSet. + { + std::unique_ptr new_versions( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id*/ "", /*db_session_id*/ "")); + ASSERT_OK(new_versions->Recover(column_families_, /*read_only=*/false)); + const auto& wals = new_versions->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kLogNumber) != wals.end()); + ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize()); + ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSizeInBytes); + } +} + +TEST_F(VersionSetTest, WalCloseWithoutSync) { + NewDB(); + + constexpr WalNumber kLogNumber = 10; + constexpr uint64_t kSizeInBytes = 111; + constexpr uint64_t kSyncedSizeInBytes = kSizeInBytes / 2; + + // A WAL is just created. + { + VersionEdit edit; + edit.AddWal(kLogNumber); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + const auto& wals = versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kLogNumber) != wals.end()); + ASSERT_FALSE(wals.at(kLogNumber).HasSyncedSize()); + } + + // The WAL is synced before closing. + { + WalMetadata wal(kSyncedSizeInBytes); + VersionEdit edit; + edit.AddWal(kLogNumber, wal); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + const auto& wals = versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kLogNumber) != wals.end()); + ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize()); + ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes); + } + + // A new WAL with larger log number is created, + // implicitly marking the current WAL closed. + { + VersionEdit edit; + edit.AddWal(kLogNumber + 1); + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + const auto& wals = versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 2); + ASSERT_TRUE(wals.find(kLogNumber) != wals.end()); + ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize()); + ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes); + ASSERT_TRUE(wals.find(kLogNumber + 1) != wals.end()); + ASSERT_FALSE(wals.at(kLogNumber + 1).HasSyncedSize()); + } + + // Recover a new VersionSet. + { + std::unique_ptr new_versions( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id*/ "", /*db_session_id*/ "")); + ASSERT_OK(new_versions->Recover(column_families_, false)); + const auto& wals = new_versions->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 2); + ASSERT_TRUE(wals.find(kLogNumber) != wals.end()); + ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize()); + ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes); + } +} + +TEST_F(VersionSetTest, WalDeletion) { + NewDB(); + + constexpr WalNumber kClosedLogNumber = 10; + constexpr WalNumber kNonClosedLogNumber = 20; + constexpr uint64_t kSizeInBytes = 111; + + // Add a non-closed and a closed WAL. + { + VersionEdit edit; + edit.AddWal(kClosedLogNumber, WalMetadata(kSizeInBytes)); + edit.AddWal(kNonClosedLogNumber); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + const auto& wals = versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 2); + ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end()); + ASSERT_TRUE(wals.find(kClosedLogNumber) != wals.end()); + ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize()); + ASSERT_TRUE(wals.at(kClosedLogNumber).HasSyncedSize()); + ASSERT_EQ(wals.at(kClosedLogNumber).GetSyncedSizeInBytes(), kSizeInBytes); + } + + // Delete the closed WAL. + { + VersionEdit edit; + edit.DeleteWalsBefore(kNonClosedLogNumber); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + const auto& wals = versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end()); + ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize()); + } + + // Recover a new VersionSet, only the non-closed WAL should show up. + { + std::unique_ptr new_versions( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id*/ "", /*db_session_id*/ "")); + ASSERT_OK(new_versions->Recover(column_families_, false)); + const auto& wals = new_versions->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end()); + ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize()); + } + + // Force the creation of a new MANIFEST file, + // only the non-closed WAL should be written to the new MANIFEST. + { + std::vector wal_additions; + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::WriteCurrentStateToManifest:SaveWal", [&](void* arg) { + VersionEdit* edit = reinterpret_cast(arg); + ASSERT_TRUE(edit->IsWalAddition()); + for (auto& addition : edit->GetWalAdditions()) { + wal_additions.push_back(addition); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + CreateNewManifest(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_EQ(wal_additions.size(), 1); + ASSERT_EQ(wal_additions[0].GetLogNumber(), kNonClosedLogNumber); + ASSERT_FALSE(wal_additions[0].GetMetadata().HasSyncedSize()); + } + + // Recover from the new MANIFEST, only the non-closed WAL should show up. + { + std::unique_ptr new_versions( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id*/ "", /*db_session_id*/ "")); + ASSERT_OK(new_versions->Recover(column_families_, false)); + const auto& wals = new_versions->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end()); + ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize()); + } +} + +TEST_F(VersionSetTest, WalCreateTwice) { + NewDB(); + + constexpr WalNumber kLogNumber = 10; + + VersionEdit edit; + edit.AddWal(kLogNumber); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + Status s = LogAndApplyToDefaultCF(edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(s.ToString().find("WAL 10 is created more than once") != + std::string::npos) + << s.ToString(); +} + +TEST_F(VersionSetTest, WalCreateAfterClose) { + NewDB(); + + constexpr WalNumber kLogNumber = 10; + constexpr uint64_t kSizeInBytes = 111; + + { + // Add a closed WAL. + VersionEdit edit; + edit.AddWal(kLogNumber); + WalMetadata wal(kSizeInBytes); + edit.AddWal(kLogNumber, wal); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + } + + { + // Create the same WAL again. + VersionEdit edit; + edit.AddWal(kLogNumber); + + Status s = LogAndApplyToDefaultCF(edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(s.ToString().find("WAL 10 is created more than once") != + std::string::npos) + << s.ToString(); + } +} + +TEST_F(VersionSetTest, AddWalWithSmallerSize) { + NewDB(); + assert(versions_); + + constexpr WalNumber kLogNumber = 10; + constexpr uint64_t kSizeInBytes = 111; + + { + // Add a closed WAL. + VersionEdit edit; + WalMetadata wal(kSizeInBytes); + edit.AddWal(kLogNumber, wal); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + } + // Copy for future comparison. + const std::map wals1 = + versions_->GetWalSet().GetWals(); + + { + // Add the same WAL with smaller synced size. + VersionEdit edit; + WalMetadata wal(kSizeInBytes / 2); + edit.AddWal(kLogNumber, wal); + + Status s = LogAndApplyToDefaultCF(edit); + ASSERT_OK(s); + } + const std::map wals2 = + versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals1, wals2); +} + +TEST_F(VersionSetTest, DeleteWalsBeforeNonExistingWalNumber) { + NewDB(); + + constexpr WalNumber kLogNumber0 = 10; + constexpr WalNumber kLogNumber1 = 20; + constexpr WalNumber kNonExistingNumber = 15; + constexpr uint64_t kSizeInBytes = 111; + + { + // Add closed WALs. + VersionEdit edit; + WalMetadata wal(kSizeInBytes); + edit.AddWal(kLogNumber0, wal); + edit.AddWal(kLogNumber1, wal); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + } + + { + // Delete WALs before a non-existing WAL. + VersionEdit edit; + edit.DeleteWalsBefore(kNonExistingNumber); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + } + + // Recover a new VersionSet, WAL0 is deleted, WAL1 is not. + { + std::unique_ptr new_versions( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id*/ "", /*db_session_id*/ "")); + ASSERT_OK(new_versions->Recover(column_families_, false)); + const auto& wals = new_versions->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kLogNumber1) != wals.end()); + } +} + +TEST_F(VersionSetTest, DeleteAllWals) { + NewDB(); + + constexpr WalNumber kMaxLogNumber = 10; + constexpr uint64_t kSizeInBytes = 111; + + { + // Add a closed WAL. + VersionEdit edit; + WalMetadata wal(kSizeInBytes); + edit.AddWal(kMaxLogNumber, wal); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + } + + { + VersionEdit edit; + edit.DeleteWalsBefore(kMaxLogNumber + 10); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + } + + // Recover a new VersionSet, all WALs are deleted. + { + std::unique_ptr new_versions( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id*/ "", /*db_session_id*/ "")); + ASSERT_OK(new_versions->Recover(column_families_, false)); + const auto& wals = new_versions->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 0); + } +} + +TEST_F(VersionSetTest, AtomicGroupWithWalEdits) { + NewDB(); + + constexpr int kAtomicGroupSize = 7; + constexpr uint64_t kNumWals = 5; + const std::string kDBId = "db_db"; + + int remaining = kAtomicGroupSize; + autovector> edits; + // Add 5 WALs. + for (uint64_t i = 1; i <= kNumWals; i++) { + edits.emplace_back(new VersionEdit); + // WAL's size equals its log number. + edits.back()->AddWal(i, WalMetadata(i)); + edits.back()->MarkAtomicGroup(--remaining); + } + // One edit with the min log number set. + edits.emplace_back(new VersionEdit); + edits.back()->SetDBId(kDBId); + edits.back()->MarkAtomicGroup(--remaining); + // Delete the first added 4 WALs. + edits.emplace_back(new VersionEdit); + edits.back()->DeleteWalsBefore(kNumWals); + edits.back()->MarkAtomicGroup(--remaining); + ASSERT_EQ(remaining, 0); + + ASSERT_OK(LogAndApplyToDefaultCF(edits)); + + // Recover a new VersionSet, the min log number and the last WAL should be + // kept. + { + std::unique_ptr new_versions( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id*/ "", /*db_session_id*/ "")); + std::string db_id; + ASSERT_OK( + new_versions->Recover(column_families_, /*read_only=*/false, &db_id)); + + ASSERT_EQ(db_id, kDBId); + + const auto& wals = new_versions->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kNumWals) != wals.end()); + ASSERT_TRUE(wals.at(kNumWals).HasSyncedSize()); + ASSERT_EQ(wals.at(kNumWals).GetSyncedSizeInBytes(), kNumWals); + } +} + +class VersionSetWithTimestampTest : public VersionSetTest { + public: + static const std::string kNewCfName; + + explicit VersionSetWithTimestampTest() : VersionSetTest() {} + + void SetUp() override { + NewDB(); + Options options; + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + cfd_ = CreateColumnFamily(kNewCfName, options); + EXPECT_NE(nullptr, cfd_); + EXPECT_NE(nullptr, cfd_->GetLatestMutableCFOptions()); + column_families_.emplace_back(kNewCfName, options); + } + + void TearDown() override { + for (auto* e : edits_) { + delete e; + } + edits_.clear(); + } + + void GenVersionEditsToSetFullHistoryTsLow( + const std::vector& ts_lbs) { + for (const auto ts_lb : ts_lbs) { + VersionEdit* edit = new VersionEdit; + edit->SetColumnFamily(cfd_->GetID()); + std::string ts_str = test::EncodeInt(ts_lb); + edit->SetFullHistoryTsLow(ts_str); + edits_.emplace_back(edit); + } + } + + void VerifyFullHistoryTsLow(uint64_t expected_ts_low) { + std::unique_ptr vset( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id*/ "", /*db_session_id*/ "")); + ASSERT_OK(vset->Recover(column_families_, /*read_only=*/false, + /*db_id=*/nullptr)); + for (auto* cfd : *(vset->GetColumnFamilySet())) { + ASSERT_NE(nullptr, cfd); + if (cfd->GetName() == kNewCfName) { + ASSERT_EQ(test::EncodeInt(expected_ts_low), cfd->GetFullHistoryTsLow()); + } else { + ASSERT_TRUE(cfd->GetFullHistoryTsLow().empty()); + } + } + } + + void DoTest(const std::vector& ts_lbs) { + if (ts_lbs.empty()) { + return; + } + + GenVersionEditsToSetFullHistoryTsLow(ts_lbs); + + Status s; + mutex_.Lock(); + s = versions_->LogAndApply(cfd_, *(cfd_->GetLatestMutableCFOptions()), + edits_, &mutex_, nullptr); + mutex_.Unlock(); + ASSERT_OK(s); + VerifyFullHistoryTsLow(*std::max_element(ts_lbs.begin(), ts_lbs.end())); + } + + protected: + ColumnFamilyData* cfd_{nullptr}; + // edits_ must contain and own pointers to heap-alloc VersionEdit objects. + autovector edits_; +}; + +const std::string VersionSetWithTimestampTest::kNewCfName("new_cf"); + +TEST_F(VersionSetWithTimestampTest, SetFullHistoryTsLbOnce) { + constexpr uint64_t kTsLow = 100; + DoTest({kTsLow}); +} + +// Simulate the application increasing full_history_ts_low. +TEST_F(VersionSetWithTimestampTest, IncreaseFullHistoryTsLb) { + const std::vector ts_lbs = {100, 101, 102, 103}; + DoTest(ts_lbs); +} + +// Simulate the application trying to decrease full_history_ts_low +// unsuccessfully. If the application calls public API sequentially to +// decrease the lower bound ts, RocksDB will return an InvalidArgument +// status before involving VersionSet. Only when multiple threads trying +// to decrease the lower bound concurrently will this case ever happen. Even +// so, the lower bound cannot be decreased. The application will be notified +// via return value of the API. +TEST_F(VersionSetWithTimestampTest, TryDecreaseFullHistoryTsLb) { + const std::vector ts_lbs = {103, 102, 101, 100}; + DoTest(ts_lbs); +} + +class VersionSetAtomicGroupTest : public VersionSetTestBase, + public testing::Test { + public: + VersionSetAtomicGroupTest() + : VersionSetTestBase("version_set_atomic_group_test") {} + + void SetUp() override { + PrepareManifest(&column_families_, &last_seqno_, &log_writer_); + SetupTestSyncPoints(); + } + + void SetupValidAtomicGroup(int atomic_group_size) { + edits_.resize(atomic_group_size); + int remaining = atomic_group_size; + for (size_t i = 0; i != edits_.size(); ++i) { + edits_[i].SetLogNumber(0); + edits_[i].SetNextFile(2); + edits_[i].MarkAtomicGroup(--remaining); + edits_[i].SetLastSequence(last_seqno_++); + } + ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr)); + } + + void SetupIncompleteTrailingAtomicGroup(int atomic_group_size) { + edits_.resize(atomic_group_size); + int remaining = atomic_group_size; + for (size_t i = 0; i != edits_.size(); ++i) { + edits_[i].SetLogNumber(0); + edits_[i].SetNextFile(2); + edits_[i].MarkAtomicGroup(--remaining); + edits_[i].SetLastSequence(last_seqno_++); + } + ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr)); + } + + void SetupCorruptedAtomicGroup(int atomic_group_size) { + edits_.resize(atomic_group_size); + int remaining = atomic_group_size; + for (size_t i = 0; i != edits_.size(); ++i) { + edits_[i].SetLogNumber(0); + edits_[i].SetNextFile(2); + if (i != ((size_t)atomic_group_size / 2)) { + edits_[i].MarkAtomicGroup(--remaining); + } + edits_[i].SetLastSequence(last_seqno_++); + } + ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr)); + } + + void SetupIncorrectAtomicGroup(int atomic_group_size) { + edits_.resize(atomic_group_size); + int remaining = atomic_group_size; + for (size_t i = 0; i != edits_.size(); ++i) { + edits_[i].SetLogNumber(0); + edits_[i].SetNextFile(2); + if (i != 1) { + edits_[i].MarkAtomicGroup(--remaining); + } else { + edits_[i].MarkAtomicGroup(remaining--); + } + edits_[i].SetLastSequence(last_seqno_++); + } + ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr)); + } + + void SetupTestSyncPoints() { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", [&](void* arg) { + VersionEdit* e = reinterpret_cast(arg); + EXPECT_EQ(edits_.front().DebugString(), + e->DebugString()); // compare based on value + first_in_atomic_group_ = true; + }); + SyncPoint::GetInstance()->SetCallBack( + "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", [&](void* arg) { + VersionEdit* e = reinterpret_cast(arg); + EXPECT_EQ(edits_.back().DebugString(), + e->DebugString()); // compare based on value + EXPECT_TRUE(first_in_atomic_group_); + last_in_atomic_group_ = true; + }); + SyncPoint::GetInstance()->SetCallBack( + "VersionEditHandlerBase::Iterate:Finish", [&](void* arg) { + num_recovered_edits_ = *reinterpret_cast(arg); + }); + SyncPoint::GetInstance()->SetCallBack( + "AtomicGroupReadBuffer::AddEdit:AtomicGroup", + [&](void* /* arg */) { ++num_edits_in_atomic_group_; }); + SyncPoint::GetInstance()->SetCallBack( + "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits", + [&](void* arg) { + corrupted_edit_ = *reinterpret_cast(arg); + }); + SyncPoint::GetInstance()->SetCallBack( + "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize", + [&](void* arg) { + edit_with_incorrect_group_size_ = + *reinterpret_cast(arg); + }); + SyncPoint::GetInstance()->EnableProcessing(); + } + + void AddNewEditsToLog(int num_edits) { + for (int i = 0; i < num_edits; i++) { + std::string record; + edits_[i].EncodeTo(&record); + ASSERT_OK(log_writer_->AddRecord(record)); + } + } + + void TearDown() override { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + log_writer_.reset(); + } + + protected: + std::vector column_families_; + SequenceNumber last_seqno_; + std::vector edits_; + bool first_in_atomic_group_ = false; + bool last_in_atomic_group_ = false; + int num_edits_in_atomic_group_ = 0; + size_t num_recovered_edits_ = 0; + VersionEdit corrupted_edit_; + VersionEdit edit_with_incorrect_group_size_; + std::unique_ptr log_writer_; +}; + +TEST_F(VersionSetAtomicGroupTest, HandleValidAtomicGroupWithVersionSetRecover) { + const int kAtomicGroupSize = 3; + SetupValidAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kAtomicGroupSize); + EXPECT_OK(versions_->Recover(column_families_, false)); + EXPECT_EQ(column_families_.size(), + versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_TRUE(first_in_atomic_group_); + EXPECT_TRUE(last_in_atomic_group_); + EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_); +} + +TEST_F(VersionSetAtomicGroupTest, + HandleValidAtomicGroupWithReactiveVersionSetRecover) { + const int kAtomicGroupSize = 3; + SetupValidAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kAtomicGroupSize); + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + EXPECT_EQ(column_families_.size(), + reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_TRUE(first_in_atomic_group_); + EXPECT_TRUE(last_in_atomic_group_); + // The recover should clean up the replay buffer. + EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0); + EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0); + EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_); +} + +TEST_F(VersionSetAtomicGroupTest, + HandleValidAtomicGroupWithReactiveVersionSetReadAndApply) { + const int kAtomicGroupSize = 3; + SetupValidAtomicGroup(kAtomicGroupSize); + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + EXPECT_EQ(num_initial_edits_, num_recovered_edits_); + AddNewEditsToLog(kAtomicGroupSize); + InstrumentedMutex mu; + std::unordered_set cfds_changed; + mu.Lock(); + EXPECT_OK(reactive_versions_->ReadAndApply( + &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed)); + mu.Unlock(); + EXPECT_TRUE(first_in_atomic_group_); + EXPECT_TRUE(last_in_atomic_group_); + // The recover should clean up the replay buffer. + EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0); + EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0); + EXPECT_EQ(kAtomicGroupSize, num_recovered_edits_); +} + +TEST_F(VersionSetAtomicGroupTest, + HandleIncompleteTrailingAtomicGroupWithVersionSetRecover) { + const int kAtomicGroupSize = 4; + const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1; + SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kNumberOfPersistedVersionEdits); + EXPECT_OK(versions_->Recover(column_families_, false)); + EXPECT_EQ(column_families_.size(), + versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_TRUE(first_in_atomic_group_); + EXPECT_FALSE(last_in_atomic_group_); + EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_); + EXPECT_EQ(num_initial_edits_, num_recovered_edits_); +} + +TEST_F(VersionSetAtomicGroupTest, + HandleIncompleteTrailingAtomicGroupWithReactiveVersionSetRecover) { + const int kAtomicGroupSize = 4; + const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1; + SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kNumberOfPersistedVersionEdits); + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + EXPECT_EQ(column_families_.size(), + reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_TRUE(first_in_atomic_group_); + EXPECT_FALSE(last_in_atomic_group_); + EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_); + // Reactive version set should store the edits in the replay buffer. + EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == + kNumberOfPersistedVersionEdits); + EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize); + // Write the last record. The reactive version set should now apply all + // edits. + std::string last_record; + edits_[kAtomicGroupSize - 1].EncodeTo(&last_record); + EXPECT_OK(log_writer_->AddRecord(last_record)); + InstrumentedMutex mu; + std::unordered_set cfds_changed; + mu.Lock(); + EXPECT_OK(reactive_versions_->ReadAndApply( + &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed)); + mu.Unlock(); + // Reactive version set should be empty now. + EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0); + EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0); + EXPECT_EQ(num_initial_edits_, num_recovered_edits_); +} + +TEST_F(VersionSetAtomicGroupTest, + HandleIncompleteTrailingAtomicGroupWithReactiveVersionSetReadAndApply) { + const int kAtomicGroupSize = 4; + const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1; + SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize); + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + // No edits in an atomic group. + EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + EXPECT_EQ(column_families_.size(), + reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_EQ(num_initial_edits_, num_recovered_edits_); + // Write a few edits in an atomic group. + AddNewEditsToLog(kNumberOfPersistedVersionEdits); + InstrumentedMutex mu; + std::unordered_set cfds_changed; + mu.Lock(); + EXPECT_OK(reactive_versions_->ReadAndApply( + &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed)); + mu.Unlock(); + EXPECT_TRUE(first_in_atomic_group_); + EXPECT_FALSE(last_in_atomic_group_); + EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_); + // Reactive version set should store the edits in the replay buffer. + EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == + kNumberOfPersistedVersionEdits); + EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize); +} + +TEST_F(VersionSetAtomicGroupTest, + HandleCorruptedAtomicGroupWithVersionSetRecover) { + const int kAtomicGroupSize = 4; + SetupCorruptedAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kAtomicGroupSize); + EXPECT_NOK(versions_->Recover(column_families_, false)); + EXPECT_EQ(column_families_.size(), + versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(), + corrupted_edit_.DebugString()); +} + +TEST_F(VersionSetAtomicGroupTest, + HandleCorruptedAtomicGroupWithReactiveVersionSetRecover) { + const int kAtomicGroupSize = 4; + SetupCorruptedAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kAtomicGroupSize); + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_NOK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + EXPECT_EQ(column_families_.size(), + reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(), + corrupted_edit_.DebugString()); +} + +TEST_F(VersionSetAtomicGroupTest, + HandleCorruptedAtomicGroupWithReactiveVersionSetReadAndApply) { + const int kAtomicGroupSize = 4; + SetupCorruptedAtomicGroup(kAtomicGroupSize); + InstrumentedMutex mu; + std::unordered_set cfds_changed; + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + // Write the corrupted edits. + AddNewEditsToLog(kAtomicGroupSize); + mu.Lock(); + EXPECT_NOK(reactive_versions_->ReadAndApply( + &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed)); + mu.Unlock(); + EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(), + corrupted_edit_.DebugString()); +} + +TEST_F(VersionSetAtomicGroupTest, + HandleIncorrectAtomicGroupSizeWithVersionSetRecover) { + const int kAtomicGroupSize = 4; + SetupIncorrectAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kAtomicGroupSize); + EXPECT_NOK(versions_->Recover(column_families_, false)); + EXPECT_EQ(column_families_.size(), + versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_EQ(edits_[1].DebugString(), + edit_with_incorrect_group_size_.DebugString()); +} + +TEST_F(VersionSetAtomicGroupTest, + HandleIncorrectAtomicGroupSizeWithReactiveVersionSetRecover) { + const int kAtomicGroupSize = 4; + SetupIncorrectAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kAtomicGroupSize); + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_NOK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + EXPECT_EQ(column_families_.size(), + reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_EQ(edits_[1].DebugString(), + edit_with_incorrect_group_size_.DebugString()); +} + +TEST_F(VersionSetAtomicGroupTest, + HandleIncorrectAtomicGroupSizeWithReactiveVersionSetReadAndApply) { + const int kAtomicGroupSize = 4; + SetupIncorrectAtomicGroup(kAtomicGroupSize); + InstrumentedMutex mu; + std::unordered_set cfds_changed; + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + AddNewEditsToLog(kAtomicGroupSize); + mu.Lock(); + EXPECT_NOK(reactive_versions_->ReadAndApply( + &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed)); + mu.Unlock(); + EXPECT_EQ(edits_[1].DebugString(), + edit_with_incorrect_group_size_.DebugString()); +} + +class VersionSetTestDropOneCF : public VersionSetTestBase, + public testing::TestWithParam { + public: + VersionSetTestDropOneCF() + : VersionSetTestBase("version_set_test_drop_one_cf") {} +}; + +// This test simulates the following execution sequence +// Time thread1 bg_flush_thr +// | Prepare version edits (e1,e2,e3) for atomic +// | flush cf1, cf2, cf3 +// | Enqueue e to drop cfi +// | to manifest_writers_ +// | Enqueue (e1,e2,e3) to manifest_writers_ +// | +// | Apply e, +// | cfi.IsDropped() is true +// | Apply (e1,e2,e3), +// | since cfi.IsDropped() == true, we need to +// | drop ei and write the rest to MANIFEST. +// V +// +// Repeat the test for i = 1, 2, 3 to simulate dropping the first, middle and +// last column family in an atomic group. +TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { + std::vector column_families; + SequenceNumber last_seqno; + std::unique_ptr log_writer; + PrepareManifest(&column_families, &last_seqno, &log_writer); + Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + ASSERT_OK(s); + + EXPECT_OK(versions_->Recover(column_families, false /* read_only */)); + EXPECT_EQ(column_families.size(), + versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + + const int kAtomicGroupSize = 3; + const std::vector non_default_cf_names = { + kColumnFamilyName1, kColumnFamilyName2, kColumnFamilyName3}; + + // Drop one column family + VersionEdit drop_cf_edit; + drop_cf_edit.DropColumnFamily(); + const std::string cf_to_drop_name(GetParam()); + auto cfd_to_drop = + versions_->GetColumnFamilySet()->GetColumnFamily(cf_to_drop_name); + ASSERT_NE(nullptr, cfd_to_drop); + // Increase its refcount because cfd_to_drop is used later, and we need to + // prevent it from being deleted. + cfd_to_drop->Ref(); + drop_cf_edit.SetColumnFamily(cfd_to_drop->GetID()); + mutex_.Lock(); + s = versions_->LogAndApply(cfd_to_drop, + *cfd_to_drop->GetLatestMutableCFOptions(), + &drop_cf_edit, &mutex_, nullptr); + mutex_.Unlock(); + ASSERT_OK(s); + + std::vector edits(kAtomicGroupSize); + uint32_t remaining = kAtomicGroupSize; + size_t i = 0; + autovector cfds; + autovector mutable_cf_options_list; + autovector> edit_lists; + for (const auto& cf_name : non_default_cf_names) { + auto cfd = (cf_name != cf_to_drop_name) + ? versions_->GetColumnFamilySet()->GetColumnFamily(cf_name) + : cfd_to_drop; + ASSERT_NE(nullptr, cfd); + cfds.push_back(cfd); + mutable_cf_options_list.emplace_back(cfd->GetLatestMutableCFOptions()); + edits[i].SetColumnFamily(cfd->GetID()); + edits[i].SetLogNumber(0); + edits[i].SetNextFile(2); + edits[i].MarkAtomicGroup(--remaining); + edits[i].SetLastSequence(last_seqno++); + autovector tmp_edits; + tmp_edits.push_back(&edits[i]); + edit_lists.emplace_back(tmp_edits); + ++i; + } + int called = 0; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", [&](void* arg) { + std::vector* tmp_edits = + reinterpret_cast*>(arg); + EXPECT_EQ(kAtomicGroupSize - 1, tmp_edits->size()); + for (const auto e : *tmp_edits) { + bool found = false; + for (const auto& e2 : edits) { + if (&e2 == e) { + found = true; + break; + } + } + ASSERT_TRUE(found); + } + ++called; + }); + SyncPoint::GetInstance()->EnableProcessing(); + mutex_.Lock(); + s = versions_->LogAndApply(cfds, mutable_cf_options_list, edit_lists, &mutex_, + nullptr); + mutex_.Unlock(); + ASSERT_OK(s); + ASSERT_EQ(1, called); + cfd_to_drop->UnrefAndTryDelete(); +} + +INSTANTIATE_TEST_CASE_P( + AtomicGroup, VersionSetTestDropOneCF, + testing::Values(VersionSetTestBase::kColumnFamilyName1, + VersionSetTestBase::kColumnFamilyName2, + VersionSetTestBase::kColumnFamilyName3)); + +class EmptyDefaultCfNewManifest : public VersionSetTestBase, + public testing::Test { + public: + EmptyDefaultCfNewManifest() : VersionSetTestBase("version_set_new_db_test") {} + // Emulate DBImpl::NewDB() + void PrepareManifest(std::vector* /*column_families*/, + SequenceNumber* /*last_seqno*/, + std::unique_ptr* log_writer) override { + assert(log_writer != nullptr); + VersionEdit new_db; + new_db.SetLogNumber(0); + const std::string manifest_path = DescriptorFileName(dbname_, 1); + const auto& fs = env_->GetFileSystem(); + std::unique_ptr file_writer; + Status s = WritableFileWriter::Create( + fs, manifest_path, fs->OptimizeForManifestWrite(env_options_), + &file_writer, nullptr); + ASSERT_OK(s); + log_writer->reset(new log::Writer(std::move(file_writer), 0, true)); + std::string record; + ASSERT_TRUE(new_db.EncodeTo(&record)); + s = (*log_writer)->AddRecord(record); + ASSERT_OK(s); + // Create new column family + VersionEdit new_cf; + new_cf.AddColumnFamily(VersionSetTestBase::kColumnFamilyName1); + new_cf.SetColumnFamily(1); + new_cf.SetLastSequence(2); + new_cf.SetNextFile(2); + record.clear(); + ASSERT_TRUE(new_cf.EncodeTo(&record)); + s = (*log_writer)->AddRecord(record); + ASSERT_OK(s); + } + + protected: + bool write_dbid_to_manifest_ = false; + std::unique_ptr log_writer_; +}; + +// Create db, create column family. Cf creation will switch to a new MANIFEST. +// Then reopen db, trying to recover. +TEST_F(EmptyDefaultCfNewManifest, Recover) { + PrepareManifest(nullptr, nullptr, &log_writer_); + log_writer_.reset(); + Status s = + SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr); + ASSERT_OK(s); + std::string manifest_path; + VerifyManifest(&manifest_path); + std::vector column_families; + column_families.emplace_back(kDefaultColumnFamilyName, cf_options_); + column_families.emplace_back(VersionSetTestBase::kColumnFamilyName1, + cf_options_); + std::string db_id; + bool has_missing_table_file = false; + s = versions_->TryRecoverFromOneManifest( + manifest_path, column_families, false, &db_id, &has_missing_table_file); + ASSERT_OK(s); + ASSERT_FALSE(has_missing_table_file); +} + +class VersionSetTestEmptyDb + : public VersionSetTestBase, + public testing::TestWithParam< + std::tuple>> { + public: + static const std::string kUnknownColumnFamilyName; + VersionSetTestEmptyDb() : VersionSetTestBase("version_set_test_empty_db") {} + + protected: + void PrepareManifest(std::vector* /*column_families*/, + SequenceNumber* /*last_seqno*/, + std::unique_ptr* log_writer) override { + assert(nullptr != log_writer); + VersionEdit new_db; + if (db_options_.write_dbid_to_manifest) { + DBOptions tmp_db_options; + tmp_db_options.env = env_; + std::unique_ptr impl(new DBImpl(tmp_db_options, dbname_)); + std::string db_id; + impl->GetDbIdentityFromIdentityFile(&db_id); + new_db.SetDBId(db_id); + } + const std::string manifest_path = DescriptorFileName(dbname_, 1); + const auto& fs = env_->GetFileSystem(); + std::unique_ptr file_writer; + Status s = WritableFileWriter::Create( + fs, manifest_path, fs->OptimizeForManifestWrite(env_options_), + &file_writer, nullptr); + ASSERT_OK(s); + { + log_writer->reset(new log::Writer(std::move(file_writer), 0, false)); + std::string record; + new_db.EncodeTo(&record); + s = (*log_writer)->AddRecord(record); + ASSERT_OK(s); + } + } + + std::unique_ptr log_writer_; +}; + +const std::string VersionSetTestEmptyDb::kUnknownColumnFamilyName = "unknown"; + +TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) { + db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); + PrepareManifest(nullptr, nullptr, &log_writer_); + log_writer_.reset(); + Status s = + SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr); + ASSERT_OK(s); + + std::string manifest_path; + VerifyManifest(&manifest_path); + + bool read_only = std::get<1>(GetParam()); + const std::vector cf_names = std::get<2>(GetParam()); + + std::vector column_families; + for (const auto& cf_name : cf_names) { + column_families.emplace_back(cf_name, cf_options_); + } + + std::string db_id; + bool has_missing_table_file = false; + s = versions_->TryRecoverFromOneManifest(manifest_path, column_families, + read_only, &db_id, + &has_missing_table_file); + auto iter = + std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName); + if (iter == cf_names.end()) { + ASSERT_TRUE(s.IsInvalidArgument()); + } else { + ASSERT_NE(s.ToString().find(manifest_path), std::string::npos); + ASSERT_TRUE(s.IsCorruption()); + } +} + +TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest1) { + db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); + PrepareManifest(nullptr, nullptr, &log_writer_); + // Only a subset of column families in the MANIFEST. + VersionEdit new_cf1; + new_cf1.AddColumnFamily(VersionSetTestBase::kColumnFamilyName1); + new_cf1.SetColumnFamily(1); + Status s; + { + std::string record; + new_cf1.EncodeTo(&record); + s = log_writer_->AddRecord(record); + ASSERT_OK(s); + } + log_writer_.reset(); + s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr); + ASSERT_OK(s); + + std::string manifest_path; + VerifyManifest(&manifest_path); + + bool read_only = std::get<1>(GetParam()); + const std::vector& cf_names = std::get<2>(GetParam()); + std::vector column_families; + for (const auto& cf_name : cf_names) { + column_families.emplace_back(cf_name, cf_options_); + } + std::string db_id; + bool has_missing_table_file = false; + s = versions_->TryRecoverFromOneManifest(manifest_path, column_families, + read_only, &db_id, + &has_missing_table_file); + auto iter = + std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName); + if (iter == cf_names.end()) { + ASSERT_TRUE(s.IsInvalidArgument()); + } else { + ASSERT_NE(s.ToString().find(manifest_path), std::string::npos); + ASSERT_TRUE(s.IsCorruption()); + } +} + +TEST_P(VersionSetTestEmptyDb, OpenFromInCompleteManifest2) { + db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); + PrepareManifest(nullptr, nullptr, &log_writer_); + // Write all column families but no log_number, next_file_number and + // last_sequence. + const std::vector all_cf_names = { + kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2, + kColumnFamilyName3}; + uint32_t cf_id = 1; + Status s; + for (size_t i = 1; i != all_cf_names.size(); ++i) { + VersionEdit new_cf; + new_cf.AddColumnFamily(all_cf_names[i]); + new_cf.SetColumnFamily(cf_id++); + std::string record; + ASSERT_TRUE(new_cf.EncodeTo(&record)); + s = log_writer_->AddRecord(record); + ASSERT_OK(s); + } + log_writer_.reset(); + s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr); + ASSERT_OK(s); + + std::string manifest_path; + VerifyManifest(&manifest_path); + + bool read_only = std::get<1>(GetParam()); + const std::vector& cf_names = std::get<2>(GetParam()); + std::vector column_families; + for (const auto& cf_name : cf_names) { + column_families.emplace_back(cf_name, cf_options_); + } + std::string db_id; + bool has_missing_table_file = false; + s = versions_->TryRecoverFromOneManifest(manifest_path, column_families, + read_only, &db_id, + &has_missing_table_file); + auto iter = + std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName); + if (iter == cf_names.end()) { + ASSERT_TRUE(s.IsInvalidArgument()); + } else { + ASSERT_NE(s.ToString().find(manifest_path), std::string::npos); + ASSERT_TRUE(s.IsCorruption()); + } +} + +TEST_P(VersionSetTestEmptyDb, OpenManifestWithUnknownCF) { + db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); + PrepareManifest(nullptr, nullptr, &log_writer_); + // Write all column families but no log_number, next_file_number and + // last_sequence. + const std::vector all_cf_names = { + kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2, + kColumnFamilyName3}; + uint32_t cf_id = 1; + Status s; + for (size_t i = 1; i != all_cf_names.size(); ++i) { + VersionEdit new_cf; + new_cf.AddColumnFamily(all_cf_names[i]); + new_cf.SetColumnFamily(cf_id++); + std::string record; + ASSERT_TRUE(new_cf.EncodeTo(&record)); + s = log_writer_->AddRecord(record); + ASSERT_OK(s); + } + { + VersionEdit tmp_edit; + tmp_edit.SetColumnFamily(4); + tmp_edit.SetLogNumber(0); + tmp_edit.SetNextFile(2); + tmp_edit.SetLastSequence(0); + std::string record; + ASSERT_TRUE(tmp_edit.EncodeTo(&record)); + s = log_writer_->AddRecord(record); + ASSERT_OK(s); + } + log_writer_.reset(); + s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr); + ASSERT_OK(s); + + std::string manifest_path; + VerifyManifest(&manifest_path); + + bool read_only = std::get<1>(GetParam()); + const std::vector& cf_names = std::get<2>(GetParam()); + std::vector column_families; + for (const auto& cf_name : cf_names) { + column_families.emplace_back(cf_name, cf_options_); + } + std::string db_id; + bool has_missing_table_file = false; + s = versions_->TryRecoverFromOneManifest(manifest_path, column_families, + read_only, &db_id, + &has_missing_table_file); + auto iter = + std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName); + if (iter == cf_names.end()) { + ASSERT_TRUE(s.IsInvalidArgument()); + } else { + ASSERT_NE(s.ToString().find(manifest_path), std::string::npos); + ASSERT_TRUE(s.IsCorruption()); + } +} + +TEST_P(VersionSetTestEmptyDb, OpenCompleteManifest) { + db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); + PrepareManifest(nullptr, nullptr, &log_writer_); + // Write all column families but no log_number, next_file_number and + // last_sequence. + const std::vector all_cf_names = { + kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2, + kColumnFamilyName3}; + uint32_t cf_id = 1; + Status s; + for (size_t i = 1; i != all_cf_names.size(); ++i) { + VersionEdit new_cf; + new_cf.AddColumnFamily(all_cf_names[i]); + new_cf.SetColumnFamily(cf_id++); + std::string record; + ASSERT_TRUE(new_cf.EncodeTo(&record)); + s = log_writer_->AddRecord(record); + ASSERT_OK(s); + } + { + VersionEdit tmp_edit; + tmp_edit.SetLogNumber(0); + tmp_edit.SetNextFile(2); + tmp_edit.SetLastSequence(0); + std::string record; + ASSERT_TRUE(tmp_edit.EncodeTo(&record)); + s = log_writer_->AddRecord(record); + ASSERT_OK(s); + } + log_writer_.reset(); + s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr); + ASSERT_OK(s); + + std::string manifest_path; + VerifyManifest(&manifest_path); + + bool read_only = std::get<1>(GetParam()); + const std::vector& cf_names = std::get<2>(GetParam()); + std::vector column_families; + for (const auto& cf_name : cf_names) { + column_families.emplace_back(cf_name, cf_options_); + } + std::string db_id; + bool has_missing_table_file = false; + s = versions_->TryRecoverFromOneManifest(manifest_path, column_families, + read_only, &db_id, + &has_missing_table_file); + auto iter = + std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName); + if (iter == cf_names.end()) { + ASSERT_TRUE(s.IsInvalidArgument()); + } else if (read_only) { + ASSERT_OK(s); + ASSERT_FALSE(has_missing_table_file); + } else if (cf_names.size() == all_cf_names.size()) { + ASSERT_OK(s); + ASSERT_FALSE(has_missing_table_file); + } else if (cf_names.size() < all_cf_names.size()) { + ASSERT_TRUE(s.IsInvalidArgument()); + } else { + ASSERT_OK(s); + ASSERT_FALSE(has_missing_table_file); + ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetColumnFamily( + kUnknownColumnFamilyName); + ASSERT_EQ(nullptr, cfd); + } +} + +INSTANTIATE_TEST_CASE_P( + BestEffortRecovery, VersionSetTestEmptyDb, + testing::Combine( + /*write_dbid_to_manifest=*/testing::Bool(), + /*read_only=*/testing::Bool(), + /*cf_names=*/ + testing::Values( + std::vector(), + std::vector({kDefaultColumnFamilyName}), + std::vector({VersionSetTestBase::kColumnFamilyName1, + VersionSetTestBase::kColumnFamilyName2, + VersionSetTestBase::kColumnFamilyName3}), + std::vector({kDefaultColumnFamilyName, + VersionSetTestBase::kColumnFamilyName1}), + std::vector({kDefaultColumnFamilyName, + VersionSetTestBase::kColumnFamilyName1, + VersionSetTestBase::kColumnFamilyName2, + VersionSetTestBase::kColumnFamilyName3}), + std::vector( + {kDefaultColumnFamilyName, + VersionSetTestBase::kColumnFamilyName1, + VersionSetTestBase::kColumnFamilyName2, + VersionSetTestBase::kColumnFamilyName3, + VersionSetTestEmptyDb::kUnknownColumnFamilyName})))); + +class VersionSetTestMissingFiles : public VersionSetTestBase, + public testing::Test { + public: + VersionSetTestMissingFiles() + : VersionSetTestBase("version_set_test_missing_files"), + block_based_table_options_(), + table_factory_(std::make_shared( + block_based_table_options_)), + internal_comparator_( + std::make_shared(options_.comparator)) {} + + protected: + void PrepareManifest(std::vector* column_families, + SequenceNumber* last_seqno, + std::unique_ptr* log_writer) override { + assert(column_families != nullptr); + assert(last_seqno != nullptr); + assert(log_writer != nullptr); + const std::string manifest = DescriptorFileName(dbname_, 1); + const auto& fs = env_->GetFileSystem(); + std::unique_ptr file_writer; + Status s = WritableFileWriter::Create( + fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer, + nullptr); + ASSERT_OK(s); + log_writer->reset(new log::Writer(std::move(file_writer), 0, false)); + VersionEdit new_db; + if (db_options_.write_dbid_to_manifest) { + DBOptions tmp_db_options; + tmp_db_options.env = env_; + std::unique_ptr impl(new DBImpl(tmp_db_options, dbname_)); + std::string db_id; + impl->GetDbIdentityFromIdentityFile(&db_id); + new_db.SetDBId(db_id); + } + { + std::string record; + ASSERT_TRUE(new_db.EncodeTo(&record)); + s = (*log_writer)->AddRecord(record); + ASSERT_OK(s); + } + const std::vector cf_names = { + kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2, + kColumnFamilyName3}; + uint32_t cf_id = 1; // default cf id is 0 + cf_options_.table_factory = table_factory_; + for (const auto& cf_name : cf_names) { + column_families->emplace_back(cf_name, cf_options_); + if (cf_name == kDefaultColumnFamilyName) { + continue; + } + VersionEdit new_cf; + new_cf.AddColumnFamily(cf_name); + new_cf.SetColumnFamily(cf_id); + std::string record; + ASSERT_TRUE(new_cf.EncodeTo(&record)); + s = (*log_writer)->AddRecord(record); + ASSERT_OK(s); + + VersionEdit cf_files; + cf_files.SetColumnFamily(cf_id); + cf_files.SetLogNumber(0); + record.clear(); + ASSERT_TRUE(cf_files.EncodeTo(&record)); + s = (*log_writer)->AddRecord(record); + ASSERT_OK(s); + ++cf_id; + } + SequenceNumber seq = 2; + { + VersionEdit edit; + edit.SetNextFile(7); + edit.SetLastSequence(seq); + std::string record; + ASSERT_TRUE(edit.EncodeTo(&record)); + s = (*log_writer)->AddRecord(record); + ASSERT_OK(s); + } + *last_seqno = seq + 1; + } + + struct SstInfo { + uint64_t file_number; + std::string column_family; + std::string key; // the only key + int level = 0; + SstInfo(uint64_t file_num, const std::string& cf_name, + const std::string& _key) + : SstInfo(file_num, cf_name, _key, 0) {} + SstInfo(uint64_t file_num, const std::string& cf_name, + const std::string& _key, int lvl) + : file_number(file_num), + column_family(cf_name), + key(_key), + level(lvl) {} + }; + + // Create dummy sst, return their metadata. Note that only file name and size + // are used. + void CreateDummyTableFiles(const std::vector& file_infos, + std::vector* file_metas) { + assert(file_metas != nullptr); + for (const auto& info : file_infos) { + uint64_t file_num = info.file_number; + std::string fname = MakeTableFileName(dbname_, file_num); + std::unique_ptr file; + Status s = fs_->NewWritableFile(fname, FileOptions(), &file, nullptr); + ASSERT_OK(s); + std::unique_ptr fwriter(new WritableFileWriter( + std::move(file), fname, FileOptions(), env_->GetSystemClock().get())); + IntTblPropCollectorFactories int_tbl_prop_collector_factories; + + std::unique_ptr builder(table_factory_->NewTableBuilder( + TableBuilderOptions( + immutable_options_, mutable_cf_options_, *internal_comparator_, + &int_tbl_prop_collector_factories, kNoCompression, + CompressionOptions(), + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + info.column_family, info.level), + fwriter.get())); + InternalKey ikey(info.key, 0, ValueType::kTypeValue); + builder->Add(ikey.Encode(), "value"); + ASSERT_OK(builder->Finish()); + ASSERT_OK(fwriter->Flush()); + uint64_t file_size = 0; + s = fs_->GetFileSize(fname, IOOptions(), &file_size, nullptr); + ASSERT_OK(s); + ASSERT_NE(0, file_size); + file_metas->emplace_back(file_num, /*file_path_id=*/0, file_size, ikey, + ikey, 0, 0, false, Temperature::kUnknown, 0, 0, + 0, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); + } + } + + // This method updates last_sequence_. + void WriteFileAdditionAndDeletionToManifest( + uint32_t cf, const std::vector>& added_files, + const std::vector>& deleted_files) { + VersionEdit edit; + edit.SetColumnFamily(cf); + for (const auto& elem : added_files) { + int level = elem.first; + edit.AddFile(level, elem.second); + } + for (const auto& elem : deleted_files) { + int level = elem.first; + edit.DeleteFile(level, elem.second); + } + edit.SetLastSequence(last_seqno_); + ++last_seqno_; + assert(log_writer_.get() != nullptr); + std::string record; + ASSERT_TRUE(edit.EncodeTo(&record)); + Status s = log_writer_->AddRecord(record); + ASSERT_OK(s); + } + + BlockBasedTableOptions block_based_table_options_; + std::shared_ptr table_factory_; + std::shared_ptr internal_comparator_; + std::vector column_families_; + SequenceNumber last_seqno_; + std::unique_ptr log_writer_; +}; + +TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) { + std::vector existing_files = { + SstInfo(100, kDefaultColumnFamilyName, "a"), + SstInfo(102, kDefaultColumnFamilyName, "b"), + SstInfo(103, kDefaultColumnFamilyName, "c"), + SstInfo(107, kDefaultColumnFamilyName, "d"), + SstInfo(110, kDefaultColumnFamilyName, "e")}; + std::vector file_metas; + CreateDummyTableFiles(existing_files, &file_metas); + + PrepareManifest(&column_families_, &last_seqno_, &log_writer_); + std::vector> added_files; + for (uint64_t file_num = 10; file_num < 15; ++file_num) { + std::string smallest_ukey = "a"; + std::string largest_ukey = "b"; + InternalKey smallest_ikey(smallest_ukey, 1, ValueType::kTypeValue); + InternalKey largest_ikey(largest_ukey, 1, ValueType::kTypeValue); + FileMetaData meta = FileMetaData( + file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey, + largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + added_files.emplace_back(0, meta); + } + WriteFileAdditionAndDeletionToManifest( + /*cf=*/0, added_files, std::vector>()); + std::vector> deleted_files; + deleted_files.emplace_back(0, 10); + WriteFileAdditionAndDeletionToManifest( + /*cf=*/0, std::vector>(), deleted_files); + log_writer_.reset(); + Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + ASSERT_OK(s); + std::string manifest_path; + VerifyManifest(&manifest_path); + std::string db_id; + bool has_missing_table_file = false; + s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_, + /*read_only=*/false, &db_id, + &has_missing_table_file); + ASSERT_OK(s); + ASSERT_TRUE(has_missing_table_file); + for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) { + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + const std::vector& files = vstorage->LevelFiles(0); + ASSERT_TRUE(files.empty()); + } +} + +TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) { + std::vector existing_files = { + SstInfo(100, kDefaultColumnFamilyName, "a"), + SstInfo(102, kDefaultColumnFamilyName, "b"), + SstInfo(103, kDefaultColumnFamilyName, "c"), + SstInfo(107, kDefaultColumnFamilyName, "d"), + SstInfo(110, kDefaultColumnFamilyName, "e")}; + std::vector file_metas; + CreateDummyTableFiles(existing_files, &file_metas); + + PrepareManifest(&column_families_, &last_seqno_, &log_writer_); + std::vector> added_files; + for (size_t i = 3; i != 5; ++i) { + added_files.emplace_back(0, file_metas[i]); + } + WriteFileAdditionAndDeletionToManifest( + /*cf=*/0, added_files, std::vector>()); + + added_files.clear(); + for (uint64_t file_num = 120; file_num < 130; ++file_num) { + std::string smallest_ukey = "a"; + std::string largest_ukey = "b"; + InternalKey smallest_ikey(smallest_ukey, 1, ValueType::kTypeValue); + InternalKey largest_ikey(largest_ukey, 1, ValueType::kTypeValue); + FileMetaData meta = FileMetaData( + file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey, + largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + added_files.emplace_back(0, meta); + } + WriteFileAdditionAndDeletionToManifest( + /*cf=*/0, added_files, std::vector>()); + log_writer_.reset(); + Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + ASSERT_OK(s); + std::string manifest_path; + VerifyManifest(&manifest_path); + std::string db_id; + bool has_missing_table_file = false; + s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_, + /*read_only=*/false, &db_id, + &has_missing_table_file); + ASSERT_OK(s); + ASSERT_TRUE(has_missing_table_file); + for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) { + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + const std::vector& files = vstorage->LevelFiles(0); + if (cfd->GetName() == kDefaultColumnFamilyName) { + ASSERT_EQ(2, files.size()); + for (const auto* fmeta : files) { + if (fmeta->fd.GetNumber() != 107 && fmeta->fd.GetNumber() != 110) { + ASSERT_FALSE(true); + } + } + } else { + ASSERT_TRUE(files.empty()); + } + } +} + +TEST_F(VersionSetTestMissingFiles, NoFileMissing) { + std::vector existing_files = { + SstInfo(100, kDefaultColumnFamilyName, "a"), + SstInfo(102, kDefaultColumnFamilyName, "b"), + SstInfo(103, kDefaultColumnFamilyName, "c"), + SstInfo(107, kDefaultColumnFamilyName, "d"), + SstInfo(110, kDefaultColumnFamilyName, "e")}; + std::vector file_metas; + CreateDummyTableFiles(existing_files, &file_metas); + + PrepareManifest(&column_families_, &last_seqno_, &log_writer_); + std::vector> added_files; + for (const auto& meta : file_metas) { + added_files.emplace_back(0, meta); + } + WriteFileAdditionAndDeletionToManifest( + /*cf=*/0, added_files, std::vector>()); + std::vector> deleted_files; + deleted_files.emplace_back(/*level=*/0, 100); + WriteFileAdditionAndDeletionToManifest( + /*cf=*/0, std::vector>(), deleted_files); + log_writer_.reset(); + Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + ASSERT_OK(s); + std::string manifest_path; + VerifyManifest(&manifest_path); + std::string db_id; + bool has_missing_table_file = false; + s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_, + /*read_only=*/false, &db_id, + &has_missing_table_file); + ASSERT_OK(s); + ASSERT_FALSE(has_missing_table_file); + for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) { + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + const std::vector& files = vstorage->LevelFiles(0); + if (cfd->GetName() == kDefaultColumnFamilyName) { + ASSERT_EQ(existing_files.size() - deleted_files.size(), files.size()); + bool has_deleted_file = false; + for (const auto* fmeta : files) { + if (fmeta->fd.GetNumber() == 100) { + has_deleted_file = true; + break; + } + } + ASSERT_FALSE(has_deleted_file); + } else { + ASSERT_TRUE(files.empty()); + } + } +} + +TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) { + db_options_.allow_2pc = true; + NewDB(); + + SstInfo sst(100, kDefaultColumnFamilyName, "a"); + std::vector file_metas; + CreateDummyTableFiles({sst}, &file_metas); + + constexpr WalNumber kMinWalNumberToKeep2PC = 10; + VersionEdit edit; + edit.AddFile(0, file_metas[0]); + edit.SetMinLogNumberToKeep(kMinWalNumberToKeep2PC); + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + ASSERT_EQ(versions_->min_log_number_to_keep(), kMinWalNumberToKeep2PC); + + for (int i = 0; i < 3; i++) { + CreateNewManifest(); + ReopenDB(); + ASSERT_EQ(versions_->min_log_number_to_keep(), kMinWalNumberToKeep2PC); + } +} + +class ChargeFileMetadataTest : public DBTestBase { + public: + ChargeFileMetadataTest() + : DBTestBase("charge_file_metadata_test", /*env_do_fsync=*/true) {} +}; + +class ChargeFileMetadataTestWithParam + : public ChargeFileMetadataTest, + public testing::WithParamInterface { + public: + ChargeFileMetadataTestWithParam() {} +}; + +#ifndef ROCKSDB_LITE +INSTANTIATE_TEST_CASE_P( + ChargeFileMetadataTestWithParam, ChargeFileMetadataTestWithParam, + ::testing::Values(CacheEntryRoleOptions::Decision::kEnabled, + CacheEntryRoleOptions::Decision::kDisabled)); + +TEST_P(ChargeFileMetadataTestWithParam, Basic) { + Options options; + BlockBasedTableOptions table_options; + CacheEntryRoleOptions::Decision charge_file_metadata = GetParam(); + table_options.cache_usage_options.options_overrides.insert( + {CacheEntryRole::kFileMetadata, {/*.charged = */ charge_file_metadata}}); + std::shared_ptr> + file_metadata_charge_only_cache = std::make_shared< + TargetCacheChargeTrackingCache>( + NewLRUCache( + 4 * CacheReservationManagerImpl< + CacheEntryRole::kFileMetadata>::GetDummyEntrySize(), + 0 /* num_shard_bits */, true /* strict_capacity_limit */)); + table_options.block_cache = file_metadata_charge_only_cache; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.create_if_missing = true; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + // Create 128 file metadata, each of which is roughly 1024 bytes. + // This results in 1 * + // CacheReservationManagerImpl::GetDummyEntrySize() + // cache reservation for file metadata. + for (int i = 1; i <= 128; ++i) { + ASSERT_OK(Put(std::string(1024, 'a'), "va")); + ASSERT_OK(Put("b", "vb")); + ASSERT_OK(Flush()); + } + if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) { + EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), + 1 * CacheReservationManagerImpl< + CacheEntryRole::kFileMetadata>::GetDummyEntrySize()); + + } else { + EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), 0); + } + + // Create another 128 file metadata. + // This increases the file metadata cache reservation to 2 * + // CacheReservationManagerImpl::GetDummyEntrySize(). + for (int i = 1; i <= 128; ++i) { + ASSERT_OK(Put(std::string(1024, 'a'), "vva")); + ASSERT_OK(Put("b", "vvb")); + ASSERT_OK(Flush()); + } + if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) { + EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), + 2 * CacheReservationManagerImpl< + CacheEntryRole::kFileMetadata>::GetDummyEntrySize()); + } else { + EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), 0); + } + // Compaction will create 1 new file metadata, obsolete and delete all 256 + // file metadata above. This results in 1 * + // CacheReservationManagerImpl::GetDummyEntrySize() + // cache reservation for file metadata. + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles", + "ChargeFileMetadataTestWithParam::" + "PreVerifyingCacheReservationRelease"}}); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,1", FilesPerLevel(0)); + TEST_SYNC_POINT( + "ChargeFileMetadataTestWithParam::PreVerifyingCacheReservationRelease"); + if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) { + EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), + 1 * CacheReservationManagerImpl< + CacheEntryRole::kFileMetadata>::GetDummyEntrySize()); + } else { + EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), 0); + } + SyncPoint::GetInstance()->DisableProcessing(); + + // Destroying the db will delete the remaining 1 new file metadata + // This results in no cache reservation for file metadata. + Destroy(options); + EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), + 0 * CacheReservationManagerImpl< + CacheEntryRole::kFileMetadata>::GetDummyEntrySize()); + + // Reopen the db with a smaller cache in order to test failure in allocating + // file metadata due to memory limit based on cache capacity + file_metadata_charge_only_cache = std::make_shared< + TargetCacheChargeTrackingCache>( + NewLRUCache(1 * CacheReservationManagerImpl< + CacheEntryRole::kFileMetadata>::GetDummyEntrySize(), + 0 /* num_shard_bits */, true /* strict_capacity_limit */)); + table_options.block_cache = file_metadata_charge_only_cache; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + ASSERT_OK(Put(std::string(1024, 'a'), "va")); + ASSERT_OK(Put("b", "vb")); + Status s = Flush(); + if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) { + EXPECT_TRUE(s.IsMemoryLimit()); + EXPECT_TRUE(s.ToString().find( + kCacheEntryRoleToCamelString[static_cast( + CacheEntryRole::kFileMetadata)]) != std::string::npos); + EXPECT_TRUE(s.ToString().find("memory limit based on cache capacity") != + std::string::npos); + } else { + EXPECT_TRUE(s.ok()); + } +} +#endif // ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/version_util.h b/src/rocksdb/db/version_util.h new file mode 100644 index 000000000..5ec6fda11 --- /dev/null +++ b/src/rocksdb/db/version_util.h @@ -0,0 +1,71 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "db/version_set.h" + +namespace ROCKSDB_NAMESPACE { + +// Instead of opening a `DB` to perform certain manifest updates, this +// uses the underlying `VersionSet` API to read and modify the MANIFEST. This +// allows us to use the user's real options, while not having to worry about +// the DB persisting new SST files via flush/compaction or attempting to read/ +// compact files which may fail, particularly for the file we intend to remove +// (the user may want to remove an already deleted file from MANIFEST). +class OfflineManifestWriter { + public: + OfflineManifestWriter(const DBOptions& options, const std::string& db_path) + : wc_(options.delayed_write_rate), + wb_(options.db_write_buffer_size), + immutable_db_options_(WithDbPath(options, db_path)), + tc_(NewLRUCache(1 << 20 /* capacity */, + options.table_cache_numshardbits)), + versions_(db_path, &immutable_db_options_, sopt_, tc_.get(), &wb_, &wc_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id*/ "", /*db_session_id*/ "") {} + + Status Recover(const std::vector& column_families) { + return versions_.Recover(column_families, /*read_only*/ false, + /*db_id*/ nullptr, + /*no_error_if_files_missing*/ true); + } + + Status LogAndApply(ColumnFamilyData* cfd, VersionEdit* edit, + FSDirectory* dir_contains_current_file) { + // Use `mutex` to imitate a locked DB mutex when calling `LogAndApply()`. + InstrumentedMutex mutex; + mutex.Lock(); + Status s = versions_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + edit, &mutex, dir_contains_current_file, + false /* new_descriptor_log */); + mutex.Unlock(); + return s; + } + + VersionSet& Versions() { return versions_; } + const ImmutableDBOptions& IOptions() { return immutable_db_options_; } + + private: + WriteController wc_; + WriteBufferManager wb_; + ImmutableDBOptions immutable_db_options_; + std::shared_ptr tc_; + EnvOptions sopt_; + VersionSet versions_; + + static ImmutableDBOptions WithDbPath(const DBOptions& options, + const std::string& db_path) { + ImmutableDBOptions rv(options); + if (rv.db_paths.empty()) { + // `VersionSet` expects options that have been through + // `SanitizeOptions()`, which would sanitize an empty `db_paths`. + rv.db_paths.emplace_back(db_path, 0 /* target_size */); + } + return rv; + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/wal_edit.cc b/src/rocksdb/db/wal_edit.cc new file mode 100644 index 000000000..2525be610 --- /dev/null +++ b/src/rocksdb/db/wal_edit.cc @@ -0,0 +1,211 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/wal_edit.h" + +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +void WalAddition::EncodeTo(std::string* dst) const { + PutVarint64(dst, number_); + + if (metadata_.HasSyncedSize()) { + PutVarint32(dst, static_cast(WalAdditionTag::kSyncedSize)); + PutVarint64(dst, metadata_.GetSyncedSizeInBytes()); + } + + PutVarint32(dst, static_cast(WalAdditionTag::kTerminate)); +} + +Status WalAddition::DecodeFrom(Slice* src) { + constexpr char class_name[] = "WalAddition"; + + if (!GetVarint64(src, &number_)) { + return Status::Corruption(class_name, "Error decoding WAL log number"); + } + + while (true) { + uint32_t tag_value = 0; + if (!GetVarint32(src, &tag_value)) { + return Status::Corruption(class_name, "Error decoding tag"); + } + WalAdditionTag tag = static_cast(tag_value); + switch (tag) { + case WalAdditionTag::kSyncedSize: { + uint64_t size = 0; + if (!GetVarint64(src, &size)) { + return Status::Corruption(class_name, "Error decoding WAL file size"); + } + metadata_.SetSyncedSizeInBytes(size); + break; + } + // TODO: process future tags such as checksum. + case WalAdditionTag::kTerminate: + return Status::OK(); + default: { + std::stringstream ss; + ss << "Unknown tag " << tag_value; + return Status::Corruption(class_name, ss.str()); + } + } + } +} + +JSONWriter& operator<<(JSONWriter& jw, const WalAddition& wal) { + jw << "LogNumber" << wal.GetLogNumber() << "SyncedSizeInBytes" + << wal.GetMetadata().GetSyncedSizeInBytes(); + return jw; +} + +std::ostream& operator<<(std::ostream& os, const WalAddition& wal) { + os << "log_number: " << wal.GetLogNumber() + << " synced_size_in_bytes: " << wal.GetMetadata().GetSyncedSizeInBytes(); + return os; +} + +std::string WalAddition::DebugString() const { + std::ostringstream oss; + oss << *this; + return oss.str(); +} + +void WalDeletion::EncodeTo(std::string* dst) const { + PutVarint64(dst, number_); +} + +Status WalDeletion::DecodeFrom(Slice* src) { + constexpr char class_name[] = "WalDeletion"; + + if (!GetVarint64(src, &number_)) { + return Status::Corruption(class_name, "Error decoding WAL log number"); + } + + return Status::OK(); +} + +JSONWriter& operator<<(JSONWriter& jw, const WalDeletion& wal) { + jw << "LogNumber" << wal.GetLogNumber(); + return jw; +} + +std::ostream& operator<<(std::ostream& os, const WalDeletion& wal) { + os << "log_number: " << wal.GetLogNumber(); + return os; +} + +std::string WalDeletion::DebugString() const { + std::ostringstream oss; + oss << *this; + return oss.str(); +} + +Status WalSet::AddWal(const WalAddition& wal) { + if (wal.GetLogNumber() < min_wal_number_to_keep_) { + // The WAL has been obsolete, ignore it. + return Status::OK(); + } + + auto it = wals_.lower_bound(wal.GetLogNumber()); + bool existing = it != wals_.end() && it->first == wal.GetLogNumber(); + + if (!existing) { + wals_.insert(it, {wal.GetLogNumber(), wal.GetMetadata()}); + return Status::OK(); + } + + assert(existing); + if (!wal.GetMetadata().HasSyncedSize()) { + std::stringstream ss; + ss << "WAL " << wal.GetLogNumber() << " is created more than once"; + return Status::Corruption("WalSet::AddWal", ss.str()); + } + + assert(wal.GetMetadata().HasSyncedSize()); + if (it->second.HasSyncedSize() && wal.GetMetadata().GetSyncedSizeInBytes() <= + it->second.GetSyncedSizeInBytes()) { + // This is possible because version edits with different synced WAL sizes + // for the same WAL can be committed out-of-order. For example, thread + // 1 synces the first 10 bytes of 1.log, while thread 2 synces the first 20 + // bytes of 1.log. It's possible that thread 1 calls LogAndApply() after + // thread 2. + // In this case, just return ok. + return Status::OK(); + } + + // Update synced size for the given WAL. + it->second.SetSyncedSizeInBytes(wal.GetMetadata().GetSyncedSizeInBytes()); + return Status::OK(); +} + +Status WalSet::AddWals(const WalAdditions& wals) { + Status s; + for (const WalAddition& wal : wals) { + s = AddWal(wal); + if (!s.ok()) { + break; + } + } + return s; +} + +Status WalSet::DeleteWalsBefore(WalNumber wal) { + if (wal > min_wal_number_to_keep_) { + min_wal_number_to_keep_ = wal; + wals_.erase(wals_.begin(), wals_.lower_bound(wal)); + } + return Status::OK(); +} + +void WalSet::Reset() { + wals_.clear(); + min_wal_number_to_keep_ = 0; +} + +Status WalSet::CheckWals( + Env* env, + const std::unordered_map& logs_on_disk) const { + assert(env != nullptr); + + Status s; + for (const auto& wal : wals_) { + const uint64_t log_number = wal.first; + const WalMetadata& wal_meta = wal.second; + + if (!wal_meta.HasSyncedSize()) { + // The WAL and WAL directory is not even synced, + // so the WAL's inode may not be persisted, + // then the WAL might not show up when listing WAL directory. + continue; + } + + if (logs_on_disk.find(log_number) == logs_on_disk.end()) { + std::stringstream ss; + ss << "Missing WAL with log number: " << log_number << "."; + s = Status::Corruption(ss.str()); + break; + } + + uint64_t log_file_size = 0; + s = env->GetFileSize(logs_on_disk.at(log_number), &log_file_size); + if (!s.ok()) { + break; + } + if (log_file_size < wal_meta.GetSyncedSizeInBytes()) { + std::stringstream ss; + ss << "Size mismatch: WAL (log number: " << log_number + << ") in MANIFEST is " << wal_meta.GetSyncedSizeInBytes() + << " bytes , but actually is " << log_file_size << " bytes on disk."; + s = Status::Corruption(ss.str()); + break; + } + } + + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/wal_edit.h b/src/rocksdb/db/wal_edit.h new file mode 100644 index 000000000..bb5c5e292 --- /dev/null +++ b/src/rocksdb/db/wal_edit.h @@ -0,0 +1,177 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// WAL related classes used in VersionEdit and VersionSet. +// Modifications to WalAddition and WalDeletion may need to update +// VersionEdit and its related tests. + +#pragma once + +#include +#include +#include +#include +#include + +#include "logging/event_logger.h" +#include "port/port.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class JSONWriter; +class Slice; +class Status; + +using WalNumber = uint64_t; + +// Metadata of a WAL. +class WalMetadata { + public: + WalMetadata() = default; + + explicit WalMetadata(uint64_t synced_size_bytes) + : synced_size_bytes_(synced_size_bytes) {} + + bool HasSyncedSize() const { return synced_size_bytes_ != kUnknownWalSize; } + + void SetSyncedSizeInBytes(uint64_t bytes) { synced_size_bytes_ = bytes; } + + uint64_t GetSyncedSizeInBytes() const { return synced_size_bytes_; } + + private: + friend bool operator==(const WalMetadata& lhs, const WalMetadata& rhs); + friend bool operator!=(const WalMetadata& lhs, const WalMetadata& rhs); + // The size of WAL is unknown, used when the WAL is not synced yet or is + // empty. + constexpr static uint64_t kUnknownWalSize = + std::numeric_limits::max(); + + // Size of the most recently synced WAL in bytes. + uint64_t synced_size_bytes_ = kUnknownWalSize; +}; + +inline bool operator==(const WalMetadata& lhs, const WalMetadata& rhs) { + return lhs.synced_size_bytes_ == rhs.synced_size_bytes_; +} + +inline bool operator!=(const WalMetadata& lhs, const WalMetadata& rhs) { + return !(lhs == rhs); +} + +// These tags are persisted to MANIFEST, so it's part of the user API. +enum class WalAdditionTag : uint32_t { + // Indicates that there are no more tags. + kTerminate = 1, + // Synced Size in bytes. + kSyncedSize = 2, + // Add tags in the future, such as checksum? +}; + +// Records the event of adding a WAL in VersionEdit. +class WalAddition { + public: + WalAddition() : number_(0), metadata_() {} + + explicit WalAddition(WalNumber number) : number_(number), metadata_() {} + + WalAddition(WalNumber number, WalMetadata meta) + : number_(number), metadata_(std::move(meta)) {} + + WalNumber GetLogNumber() const { return number_; } + + const WalMetadata& GetMetadata() const { return metadata_; } + + void EncodeTo(std::string* dst) const; + + Status DecodeFrom(Slice* src); + + std::string DebugString() const; + + private: + WalNumber number_; + WalMetadata metadata_; +}; + +std::ostream& operator<<(std::ostream& os, const WalAddition& wal); +JSONWriter& operator<<(JSONWriter& jw, const WalAddition& wal); + +using WalAdditions = std::vector; + +// Records the event of deleting WALs before the specified log number. +class WalDeletion { + public: + WalDeletion() : number_(kEmpty) {} + + explicit WalDeletion(WalNumber number) : number_(number) {} + + WalNumber GetLogNumber() const { return number_; } + + void EncodeTo(std::string* dst) const; + + Status DecodeFrom(Slice* src); + + std::string DebugString() const; + + bool IsEmpty() const { return number_ == kEmpty; } + + void Reset() { number_ = kEmpty; } + + private: + static constexpr WalNumber kEmpty = 0; + + WalNumber number_; +}; + +std::ostream& operator<<(std::ostream& os, const WalDeletion& wal); +JSONWriter& operator<<(JSONWriter& jw, const WalDeletion& wal); + +// Used in VersionSet to keep the current set of WALs. +// +// When a WAL is synced or becomes obsoleted, +// a VersionEdit is logged to MANIFEST and +// the WAL is added to or deleted from WalSet. +// +// Not thread safe, needs external synchronization such as holding DB mutex. +class WalSet { + public: + // Add WAL(s). + // If the WAL is closed, + // then there must be an existing unclosed WAL, + // otherwise, return Status::Corruption. + // Can happen when applying a VersionEdit or recovering from MANIFEST. + Status AddWal(const WalAddition& wal); + Status AddWals(const WalAdditions& wals); + + // Delete WALs with log number smaller than the specified wal number. + // Can happen when applying a VersionEdit or recovering from MANIFEST. + Status DeleteWalsBefore(WalNumber wal); + + // Resets the internal state. + void Reset(); + + // WALs with number less than MinWalNumberToKeep should not exist in WalSet. + WalNumber GetMinWalNumberToKeep() const { return min_wal_number_to_keep_; } + + const std::map& GetWals() const { return wals_; } + + // Checks whether there are missing or corrupted WALs. + // Returns Status::OK if there is no missing nor corrupted WAL, + // otherwise returns Status::Corruption. + // logs_on_disk is a map from log number to the log filename. + // Note that logs_on_disk may contain logs that is obsolete but + // haven't been deleted from disk. + Status CheckWals( + Env* env, + const std::unordered_map& logs_on_disk) const; + + private: + std::map wals_; + // WAL number < min_wal_number_to_keep_ should not exist in wals_. + // It's monotonically increasing, in-memory only, not written to MANIFEST. + WalNumber min_wal_number_to_keep_ = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/wal_edit_test.cc b/src/rocksdb/db/wal_edit_test.cc new file mode 100644 index 000000000..0c18fb125 --- /dev/null +++ b/src/rocksdb/db/wal_edit_test.cc @@ -0,0 +1,213 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/wal_edit.h" + +#include "db/db_test_util.h" +#include "file/file_util.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" + +namespace ROCKSDB_NAMESPACE { + +TEST(WalSet, AddDeleteReset) { + WalSet wals; + ASSERT_TRUE(wals.GetWals().empty()); + + // Create WAL 1 - 10. + for (WalNumber log_number = 1; log_number <= 10; log_number++) { + wals.AddWal(WalAddition(log_number)); + } + ASSERT_EQ(wals.GetWals().size(), 10); + + // Delete WAL 1 - 5. + wals.DeleteWalsBefore(6); + ASSERT_EQ(wals.GetWals().size(), 5); + + WalNumber expected_log_number = 6; + for (auto it : wals.GetWals()) { + WalNumber log_number = it.first; + ASSERT_EQ(log_number, expected_log_number++); + } + + wals.Reset(); + ASSERT_TRUE(wals.GetWals().empty()); +} + +TEST(WalSet, Overwrite) { + constexpr WalNumber kNumber = 100; + constexpr uint64_t kBytes = 200; + WalSet wals; + wals.AddWal(WalAddition(kNumber)); + ASSERT_FALSE(wals.GetWals().at(kNumber).HasSyncedSize()); + wals.AddWal(WalAddition(kNumber, WalMetadata(kBytes))); + ASSERT_TRUE(wals.GetWals().at(kNumber).HasSyncedSize()); + ASSERT_EQ(wals.GetWals().at(kNumber).GetSyncedSizeInBytes(), kBytes); +} + +TEST(WalSet, SmallerSyncedSize) { + constexpr WalNumber kNumber = 100; + constexpr uint64_t kBytes = 100; + WalSet wals; + ASSERT_OK(wals.AddWal(WalAddition(kNumber, WalMetadata(kBytes)))); + const auto wals1 = wals.GetWals(); + Status s = wals.AddWal(WalAddition(kNumber, WalMetadata(0))); + const auto wals2 = wals.GetWals(); + ASSERT_OK(s); + ASSERT_EQ(wals1, wals2); +} + +TEST(WalSet, CreateTwice) { + constexpr WalNumber kNumber = 100; + WalSet wals; + ASSERT_OK(wals.AddWal(WalAddition(kNumber))); + Status s = wals.AddWal(WalAddition(kNumber)); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(s.ToString().find("WAL 100 is created more than once") != + std::string::npos); +} + +TEST(WalSet, DeleteAllWals) { + constexpr WalNumber kMaxWalNumber = 10; + WalSet wals; + for (WalNumber i = 1; i <= kMaxWalNumber; i++) { + wals.AddWal(WalAddition(i)); + } + ASSERT_OK(wals.DeleteWalsBefore(kMaxWalNumber + 1)); +} + +TEST(WalSet, AddObsoleteWal) { + constexpr WalNumber kNumber = 100; + WalSet wals; + ASSERT_OK(wals.DeleteWalsBefore(kNumber + 1)); + ASSERT_OK(wals.AddWal(WalAddition(kNumber))); + ASSERT_TRUE(wals.GetWals().empty()); +} + +TEST(WalSet, MinWalNumberToKeep) { + constexpr WalNumber kNumber = 100; + WalSet wals; + ASSERT_EQ(wals.GetMinWalNumberToKeep(), 0); + ASSERT_OK(wals.DeleteWalsBefore(kNumber)); + ASSERT_EQ(wals.GetMinWalNumberToKeep(), kNumber); + ASSERT_OK(wals.DeleteWalsBefore(kNumber - 1)); + ASSERT_EQ(wals.GetMinWalNumberToKeep(), kNumber); + ASSERT_OK(wals.DeleteWalsBefore(kNumber + 1)); + ASSERT_EQ(wals.GetMinWalNumberToKeep(), kNumber + 1); +} + +class WalSetTest : public DBTestBase { + public: + WalSetTest() : DBTestBase("WalSetTest", /* env_do_fsync */ true) {} + + void SetUp() override { + test_dir_ = test::PerThreadDBPath("wal_set_test"); + ASSERT_OK(env_->CreateDir(test_dir_)); + } + + void TearDown() override { + EXPECT_OK(DestroyDir(env_, test_dir_)); + logs_on_disk_.clear(); + wals_.Reset(); + } + + void CreateWalOnDisk(WalNumber number, const std::string& fname, + uint64_t size_bytes) { + std::unique_ptr f; + std::string fpath = Path(fname); + ASSERT_OK(env_->NewWritableFile(fpath, &f, EnvOptions())); + std::string content(size_bytes, '0'); + ASSERT_OK(f->Append(content)); + ASSERT_OK(f->Close()); + + logs_on_disk_[number] = fpath; + } + + void AddWalToWalSet(WalNumber number, uint64_t size_bytes) { + // Create WAL. + ASSERT_OK(wals_.AddWal(WalAddition(number))); + // Close WAL. + WalMetadata wal(size_bytes); + ASSERT_OK(wals_.AddWal(WalAddition(number, wal))); + } + + Status CheckWals() const { return wals_.CheckWals(env_, logs_on_disk_); } + + private: + std::string test_dir_; + std::unordered_map logs_on_disk_; + WalSet wals_; + + std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; } +}; + +TEST_F(WalSetTest, CheckEmptyWals) { ASSERT_OK(CheckWals()); } + +TEST_F(WalSetTest, CheckWals) { + for (int number = 1; number < 10; number++) { + uint64_t size = rand() % 100; + std::stringstream ss; + ss << "log" << number; + std::string fname = ss.str(); + CreateWalOnDisk(number, fname, size); + // log 0 - 5 are obsolete. + if (number > 5) { + AddWalToWalSet(number, size); + } + } + ASSERT_OK(CheckWals()); +} + +TEST_F(WalSetTest, CheckMissingWals) { + for (int number = 1; number < 10; number++) { + uint64_t size = rand() % 100; + AddWalToWalSet(number, size); + // logs with even number are missing from disk. + if (number % 2) { + std::stringstream ss; + ss << "log" << number; + std::string fname = ss.str(); + CreateWalOnDisk(number, fname, size); + } + } + + Status s = CheckWals(); + ASSERT_TRUE(s.IsCorruption()) << s.ToString(); + // The first log with even number is missing. + std::stringstream expected_err; + expected_err << "Missing WAL with log number: " << 2; + ASSERT_TRUE(s.ToString().find(expected_err.str()) != std::string::npos) + << s.ToString(); +} + +TEST_F(WalSetTest, CheckWalsWithShrinkedSize) { + for (int number = 1; number < 10; number++) { + uint64_t size = rand() % 100 + 1; + AddWalToWalSet(number, size); + // logs with even number have shrinked size. + std::stringstream ss; + ss << "log" << number; + std::string fname = ss.str(); + CreateWalOnDisk(number, fname, (number % 2) ? size : size - 1); + } + + Status s = CheckWals(); + ASSERT_TRUE(s.IsCorruption()) << s.ToString(); + // The first log with even number has wrong size. + std::stringstream expected_err; + expected_err << "Size mismatch: WAL (log number: " << 2 << ")"; + ASSERT_TRUE(s.ToString().find(expected_err.str()) != std::string::npos) + << s.ToString(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/wal_manager.cc b/src/rocksdb/db/wal_manager.cc new file mode 100644 index 000000000..a6060235f --- /dev/null +++ b/src/rocksdb/db/wal_manager.cc @@ -0,0 +1,529 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/wal_manager.h" + +#include +#include +#include +#include + +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/transaction_log_impl.h" +#include "db/write_batch_internal.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "file/sequence_file_reader.h" +#include "logging/logging.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/write_batch.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/coding.h" +#include "util/mutexlock.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE + +Status WalManager::DeleteFile(const std::string& fname, uint64_t number) { + auto s = env_->DeleteFile(wal_dir_ + "/" + fname); + if (s.ok()) { + MutexLock l(&read_first_record_cache_mutex_); + read_first_record_cache_.erase(number); + } + return s; +} + +Status WalManager::GetSortedWalFiles(VectorLogPtr& files) { + // First get sorted files in db dir, then get sorted files from archived + // dir, to avoid a race condition where a log file is moved to archived + // dir in between. + Status s; + // list wal files in main db dir. + VectorLogPtr logs; + s = GetSortedWalsOfType(wal_dir_, logs, kAliveLogFile); + if (!s.ok()) { + return s; + } + + // Reproduce the race condition where a log file is moved + // to archived dir, between these two sync points, used in + // (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("WalManager::GetSortedWalFiles:1"); + TEST_SYNC_POINT("WalManager::GetSortedWalFiles:2"); + + files.clear(); + // list wal files in archive dir. + std::string archivedir = ArchivalDirectory(wal_dir_); + Status exists = env_->FileExists(archivedir); + if (exists.ok()) { + s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile); + if (!s.ok()) { + return s; + } + } else if (!exists.IsNotFound()) { + assert(s.IsIOError()); + return s; + } + + uint64_t latest_archived_log_number = 0; + if (!files.empty()) { + latest_archived_log_number = files.back()->LogNumber(); + ROCKS_LOG_INFO(db_options_.info_log, "Latest Archived log: %" PRIu64, + latest_archived_log_number); + } + + files.reserve(files.size() + logs.size()); + for (auto& log : logs) { + if (log->LogNumber() > latest_archived_log_number) { + files.push_back(std::move(log)); + } else { + // When the race condition happens, we could see the + // same log in both db dir and archived dir. Simply + // ignore the one in db dir. Note that, if we read + // archived dir first, we would have missed the log file. + ROCKS_LOG_WARN(db_options_.info_log, "%s already moved to archive", + log->PathName().c_str()); + } + } + + return s; +} + +Status WalManager::GetUpdatesSince( + SequenceNumber seq, std::unique_ptr* iter, + const TransactionLogIterator::ReadOptions& read_options, + VersionSet* version_set) { + if (seq_per_batch_) { + return Status::NotSupported(); + } + + assert(!seq_per_batch_); + + // Get all sorted Wal Files. + // Do binary search and open files and find the seq number. + + std::unique_ptr wal_files(new VectorLogPtr); + Status s = GetSortedWalFiles(*wal_files); + if (!s.ok()) { + return s; + } + + s = RetainProbableWalFiles(*wal_files, seq); + if (!s.ok()) { + return s; + } + iter->reset(new TransactionLogIteratorImpl( + wal_dir_, &db_options_, read_options, file_options_, seq, + std::move(wal_files), version_set, seq_per_batch_, io_tracer_)); + return (*iter)->status(); +} + +// 1. Go through all archived files and +// a. if ttl is enabled, delete outdated files +// b. if archive size limit is enabled, delete empty files, +// compute file number and size. +// 2. If size limit is enabled: +// a. compute how many files should be deleted +// b. get sorted non-empty archived logs +// c. delete what should be deleted +void WalManager::PurgeObsoleteWALFiles() { + bool const ttl_enabled = db_options_.WAL_ttl_seconds > 0; + bool const size_limit_enabled = db_options_.WAL_size_limit_MB > 0; + if (!ttl_enabled && !size_limit_enabled) { + return; + } + + int64_t current_time = 0; + Status s = db_options_.clock->GetCurrentTime(¤t_time); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, "Can't get current time: %s", + s.ToString().c_str()); + assert(false); + return; + } + uint64_t const now_seconds = static_cast(current_time); + uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled) + ? db_options_.WAL_ttl_seconds / 2 + : kDefaultIntervalToDeleteObsoleteWAL; + + if (purge_wal_files_last_run_ + time_to_check > now_seconds) { + return; + } + + purge_wal_files_last_run_ = now_seconds; + + std::string archival_dir = ArchivalDirectory(wal_dir_); + std::vector files; + s = env_->GetChildren(archival_dir, &files); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, "Can't get archive files: %s", + s.ToString().c_str()); + assert(false); + return; + } + + size_t log_files_num = 0; + uint64_t log_file_size = 0; + for (auto& f : files) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kWalFile) { + std::string const file_path = archival_dir + "/" + f; + if (ttl_enabled) { + uint64_t file_m_time; + s = env_->GetFileModificationTime(file_path, &file_m_time); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Can't get file mod time: %s: %s", file_path.c_str(), + s.ToString().c_str()); + continue; + } + if (now_seconds - file_m_time > db_options_.WAL_ttl_seconds) { + s = DeleteDBFile(&db_options_, file_path, archival_dir, false, + /*force_fg=*/!wal_in_db_path_); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, "Can't delete file: %s: %s", + file_path.c_str(), s.ToString().c_str()); + continue; + } else { + MutexLock l(&read_first_record_cache_mutex_); + read_first_record_cache_.erase(number); + } + continue; + } + } + + if (size_limit_enabled) { + uint64_t file_size; + s = env_->GetFileSize(file_path, &file_size); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Unable to get file size: %s: %s", file_path.c_str(), + s.ToString().c_str()); + return; + } else { + if (file_size > 0) { + log_file_size = std::max(log_file_size, file_size); + ++log_files_num; + } else { + s = DeleteDBFile(&db_options_, file_path, archival_dir, false, + /*force_fg=*/!wal_in_db_path_); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Unable to delete file: %s: %s", file_path.c_str(), + s.ToString().c_str()); + continue; + } else { + MutexLock l(&read_first_record_cache_mutex_); + read_first_record_cache_.erase(number); + } + } + } + } + } + } + + if (0 == log_files_num || !size_limit_enabled) { + return; + } + + size_t const files_keep_num = static_cast( + db_options_.WAL_size_limit_MB * 1024 * 1024 / log_file_size); + if (log_files_num <= files_keep_num) { + return; + } + + size_t files_del_num = log_files_num - files_keep_num; + VectorLogPtr archived_logs; + s = GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Unable to get archived WALs from: %s: %s", + archival_dir.c_str(), s.ToString().c_str()); + files_del_num = 0; + } else if (files_del_num > archived_logs.size()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Trying to delete more archived log files than " + "exist. Deleting all"); + files_del_num = archived_logs.size(); + } + + for (size_t i = 0; i < files_del_num; ++i) { + std::string const file_path = archived_logs[i]->PathName(); + s = DeleteDBFile(&db_options_, wal_dir_ + "/" + file_path, wal_dir_, false, + /*force_fg=*/!wal_in_db_path_); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, "Unable to delete file: %s: %s", + file_path.c_str(), s.ToString().c_str()); + continue; + } else { + MutexLock l(&read_first_record_cache_mutex_); + read_first_record_cache_.erase(archived_logs[i]->LogNumber()); + } + } +} + +void WalManager::ArchiveWALFile(const std::string& fname, uint64_t number) { + auto archived_log_name = ArchivedLogFileName(wal_dir_, number); + // The sync point below is used in (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:1"); + Status s = env_->RenameFile(fname, archived_log_name); + // The sync point below is used in (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:2"); + ROCKS_LOG_INFO(db_options_.info_log, "Move log file %s to %s -- %s\n", + fname.c_str(), archived_log_name.c_str(), + s.ToString().c_str()); +} + +Status WalManager::GetSortedWalsOfType(const std::string& path, + VectorLogPtr& log_files, + WalFileType log_type) { + std::vector all_files; + const Status status = env_->GetChildren(path, &all_files); + if (!status.ok()) { + return status; + } + log_files.reserve(all_files.size()); + for (const auto& f : all_files) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kWalFile) { + SequenceNumber sequence; + Status s = ReadFirstRecord(log_type, number, &sequence); + if (!s.ok()) { + return s; + } + if (sequence == 0) { + // empty file + continue; + } + + // Reproduce the race condition where a log file is moved + // to archived dir, between these two sync points, used in + // (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("WalManager::GetSortedWalsOfType:1"); + TEST_SYNC_POINT("WalManager::GetSortedWalsOfType:2"); + + uint64_t size_bytes; + s = env_->GetFileSize(LogFileName(path, number), &size_bytes); + // re-try in case the alive log file has been moved to archive. + if (!s.ok() && log_type == kAliveLogFile) { + std::string archived_file = ArchivedLogFileName(path, number); + if (env_->FileExists(archived_file).ok()) { + s = env_->GetFileSize(archived_file, &size_bytes); + if (!s.ok() && env_->FileExists(archived_file).IsNotFound()) { + // oops, the file just got deleted from archived dir! move on + s = Status::OK(); + continue; + } + } + } + if (!s.ok()) { + return s; + } + + log_files.push_back(std::unique_ptr( + new LogFileImpl(number, log_type, sequence, size_bytes))); + } + } + std::sort( + log_files.begin(), log_files.end(), + [](const std::unique_ptr& a, const std::unique_ptr& b) { + LogFileImpl* a_impl = static_cast_with_check(a.get()); + LogFileImpl* b_impl = static_cast_with_check(b.get()); + return *a_impl < *b_impl; + }); + return status; +} + +Status WalManager::RetainProbableWalFiles(VectorLogPtr& all_logs, + const SequenceNumber target) { + int64_t start = 0; // signed to avoid overflow when target is < first file. + int64_t end = static_cast(all_logs.size()) - 1; + // Binary Search. avoid opening all files. + while (end >= start) { + int64_t mid = start + (end - start) / 2; // Avoid overflow. + SequenceNumber current_seq_num = + all_logs.at(static_cast(mid))->StartSequence(); + if (current_seq_num == target) { + end = mid; + break; + } else if (current_seq_num < target) { + start = mid + 1; + } else { + end = mid - 1; + } + } + // end could be -ve. + size_t start_index = + static_cast(std::max(static_cast(0), end)); + // The last wal file is always included + all_logs.erase(all_logs.begin(), all_logs.begin() + start_index); + return Status::OK(); +} + +Status WalManager::ReadFirstRecord(const WalFileType type, + const uint64_t number, + SequenceNumber* sequence) { + *sequence = 0; + if (type != kAliveLogFile && type != kArchivedLogFile) { + ROCKS_LOG_ERROR(db_options_.info_log, "[WalManger] Unknown file type %s", + std::to_string(type).c_str()); + return Status::NotSupported("File Type Not Known " + std::to_string(type)); + } + { + MutexLock l(&read_first_record_cache_mutex_); + auto itr = read_first_record_cache_.find(number); + if (itr != read_first_record_cache_.end()) { + *sequence = itr->second; + return Status::OK(); + } + } + Status s; + if (type == kAliveLogFile) { + std::string fname = LogFileName(wal_dir_, number); + s = ReadFirstLine(fname, number, sequence); + if (!s.ok() && env_->FileExists(fname).ok()) { + // return any error that is not caused by non-existing file + return s; + } + } + + if (type == kArchivedLogFile || !s.ok()) { + // check if the file got moved to archive. + std::string archived_file = ArchivedLogFileName(wal_dir_, number); + s = ReadFirstLine(archived_file, number, sequence); + // maybe the file was deleted from archive dir. If that's the case, return + // Status::OK(). The caller with identify this as empty file because + // *sequence == 0 + if (!s.ok() && env_->FileExists(archived_file).IsNotFound()) { + return Status::OK(); + } + } + + if (s.ok() && *sequence != 0) { + MutexLock l(&read_first_record_cache_mutex_); + read_first_record_cache_.insert({number, *sequence}); + } + return s; +} + +Status WalManager::GetLiveWalFile(uint64_t number, + std::unique_ptr* log_file) { + if (!log_file) { + return Status::InvalidArgument("log_file not preallocated."); + } + + if (!number) { + return Status::PathNotFound("log file not available"); + } + + Status s; + + uint64_t size_bytes; + s = env_->GetFileSize(LogFileName(wal_dir_, number), &size_bytes); + + if (!s.ok()) { + return s; + } + + log_file->reset(new LogFileImpl(number, kAliveLogFile, + 0, // SequenceNumber + size_bytes)); + + return Status::OK(); +} + +// the function returns status.ok() and sequence == 0 if the file exists, but is +// empty +Status WalManager::ReadFirstLine(const std::string& fname, + const uint64_t number, + SequenceNumber* sequence) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + const char* fname; + + Status* status; + bool ignore_error; // true if db_options_.paranoid_checks==false + void Corruption(size_t bytes, const Status& s) override { + ROCKS_LOG_WARN(info_log, "[WalManager] %s%s: dropping %d bytes; %s", + (this->ignore_error ? "(ignoring error) " : ""), fname, + static_cast(bytes), s.ToString().c_str()); + if (this->status->ok()) { + // only keep the first error + *this->status = s; + } + } + }; + + std::unique_ptr file; + Status status = fs_->NewSequentialFile( + fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr); + std::unique_ptr file_reader( + new SequentialFileReader(std::move(file), fname, io_tracer_)); + + if (!status.ok()) { + return status; + } + + LogReporter reporter; + reporter.env = env_; + reporter.info_log = db_options_.info_log.get(); + reporter.fname = fname.c_str(); + reporter.status = &status; + reporter.ignore_error = !db_options_.paranoid_checks; + log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter, + true /*checksum*/, number); + std::string scratch; + Slice record; + + if (reader.ReadRecord(&record, &scratch) && + (status.ok() || !db_options_.paranoid_checks)) { + if (record.size() < WriteBatchInternal::kHeader) { + reporter.Corruption(record.size(), + Status::Corruption("log record too small")); + // TODO read record's till the first no corrupt entry? + } else { + WriteBatch batch; + // We can overwrite an existing non-OK Status since it'd only reach here + // with `paranoid_checks == false`. + status = WriteBatchInternal::SetContents(&batch, record); + if (status.ok()) { + *sequence = WriteBatchInternal::Sequence(&batch); + return status; + } + } + } + + if (status.ok() && reader.IsCompressedAndEmptyFile()) { + // In case of wal_compression, it writes a `kSetCompressionType` record + // which is not associated with any sequence number. As result for an empty + // file, GetSortedWalsOfType() will skip these WALs causing the operations + // to fail. + // Therefore, in order to avoid that failure, it sets sequence_number to 1 + // indicating those WALs should be included. + *sequence = 1; + } else { + // ReadRecord might have returned false on EOF, which means that the log + // file is empty. Or, a failure may have occurred while processing the first + // entry. In any case, return status and set sequence number to 0. + *sequence = 0; + } + return status; +} + +#endif // ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/wal_manager.h b/src/rocksdb/db/wal_manager.h new file mode 100644 index 000000000..8cc067935 --- /dev/null +++ b/src/rocksdb/db/wal_manager.h @@ -0,0 +1,138 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/version_set.h" +#include "file/file_util.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "rocksdb/transaction_log.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE + +// WAL manager provides the abstraction for reading the WAL files as a single +// unit. Internally, it opens and reads the files using Reader or Writer +// abstraction. +class WalManager { + public: + WalManager(const ImmutableDBOptions& db_options, + const FileOptions& file_options, + const std::shared_ptr& io_tracer, + const bool seq_per_batch = false) + : db_options_(db_options), + file_options_(file_options), + env_(db_options.env), + fs_(db_options.fs, io_tracer), + purge_wal_files_last_run_(0), + seq_per_batch_(seq_per_batch), + wal_dir_(db_options_.GetWalDir()), + wal_in_db_path_(db_options_.IsWalDirSameAsDBPath()), + io_tracer_(io_tracer) {} + + Status GetSortedWalFiles(VectorLogPtr& files); + + // Allow user to tail transaction log to find all recent changes to the + // database that are newer than `seq_number`. + Status GetUpdatesSince( + SequenceNumber seq_number, std::unique_ptr* iter, + const TransactionLogIterator::ReadOptions& read_options, + VersionSet* version_set); + + void PurgeObsoleteWALFiles(); + + void ArchiveWALFile(const std::string& fname, uint64_t number); + + Status DeleteFile(const std::string& fname, uint64_t number); + + Status GetLiveWalFile(uint64_t number, std::unique_ptr* log_file); + + Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number, + SequenceNumber* sequence) { + return ReadFirstRecord(type, number, sequence); + } + + Status TEST_ReadFirstLine(const std::string& fname, const uint64_t number, + SequenceNumber* sequence) { + return ReadFirstLine(fname, number, sequence); + } + + private: + Status GetSortedWalsOfType(const std::string& path, VectorLogPtr& log_files, + WalFileType type); + // Requires: all_logs should be sorted with earliest log file first + // Retains all log files in all_logs which contain updates with seq no. + // Greater Than or Equal to the requested SequenceNumber. + Status RetainProbableWalFiles(VectorLogPtr& all_logs, + const SequenceNumber target); + + // ReadFirstRecord checks the read_first_record_cache_ to see if the entry + // exists or not. If not, it will read the WAL file. + // In case of wal_compression, WAL contains a `kSetCompressionType` record + // which is not associated with any sequence number. So the sequence_number is + // set to 1 if that WAL doesn't include any other record (basically empty) in + // order to include that WAL and is inserted in read_first_record_cache_. + // Therefore, sequence_number is used as boolean if WAL should be included or + // not and that sequence_number shouldn't be use for any other purpose. + Status ReadFirstRecord(const WalFileType type, const uint64_t number, + SequenceNumber* sequence); + + // In case of no wal_compression, ReadFirstLine returns status.ok() and + // sequence == 0 if the file exists, but is empty. + // In case of wal_compression, WAL contains + // `kSetCompressionType` record which is not associated with any sequence + // number if that WAL doesn't include any other record (basically empty). As + // result for an empty file, GetSortedWalsOfType() will skip these WALs + // causing the operations to fail. To avoid that, it sets sequence_number to + // 1 inorder to include that WAL. + Status ReadFirstLine(const std::string& fname, const uint64_t number, + SequenceNumber* sequence); + + // ------- state from DBImpl ------ + const ImmutableDBOptions& db_options_; + const FileOptions file_options_; + Env* env_; + const FileSystemPtr fs_; + + // ------- WalManager state ------- + // cache for ReadFirstRecord() calls + std::unordered_map read_first_record_cache_; + port::Mutex read_first_record_cache_mutex_; + + // last time when PurgeObsoleteWALFiles ran. + uint64_t purge_wal_files_last_run_; + + bool seq_per_batch_; + + const std::string& wal_dir_; + + bool wal_in_db_path_; + + // obsolete files will be deleted every this seconds if ttl deletion is + // enabled and archive size_limit is disabled. + static constexpr uint64_t kDefaultIntervalToDeleteObsoleteWAL = 600; + + std::shared_ptr io_tracer_; +}; + +#endif // ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/wal_manager_test.cc b/src/rocksdb/db/wal_manager_test.cc new file mode 100644 index 000000000..4ad4e9749 --- /dev/null +++ b/src/rocksdb/db/wal_manager_test.cc @@ -0,0 +1,346 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "db/wal_manager.h" + +#include +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/log_writer.h" +#include "db/version_set.h" +#include "env/mock_env.h" +#include "file/writable_file_writer.h" +#include "rocksdb/cache.h" +#include "rocksdb/file_system.h" +#include "rocksdb/write_batch.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/mock_table.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +// TODO(icanadi) mock out VersionSet +// TODO(icanadi) move other WalManager-specific tests from db_test here +class WalManagerTest : public testing::Test { + public: + WalManagerTest() + : dbname_(test::PerThreadDBPath("wal_manager_test")), + db_options_(), + table_cache_(NewLRUCache(50000, 16)), + write_buffer_manager_(db_options_.db_write_buffer_size), + current_log_number_(0) { + env_.reset(MockEnv::Create(Env::Default())); + EXPECT_OK(DestroyDB(dbname_, Options())); + } + + void Init() { + ASSERT_OK(env_->CreateDirIfMissing(dbname_)); + ASSERT_OK(env_->CreateDirIfMissing(ArchivalDirectory(dbname_))); + db_options_.db_paths.emplace_back(dbname_, + std::numeric_limits::max()); + db_options_.wal_dir = dbname_; + db_options_.env = env_.get(); + db_options_.fs = env_->GetFileSystem(); + db_options_.clock = env_->GetSystemClock().get(); + + versions_.reset( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id*/ "", /*db_session_id*/ "")); + + wal_manager_.reset( + new WalManager(db_options_, env_options_, nullptr /*IOTracer*/)); + } + + void Reopen() { + wal_manager_.reset( + new WalManager(db_options_, env_options_, nullptr /*IOTracer*/)); + } + + // NOT thread safe + void Put(const std::string& key, const std::string& value) { + assert(current_log_writer_.get() != nullptr); + uint64_t seq = versions_->LastSequence() + 1; + WriteBatch batch; + ASSERT_OK(batch.Put(key, value)); + WriteBatchInternal::SetSequence(&batch, seq); + ASSERT_OK( + current_log_writer_->AddRecord(WriteBatchInternal::Contents(&batch))); + versions_->SetLastAllocatedSequence(seq); + versions_->SetLastPublishedSequence(seq); + versions_->SetLastSequence(seq); + } + + // NOT thread safe + void RollTheLog(bool /*archived*/) { + current_log_number_++; + std::string fname = ArchivedLogFileName(dbname_, current_log_number_); + const auto& fs = env_->GetFileSystem(); + std::unique_ptr file_writer; + ASSERT_OK(WritableFileWriter::Create(fs, fname, env_options_, &file_writer, + nullptr)); + current_log_writer_.reset( + new log::Writer(std::move(file_writer), 0, false)); + } + + void CreateArchiveLogs(int num_logs, int entries_per_log) { + for (int i = 1; i <= num_logs; ++i) { + RollTheLog(true); + for (int k = 0; k < entries_per_log; ++k) { + Put(std::to_string(k), std::string(1024, 'a')); + } + } + } + + std::unique_ptr OpenTransactionLogIter( + const SequenceNumber seq) { + std::unique_ptr iter; + Status status = wal_manager_->GetUpdatesSince( + seq, &iter, TransactionLogIterator::ReadOptions(), versions_.get()); + EXPECT_OK(status); + return iter; + } + + std::unique_ptr env_; + std::string dbname_; + ImmutableDBOptions db_options_; + WriteController write_controller_; + EnvOptions env_options_; + std::shared_ptr table_cache_; + WriteBufferManager write_buffer_manager_; + std::unique_ptr versions_; + std::unique_ptr wal_manager_; + + std::unique_ptr current_log_writer_; + uint64_t current_log_number_; +}; + +TEST_F(WalManagerTest, ReadFirstRecordCache) { + Init(); + std::string path = dbname_ + "/000001.log"; + std::unique_ptr file; + ASSERT_OK(env_->GetFileSystem()->NewWritableFile(path, FileOptions(), &file, + nullptr)); + + SequenceNumber s; + ASSERT_OK(wal_manager_->TEST_ReadFirstLine(path, 1 /* number */, &s)); + ASSERT_EQ(s, 0U); + + ASSERT_OK( + wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1 /* number */, &s)); + ASSERT_EQ(s, 0U); + + std::unique_ptr file_writer( + new WritableFileWriter(std::move(file), path, FileOptions())); + log::Writer writer(std::move(file_writer), 1, + db_options_.recycle_log_file_num > 0); + WriteBatch batch; + ASSERT_OK(batch.Put("foo", "bar")); + WriteBatchInternal::SetSequence(&batch, 10); + ASSERT_OK(writer.AddRecord(WriteBatchInternal::Contents(&batch))); + + // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here. + // Waiting for lei to finish with db_test + // env_->count_sequential_reads_ = true; + // sequential_read_counter_ sanity test + // ASSERT_EQ(env_->sequential_read_counter_.Read(), 0); + + ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s)); + ASSERT_EQ(s, 10U); + // did a read + // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here + // ASSERT_EQ(env_->sequential_read_counter_.Read(), 1); + + ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s)); + ASSERT_EQ(s, 10U); + // no new reads since the value is cached + // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here + // ASSERT_EQ(env_->sequential_read_counter_.Read(), 1); +} + +namespace { +uint64_t GetLogDirSize(std::string dir_path, Env* env) { + uint64_t dir_size = 0; + std::vector files; + EXPECT_OK(env->GetChildren(dir_path, &files)); + for (auto& f : files) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kWalFile) { + std::string const file_path = dir_path + "/" + f; + uint64_t file_size; + EXPECT_OK(env->GetFileSize(file_path, &file_size)); + dir_size += file_size; + } + } + return dir_size; +} +std::vector ListSpecificFiles( + Env* env, const std::string& path, const FileType expected_file_type) { + std::vector files; + std::vector file_numbers; + uint64_t number; + FileType type; + EXPECT_OK(env->GetChildren(path, &files)); + for (size_t i = 0; i < files.size(); ++i) { + if (ParseFileName(files[i], &number, &type)) { + if (type == expected_file_type) { + file_numbers.push_back(number); + } + } + } + return file_numbers; +} + +int CountRecords(TransactionLogIterator* iter) { + int count = 0; + SequenceNumber lastSequence = 0; + BatchResult res; + while (iter->Valid()) { + res = iter->GetBatch(); + EXPECT_TRUE(res.sequence > lastSequence); + ++count; + lastSequence = res.sequence; + EXPECT_OK(iter->status()); + iter->Next(); + } + EXPECT_OK(iter->status()); + return count; +} +} // anonymous namespace + +TEST_F(WalManagerTest, WALArchivalSizeLimit) { + db_options_.WAL_ttl_seconds = 0; + db_options_.WAL_size_limit_MB = 1000; + Init(); + + // TEST : Create WalManager with huge size limit and no ttl. + // Create some archived files and call PurgeObsoleteWALFiles(). + // Count the archived log files that survived. + // Assert that all of them did. + // Change size limit. Re-open WalManager. + // Assert that archive is not greater than WAL_size_limit_MB after + // PurgeObsoleteWALFiles() + // Set ttl and time_to_check_ to small values. Re-open db. + // Assert that there are no archived logs left. + + std::string archive_dir = ArchivalDirectory(dbname_); + CreateArchiveLogs(20, 5000); + + std::vector log_files = + ListSpecificFiles(env_.get(), archive_dir, kWalFile); + ASSERT_EQ(log_files.size(), 20U); + + db_options_.WAL_size_limit_MB = 8; + Reopen(); + wal_manager_->PurgeObsoleteWALFiles(); + + uint64_t archive_size = GetLogDirSize(archive_dir, env_.get()); + ASSERT_TRUE(archive_size <= db_options_.WAL_size_limit_MB * 1024 * 1024); + + db_options_.WAL_ttl_seconds = 1; + env_->SleepForMicroseconds(2 * 1000 * 1000); + Reopen(); + wal_manager_->PurgeObsoleteWALFiles(); + + log_files = ListSpecificFiles(env_.get(), archive_dir, kWalFile); + ASSERT_TRUE(log_files.empty()); +} + +TEST_F(WalManagerTest, WALArchivalTtl) { + db_options_.WAL_ttl_seconds = 1000; + Init(); + + // TEST : Create WalManager with a ttl and no size limit. + // Create some archived log files and call PurgeObsoleteWALFiles(). + // Assert that files are not deleted + // Reopen db with small ttl. + // Assert that all archived logs was removed. + + std::string archive_dir = ArchivalDirectory(dbname_); + CreateArchiveLogs(20, 5000); + + std::vector log_files = + ListSpecificFiles(env_.get(), archive_dir, kWalFile); + ASSERT_GT(log_files.size(), 0U); + + db_options_.WAL_ttl_seconds = 1; + env_->SleepForMicroseconds(3 * 1000 * 1000); + Reopen(); + wal_manager_->PurgeObsoleteWALFiles(); + + log_files = ListSpecificFiles(env_.get(), archive_dir, kWalFile); + ASSERT_TRUE(log_files.empty()); +} + +TEST_F(WalManagerTest, TransactionLogIteratorMoveOverZeroFiles) { + Init(); + RollTheLog(false); + Put("key1", std::string(1024, 'a')); + // Create a zero record WAL file. + RollTheLog(false); + RollTheLog(false); + + Put("key2", std::string(1024, 'a')); + + auto iter = OpenTransactionLogIter(0); + ASSERT_EQ(2, CountRecords(iter.get())); +} + +TEST_F(WalManagerTest, TransactionLogIteratorJustEmptyFile) { + Init(); + RollTheLog(false); + auto iter = OpenTransactionLogIter(0); + // Check that an empty iterator is returned + ASSERT_TRUE(!iter->Valid()); +} + +TEST_F(WalManagerTest, TransactionLogIteratorNewFileWhileScanning) { + Init(); + CreateArchiveLogs(2, 100); + auto iter = OpenTransactionLogIter(0); + CreateArchiveLogs(1, 100); + int i = 0; + for (; iter->Valid(); iter->Next()) { + i++; + } + ASSERT_EQ(i, 200); + // A new log file was added after the iterator was created. + // TryAgain indicates a new iterator is needed to fetch the new data + ASSERT_TRUE(iter->status().IsTryAgain()); + + iter = OpenTransactionLogIter(0); + i = 0; + for (; iter->Valid(); iter->Next()) { + i++; + } + ASSERT_EQ(i, 300); + ASSERT_TRUE(iter->status().ok()); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as WalManager is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/wide/db_wide_basic_test.cc b/src/rocksdb/db/wide/db_wide_basic_test.cc new file mode 100644 index 000000000..1ffe314fe --- /dev/null +++ b/src/rocksdb/db/wide/db_wide_basic_test.cc @@ -0,0 +1,654 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "test_util/testutil.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +class DBWideBasicTest : public DBTestBase { + protected: + explicit DBWideBasicTest() + : DBTestBase("db_wide_basic_test", /* env_do_fsync */ false) {} +}; + +TEST_F(DBWideBasicTest, PutEntity) { + Options options = GetDefaultOptions(); + + // Write a couple of wide-column entities and a plain old key-value, then read + // them back. + constexpr char first_key[] = "first"; + constexpr char first_value_of_default_column[] = "hello"; + WideColumns first_columns{ + {kDefaultWideColumnName, first_value_of_default_column}, + {"attr_name1", "foo"}, + {"attr_name2", "bar"}}; + + constexpr char second_key[] = "second"; + WideColumns second_columns{{"attr_one", "two"}, {"attr_three", "four"}}; + + constexpr char third_key[] = "third"; + constexpr char third_value[] = "baz"; + + auto verify = [&]() { + const WideColumns expected_third_columns{ + {kDefaultWideColumnName, third_value}}; + + { + PinnableSlice result; + ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), first_key, + &result)); + ASSERT_EQ(result, first_value_of_default_column); + } + + { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + first_key, &result)); + ASSERT_EQ(result.columns(), first_columns); + } + + { + PinnableSlice result; + ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), second_key, + &result)); + ASSERT_TRUE(result.empty()); + } + + { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + second_key, &result)); + ASSERT_EQ(result.columns(), second_columns); + } + + { + PinnableSlice result; + ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), third_key, + &result)); + ASSERT_EQ(result, third_value); + } + + { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + third_key, &result)); + + ASSERT_EQ(result.columns(), expected_third_columns); + } + + { + constexpr size_t num_keys = 3; + + std::array keys{{first_key, second_key, third_key}}; + std::array values; + std::array statuses; + + db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, + &keys[0], &values[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], first_value_of_default_column); + + ASSERT_OK(statuses[1]); + ASSERT_TRUE(values[1].empty()); + + ASSERT_OK(statuses[2]); + ASSERT_EQ(values[2], third_value); + } + + { + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), first_key); + ASSERT_EQ(iter->value(), first_value_of_default_column); + ASSERT_EQ(iter->columns(), first_columns); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), second_key); + ASSERT_TRUE(iter->value().empty()); + ASSERT_EQ(iter->columns(), second_columns); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), third_key); + ASSERT_EQ(iter->value(), third_value); + ASSERT_EQ(iter->columns(), expected_third_columns); + + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), third_key); + ASSERT_EQ(iter->value(), third_value); + ASSERT_EQ(iter->columns(), expected_third_columns); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), second_key); + ASSERT_TRUE(iter->value().empty()); + ASSERT_EQ(iter->columns(), second_columns); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), first_key); + ASSERT_EQ(iter->value(), first_value_of_default_column); + ASSERT_EQ(iter->columns(), first_columns); + + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + } + }; + + // Use the DB::PutEntity API to write the first entity + ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(), + first_key, first_columns)); + + // Use WriteBatch to write the second entity + WriteBatch batch; + ASSERT_OK( + batch.PutEntity(db_->DefaultColumnFamily(), second_key, second_columns)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + // Use Put to write the plain key-value + ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), third_key, + third_value)); + + // Try reading from memtable + verify(); + + // Try reading after recovery + Close(); + options.avoid_flush_during_recovery = true; + Reopen(options); + + verify(); + + // Try reading from storage + ASSERT_OK(Flush()); + + verify(); +} + +TEST_F(DBWideBasicTest, PutEntityColumnFamily) { + Options options = GetDefaultOptions(); + CreateAndReopenWithCF({"corinthian"}, options); + + // Use the DB::PutEntity API + constexpr char first_key[] = "first"; + WideColumns first_columns{{"attr_name1", "foo"}, {"attr_name2", "bar"}}; + + ASSERT_OK( + db_->PutEntity(WriteOptions(), handles_[1], first_key, first_columns)); + + // Use WriteBatch + constexpr char second_key[] = "second"; + WideColumns second_columns{{"attr_one", "two"}, {"attr_three", "four"}}; + + WriteBatch batch; + ASSERT_OK(batch.PutEntity(handles_[1], second_key, second_columns)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); +} + +TEST_F(DBWideBasicTest, MergePlainKeyValue) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + Reopen(options); + + // Put + Merge + constexpr char first_key[] = "first"; + constexpr char first_base_value[] = "hello"; + constexpr char first_merge_op[] = "world"; + + // Delete + Merge + constexpr char second_key[] = "second"; + constexpr char second_merge_op[] = "foo"; + + // Merge without any preceding KV + constexpr char third_key[] = "third"; + constexpr char third_merge_op[] = "bar"; + + auto write_base = [&]() { + // Write "base" KVs: a Put for the 1st key and a Delete for the 2nd one; + // note there is no "base" KV for the 3rd + ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), first_key, + first_base_value)); + ASSERT_OK( + db_->Delete(WriteOptions(), db_->DefaultColumnFamily(), second_key)); + }; + + auto write_merge = [&]() { + // Write Merge operands + ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), first_key, + first_merge_op)); + ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), second_key, + second_merge_op)); + ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), third_key, + third_merge_op)); + }; + + const std::string expected_first_column(std::string(first_base_value) + "," + + first_merge_op); + const WideColumns expected_first_columns{ + {kDefaultWideColumnName, expected_first_column}}; + const WideColumns expected_second_columns{ + {kDefaultWideColumnName, second_merge_op}}; + const WideColumns expected_third_columns{ + {kDefaultWideColumnName, third_merge_op}}; + + auto verify = [&]() { + { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + first_key, &result)); + ASSERT_EQ(result.columns(), expected_first_columns); + } + + { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + second_key, &result)); + ASSERT_EQ(result.columns(), expected_second_columns); + } + + { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + third_key, &result)); + + ASSERT_EQ(result.columns(), expected_third_columns); + } + + { + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), first_key); + ASSERT_EQ(iter->value(), expected_first_columns[0].value()); + ASSERT_EQ(iter->columns(), expected_first_columns); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), second_key); + ASSERT_EQ(iter->value(), expected_second_columns[0].value()); + ASSERT_EQ(iter->columns(), expected_second_columns); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), third_key); + ASSERT_EQ(iter->value(), expected_third_columns[0].value()); + ASSERT_EQ(iter->columns(), expected_third_columns); + + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), third_key); + ASSERT_EQ(iter->value(), expected_third_columns[0].value()); + ASSERT_EQ(iter->columns(), expected_third_columns); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), second_key); + ASSERT_EQ(iter->value(), expected_second_columns[0].value()); + ASSERT_EQ(iter->columns(), expected_second_columns); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), first_key); + ASSERT_EQ(iter->value(), expected_first_columns[0].value()); + ASSERT_EQ(iter->columns(), expected_first_columns); + + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + } + }; + + { + // Base KVs (if any) and Merge operands both in memtable (note: we take a + // snapshot in between to make sure they do not get reconciled during the + // subsequent flush) + write_base(); + ManagedSnapshot snapshot(db_); + write_merge(); + verify(); + + // Base KVs (if any) and Merge operands both in storage + ASSERT_OK(Flush()); + verify(); + } + + // Base KVs (if any) in storage, Merge operands in memtable + DestroyAndReopen(options); + write_base(); + ASSERT_OK(Flush()); + write_merge(); + verify(); +} + +TEST_F(DBWideBasicTest, MergeEntity) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + + const std::string delim("|"); + options.merge_operator = MergeOperators::CreateStringAppendOperator(delim); + + Reopen(options); + + // Test Merge with two entities: one that has the default column and one that + // doesn't + constexpr char first_key[] = "first"; + WideColumns first_columns{{kDefaultWideColumnName, "a"}, + {"attr_name1", "foo"}, + {"attr_name2", "bar"}}; + constexpr char first_merge_operand[] = "bla1"; + + constexpr char second_key[] = "second"; + WideColumns second_columns{{"attr_one", "two"}, {"attr_three", "four"}}; + constexpr char second_merge_operand[] = "bla2"; + + auto write_base = [&]() { + // Use the DB::PutEntity API + ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(), + first_key, first_columns)); + + // Use WriteBatch + WriteBatch batch; + ASSERT_OK(batch.PutEntity(db_->DefaultColumnFamily(), second_key, + second_columns)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + }; + + auto write_merge = [&]() { + ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), first_key, + first_merge_operand)); + ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), second_key, + second_merge_operand)); + }; + + const std::string first_expected_default(first_columns[0].value().ToString() + + delim + first_merge_operand); + const std::string second_expected_default(delim + second_merge_operand); + + auto verify_basic = [&]() { + WideColumns first_expected_columns{ + {kDefaultWideColumnName, first_expected_default}, + first_columns[1], + first_columns[2]}; + + WideColumns second_expected_columns{ + {kDefaultWideColumnName, second_expected_default}, + second_columns[0], + second_columns[1]}; + + { + PinnableSlice result; + ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), first_key, + &result)); + ASSERT_EQ(result, first_expected_default); + } + + { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + first_key, &result)); + ASSERT_EQ(result.columns(), first_expected_columns); + } + + { + PinnableSlice result; + ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), second_key, + &result)); + ASSERT_EQ(result, second_expected_default); + } + + { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + second_key, &result)); + ASSERT_EQ(result.columns(), second_expected_columns); + } + + { + constexpr size_t num_keys = 2; + + std::array keys{{first_key, second_key}}; + std::array values; + std::array statuses; + + db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, + &keys[0], &values[0], &statuses[0]); + + ASSERT_EQ(values[0], first_expected_default); + ASSERT_OK(statuses[0]); + + ASSERT_EQ(values[1], second_expected_default); + ASSERT_OK(statuses[1]); + } + + { + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), first_key); + ASSERT_EQ(iter->value(), first_expected_default); + ASSERT_EQ(iter->columns(), first_expected_columns); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), second_key); + ASSERT_EQ(iter->value(), second_expected_default); + ASSERT_EQ(iter->columns(), second_expected_columns); + + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), second_key); + ASSERT_EQ(iter->value(), second_expected_default); + ASSERT_EQ(iter->columns(), second_expected_columns); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), first_key); + ASSERT_EQ(iter->value(), first_expected_default); + ASSERT_EQ(iter->columns(), first_expected_columns); + + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + } + }; + + auto verify_merge_ops_pre_compaction = [&]() { + constexpr size_t num_merge_operands = 2; + + GetMergeOperandsOptions get_merge_opts; + get_merge_opts.expected_max_number_of_operands = num_merge_operands; + + { + std::array merge_operands; + int number_of_operands = 0; + + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + first_key, &merge_operands[0], + &get_merge_opts, &number_of_operands)); + + ASSERT_EQ(number_of_operands, num_merge_operands); + ASSERT_EQ(merge_operands[0], first_columns[0].value()); + ASSERT_EQ(merge_operands[1], first_merge_operand); + } + + { + std::array merge_operands; + int number_of_operands = 0; + + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + second_key, &merge_operands[0], + &get_merge_opts, &number_of_operands)); + + ASSERT_EQ(number_of_operands, num_merge_operands); + ASSERT_TRUE(merge_operands[0].empty()); + ASSERT_EQ(merge_operands[1], second_merge_operand); + } + }; + + auto verify_merge_ops_post_compaction = [&]() { + constexpr size_t num_merge_operands = 1; + + GetMergeOperandsOptions get_merge_opts; + get_merge_opts.expected_max_number_of_operands = num_merge_operands; + + { + std::array merge_operands; + int number_of_operands = 0; + + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + first_key, &merge_operands[0], + &get_merge_opts, &number_of_operands)); + + ASSERT_EQ(number_of_operands, num_merge_operands); + ASSERT_EQ(merge_operands[0], first_expected_default); + } + + { + std::array merge_operands; + int number_of_operands = 0; + + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + second_key, &merge_operands[0], + &get_merge_opts, &number_of_operands)); + + ASSERT_EQ(number_of_operands, num_merge_operands); + ASSERT_EQ(merge_operands[0], second_expected_default); + } + }; + + { + // Base KVs and Merge operands both in memtable (note: we take a snapshot in + // between to make sure they do not get reconciled during the subsequent + // flush) + write_base(); + ManagedSnapshot snapshot(db_); + write_merge(); + verify_basic(); + verify_merge_ops_pre_compaction(); + + // Base KVs and Merge operands both in storage + ASSERT_OK(Flush()); + verify_basic(); + verify_merge_ops_pre_compaction(); + } + + // Base KVs in storage, Merge operands in memtable + DestroyAndReopen(options); + write_base(); + ASSERT_OK(Flush()); + write_merge(); + verify_basic(); + verify_merge_ops_pre_compaction(); + + // Flush and compact + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /* begin */ nullptr, + /* end */ nullptr)); + verify_basic(); + verify_merge_ops_post_compaction(); +} + +TEST_F(DBWideBasicTest, PutEntityTimestampError) { + // Note: timestamps are currently not supported + + Options options = GetDefaultOptions(); + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + + ColumnFamilyHandle* handle = nullptr; + ASSERT_OK(db_->CreateColumnFamily(options, "corinthian", &handle)); + std::unique_ptr handle_guard(handle); + + // Use the DB::PutEntity API + constexpr char first_key[] = "first"; + WideColumns first_columns{{"attr_name1", "foo"}, {"attr_name2", "bar"}}; + + ASSERT_TRUE(db_->PutEntity(WriteOptions(), handle, first_key, first_columns) + .IsInvalidArgument()); + + // Use WriteBatch + constexpr char second_key[] = "second"; + WideColumns second_columns{{"doric", "column"}, {"ionic", "column"}}; + + WriteBatch batch; + ASSERT_TRUE( + batch.PutEntity(handle, second_key, second_columns).IsInvalidArgument()); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); +} + +TEST_F(DBWideBasicTest, PutEntitySerializationError) { + // Make sure duplicate columns are caught + + Options options = GetDefaultOptions(); + + // Use the DB::PutEntity API + constexpr char first_key[] = "first"; + WideColumns first_columns{{"foo", "bar"}, {"foo", "baz"}}; + + ASSERT_TRUE(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(), + first_key, first_columns) + .IsCorruption()); + + // Use WriteBatch + constexpr char second_key[] = "second"; + WideColumns second_columns{{"column", "doric"}, {"column", "ionic"}}; + + WriteBatch batch; + ASSERT_TRUE( + batch.PutEntity(db_->DefaultColumnFamily(), second_key, second_columns) + .IsCorruption()); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/wide/wide_column_serialization.cc b/src/rocksdb/db/wide/wide_column_serialization.cc new file mode 100644 index 000000000..f62143c40 --- /dev/null +++ b/src/rocksdb/db/wide/wide_column_serialization.cc @@ -0,0 +1,182 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/wide/wide_column_serialization.h" + +#include +#include +#include + +#include "rocksdb/slice.h" +#include "util/autovector.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +Status WideColumnSerialization::SerializeImpl(const Slice* value_of_default, + const WideColumns& columns, + std::string& output) { + const size_t num_columns = + value_of_default ? columns.size() + 1 : columns.size(); + + if (num_columns > static_cast(std::numeric_limits::max())) { + return Status::InvalidArgument("Too many wide columns"); + } + + PutVarint32(&output, kCurrentVersion); + + PutVarint32(&output, static_cast(num_columns)); + + const Slice* prev_name = nullptr; + if (value_of_default) { + if (value_of_default->size() > + static_cast(std::numeric_limits::max())) { + return Status::InvalidArgument("Wide column value too long"); + } + + PutLengthPrefixedSlice(&output, kDefaultWideColumnName); + PutVarint32(&output, static_cast(value_of_default->size())); + + prev_name = &kDefaultWideColumnName; + } + + for (size_t i = 0; i < columns.size(); ++i) { + const WideColumn& column = columns[i]; + + const Slice& name = column.name(); + if (name.size() > + static_cast(std::numeric_limits::max())) { + return Status::InvalidArgument("Wide column name too long"); + } + + if (prev_name && prev_name->compare(name) >= 0) { + return Status::Corruption("Wide columns out of order"); + } + + const Slice& value = column.value(); + if (value.size() > + static_cast(std::numeric_limits::max())) { + return Status::InvalidArgument("Wide column value too long"); + } + + PutLengthPrefixedSlice(&output, name); + PutVarint32(&output, static_cast(value.size())); + + prev_name = &name; + } + + if (value_of_default) { + output.append(value_of_default->data(), value_of_default->size()); + } + + for (const auto& column : columns) { + const Slice& value = column.value(); + + output.append(value.data(), value.size()); + } + + return Status::OK(); +} + +Status WideColumnSerialization::Deserialize(Slice& input, + WideColumns& columns) { + assert(columns.empty()); + + uint32_t version = 0; + if (!GetVarint32(&input, &version)) { + return Status::Corruption("Error decoding wide column version"); + } + + if (version > kCurrentVersion) { + return Status::NotSupported("Unsupported wide column version"); + } + + uint32_t num_columns = 0; + if (!GetVarint32(&input, &num_columns)) { + return Status::Corruption("Error decoding number of wide columns"); + } + + if (!num_columns) { + return Status::OK(); + } + + columns.reserve(num_columns); + + autovector column_value_sizes; + column_value_sizes.reserve(num_columns); + + for (uint32_t i = 0; i < num_columns; ++i) { + Slice name; + if (!GetLengthPrefixedSlice(&input, &name)) { + return Status::Corruption("Error decoding wide column name"); + } + + if (!columns.empty() && columns.back().name().compare(name) >= 0) { + return Status::Corruption("Wide columns out of order"); + } + + columns.emplace_back(name, Slice()); + + uint32_t value_size = 0; + if (!GetVarint32(&input, &value_size)) { + return Status::Corruption("Error decoding wide column value size"); + } + + column_value_sizes.emplace_back(value_size); + } + + const Slice data(input); + size_t pos = 0; + + for (uint32_t i = 0; i < num_columns; ++i) { + const uint32_t value_size = column_value_sizes[i]; + + if (pos + value_size > data.size()) { + return Status::Corruption("Error decoding wide column value payload"); + } + + columns[i].value() = Slice(data.data() + pos, value_size); + + pos += value_size; + } + + return Status::OK(); +} + +WideColumns::const_iterator WideColumnSerialization::Find( + const WideColumns& columns, const Slice& column_name) { + const auto it = + std::lower_bound(columns.cbegin(), columns.cend(), column_name, + [](const WideColumn& lhs, const Slice& rhs) { + return lhs.name().compare(rhs) < 0; + }); + + if (it == columns.cend() || it->name() != column_name) { + return columns.cend(); + } + + return it; +} + +Status WideColumnSerialization::GetValueOfDefaultColumn(Slice& input, + Slice& value) { + WideColumns columns; + + const Status s = Deserialize(input, columns); + if (!s.ok()) { + return s; + } + + if (columns.empty() || columns[0].name() != kDefaultWideColumnName) { + value.clear(); + return Status::OK(); + } + + value = columns[0].value(); + + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/wide/wide_column_serialization.h b/src/rocksdb/db/wide/wide_column_serialization.h new file mode 100644 index 000000000..f0ffbd392 --- /dev/null +++ b/src/rocksdb/db/wide/wide_column_serialization.h @@ -0,0 +1,77 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" +#include "rocksdb/wide_columns.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; + +// Wide-column serialization/deserialization primitives. +// +// The two main parts of the layout are 1) a sorted index containing the column +// names and column value sizes and 2) the column values themselves. Keeping the +// index and the values separate will enable selectively reading column values +// down the line. Note that currently the index has to be fully parsed in order +// to find out the offset of each column value. +// +// Legend: cn = column name, cv = column value, cns = column name size, cvs = +// column value size. +// +// +----------+--------------+----------+-------+----------+---... +// | version | # of columns | cns 1 | cn 1 | cvs 1 | +// +----------+--------------+------------------+--------- +---... +// | varint32 | varint32 | varint32 | bytes | varint32 | +// +----------+--------------+----------+-------+----------+---... +// +// ... continued ... +// +// ...---+----------+-------+----------+-------+---...---+-------+ +// | cns N | cn N | cvs N | cv 1 | | cv N | +// ...---+----------+-------+----------+-------+---...---+-------+ +// | varint32 | bytes | varint32 | bytes | | bytes | +// ...---+----------+-------+----------+-------+---...---+-------+ + +class WideColumnSerialization { + public: + static Status Serialize(const WideColumns& columns, std::string& output); + static Status Serialize(const Slice& value_of_default, + const WideColumns& other_columns, + std::string& output); + + static Status Deserialize(Slice& input, WideColumns& columns); + + static WideColumns::const_iterator Find(const WideColumns& columns, + const Slice& column_name); + static Status GetValueOfDefaultColumn(Slice& input, Slice& value); + + static constexpr uint32_t kCurrentVersion = 1; + + private: + static Status SerializeImpl(const Slice* value_of_default, + const WideColumns& columns, std::string& output); +}; + +inline Status WideColumnSerialization::Serialize(const WideColumns& columns, + std::string& output) { + constexpr Slice* value_of_default = nullptr; + + return SerializeImpl(value_of_default, columns, output); +} + +inline Status WideColumnSerialization::Serialize( + const Slice& value_of_default, const WideColumns& other_columns, + std::string& output) { + return SerializeImpl(&value_of_default, other_columns, output); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/wide/wide_column_serialization_test.cc b/src/rocksdb/db/wide/wide_column_serialization_test.cc new file mode 100644 index 000000000..8060d2f24 --- /dev/null +++ b/src/rocksdb/db/wide/wide_column_serialization_test.cc @@ -0,0 +1,338 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/wide/wide_column_serialization.h" + +#include "test_util/testharness.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +TEST(WideColumnSerializationTest, Construct) { + constexpr char foo[] = "foo"; + constexpr char bar[] = "bar"; + + const std::string foo_str(foo); + const std::string bar_str(bar); + + const Slice foo_slice(foo_str); + const Slice bar_slice(bar_str); + + { + WideColumn column(foo, bar); + ASSERT_EQ(column.name(), foo); + ASSERT_EQ(column.value(), bar); + } + + { + WideColumn column(foo_str, bar); + ASSERT_EQ(column.name(), foo_str); + ASSERT_EQ(column.value(), bar); + } + + { + WideColumn column(foo_slice, bar); + ASSERT_EQ(column.name(), foo_slice); + ASSERT_EQ(column.value(), bar); + } + + { + WideColumn column(foo, bar_str); + ASSERT_EQ(column.name(), foo); + ASSERT_EQ(column.value(), bar_str); + } + + { + WideColumn column(foo_str, bar_str); + ASSERT_EQ(column.name(), foo_str); + ASSERT_EQ(column.value(), bar_str); + } + + { + WideColumn column(foo_slice, bar_str); + ASSERT_EQ(column.name(), foo_slice); + ASSERT_EQ(column.value(), bar_str); + } + + { + WideColumn column(foo, bar_slice); + ASSERT_EQ(column.name(), foo); + ASSERT_EQ(column.value(), bar_slice); + } + + { + WideColumn column(foo_str, bar_slice); + ASSERT_EQ(column.name(), foo_str); + ASSERT_EQ(column.value(), bar_slice); + } + + { + WideColumn column(foo_slice, bar_slice); + ASSERT_EQ(column.name(), foo_slice); + ASSERT_EQ(column.value(), bar_slice); + } + + { + constexpr char foo_name[] = "foo_name"; + constexpr char bar_value[] = "bar_value"; + + WideColumn column(std::piecewise_construct, + std::forward_as_tuple(foo_name, sizeof(foo) - 1), + std::forward_as_tuple(bar_value, sizeof(bar) - 1)); + ASSERT_EQ(column.name(), foo); + ASSERT_EQ(column.value(), bar); + } +} + +TEST(WideColumnSerializationTest, SerializeDeserialize) { + WideColumns columns{{"foo", "bar"}, {"hello", "world"}}; + std::string output; + + ASSERT_OK(WideColumnSerialization::Serialize(columns, output)); + + Slice input(output); + WideColumns deserialized_columns; + + ASSERT_OK(WideColumnSerialization::Deserialize(input, deserialized_columns)); + ASSERT_EQ(columns, deserialized_columns); + + { + const auto it = WideColumnSerialization::Find(deserialized_columns, "foo"); + ASSERT_NE(it, deserialized_columns.cend()); + ASSERT_EQ(*it, deserialized_columns.front()); + } + + { + const auto it = + WideColumnSerialization::Find(deserialized_columns, "hello"); + ASSERT_NE(it, deserialized_columns.cend()); + ASSERT_EQ(*it, deserialized_columns.back()); + } + + { + const auto it = + WideColumnSerialization::Find(deserialized_columns, "fubar"); + ASSERT_EQ(it, deserialized_columns.cend()); + } + + { + const auto it = + WideColumnSerialization::Find(deserialized_columns, "snafu"); + ASSERT_EQ(it, deserialized_columns.cend()); + } +} + +TEST(WideColumnSerializationTest, SerializeWithPrepend) { + Slice value_of_default("baz"); + WideColumns other_columns{{"foo", "bar"}, {"hello", "world"}}; + + std::string output; + ASSERT_OK(WideColumnSerialization::Serialize(value_of_default, other_columns, + output)); + + Slice input(output); + + WideColumns deserialized_columns; + ASSERT_OK(WideColumnSerialization::Deserialize(input, deserialized_columns)); + + WideColumns expected_columns{{kDefaultWideColumnName, value_of_default}, + other_columns[0], + other_columns[1]}; + ASSERT_EQ(deserialized_columns, expected_columns); +} + +TEST(WideColumnSerializationTest, SerializeDuplicateError) { + WideColumns columns{{"foo", "bar"}, {"foo", "baz"}}; + std::string output; + + ASSERT_TRUE( + WideColumnSerialization::Serialize(columns, output).IsCorruption()); +} + +TEST(WideColumnSerializationTest, SerializeWithPrependDuplicateError) { + Slice value_of_default("baz"); + WideColumns other_columns{{kDefaultWideColumnName, "dup"}, {"foo", "bar"}}; + + std::string output; + ASSERT_TRUE(WideColumnSerialization::Serialize(value_of_default, + other_columns, output) + .IsCorruption()); +} + +TEST(WideColumnSerializationTest, SerializeOutOfOrderError) { + WideColumns columns{{"hello", "world"}, {"foo", "bar"}}; + std::string output; + + ASSERT_TRUE( + WideColumnSerialization::Serialize(columns, output).IsCorruption()); +} + +TEST(WideColumnSerializationTest, DeserializeVersionError) { + // Can't decode version + + std::string buf; + + Slice input(buf); + WideColumns columns; + + const Status s = WideColumnSerialization::Deserialize(input, columns); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "version")); +} + +TEST(WideColumnSerializationTest, DeserializeUnsupportedVersion) { + // Unsupported version + constexpr uint32_t future_version = 1000; + + std::string buf; + PutVarint32(&buf, future_version); + + Slice input(buf); + WideColumns columns; + + const Status s = WideColumnSerialization::Deserialize(input, columns); + ASSERT_TRUE(s.IsNotSupported()); + ASSERT_TRUE(std::strstr(s.getState(), "version")); +} + +TEST(WideColumnSerializationTest, DeserializeNumberOfColumnsError) { + // Can't decode number of columns + + std::string buf; + PutVarint32(&buf, WideColumnSerialization::kCurrentVersion); + + Slice input(buf); + WideColumns columns; + + const Status s = WideColumnSerialization::Deserialize(input, columns); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "number")); +} + +TEST(WideColumnSerializationTest, DeserializeColumnsError) { + std::string buf; + + PutVarint32(&buf, WideColumnSerialization::kCurrentVersion); + + constexpr uint32_t num_columns = 2; + PutVarint32(&buf, num_columns); + + // Can't decode the first column name + { + Slice input(buf); + WideColumns columns; + + const Status s = WideColumnSerialization::Deserialize(input, columns); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "name")); + } + + constexpr char first_column_name[] = "foo"; + PutLengthPrefixedSlice(&buf, first_column_name); + + // Can't decode the size of the first column value + { + Slice input(buf); + WideColumns columns; + + const Status s = WideColumnSerialization::Deserialize(input, columns); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "value size")); + } + + constexpr uint32_t first_value_size = 16; + PutVarint32(&buf, first_value_size); + + // Can't decode the second column name + { + Slice input(buf); + WideColumns columns; + + const Status s = WideColumnSerialization::Deserialize(input, columns); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "name")); + } + + constexpr char second_column_name[] = "hello"; + PutLengthPrefixedSlice(&buf, second_column_name); + + // Can't decode the size of the second column value + { + Slice input(buf); + WideColumns columns; + + const Status s = WideColumnSerialization::Deserialize(input, columns); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "value size")); + } + + constexpr uint32_t second_value_size = 64; + PutVarint32(&buf, second_value_size); + + // Can't decode the payload of the first column + { + Slice input(buf); + WideColumns columns; + + const Status s = WideColumnSerialization::Deserialize(input, columns); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "payload")); + } + + buf.append(first_value_size, '0'); + + // Can't decode the payload of the second column + { + Slice input(buf); + WideColumns columns; + + const Status s = WideColumnSerialization::Deserialize(input, columns); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "payload")); + } + + buf.append(second_value_size, 'x'); + + // Success + { + Slice input(buf); + WideColumns columns; + + ASSERT_OK(WideColumnSerialization::Deserialize(input, columns)); + } +} + +TEST(WideColumnSerializationTest, DeserializeColumnsOutOfOrder) { + std::string buf; + + PutVarint32(&buf, WideColumnSerialization::kCurrentVersion); + + constexpr uint32_t num_columns = 2; + PutVarint32(&buf, num_columns); + + constexpr char first_column_name[] = "b"; + PutLengthPrefixedSlice(&buf, first_column_name); + + constexpr uint32_t first_value_size = 16; + PutVarint32(&buf, first_value_size); + + constexpr char second_column_name[] = "a"; + PutLengthPrefixedSlice(&buf, second_column_name); + + Slice input(buf); + WideColumns columns; + + const Status s = WideColumnSerialization::Deserialize(input, columns); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "order")); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/wide/wide_columns.cc b/src/rocksdb/db/wide/wide_columns.cc new file mode 100644 index 000000000..186be7f85 --- /dev/null +++ b/src/rocksdb/db/wide/wide_columns.cc @@ -0,0 +1,22 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/wide_columns.h" + +#include "db/wide/wide_column_serialization.h" + +namespace ROCKSDB_NAMESPACE { + +const Slice kDefaultWideColumnName; + +const WideColumns kNoWideColumns; + +Status PinnableWideColumns::CreateIndexForWideColumns() { + Slice value_copy = value_; + + return WideColumnSerialization::Deserialize(value_copy, columns_); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/write_batch.cc b/src/rocksdb/db/write_batch.cc new file mode 100644 index 000000000..796697cfc --- /dev/null +++ b/src/rocksdb/db/write_batch.cc @@ -0,0 +1,3137 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBatch::rep_ := +// sequence: fixed64 +// count: fixed32 +// data: record[count] +// record := +// kTypeValue varstring varstring +// kTypeDeletion varstring +// kTypeSingleDeletion varstring +// kTypeRangeDeletion varstring varstring +// kTypeMerge varstring varstring +// kTypeColumnFamilyValue varint32 varstring varstring +// kTypeColumnFamilyDeletion varint32 varstring +// kTypeColumnFamilySingleDeletion varint32 varstring +// kTypeColumnFamilyRangeDeletion varint32 varstring varstring +// kTypeColumnFamilyMerge varint32 varstring varstring +// kTypeBeginPrepareXID +// kTypeEndPrepareXID varstring +// kTypeCommitXID varstring +// kTypeCommitXIDAndTimestamp varstring varstring +// kTypeRollbackXID varstring +// kTypeBeginPersistedPrepareXID +// kTypeBeginUnprepareXID +// kTypeWideColumnEntity varstring varstring +// kTypeColumnFamilyWideColumnEntity varint32 varstring varstring +// kTypeNoop +// varstring := +// len: varint32 +// data: uint8[len] + +#include "rocksdb/write_batch.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "db/flush_scheduler.h" +#include "db/kv_checksum.h" +#include "db/memtable.h" +#include "db/merge_context.h" +#include "db/snapshot_impl.h" +#include "db/trim_history_scheduler.h" +#include "db/wide/wide_column_serialization.h" +#include "db/write_batch_internal.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/statistics.h" +#include "port/lang.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/system_clock.h" +#include "util/autovector.h" +#include "util/cast_util.h" +#include "util/coding.h" +#include "util/duplicate_detector.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +// anon namespace for file-local types +namespace { + +enum ContentFlags : uint32_t { + DEFERRED = 1 << 0, + HAS_PUT = 1 << 1, + HAS_DELETE = 1 << 2, + HAS_SINGLE_DELETE = 1 << 3, + HAS_MERGE = 1 << 4, + HAS_BEGIN_PREPARE = 1 << 5, + HAS_END_PREPARE = 1 << 6, + HAS_COMMIT = 1 << 7, + HAS_ROLLBACK = 1 << 8, + HAS_DELETE_RANGE = 1 << 9, + HAS_BLOB_INDEX = 1 << 10, + HAS_BEGIN_UNPREPARE = 1 << 11, + HAS_PUT_ENTITY = 1 << 12, +}; + +struct BatchContentClassifier : public WriteBatch::Handler { + uint32_t content_flags = 0; + + Status PutCF(uint32_t, const Slice&, const Slice&) override { + content_flags |= ContentFlags::HAS_PUT; + return Status::OK(); + } + + Status PutEntityCF(uint32_t /* column_family_id */, const Slice& /* key */, + const Slice& /* entity */) override { + content_flags |= ContentFlags::HAS_PUT_ENTITY; + return Status::OK(); + } + + Status DeleteCF(uint32_t, const Slice&) override { + content_flags |= ContentFlags::HAS_DELETE; + return Status::OK(); + } + + Status SingleDeleteCF(uint32_t, const Slice&) override { + content_flags |= ContentFlags::HAS_SINGLE_DELETE; + return Status::OK(); + } + + Status DeleteRangeCF(uint32_t, const Slice&, const Slice&) override { + content_flags |= ContentFlags::HAS_DELETE_RANGE; + return Status::OK(); + } + + Status MergeCF(uint32_t, const Slice&, const Slice&) override { + content_flags |= ContentFlags::HAS_MERGE; + return Status::OK(); + } + + Status PutBlobIndexCF(uint32_t, const Slice&, const Slice&) override { + content_flags |= ContentFlags::HAS_BLOB_INDEX; + return Status::OK(); + } + + Status MarkBeginPrepare(bool unprepare) override { + content_flags |= ContentFlags::HAS_BEGIN_PREPARE; + if (unprepare) { + content_flags |= ContentFlags::HAS_BEGIN_UNPREPARE; + } + return Status::OK(); + } + + Status MarkEndPrepare(const Slice&) override { + content_flags |= ContentFlags::HAS_END_PREPARE; + return Status::OK(); + } + + Status MarkCommit(const Slice&) override { + content_flags |= ContentFlags::HAS_COMMIT; + return Status::OK(); + } + + Status MarkCommitWithTimestamp(const Slice&, const Slice&) override { + content_flags |= ContentFlags::HAS_COMMIT; + return Status::OK(); + } + + Status MarkRollback(const Slice&) override { + content_flags |= ContentFlags::HAS_ROLLBACK; + return Status::OK(); + } +}; + +} // anonymous namespace + +struct SavePoints { + std::stack> stack; +}; + +WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes, + size_t protection_bytes_per_key, size_t default_cf_ts_sz) + : content_flags_(0), + max_bytes_(max_bytes), + default_cf_ts_sz_(default_cf_ts_sz), + rep_() { + // Currently `protection_bytes_per_key` can only be enabled at 8 bytes per + // entry. + assert(protection_bytes_per_key == 0 || protection_bytes_per_key == 8); + if (protection_bytes_per_key != 0) { + prot_info_.reset(new WriteBatch::ProtectionInfo()); + } + rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader) + ? reserved_bytes + : WriteBatchInternal::kHeader); + rep_.resize(WriteBatchInternal::kHeader); +} + +WriteBatch::WriteBatch(const std::string& rep) + : content_flags_(ContentFlags::DEFERRED), max_bytes_(0), rep_(rep) {} + +WriteBatch::WriteBatch(std::string&& rep) + : content_flags_(ContentFlags::DEFERRED), + max_bytes_(0), + rep_(std::move(rep)) {} + +WriteBatch::WriteBatch(const WriteBatch& src) + : wal_term_point_(src.wal_term_point_), + content_flags_(src.content_flags_.load(std::memory_order_relaxed)), + max_bytes_(src.max_bytes_), + default_cf_ts_sz_(src.default_cf_ts_sz_), + rep_(src.rep_) { + if (src.save_points_ != nullptr) { + save_points_.reset(new SavePoints()); + save_points_->stack = src.save_points_->stack; + } + if (src.prot_info_ != nullptr) { + prot_info_.reset(new WriteBatch::ProtectionInfo()); + prot_info_->entries_ = src.prot_info_->entries_; + } +} + +WriteBatch::WriteBatch(WriteBatch&& src) noexcept + : save_points_(std::move(src.save_points_)), + wal_term_point_(std::move(src.wal_term_point_)), + content_flags_(src.content_flags_.load(std::memory_order_relaxed)), + max_bytes_(src.max_bytes_), + prot_info_(std::move(src.prot_info_)), + default_cf_ts_sz_(src.default_cf_ts_sz_), + rep_(std::move(src.rep_)) {} + +WriteBatch& WriteBatch::operator=(const WriteBatch& src) { + if (&src != this) { + this->~WriteBatch(); + new (this) WriteBatch(src); + } + return *this; +} + +WriteBatch& WriteBatch::operator=(WriteBatch&& src) { + if (&src != this) { + this->~WriteBatch(); + new (this) WriteBatch(std::move(src)); + } + return *this; +} + +WriteBatch::~WriteBatch() {} + +WriteBatch::Handler::~Handler() {} + +void WriteBatch::Handler::LogData(const Slice& /*blob*/) { + // If the user has not specified something to do with blobs, then we ignore + // them. +} + +bool WriteBatch::Handler::Continue() { return true; } + +void WriteBatch::Clear() { + rep_.clear(); + rep_.resize(WriteBatchInternal::kHeader); + + content_flags_.store(0, std::memory_order_relaxed); + + if (save_points_ != nullptr) { + while (!save_points_->stack.empty()) { + save_points_->stack.pop(); + } + } + + if (prot_info_ != nullptr) { + prot_info_->entries_.clear(); + } + wal_term_point_.clear(); + default_cf_ts_sz_ = 0; +} + +uint32_t WriteBatch::Count() const { return WriteBatchInternal::Count(this); } + +uint32_t WriteBatch::ComputeContentFlags() const { + auto rv = content_flags_.load(std::memory_order_relaxed); + if ((rv & ContentFlags::DEFERRED) != 0) { + BatchContentClassifier classifier; + // Should we handle status here? + Iterate(&classifier).PermitUncheckedError(); + rv = classifier.content_flags; + + // this method is conceptually const, because it is performing a lazy + // computation that doesn't affect the abstract state of the batch. + // content_flags_ is marked mutable so that we can perform the + // following assignment + content_flags_.store(rv, std::memory_order_relaxed); + } + return rv; +} + +void WriteBatch::MarkWalTerminationPoint() { + wal_term_point_.size = GetDataSize(); + wal_term_point_.count = Count(); + wal_term_point_.content_flags = content_flags_; +} + +size_t WriteBatch::GetProtectionBytesPerKey() const { + if (prot_info_ != nullptr) { + return prot_info_->GetBytesPerKey(); + } + return 0; +} + +bool WriteBatch::HasPut() const { + return (ComputeContentFlags() & ContentFlags::HAS_PUT) != 0; +} + +bool WriteBatch::HasPutEntity() const { + return (ComputeContentFlags() & ContentFlags::HAS_PUT_ENTITY) != 0; +} + +bool WriteBatch::HasDelete() const { + return (ComputeContentFlags() & ContentFlags::HAS_DELETE) != 0; +} + +bool WriteBatch::HasSingleDelete() const { + return (ComputeContentFlags() & ContentFlags::HAS_SINGLE_DELETE) != 0; +} + +bool WriteBatch::HasDeleteRange() const { + return (ComputeContentFlags() & ContentFlags::HAS_DELETE_RANGE) != 0; +} + +bool WriteBatch::HasMerge() const { + return (ComputeContentFlags() & ContentFlags::HAS_MERGE) != 0; +} + +bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, bool cf_record) { + assert(input != nullptr && key != nullptr); + // Skip tag byte + input->remove_prefix(1); + + if (cf_record) { + // Skip column_family bytes + uint32_t cf; + if (!GetVarint32(input, &cf)) { + return false; + } + } + + // Extract key + return GetLengthPrefixedSlice(input, key); +} + +bool WriteBatch::HasBeginPrepare() const { + return (ComputeContentFlags() & ContentFlags::HAS_BEGIN_PREPARE) != 0; +} + +bool WriteBatch::HasEndPrepare() const { + return (ComputeContentFlags() & ContentFlags::HAS_END_PREPARE) != 0; +} + +bool WriteBatch::HasCommit() const { + return (ComputeContentFlags() & ContentFlags::HAS_COMMIT) != 0; +} + +bool WriteBatch::HasRollback() const { + return (ComputeContentFlags() & ContentFlags::HAS_ROLLBACK) != 0; +} + +Status ReadRecordFromWriteBatch(Slice* input, char* tag, + uint32_t* column_family, Slice* key, + Slice* value, Slice* blob, Slice* xid) { + assert(key != nullptr && value != nullptr); + *tag = (*input)[0]; + input->remove_prefix(1); + *column_family = 0; // default + switch (*tag) { + case kTypeColumnFamilyValue: + if (!GetVarint32(input, column_family)) { + return Status::Corruption("bad WriteBatch Put"); + } + FALLTHROUGH_INTENDED; + case kTypeValue: + if (!GetLengthPrefixedSlice(input, key) || + !GetLengthPrefixedSlice(input, value)) { + return Status::Corruption("bad WriteBatch Put"); + } + break; + case kTypeColumnFamilyDeletion: + case kTypeColumnFamilySingleDeletion: + if (!GetVarint32(input, column_family)) { + return Status::Corruption("bad WriteBatch Delete"); + } + FALLTHROUGH_INTENDED; + case kTypeDeletion: + case kTypeSingleDeletion: + if (!GetLengthPrefixedSlice(input, key)) { + return Status::Corruption("bad WriteBatch Delete"); + } + break; + case kTypeColumnFamilyRangeDeletion: + if (!GetVarint32(input, column_family)) { + return Status::Corruption("bad WriteBatch DeleteRange"); + } + FALLTHROUGH_INTENDED; + case kTypeRangeDeletion: + // for range delete, "key" is begin_key, "value" is end_key + if (!GetLengthPrefixedSlice(input, key) || + !GetLengthPrefixedSlice(input, value)) { + return Status::Corruption("bad WriteBatch DeleteRange"); + } + break; + case kTypeColumnFamilyMerge: + if (!GetVarint32(input, column_family)) { + return Status::Corruption("bad WriteBatch Merge"); + } + FALLTHROUGH_INTENDED; + case kTypeMerge: + if (!GetLengthPrefixedSlice(input, key) || + !GetLengthPrefixedSlice(input, value)) { + return Status::Corruption("bad WriteBatch Merge"); + } + break; + case kTypeColumnFamilyBlobIndex: + if (!GetVarint32(input, column_family)) { + return Status::Corruption("bad WriteBatch BlobIndex"); + } + FALLTHROUGH_INTENDED; + case kTypeBlobIndex: + if (!GetLengthPrefixedSlice(input, key) || + !GetLengthPrefixedSlice(input, value)) { + return Status::Corruption("bad WriteBatch BlobIndex"); + } + break; + case kTypeLogData: + assert(blob != nullptr); + if (!GetLengthPrefixedSlice(input, blob)) { + return Status::Corruption("bad WriteBatch Blob"); + } + break; + case kTypeNoop: + case kTypeBeginPrepareXID: + // This indicates that the prepared batch is also persisted in the db. + // This is used in WritePreparedTxn + case kTypeBeginPersistedPrepareXID: + // This is used in WriteUnpreparedTxn + case kTypeBeginUnprepareXID: + break; + case kTypeEndPrepareXID: + if (!GetLengthPrefixedSlice(input, xid)) { + return Status::Corruption("bad EndPrepare XID"); + } + break; + case kTypeCommitXIDAndTimestamp: + if (!GetLengthPrefixedSlice(input, key)) { + return Status::Corruption("bad commit timestamp"); + } + FALLTHROUGH_INTENDED; + case kTypeCommitXID: + if (!GetLengthPrefixedSlice(input, xid)) { + return Status::Corruption("bad Commit XID"); + } + break; + case kTypeRollbackXID: + if (!GetLengthPrefixedSlice(input, xid)) { + return Status::Corruption("bad Rollback XID"); + } + break; + case kTypeColumnFamilyWideColumnEntity: + if (!GetVarint32(input, column_family)) { + return Status::Corruption("bad WriteBatch PutEntity"); + } + FALLTHROUGH_INTENDED; + case kTypeWideColumnEntity: + if (!GetLengthPrefixedSlice(input, key) || + !GetLengthPrefixedSlice(input, value)) { + return Status::Corruption("bad WriteBatch PutEntity"); + } + break; + default: + return Status::Corruption("unknown WriteBatch tag"); + } + return Status::OK(); +} + +Status WriteBatch::Iterate(Handler* handler) const { + if (rep_.size() < WriteBatchInternal::kHeader) { + return Status::Corruption("malformed WriteBatch (too small)"); + } + + return WriteBatchInternal::Iterate(this, handler, WriteBatchInternal::kHeader, + rep_.size()); +} + +Status WriteBatchInternal::Iterate(const WriteBatch* wb, + WriteBatch::Handler* handler, size_t begin, + size_t end) { + if (begin > wb->rep_.size() || end > wb->rep_.size() || end < begin) { + return Status::Corruption("Invalid start/end bounds for Iterate"); + } + assert(begin <= end); + Slice input(wb->rep_.data() + begin, static_cast(end - begin)); + bool whole_batch = + (begin == WriteBatchInternal::kHeader) && (end == wb->rep_.size()); + + Slice key, value, blob, xid; + + // Sometimes a sub-batch starts with a Noop. We want to exclude such Noops as + // the batch boundary symbols otherwise we would mis-count the number of + // batches. We do that by checking whether the accumulated batch is empty + // before seeing the next Noop. + bool empty_batch = true; + uint32_t found = 0; + Status s; + char tag = 0; + uint32_t column_family = 0; // default + bool last_was_try_again = false; + bool handler_continue = true; + while (((s.ok() && !input.empty()) || UNLIKELY(s.IsTryAgain()))) { + handler_continue = handler->Continue(); + if (!handler_continue) { + break; + } + + if (LIKELY(!s.IsTryAgain())) { + last_was_try_again = false; + tag = 0; + column_family = 0; // default + + s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value, + &blob, &xid); + if (!s.ok()) { + return s; + } + } else { + assert(s.IsTryAgain()); + assert(!last_was_try_again); // to detect infinite loop bugs + if (UNLIKELY(last_was_try_again)) { + return Status::Corruption( + "two consecutive TryAgain in WriteBatch handler; this is either a " + "software bug or data corruption."); + } + last_was_try_again = true; + s = Status::OK(); + } + + switch (tag) { + case kTypeColumnFamilyValue: + case kTypeValue: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_PUT)); + s = handler->PutCF(column_family, key, value); + if (LIKELY(s.ok())) { + empty_batch = false; + found++; + } + break; + case kTypeColumnFamilyDeletion: + case kTypeDeletion: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE)); + s = handler->DeleteCF(column_family, key); + if (LIKELY(s.ok())) { + empty_batch = false; + found++; + } + break; + case kTypeColumnFamilySingleDeletion: + case kTypeSingleDeletion: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_SINGLE_DELETE)); + s = handler->SingleDeleteCF(column_family, key); + if (LIKELY(s.ok())) { + empty_batch = false; + found++; + } + break; + case kTypeColumnFamilyRangeDeletion: + case kTypeRangeDeletion: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE_RANGE)); + s = handler->DeleteRangeCF(column_family, key, value); + if (LIKELY(s.ok())) { + empty_batch = false; + found++; + } + break; + case kTypeColumnFamilyMerge: + case kTypeMerge: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_MERGE)); + s = handler->MergeCF(column_family, key, value); + if (LIKELY(s.ok())) { + empty_batch = false; + found++; + } + break; + case kTypeColumnFamilyBlobIndex: + case kTypeBlobIndex: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_BLOB_INDEX)); + s = handler->PutBlobIndexCF(column_family, key, value); + if (LIKELY(s.ok())) { + found++; + } + break; + case kTypeLogData: + handler->LogData(blob); + // A batch might have nothing but LogData. It is still a batch. + empty_batch = false; + break; + case kTypeBeginPrepareXID: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE)); + s = handler->MarkBeginPrepare(); + assert(s.ok()); + empty_batch = false; + if (handler->WriteAfterCommit() == + WriteBatch::Handler::OptionState::kDisabled) { + s = Status::NotSupported( + "WriteCommitted txn tag when write_after_commit_ is disabled (in " + "WritePrepared/WriteUnprepared mode). If it is not due to " + "corruption, the WAL must be emptied before changing the " + "WritePolicy."); + } + if (handler->WriteBeforePrepare() == + WriteBatch::Handler::OptionState::kEnabled) { + s = Status::NotSupported( + "WriteCommitted txn tag when write_before_prepare_ is enabled " + "(in WriteUnprepared mode). If it is not due to corruption, the " + "WAL must be emptied before changing the WritePolicy."); + } + break; + case kTypeBeginPersistedPrepareXID: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE)); + s = handler->MarkBeginPrepare(); + assert(s.ok()); + empty_batch = false; + if (handler->WriteAfterCommit() == + WriteBatch::Handler::OptionState::kEnabled) { + s = Status::NotSupported( + "WritePrepared/WriteUnprepared txn tag when write_after_commit_ " + "is enabled (in default WriteCommitted mode). If it is not due " + "to corruption, the WAL must be emptied before changing the " + "WritePolicy."); + } + break; + case kTypeBeginUnprepareXID: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_UNPREPARE)); + s = handler->MarkBeginPrepare(true /* unprepared */); + assert(s.ok()); + empty_batch = false; + if (handler->WriteAfterCommit() == + WriteBatch::Handler::OptionState::kEnabled) { + s = Status::NotSupported( + "WriteUnprepared txn tag when write_after_commit_ is enabled (in " + "default WriteCommitted mode). If it is not due to corruption, " + "the WAL must be emptied before changing the WritePolicy."); + } + if (handler->WriteBeforePrepare() == + WriteBatch::Handler::OptionState::kDisabled) { + s = Status::NotSupported( + "WriteUnprepared txn tag when write_before_prepare_ is disabled " + "(in WriteCommitted/WritePrepared mode). If it is not due to " + "corruption, the WAL must be emptied before changing the " + "WritePolicy."); + } + break; + case kTypeEndPrepareXID: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_END_PREPARE)); + s = handler->MarkEndPrepare(xid); + assert(s.ok()); + empty_batch = true; + break; + case kTypeCommitXID: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT)); + s = handler->MarkCommit(xid); + assert(s.ok()); + empty_batch = true; + break; + case kTypeCommitXIDAndTimestamp: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT)); + // key stores the commit timestamp. + assert(!key.empty()); + s = handler->MarkCommitWithTimestamp(xid, key); + if (LIKELY(s.ok())) { + empty_batch = true; + } + break; + case kTypeRollbackXID: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_ROLLBACK)); + s = handler->MarkRollback(xid); + assert(s.ok()); + empty_batch = true; + break; + case kTypeNoop: + s = handler->MarkNoop(empty_batch); + assert(s.ok()); + empty_batch = true; + break; + case kTypeWideColumnEntity: + case kTypeColumnFamilyWideColumnEntity: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_PUT_ENTITY)); + s = handler->PutEntityCF(column_family, key, value); + if (LIKELY(s.ok())) { + empty_batch = false; + ++found; + } + break; + default: + return Status::Corruption("unknown WriteBatch tag"); + } + } + if (!s.ok()) { + return s; + } + if (handler_continue && whole_batch && + found != WriteBatchInternal::Count(wb)) { + return Status::Corruption("WriteBatch has wrong count"); + } else { + return Status::OK(); + } +} + +bool WriteBatchInternal::IsLatestPersistentState(const WriteBatch* b) { + return b->is_latest_persistent_state_; +} + +void WriteBatchInternal::SetAsLatestPersistentState(WriteBatch* b) { + b->is_latest_persistent_state_ = true; +} + +uint32_t WriteBatchInternal::Count(const WriteBatch* b) { + return DecodeFixed32(b->rep_.data() + 8); +} + +void WriteBatchInternal::SetCount(WriteBatch* b, uint32_t n) { + EncodeFixed32(&b->rep_[8], n); +} + +SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) { + return SequenceNumber(DecodeFixed64(b->rep_.data())); +} + +void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) { + EncodeFixed64(&b->rep_[0], seq); +} + +size_t WriteBatchInternal::GetFirstOffset(WriteBatch* /*b*/) { + return WriteBatchInternal::kHeader; +} + +std::tuple +WriteBatchInternal::GetColumnFamilyIdAndTimestampSize( + WriteBatch* b, ColumnFamilyHandle* column_family) { + uint32_t cf_id = GetColumnFamilyID(column_family); + size_t ts_sz = 0; + Status s; + if (column_family) { + const Comparator* const ucmp = column_family->GetComparator(); + if (ucmp) { + ts_sz = ucmp->timestamp_size(); + if (0 == cf_id && b->default_cf_ts_sz_ != ts_sz) { + s = Status::InvalidArgument("Default cf timestamp size mismatch"); + } + } + } else if (b->default_cf_ts_sz_ > 0) { + ts_sz = b->default_cf_ts_sz_; + } + return std::make_tuple(s, cf_id, ts_sz); +} + +namespace { +Status CheckColumnFamilyTimestampSize(ColumnFamilyHandle* column_family, + const Slice& ts) { + if (!column_family) { + return Status::InvalidArgument("column family handle cannot be null"); + } + const Comparator* const ucmp = column_family->GetComparator(); + assert(ucmp); + size_t cf_ts_sz = ucmp->timestamp_size(); + if (0 == cf_ts_sz) { + return Status::InvalidArgument("timestamp disabled"); + } + if (cf_ts_sz != ts.size()) { + return Status::InvalidArgument("timestamp size mismatch"); + } + return Status::OK(); +} +} // anonymous namespace + +Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id, + const Slice& key, const Slice& value) { + if (key.size() > size_t{std::numeric_limits::max()}) { + return Status::InvalidArgument("key is too large"); + } + if (value.size() > size_t{std::numeric_limits::max()}) { + return Status::InvalidArgument("value is too large"); + } + + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeValue)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyValue)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSlice(&b->rep_, key); + PutLengthPrefixedSlice(&b->rep_, value); + b->content_flags_.store( + b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT, + std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // Technically the optype could've been `kTypeColumnFamilyValue` with the + // CF ID encoded in the `WriteBatch`. That distinction is unimportant + // however since we verify CF ID is correct, as well as all other fields + // (a missing/extra encoded CF ID would corrupt another field). It is + // convenient to consolidate on `kTypeValue` here as that is what will be + // inserted into memtable. + b->prot_info_->entries_.emplace_back(ProtectionInfo64() + .ProtectKVO(key, value, kTypeValue) + .ProtectC(column_family_id)); + } + return save.commit(); +} + +Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) { + size_t ts_sz = 0; + uint32_t cf_id = 0; + Status s; + + std::tie(s, cf_id, ts_sz) = + WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this, + column_family); + + if (!s.ok()) { + return s; + } + + if (0 == ts_sz) { + return WriteBatchInternal::Put(this, cf_id, key, value); + } + + needs_in_place_update_ts_ = true; + has_key_with_ts_ = true; + std::string dummy_ts(ts_sz, '\0'); + std::array key_with_ts{{key, dummy_ts}}; + return WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2), + SliceParts(&value, 1)); +} + +Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts, const Slice& value) { + const Status s = CheckColumnFamilyTimestampSize(column_family, ts); + if (!s.ok()) { + return s; + } + has_key_with_ts_ = true; + assert(column_family); + uint32_t cf_id = column_family->GetID(); + std::array key_with_ts{{key, ts}}; + return WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2), + SliceParts(&value, 1)); +} + +Status WriteBatchInternal::CheckSlicePartsLength(const SliceParts& key, + const SliceParts& value) { + size_t total_key_bytes = 0; + for (int i = 0; i < key.num_parts; ++i) { + total_key_bytes += key.parts[i].size(); + } + if (total_key_bytes >= size_t{std::numeric_limits::max()}) { + return Status::InvalidArgument("key is too large"); + } + + size_t total_value_bytes = 0; + for (int i = 0; i < value.num_parts; ++i) { + total_value_bytes += value.parts[i].size(); + } + if (total_value_bytes >= size_t{std::numeric_limits::max()}) { + return Status::InvalidArgument("value is too large"); + } + return Status::OK(); +} + +Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id, + const SliceParts& key, const SliceParts& value) { + Status s = CheckSlicePartsLength(key, value); + if (!s.ok()) { + return s; + } + + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeValue)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyValue)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSliceParts(&b->rep_, key); + PutLengthPrefixedSliceParts(&b->rep_, value); + b->content_flags_.store( + b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT, + std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + b->prot_info_->entries_.emplace_back(ProtectionInfo64() + .ProtectKVO(key, value, kTypeValue) + .ProtectC(column_family_id)); + } + return save.commit(); +} + +Status WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) { + size_t ts_sz = 0; + uint32_t cf_id = 0; + Status s; + + std::tie(s, cf_id, ts_sz) = + WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this, + column_family); + + if (!s.ok()) { + return s; + } + + if (ts_sz == 0) { + return WriteBatchInternal::Put(this, cf_id, key, value); + } + + return Status::InvalidArgument( + "Cannot call this method on column family enabling timestamp"); +} + +Status WriteBatchInternal::PutEntity(WriteBatch* b, uint32_t column_family_id, + const Slice& key, + const WideColumns& columns) { + assert(b); + + if (key.size() > size_t{std::numeric_limits::max()}) { + return Status::InvalidArgument("key is too large"); + } + + WideColumns sorted_columns(columns); + std::sort(sorted_columns.begin(), sorted_columns.end(), + [](const WideColumn& lhs, const WideColumn& rhs) { + return lhs.name().compare(rhs.name()) < 0; + }); + + std::string entity; + const Status s = WideColumnSerialization::Serialize(sorted_columns, entity); + if (!s.ok()) { + return s; + } + + if (entity.size() > size_t{std::numeric_limits::max()}) { + return Status::InvalidArgument("wide column entity is too large"); + } + + LocalSavePoint save(b); + + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeWideColumnEntity)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyWideColumnEntity)); + PutVarint32(&b->rep_, column_family_id); + } + + PutLengthPrefixedSlice(&b->rep_, key); + PutLengthPrefixedSlice(&b->rep_, entity); + + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_PUT_ENTITY, + std::memory_order_relaxed); + + if (b->prot_info_ != nullptr) { + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVO(key, entity, kTypeWideColumnEntity) + .ProtectC(column_family_id)); + } + + return save.commit(); +} + +Status WriteBatch::PutEntity(ColumnFamilyHandle* column_family, + const Slice& key, const WideColumns& columns) { + if (!column_family) { + return Status::InvalidArgument( + "Cannot call this method without a column family handle"); + } + + Status s; + uint32_t cf_id = 0; + size_t ts_sz = 0; + + std::tie(s, cf_id, ts_sz) = + WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this, + column_family); + + if (!s.ok()) { + return s; + } + + if (ts_sz) { + return Status::InvalidArgument( + "Cannot call this method on column family enabling timestamp"); + } + + return WriteBatchInternal::PutEntity(this, cf_id, key, columns); +} + +Status WriteBatchInternal::InsertNoop(WriteBatch* b) { + b->rep_.push_back(static_cast(kTypeNoop)); + return Status::OK(); +} + +Status WriteBatchInternal::MarkEndPrepare(WriteBatch* b, const Slice& xid, + bool write_after_commit, + bool unprepared_batch) { + // a manually constructed batch can only contain one prepare section + assert(b->rep_[12] == static_cast(kTypeNoop)); + + // all savepoints up to this point are cleared + if (b->save_points_ != nullptr) { + while (!b->save_points_->stack.empty()) { + b->save_points_->stack.pop(); + } + } + + // rewrite noop as begin marker + b->rep_[12] = static_cast( + write_after_commit ? kTypeBeginPrepareXID + : (unprepared_batch ? kTypeBeginUnprepareXID + : kTypeBeginPersistedPrepareXID)); + b->rep_.push_back(static_cast(kTypeEndPrepareXID)); + PutLengthPrefixedSlice(&b->rep_, xid); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_END_PREPARE | + ContentFlags::HAS_BEGIN_PREPARE, + std::memory_order_relaxed); + if (unprepared_batch) { + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_BEGIN_UNPREPARE, + std::memory_order_relaxed); + } + return Status::OK(); +} + +Status WriteBatchInternal::MarkCommit(WriteBatch* b, const Slice& xid) { + b->rep_.push_back(static_cast(kTypeCommitXID)); + PutLengthPrefixedSlice(&b->rep_, xid); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_COMMIT, + std::memory_order_relaxed); + return Status::OK(); +} + +Status WriteBatchInternal::MarkCommitWithTimestamp(WriteBatch* b, + const Slice& xid, + const Slice& commit_ts) { + assert(!commit_ts.empty()); + b->rep_.push_back(static_cast(kTypeCommitXIDAndTimestamp)); + PutLengthPrefixedSlice(&b->rep_, commit_ts); + PutLengthPrefixedSlice(&b->rep_, xid); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_COMMIT, + std::memory_order_relaxed); + return Status::OK(); +} + +Status WriteBatchInternal::MarkRollback(WriteBatch* b, const Slice& xid) { + b->rep_.push_back(static_cast(kTypeRollbackXID)); + PutLengthPrefixedSlice(&b->rep_, xid); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_ROLLBACK, + std::memory_order_relaxed); + return Status::OK(); +} + +Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id, + const Slice& key) { + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeDeletion)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyDeletion)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSlice(&b->rep_, key); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_DELETE, + std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVO(key, "" /* value */, kTypeDeletion) + .ProtectC(column_family_id)); + } + return save.commit(); +} + +Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key) { + size_t ts_sz = 0; + uint32_t cf_id = 0; + Status s; + + std::tie(s, cf_id, ts_sz) = + WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this, + column_family); + + if (!s.ok()) { + return s; + } + + if (0 == ts_sz) { + return WriteBatchInternal::Delete(this, cf_id, key); + } + + needs_in_place_update_ts_ = true; + has_key_with_ts_ = true; + std::string dummy_ts(ts_sz, '\0'); + std::array key_with_ts{{key, dummy_ts}}; + return WriteBatchInternal::Delete(this, cf_id, + SliceParts(key_with_ts.data(), 2)); +} + +Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts) { + const Status s = CheckColumnFamilyTimestampSize(column_family, ts); + if (!s.ok()) { + return s; + } + assert(column_family); + has_key_with_ts_ = true; + uint32_t cf_id = column_family->GetID(); + std::array key_with_ts{{key, ts}}; + return WriteBatchInternal::Delete(this, cf_id, + SliceParts(key_with_ts.data(), 2)); +} + +Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id, + const SliceParts& key) { + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeDeletion)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyDeletion)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSliceParts(&b->rep_, key); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_DELETE, + std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVO(key, + SliceParts(nullptr /* _parts */, 0 /* _num_parts */), + kTypeDeletion) + .ProtectC(column_family_id)); + } + return save.commit(); +} + +Status WriteBatch::Delete(ColumnFamilyHandle* column_family, + const SliceParts& key) { + size_t ts_sz = 0; + uint32_t cf_id = 0; + Status s; + + std::tie(s, cf_id, ts_sz) = + WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this, + column_family); + + if (!s.ok()) { + return s; + } + + if (0 == ts_sz) { + return WriteBatchInternal::Delete(this, cf_id, key); + } + + return Status::InvalidArgument( + "Cannot call this method on column family enabling timestamp"); +} + +Status WriteBatchInternal::SingleDelete(WriteBatch* b, + uint32_t column_family_id, + const Slice& key) { + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeSingleDeletion)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilySingleDeletion)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSlice(&b->rep_, key); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_SINGLE_DELETE, + std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVO(key, "" /* value */, kTypeSingleDeletion) + .ProtectC(column_family_id)); + } + return save.commit(); +} + +Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key) { + size_t ts_sz = 0; + uint32_t cf_id = 0; + Status s; + + std::tie(s, cf_id, ts_sz) = + WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this, + column_family); + + if (!s.ok()) { + return s; + } + + if (0 == ts_sz) { + return WriteBatchInternal::SingleDelete(this, cf_id, key); + } + + needs_in_place_update_ts_ = true; + has_key_with_ts_ = true; + std::string dummy_ts(ts_sz, '\0'); + std::array key_with_ts{{key, dummy_ts}}; + return WriteBatchInternal::SingleDelete(this, cf_id, + SliceParts(key_with_ts.data(), 2)); +} + +Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts) { + const Status s = CheckColumnFamilyTimestampSize(column_family, ts); + if (!s.ok()) { + return s; + } + has_key_with_ts_ = true; + assert(column_family); + uint32_t cf_id = column_family->GetID(); + std::array key_with_ts{{key, ts}}; + return WriteBatchInternal::SingleDelete(this, cf_id, + SliceParts(key_with_ts.data(), 2)); +} + +Status WriteBatchInternal::SingleDelete(WriteBatch* b, + uint32_t column_family_id, + const SliceParts& key) { + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeSingleDeletion)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilySingleDeletion)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSliceParts(&b->rep_, key); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_SINGLE_DELETE, + std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVO(key, + SliceParts(nullptr /* _parts */, + 0 /* _num_parts */) /* value */, + kTypeSingleDeletion) + .ProtectC(column_family_id)); + } + return save.commit(); +} + +Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key) { + size_t ts_sz = 0; + uint32_t cf_id = 0; + Status s; + + std::tie(s, cf_id, ts_sz) = + WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this, + column_family); + + if (!s.ok()) { + return s; + } + + if (0 == ts_sz) { + return WriteBatchInternal::SingleDelete(this, cf_id, key); + } + + return Status::InvalidArgument( + "Cannot call this method on column family enabling timestamp"); +} + +Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id, + const Slice& begin_key, + const Slice& end_key) { + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeRangeDeletion)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyRangeDeletion)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSlice(&b->rep_, begin_key); + PutLengthPrefixedSlice(&b->rep_, end_key); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_DELETE_RANGE, + std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + // In `DeleteRange()`, the end key is treated as the value. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVO(begin_key, end_key, kTypeRangeDeletion) + .ProtectC(column_family_id)); + } + return save.commit(); +} + +Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key) { + size_t ts_sz = 0; + uint32_t cf_id = 0; + Status s; + + std::tie(s, cf_id, ts_sz) = + WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this, + column_family); + + if (!s.ok()) { + return s; + } + + if (0 == ts_sz) { + return WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key); + } + + needs_in_place_update_ts_ = true; + has_key_with_ts_ = true; + std::string dummy_ts(ts_sz, '\0'); + std::array begin_key_with_ts{{begin_key, dummy_ts}}; + std::array end_key_with_ts{{end_key, dummy_ts}}; + return WriteBatchInternal::DeleteRange( + this, cf_id, SliceParts(begin_key_with_ts.data(), 2), + SliceParts(end_key_with_ts.data(), 2)); +} + +Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key, + const Slice& ts) { + const Status s = CheckColumnFamilyTimestampSize(column_family, ts); + if (!s.ok()) { + return s; + } + assert(column_family); + has_key_with_ts_ = true; + uint32_t cf_id = column_family->GetID(); + std::array key_with_ts{{begin_key, ts}}; + std::array end_key_with_ts{{end_key, ts}}; + return WriteBatchInternal::DeleteRange(this, cf_id, + SliceParts(key_with_ts.data(), 2), + SliceParts(end_key_with_ts.data(), 2)); +} + +Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id, + const SliceParts& begin_key, + const SliceParts& end_key) { + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeRangeDeletion)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyRangeDeletion)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSliceParts(&b->rep_, begin_key); + PutLengthPrefixedSliceParts(&b->rep_, end_key); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_DELETE_RANGE, + std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + // In `DeleteRange()`, the end key is treated as the value. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVO(begin_key, end_key, kTypeRangeDeletion) + .ProtectC(column_family_id)); + } + return save.commit(); +} + +Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family, + const SliceParts& begin_key, + const SliceParts& end_key) { + size_t ts_sz = 0; + uint32_t cf_id = 0; + Status s; + + std::tie(s, cf_id, ts_sz) = + WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this, + column_family); + + if (!s.ok()) { + return s; + } + + if (0 == ts_sz) { + return WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key); + } + + return Status::InvalidArgument( + "Cannot call this method on column family enabling timestamp"); +} + +Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id, + const Slice& key, const Slice& value) { + if (key.size() > size_t{std::numeric_limits::max()}) { + return Status::InvalidArgument("key is too large"); + } + if (value.size() > size_t{std::numeric_limits::max()}) { + return Status::InvalidArgument("value is too large"); + } + + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeMerge)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyMerge)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSlice(&b->rep_, key); + PutLengthPrefixedSlice(&b->rep_, value); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_MERGE, + std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + b->prot_info_->entries_.emplace_back(ProtectionInfo64() + .ProtectKVO(key, value, kTypeMerge) + .ProtectC(column_family_id)); + } + return save.commit(); +} + +Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) { + size_t ts_sz = 0; + uint32_t cf_id = 0; + Status s; + + std::tie(s, cf_id, ts_sz) = + WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this, + column_family); + + if (!s.ok()) { + return s; + } + + if (0 == ts_sz) { + return WriteBatchInternal::Merge(this, cf_id, key, value); + } + + needs_in_place_update_ts_ = true; + has_key_with_ts_ = true; + std::string dummy_ts(ts_sz, '\0'); + std::array key_with_ts{{key, dummy_ts}}; + + return WriteBatchInternal::Merge( + this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1)); +} + +Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts, const Slice& value) { + const Status s = CheckColumnFamilyTimestampSize(column_family, ts); + if (!s.ok()) { + return s; + } + has_key_with_ts_ = true; + assert(column_family); + uint32_t cf_id = column_family->GetID(); + std::array key_with_ts{{key, ts}}; + return WriteBatchInternal::Merge( + this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1)); +} + +Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id, + const SliceParts& key, + const SliceParts& value) { + Status s = CheckSlicePartsLength(key, value); + if (!s.ok()) { + return s; + } + + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeMerge)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyMerge)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSliceParts(&b->rep_, key); + PutLengthPrefixedSliceParts(&b->rep_, value); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_MERGE, + std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + b->prot_info_->entries_.emplace_back(ProtectionInfo64() + .ProtectKVO(key, value, kTypeMerge) + .ProtectC(column_family_id)); + } + return save.commit(); +} + +Status WriteBatch::Merge(ColumnFamilyHandle* column_family, + const SliceParts& key, const SliceParts& value) { + size_t ts_sz = 0; + uint32_t cf_id = 0; + Status s; + + std::tie(s, cf_id, ts_sz) = + WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this, + column_family); + + if (!s.ok()) { + return s; + } + + if (0 == ts_sz) { + return WriteBatchInternal::Merge(this, cf_id, key, value); + } + + return Status::InvalidArgument( + "Cannot call this method on column family enabling timestamp"); +} + +Status WriteBatchInternal::PutBlobIndex(WriteBatch* b, + uint32_t column_family_id, + const Slice& key, const Slice& value) { + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeBlobIndex)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyBlobIndex)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSlice(&b->rep_, key); + PutLengthPrefixedSlice(&b->rep_, value); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_BLOB_INDEX, + std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVO(key, value, kTypeBlobIndex) + .ProtectC(column_family_id)); + } + return save.commit(); +} + +Status WriteBatch::PutLogData(const Slice& blob) { + LocalSavePoint save(this); + rep_.push_back(static_cast(kTypeLogData)); + PutLengthPrefixedSlice(&rep_, blob); + return save.commit(); +} + +void WriteBatch::SetSavePoint() { + if (save_points_ == nullptr) { + save_points_.reset(new SavePoints()); + } + // Record length and count of current batch of writes. + save_points_->stack.push(SavePoint( + GetDataSize(), Count(), content_flags_.load(std::memory_order_relaxed))); +} + +Status WriteBatch::RollbackToSavePoint() { + if (save_points_ == nullptr || save_points_->stack.size() == 0) { + return Status::NotFound(); + } + + // Pop the most recent savepoint off the stack + SavePoint savepoint = save_points_->stack.top(); + save_points_->stack.pop(); + + assert(savepoint.size <= rep_.size()); + assert(static_cast(savepoint.count) <= Count()); + + if (savepoint.size == rep_.size()) { + // No changes to rollback + } else if (savepoint.size == 0) { + // Rollback everything + Clear(); + } else { + rep_.resize(savepoint.size); + if (prot_info_ != nullptr) { + prot_info_->entries_.resize(savepoint.count); + } + WriteBatchInternal::SetCount(this, savepoint.count); + content_flags_.store(savepoint.content_flags, std::memory_order_relaxed); + } + + return Status::OK(); +} + +Status WriteBatch::PopSavePoint() { + if (save_points_ == nullptr || save_points_->stack.size() == 0) { + return Status::NotFound(); + } + + // Pop the most recent savepoint off the stack + save_points_->stack.pop(); + + return Status::OK(); +} + +Status WriteBatch::UpdateTimestamps( + const Slice& ts, std::function ts_sz_func) { + TimestampUpdater ts_updater(prot_info_.get(), + std::move(ts_sz_func), ts); + const Status s = Iterate(&ts_updater); + if (s.ok()) { + needs_in_place_update_ts_ = false; + } + return s; +} + +Status WriteBatch::VerifyChecksum() const { + if (prot_info_ == nullptr) { + return Status::OK(); + } + Slice input(rep_.data() + WriteBatchInternal::kHeader, + rep_.size() - WriteBatchInternal::kHeader); + Slice key, value, blob, xid; + char tag = 0; + uint32_t column_family = 0; // default + Status s; + size_t prot_info_idx = 0; + bool checksum_protected = true; + while (!input.empty() && prot_info_idx < prot_info_->entries_.size()) { + // In case key/value/column_family are not updated by + // ReadRecordFromWriteBatch + key.clear(); + value.clear(); + column_family = 0; + s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value, + &blob, &xid); + if (!s.ok()) { + return s; + } + checksum_protected = true; + // Write batch checksum uses op_type without ColumnFamily (e.g., if op_type + // in the write batch is kTypeColumnFamilyValue, kTypeValue is used to + // compute the checksum), and encodes column family id separately. See + // comment in first `WriteBatchInternal::Put()` for more detail. + switch (tag) { + case kTypeColumnFamilyValue: + case kTypeValue: + tag = kTypeValue; + break; + case kTypeColumnFamilyDeletion: + case kTypeDeletion: + tag = kTypeDeletion; + break; + case kTypeColumnFamilySingleDeletion: + case kTypeSingleDeletion: + tag = kTypeSingleDeletion; + break; + case kTypeColumnFamilyRangeDeletion: + case kTypeRangeDeletion: + tag = kTypeRangeDeletion; + break; + case kTypeColumnFamilyMerge: + case kTypeMerge: + tag = kTypeMerge; + break; + case kTypeColumnFamilyBlobIndex: + case kTypeBlobIndex: + tag = kTypeBlobIndex; + break; + case kTypeLogData: + case kTypeBeginPrepareXID: + case kTypeEndPrepareXID: + case kTypeCommitXID: + case kTypeRollbackXID: + case kTypeNoop: + case kTypeBeginPersistedPrepareXID: + case kTypeBeginUnprepareXID: + case kTypeDeletionWithTimestamp: + case kTypeCommitXIDAndTimestamp: + checksum_protected = false; + break; + case kTypeColumnFamilyWideColumnEntity: + case kTypeWideColumnEntity: + tag = kTypeWideColumnEntity; + break; + default: + return Status::Corruption( + "unknown WriteBatch tag", + std::to_string(static_cast(tag))); + } + if (checksum_protected) { + s = prot_info_->entries_[prot_info_idx++] + .StripC(column_family) + .StripKVO(key, value, static_cast(tag)) + .GetStatus(); + if (!s.ok()) { + return s; + } + } + } + + if (prot_info_idx != WriteBatchInternal::Count(this)) { + return Status::Corruption("WriteBatch has wrong count"); + } + assert(WriteBatchInternal::Count(this) == prot_info_->entries_.size()); + return Status::OK(); +} + +namespace { + +class MemTableInserter : public WriteBatch::Handler { + SequenceNumber sequence_; + ColumnFamilyMemTables* const cf_mems_; + FlushScheduler* const flush_scheduler_; + TrimHistoryScheduler* const trim_history_scheduler_; + const bool ignore_missing_column_families_; + const uint64_t recovering_log_number_; + // log number that all Memtables inserted into should reference + uint64_t log_number_ref_; + DBImpl* db_; + const bool concurrent_memtable_writes_; + bool post_info_created_; + const WriteBatch::ProtectionInfo* prot_info_; + size_t prot_info_idx_; + + bool* has_valid_writes_; + // On some (!) platforms just default creating + // a map is too expensive in the Write() path as they + // cause memory allocations though unused. + // Make creation optional but do not incur + // std::unique_ptr additional allocation + using MemPostInfoMap = std::map; + using PostMapType = std::aligned_storage::type; + PostMapType mem_post_info_map_; + // current recovered transaction we are rebuilding (recovery) + WriteBatch* rebuilding_trx_; + SequenceNumber rebuilding_trx_seq_; + // Increase seq number once per each write batch. Otherwise increase it once + // per key. + bool seq_per_batch_; + // Whether the memtable write will be done only after the commit + bool write_after_commit_; + // Whether memtable write can be done before prepare + bool write_before_prepare_; + // Whether this batch was unprepared or not + bool unprepared_batch_; + using DupDetector = std::aligned_storage::type; + DupDetector duplicate_detector_; + bool dup_dectector_on_; + + bool hint_per_batch_; + bool hint_created_; + // Hints for this batch + using HintMap = std::unordered_map; + using HintMapType = std::aligned_storage::type; + HintMapType hint_; + + HintMap& GetHintMap() { + assert(hint_per_batch_); + if (!hint_created_) { + new (&hint_) HintMap(); + hint_created_ = true; + } + return *reinterpret_cast(&hint_); + } + + MemPostInfoMap& GetPostMap() { + assert(concurrent_memtable_writes_); + if (!post_info_created_) { + new (&mem_post_info_map_) MemPostInfoMap(); + post_info_created_ = true; + } + return *reinterpret_cast(&mem_post_info_map_); + } + + bool IsDuplicateKeySeq(uint32_t column_family_id, const Slice& key) { + assert(!write_after_commit_); + assert(rebuilding_trx_ != nullptr); + if (!dup_dectector_on_) { + new (&duplicate_detector_) DuplicateDetector(db_); + dup_dectector_on_ = true; + } + return reinterpret_cast(&duplicate_detector_) + ->IsDuplicateKeySeq(column_family_id, key, sequence_); + } + + const ProtectionInfoKVOC64* NextProtectionInfo() { + const ProtectionInfoKVOC64* res = nullptr; + if (prot_info_ != nullptr) { + assert(prot_info_idx_ < prot_info_->entries_.size()); + res = &prot_info_->entries_[prot_info_idx_]; + ++prot_info_idx_; + } + return res; + } + + void DecrementProtectionInfoIdxForTryAgain() { + if (prot_info_ != nullptr) --prot_info_idx_; + } + + void ResetProtectionInfo() { + prot_info_idx_ = 0; + prot_info_ = nullptr; + } + + protected: + Handler::OptionState WriteBeforePrepare() const override { + return write_before_prepare_ ? Handler::OptionState::kEnabled + : Handler::OptionState::kDisabled; + } + Handler::OptionState WriteAfterCommit() const override { + return write_after_commit_ ? Handler::OptionState::kEnabled + : Handler::OptionState::kDisabled; + } + + public: + // cf_mems should not be shared with concurrent inserters + MemTableInserter(SequenceNumber _sequence, ColumnFamilyMemTables* cf_mems, + FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, + bool ignore_missing_column_families, + uint64_t recovering_log_number, DB* db, + bool concurrent_memtable_writes, + const WriteBatch::ProtectionInfo* prot_info, + bool* has_valid_writes = nullptr, bool seq_per_batch = false, + bool batch_per_txn = true, bool hint_per_batch = false) + : sequence_(_sequence), + cf_mems_(cf_mems), + flush_scheduler_(flush_scheduler), + trim_history_scheduler_(trim_history_scheduler), + ignore_missing_column_families_(ignore_missing_column_families), + recovering_log_number_(recovering_log_number), + log_number_ref_(0), + db_(static_cast_with_check(db)), + concurrent_memtable_writes_(concurrent_memtable_writes), + post_info_created_(false), + prot_info_(prot_info), + prot_info_idx_(0), + has_valid_writes_(has_valid_writes), + rebuilding_trx_(nullptr), + rebuilding_trx_seq_(0), + seq_per_batch_(seq_per_batch), + // Write after commit currently uses one seq per key (instead of per + // batch). So seq_per_batch being false indicates write_after_commit + // approach. + write_after_commit_(!seq_per_batch), + // WriteUnprepared can write WriteBatches per transaction, so + // batch_per_txn being false indicates write_before_prepare. + write_before_prepare_(!batch_per_txn), + unprepared_batch_(false), + duplicate_detector_(), + dup_dectector_on_(false), + hint_per_batch_(hint_per_batch), + hint_created_(false) { + assert(cf_mems_); + } + + ~MemTableInserter() override { + if (dup_dectector_on_) { + reinterpret_cast(&duplicate_detector_) + ->~DuplicateDetector(); + } + if (post_info_created_) { + reinterpret_cast(&mem_post_info_map_)->~MemPostInfoMap(); + } + if (hint_created_) { + for (auto iter : GetHintMap()) { + delete[] reinterpret_cast(iter.second); + } + reinterpret_cast(&hint_)->~HintMap(); + } + delete rebuilding_trx_; + } + + MemTableInserter(const MemTableInserter&) = delete; + MemTableInserter& operator=(const MemTableInserter&) = delete; + + // The batch seq is regularly restarted; In normal mode it is set when + // MemTableInserter is constructed in the write thread and in recovery mode it + // is set when a batch, which is tagged with seq, is read from the WAL. + // Within a sequenced batch, which could be a merge of multiple batches, we + // have two policies to advance the seq: i) seq_per_key (default) and ii) + // seq_per_batch. To implement the latter we need to mark the boundary between + // the individual batches. The approach is this: 1) Use the terminating + // markers to indicate the boundary (kTypeEndPrepareXID, kTypeCommitXID, + // kTypeRollbackXID) 2) Terminate a batch with kTypeNoop in the absence of a + // natural boundary marker. + void MaybeAdvanceSeq(bool batch_boundry = false) { + if (batch_boundry == seq_per_batch_) { + sequence_++; + } + } + + void set_log_number_ref(uint64_t log) { log_number_ref_ = log; } + void set_prot_info(const WriteBatch::ProtectionInfo* prot_info) { + prot_info_ = prot_info; + prot_info_idx_ = 0; + } + + SequenceNumber sequence() const { return sequence_; } + + void PostProcess() { + assert(concurrent_memtable_writes_); + // If post info was not created there is nothing + // to process and no need to create on demand + if (post_info_created_) { + for (auto& pair : GetPostMap()) { + pair.first->BatchPostProcess(pair.second); + } + } + } + + bool SeekToColumnFamily(uint32_t column_family_id, Status* s) { + // If we are in a concurrent mode, it is the caller's responsibility + // to clone the original ColumnFamilyMemTables so that each thread + // has its own instance. Otherwise, it must be guaranteed that there + // is no concurrent access + bool found = cf_mems_->Seek(column_family_id); + if (!found) { + if (ignore_missing_column_families_) { + *s = Status::OK(); + } else { + *s = Status::InvalidArgument( + "Invalid column family specified in write batch"); + } + return false; + } + if (recovering_log_number_ != 0 && + recovering_log_number_ < cf_mems_->GetLogNumber()) { + // This is true only in recovery environment (recovering_log_number_ is + // always 0 in + // non-recovery, regular write code-path) + // * If recovering_log_number_ < cf_mems_->GetLogNumber(), this means that + // column family already contains updates from this log. We can't apply + // updates twice because of update-in-place or merge workloads -- ignore + // the update + *s = Status::OK(); + return false; + } + + if (has_valid_writes_ != nullptr) { + *has_valid_writes_ = true; + } + + if (log_number_ref_ > 0) { + cf_mems_->GetMemTable()->RefLogContainingPrepSection(log_number_ref_); + } + + return true; + } + + Status PutCFImpl(uint32_t column_family_id, const Slice& key, + const Slice& value, ValueType value_type, + const ProtectionInfoKVOS64* kv_prot_info) { + // optimize for non-recovery mode + if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + return WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, + value); + // else insert the values to the memtable right away + } + + Status ret_status; + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) { + if (ret_status.ok() && rebuilding_trx_ != nullptr) { + assert(!write_after_commit_); + // The CF is probably flushed and hence no need for insert but we still + // need to keep track of the keys for upcoming rollback/commit. + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = WriteBatchInternal::Put(rebuilding_trx_, column_family_id, + key, value); + if (ret_status.ok()) { + MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key)); + } + } else if (ret_status.ok()) { + MaybeAdvanceSeq(false /* batch_boundary */); + } + return ret_status; + } + assert(ret_status.ok()); + + MemTable* mem = cf_mems_->GetMemTable(); + auto* moptions = mem->GetImmutableMemTableOptions(); + // inplace_update_support is inconsistent with snapshots, and therefore with + // any kind of transactions including the ones that use seq_per_batch + assert(!seq_per_batch_ || !moptions->inplace_update_support); + if (!moptions->inplace_update_support) { + ret_status = + mem->Add(sequence_, value_type, key, value, kv_prot_info, + concurrent_memtable_writes_, get_post_process_info(mem), + hint_per_batch_ ? &GetHintMap()[mem] : nullptr); + } else if (moptions->inplace_callback == nullptr || + value_type != kTypeValue) { + assert(!concurrent_memtable_writes_); + ret_status = mem->Update(sequence_, value_type, key, value, kv_prot_info); + } else { + assert(!concurrent_memtable_writes_); + assert(value_type == kTypeValue); + ret_status = mem->UpdateCallback(sequence_, key, value, kv_prot_info); + if (ret_status.IsNotFound()) { + // key not found in memtable. Do sst get, update, add + SnapshotImpl read_from_snapshot; + read_from_snapshot.number_ = sequence_; + ReadOptions ropts; + // it's going to be overwritten for sure, so no point caching data block + // containing the old version + ropts.fill_cache = false; + ropts.snapshot = &read_from_snapshot; + + std::string prev_value; + std::string merged_value; + + auto cf_handle = cf_mems_->GetColumnFamilyHandle(); + Status get_status = Status::NotSupported(); + if (db_ != nullptr && recovering_log_number_ == 0) { + if (cf_handle == nullptr) { + cf_handle = db_->DefaultColumnFamily(); + } + // TODO (yanqin): fix when user-defined timestamp is enabled. + get_status = db_->Get(ropts, cf_handle, key, &prev_value); + } + // Intentionally overwrites the `NotFound` in `ret_status`. + if (!get_status.ok() && !get_status.IsNotFound()) { + ret_status = get_status; + } else { + ret_status = Status::OK(); + } + if (ret_status.ok()) { + UpdateStatus update_status; + char* prev_buffer = const_cast(prev_value.c_str()); + uint32_t prev_size = static_cast(prev_value.size()); + if (get_status.ok()) { + update_status = moptions->inplace_callback(prev_buffer, &prev_size, + value, &merged_value); + } else { + update_status = moptions->inplace_callback( + nullptr /* existing_value */, nullptr /* existing_value_size */, + value, &merged_value); + } + if (update_status == UpdateStatus::UPDATED_INPLACE) { + assert(get_status.ok()); + if (kv_prot_info != nullptr) { + ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info); + updated_kv_prot_info.UpdateV(value, + Slice(prev_buffer, prev_size)); + // prev_value is updated in-place with final value. + ret_status = mem->Add(sequence_, value_type, key, + Slice(prev_buffer, prev_size), + &updated_kv_prot_info); + } else { + ret_status = mem->Add(sequence_, value_type, key, + Slice(prev_buffer, prev_size), + nullptr /* kv_prot_info */); + } + if (ret_status.ok()) { + RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN); + } + } else if (update_status == UpdateStatus::UPDATED) { + if (kv_prot_info != nullptr) { + ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info); + updated_kv_prot_info.UpdateV(value, merged_value); + // merged_value contains the final value. + ret_status = mem->Add(sequence_, value_type, key, + Slice(merged_value), &updated_kv_prot_info); + } else { + // merged_value contains the final value. + ret_status = + mem->Add(sequence_, value_type, key, Slice(merged_value), + nullptr /* kv_prot_info */); + } + if (ret_status.ok()) { + RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN); + } + } + } + } + } + if (UNLIKELY(ret_status.IsTryAgain())) { + assert(seq_per_batch_); + const bool kBatchBoundary = true; + MaybeAdvanceSeq(kBatchBoundary); + } else if (ret_status.ok()) { + MaybeAdvanceSeq(); + CheckMemtableFull(); + } + // optimize for non-recovery mode + // If `ret_status` is `TryAgain` then the next (successful) try will add + // the key to the rebuilding transaction object. If `ret_status` is + // another non-OK `Status`, then the `rebuilding_trx_` will be thrown + // away. So we only need to add to it when `ret_status.ok()`. + if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) { + assert(!write_after_commit_); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = WriteBatchInternal::Put(rebuilding_trx_, column_family_id, + key, value); + } + return ret_status; + } + + Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + const auto* kv_prot_info = NextProtectionInfo(); + Status ret_status; + if (kv_prot_info != nullptr) { + // Memtable needs seqno, doesn't need CF ID + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + ret_status = PutCFImpl(column_family_id, key, value, kTypeValue, + &mem_kv_prot_info); + } else { + ret_status = PutCFImpl(column_family_id, key, value, kTypeValue, + nullptr /* kv_prot_info */); + } + // TODO: this assumes that if TryAgain status is returned to the caller, + // the operation is actually tried again. The proper way to do this is to + // pass a `try_again` parameter to the operation itself and decrement + // prot_info_idx_ based on that + if (UNLIKELY(ret_status.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + return ret_status; + } + + Status PutEntityCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + const auto* kv_prot_info = NextProtectionInfo(); + + Status s; + if (kv_prot_info) { + // Memtable needs seqno, doesn't need CF ID + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + s = PutCFImpl(column_family_id, key, value, kTypeWideColumnEntity, + &mem_kv_prot_info); + } else { + s = PutCFImpl(column_family_id, key, value, kTypeWideColumnEntity, + /* kv_prot_info */ nullptr); + } + + if (UNLIKELY(s.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + + return s; + } + + Status DeleteImpl(uint32_t /*column_family_id*/, const Slice& key, + const Slice& value, ValueType delete_type, + const ProtectionInfoKVOS64* kv_prot_info) { + Status ret_status; + MemTable* mem = cf_mems_->GetMemTable(); + ret_status = + mem->Add(sequence_, delete_type, key, value, kv_prot_info, + concurrent_memtable_writes_, get_post_process_info(mem), + hint_per_batch_ ? &GetHintMap()[mem] : nullptr); + if (UNLIKELY(ret_status.IsTryAgain())) { + assert(seq_per_batch_); + const bool kBatchBoundary = true; + MaybeAdvanceSeq(kBatchBoundary); + } else if (ret_status.ok()) { + MaybeAdvanceSeq(); + CheckMemtableFull(); + } + return ret_status; + } + + Status DeleteCF(uint32_t column_family_id, const Slice& key) override { + const auto* kv_prot_info = NextProtectionInfo(); + // optimize for non-recovery mode + if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + return WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); + // else insert the values to the memtable right away + } + + Status ret_status; + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) { + if (ret_status.ok() && rebuilding_trx_ != nullptr) { + assert(!write_after_commit_); + // The CF is probably flushed and hence no need for insert but we still + // need to keep track of the keys for upcoming rollback/commit. + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = + WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); + if (ret_status.ok()) { + MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key)); + } + } else if (ret_status.ok()) { + MaybeAdvanceSeq(false /* batch_boundary */); + } + if (UNLIKELY(ret_status.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + return ret_status; + } + + ColumnFamilyData* cfd = cf_mems_->current(); + assert(!cfd || cfd->user_comparator()); + const size_t ts_sz = (cfd && cfd->user_comparator()) + ? cfd->user_comparator()->timestamp_size() + : 0; + const ValueType delete_type = + (0 == ts_sz) ? kTypeDeletion : kTypeDeletionWithTimestamp; + if (kv_prot_info != nullptr) { + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + mem_kv_prot_info.UpdateO(kTypeDeletion, delete_type); + ret_status = DeleteImpl(column_family_id, key, Slice(), delete_type, + &mem_kv_prot_info); + } else { + ret_status = DeleteImpl(column_family_id, key, Slice(), delete_type, + nullptr /* kv_prot_info */); + } + // optimize for non-recovery mode + // If `ret_status` is `TryAgain` then the next (successful) try will add + // the key to the rebuilding transaction object. If `ret_status` is + // another non-OK `Status`, then the `rebuilding_trx_` will be thrown + // away. So we only need to add to it when `ret_status.ok()`. + if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) { + assert(!write_after_commit_); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = + WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); + } + if (UNLIKELY(ret_status.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + return ret_status; + } + + Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override { + const auto* kv_prot_info = NextProtectionInfo(); + // optimize for non-recovery mode + if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + return WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, + key); + // else insert the values to the memtable right away + } + + Status ret_status; + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) { + if (ret_status.ok() && rebuilding_trx_ != nullptr) { + assert(!write_after_commit_); + // The CF is probably flushed and hence no need for insert but we still + // need to keep track of the keys for upcoming rollback/commit. + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = WriteBatchInternal::SingleDelete(rebuilding_trx_, + column_family_id, key); + if (ret_status.ok()) { + MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key)); + } + } else if (ret_status.ok()) { + MaybeAdvanceSeq(false /* batch_boundary */); + } + if (UNLIKELY(ret_status.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + return ret_status; + } + assert(ret_status.ok()); + + if (kv_prot_info != nullptr) { + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + ret_status = DeleteImpl(column_family_id, key, Slice(), + kTypeSingleDeletion, &mem_kv_prot_info); + } else { + ret_status = DeleteImpl(column_family_id, key, Slice(), + kTypeSingleDeletion, nullptr /* kv_prot_info */); + } + // optimize for non-recovery mode + // If `ret_status` is `TryAgain` then the next (successful) try will add + // the key to the rebuilding transaction object. If `ret_status` is + // another non-OK `Status`, then the `rebuilding_trx_` will be thrown + // away. So we only need to add to it when `ret_status.ok()`. + if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) { + assert(!write_after_commit_); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = WriteBatchInternal::SingleDelete(rebuilding_trx_, + column_family_id, key); + } + if (UNLIKELY(ret_status.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + return ret_status; + } + + Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key, + const Slice& end_key) override { + const auto* kv_prot_info = NextProtectionInfo(); + // optimize for non-recovery mode + if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + return WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id, + begin_key, end_key); + // else insert the values to the memtable right away + } + + Status ret_status; + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) { + if (ret_status.ok() && rebuilding_trx_ != nullptr) { + assert(!write_after_commit_); + // The CF is probably flushed and hence no need for insert but we still + // need to keep track of the keys for upcoming rollback/commit. + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = WriteBatchInternal::DeleteRange( + rebuilding_trx_, column_family_id, begin_key, end_key); + if (ret_status.ok()) { + MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, begin_key)); + } + } else if (ret_status.ok()) { + MaybeAdvanceSeq(false /* batch_boundary */); + } + if (UNLIKELY(ret_status.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + return ret_status; + } + assert(ret_status.ok()); + + if (db_ != nullptr) { + auto cf_handle = cf_mems_->GetColumnFamilyHandle(); + if (cf_handle == nullptr) { + cf_handle = db_->DefaultColumnFamily(); + } + auto* cfd = + static_cast_with_check(cf_handle)->cfd(); + if (!cfd->is_delete_range_supported()) { + // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`. + ret_status.PermitUncheckedError(); + return Status::NotSupported( + std::string("DeleteRange not supported for table type ") + + cfd->ioptions()->table_factory->Name() + " in CF " + + cfd->GetName()); + } + int cmp = + cfd->user_comparator()->CompareWithoutTimestamp(begin_key, end_key); + if (cmp > 0) { + // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`. + ret_status.PermitUncheckedError(); + // It's an empty range where endpoints appear mistaken. Don't bother + // applying it to the DB, and return an error to the user. + return Status::InvalidArgument("end key comes before start key"); + } else if (cmp == 0) { + // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`. + ret_status.PermitUncheckedError(); + // It's an empty range. Don't bother applying it to the DB. + return Status::OK(); + } + } + + if (kv_prot_info != nullptr) { + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + ret_status = DeleteImpl(column_family_id, begin_key, end_key, + kTypeRangeDeletion, &mem_kv_prot_info); + } else { + ret_status = DeleteImpl(column_family_id, begin_key, end_key, + kTypeRangeDeletion, nullptr /* kv_prot_info */); + } + // optimize for non-recovery mode + // If `ret_status` is `TryAgain` then the next (successful) try will add + // the key to the rebuilding transaction object. If `ret_status` is + // another non-OK `Status`, then the `rebuilding_trx_` will be thrown + // away. So we only need to add to it when `ret_status.ok()`. + if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) { + assert(!write_after_commit_); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = WriteBatchInternal::DeleteRange( + rebuilding_trx_, column_family_id, begin_key, end_key); + } + if (UNLIKELY(ret_status.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + return ret_status; + } + + Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + const auto* kv_prot_info = NextProtectionInfo(); + // optimize for non-recovery mode + if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + return WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, + value); + // else insert the values to the memtable right away + } + + Status ret_status; + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) { + if (ret_status.ok() && rebuilding_trx_ != nullptr) { + assert(!write_after_commit_); + // The CF is probably flushed and hence no need for insert but we still + // need to keep track of the keys for upcoming rollback/commit. + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = WriteBatchInternal::Merge(rebuilding_trx_, + column_family_id, key, value); + if (ret_status.ok()) { + MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key)); + } + } else if (ret_status.ok()) { + MaybeAdvanceSeq(false /* batch_boundary */); + } + if (UNLIKELY(ret_status.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + return ret_status; + } + assert(ret_status.ok()); + + MemTable* mem = cf_mems_->GetMemTable(); + auto* moptions = mem->GetImmutableMemTableOptions(); + if (moptions->merge_operator == nullptr) { + return Status::InvalidArgument( + "Merge requires `ColumnFamilyOptions::merge_operator != nullptr`"); + } + bool perform_merge = false; + assert(!concurrent_memtable_writes_ || + moptions->max_successive_merges == 0); + + // If we pass DB through and options.max_successive_merges is hit + // during recovery, Get() will be issued which will try to acquire + // DB mutex and cause deadlock, as DB mutex is already held. + // So we disable merge in recovery + if (moptions->max_successive_merges > 0 && db_ != nullptr && + recovering_log_number_ == 0) { + assert(!concurrent_memtable_writes_); + LookupKey lkey(key, sequence_); + + // Count the number of successive merges at the head + // of the key in the memtable + size_t num_merges = mem->CountSuccessiveMergeEntries(lkey); + + if (num_merges >= moptions->max_successive_merges) { + perform_merge = true; + } + } + + if (perform_merge) { + // 1) Get the existing value + std::string get_value; + + // Pass in the sequence number so that we also include previous merge + // operations in the same batch. + SnapshotImpl read_from_snapshot; + read_from_snapshot.number_ = sequence_; + ReadOptions read_options; + read_options.snapshot = &read_from_snapshot; + + auto cf_handle = cf_mems_->GetColumnFamilyHandle(); + if (cf_handle == nullptr) { + cf_handle = db_->DefaultColumnFamily(); + } + Status get_status = db_->Get(read_options, cf_handle, key, &get_value); + if (!get_status.ok()) { + // Failed to read a key we know exists. Store the delta in memtable. + perform_merge = false; + } else { + Slice get_value_slice = Slice(get_value); + + // 2) Apply this merge + auto merge_operator = moptions->merge_operator; + assert(merge_operator); + + std::string new_value; + Status merge_status = MergeHelper::TimedFullMerge( + merge_operator, key, &get_value_slice, {value}, &new_value, + moptions->info_log, moptions->statistics, + SystemClock::Default().get(), /* result_operand */ nullptr, + /* update_num_ops_stats */ false); + + if (!merge_status.ok()) { + // Failed to merge! + // Store the delta in memtable + perform_merge = false; + } else { + // 3) Add value to memtable + assert(!concurrent_memtable_writes_); + if (kv_prot_info != nullptr) { + auto merged_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + merged_kv_prot_info.UpdateV(value, new_value); + merged_kv_prot_info.UpdateO(kTypeMerge, kTypeValue); + ret_status = mem->Add(sequence_, kTypeValue, key, new_value, + &merged_kv_prot_info); + } else { + ret_status = mem->Add(sequence_, kTypeValue, key, new_value, + nullptr /* kv_prot_info */); + } + } + } + } + + if (!perform_merge) { + assert(ret_status.ok()); + // Add merge operand to memtable + if (kv_prot_info != nullptr) { + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + ret_status = + mem->Add(sequence_, kTypeMerge, key, value, &mem_kv_prot_info, + concurrent_memtable_writes_, get_post_process_info(mem)); + } else { + ret_status = mem->Add( + sequence_, kTypeMerge, key, value, nullptr /* kv_prot_info */, + concurrent_memtable_writes_, get_post_process_info(mem)); + } + } + + if (UNLIKELY(ret_status.IsTryAgain())) { + assert(seq_per_batch_); + const bool kBatchBoundary = true; + MaybeAdvanceSeq(kBatchBoundary); + } else if (ret_status.ok()) { + MaybeAdvanceSeq(); + CheckMemtableFull(); + } + // optimize for non-recovery mode + // If `ret_status` is `TryAgain` then the next (successful) try will add + // the key to the rebuilding transaction object. If `ret_status` is + // another non-OK `Status`, then the `rebuilding_trx_` will be thrown + // away. So we only need to add to it when `ret_status.ok()`. + if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) { + assert(!write_after_commit_); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, + key, value); + } + if (UNLIKELY(ret_status.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + return ret_status; + } + + Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + const auto* kv_prot_info = NextProtectionInfo(); + Status ret_status; + if (kv_prot_info != nullptr) { + // Memtable needs seqno, doesn't need CF ID + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + // Same as PutCF except for value type. + ret_status = PutCFImpl(column_family_id, key, value, kTypeBlobIndex, + &mem_kv_prot_info); + } else { + ret_status = PutCFImpl(column_family_id, key, value, kTypeBlobIndex, + nullptr /* kv_prot_info */); + } + if (UNLIKELY(ret_status.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + return ret_status; + } + + void CheckMemtableFull() { + if (flush_scheduler_ != nullptr) { + auto* cfd = cf_mems_->current(); + assert(cfd != nullptr); + if (cfd->mem()->ShouldScheduleFlush() && + cfd->mem()->MarkFlushScheduled()) { + // MarkFlushScheduled only returns true if we are the one that + // should take action, so no need to dedup further + flush_scheduler_->ScheduleWork(cfd); + } + } + // check if memtable_list size exceeds max_write_buffer_size_to_maintain + if (trim_history_scheduler_ != nullptr) { + auto* cfd = cf_mems_->current(); + + assert(cfd); + assert(cfd->ioptions()); + + const size_t size_to_maintain = static_cast( + cfd->ioptions()->max_write_buffer_size_to_maintain); + + if (size_to_maintain > 0) { + MemTableList* const imm = cfd->imm(); + assert(imm); + + if (imm->HasHistory()) { + const MemTable* const mem = cfd->mem(); + assert(mem); + + if (mem->MemoryAllocatedBytes() + + imm->MemoryAllocatedBytesExcludingLast() >= + size_to_maintain && + imm->MarkTrimHistoryNeeded()) { + trim_history_scheduler_->ScheduleWork(cfd); + } + } + } + } + } + + // The write batch handler calls MarkBeginPrepare with unprepare set to true + // if it encounters the kTypeBeginUnprepareXID marker. + Status MarkBeginPrepare(bool unprepare) override { + assert(rebuilding_trx_ == nullptr); + assert(db_); + + if (recovering_log_number_ != 0) { + db_->mutex()->AssertHeld(); + // during recovery we rebuild a hollow transaction + // from all encountered prepare sections of the wal + if (db_->allow_2pc() == false) { + return Status::NotSupported( + "WAL contains prepared transactions. Open with " + "TransactionDB::Open()."); + } + + // we are now iterating through a prepared section + rebuilding_trx_ = new WriteBatch(); + rebuilding_trx_seq_ = sequence_; + // Verify that we have matching MarkBeginPrepare/MarkEndPrepare markers. + // unprepared_batch_ should be false because it is false by default, and + // gets reset to false in MarkEndPrepare. + assert(!unprepared_batch_); + unprepared_batch_ = unprepare; + + if (has_valid_writes_ != nullptr) { + *has_valid_writes_ = true; + } + } + + return Status::OK(); + } + + Status MarkEndPrepare(const Slice& name) override { + assert(db_); + assert((rebuilding_trx_ != nullptr) == (recovering_log_number_ != 0)); + + if (recovering_log_number_ != 0) { + db_->mutex()->AssertHeld(); + assert(db_->allow_2pc()); + size_t batch_cnt = + write_after_commit_ + ? 0 // 0 will disable further checks + : static_cast(sequence_ - rebuilding_trx_seq_ + 1); + db_->InsertRecoveredTransaction(recovering_log_number_, name.ToString(), + rebuilding_trx_, rebuilding_trx_seq_, + batch_cnt, unprepared_batch_); + unprepared_batch_ = false; + rebuilding_trx_ = nullptr; + } else { + assert(rebuilding_trx_ == nullptr); + } + const bool batch_boundry = true; + MaybeAdvanceSeq(batch_boundry); + + return Status::OK(); + } + + Status MarkNoop(bool empty_batch) override { + if (recovering_log_number_ != 0) { + db_->mutex()->AssertHeld(); + } + // A hack in pessimistic transaction could result into a noop at the start + // of the write batch, that should be ignored. + if (!empty_batch) { + // In the absence of Prepare markers, a kTypeNoop tag indicates the end of + // a batch. This happens when write batch commits skipping the prepare + // phase. + const bool batch_boundry = true; + MaybeAdvanceSeq(batch_boundry); + } + return Status::OK(); + } + + Status MarkCommit(const Slice& name) override { + assert(db_); + + Status s; + + if (recovering_log_number_ != 0) { + // We must hold db mutex in recovery. + db_->mutex()->AssertHeld(); + // in recovery when we encounter a commit marker + // we lookup this transaction in our set of rebuilt transactions + // and commit. + auto trx = db_->GetRecoveredTransaction(name.ToString()); + + // the log containing the prepared section may have + // been released in the last incarnation because the + // data was flushed to L0 + if (trx != nullptr) { + // at this point individual CF lognumbers will prevent + // duplicate re-insertion of values. + assert(log_number_ref_ == 0); + if (write_after_commit_) { + // write_after_commit_ can only have one batch in trx. + assert(trx->batches_.size() == 1); + const auto& batch_info = trx->batches_.begin()->second; + // all inserts must reference this trx log number + log_number_ref_ = batch_info.log_number_; + ResetProtectionInfo(); + s = batch_info.batch_->Iterate(this); + log_number_ref_ = 0; + } + // else the values are already inserted before the commit + + if (s.ok()) { + db_->DeleteRecoveredTransaction(name.ToString()); + } + if (has_valid_writes_ != nullptr) { + *has_valid_writes_ = true; + } + } + } else { + // When writes are not delayed until commit, there is no disconnect + // between a memtable write and the WAL that supports it. So the commit + // need not reference any log as the only log to which it depends. + assert(!write_after_commit_ || log_number_ref_ > 0); + } + const bool batch_boundry = true; + MaybeAdvanceSeq(batch_boundry); + + if (UNLIKELY(s.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + + return s; + } + + Status MarkCommitWithTimestamp(const Slice& name, + const Slice& commit_ts) override { + assert(db_); + + Status s; + + if (recovering_log_number_ != 0) { + // In recovery, db mutex must be held. + db_->mutex()->AssertHeld(); + // in recovery when we encounter a commit marker + // we lookup this transaction in our set of rebuilt transactions + // and commit. + auto trx = db_->GetRecoveredTransaction(name.ToString()); + // the log containing the prepared section may have + // been released in the last incarnation because the + // data was flushed to L0 + if (trx) { + // at this point individual CF lognumbers will prevent + // duplicate re-insertion of values. + assert(0 == log_number_ref_); + if (write_after_commit_) { + // write_after_commit_ can only have one batch in trx. + assert(trx->batches_.size() == 1); + const auto& batch_info = trx->batches_.begin()->second; + // all inserts must reference this trx log number + log_number_ref_ = batch_info.log_number_; + + s = batch_info.batch_->UpdateTimestamps( + commit_ts, [this](uint32_t cf) { + assert(db_); + VersionSet* const vset = db_->GetVersionSet(); + assert(vset); + ColumnFamilySet* const cf_set = vset->GetColumnFamilySet(); + assert(cf_set); + ColumnFamilyData* cfd = cf_set->GetColumnFamily(cf); + assert(cfd); + const auto* const ucmp = cfd->user_comparator(); + assert(ucmp); + return ucmp->timestamp_size(); + }); + if (s.ok()) { + ResetProtectionInfo(); + s = batch_info.batch_->Iterate(this); + log_number_ref_ = 0; + } + } + // else the values are already inserted before the commit + + if (s.ok()) { + db_->DeleteRecoveredTransaction(name.ToString()); + } + if (has_valid_writes_) { + *has_valid_writes_ = true; + } + } + } else { + // When writes are not delayed until commit, there is no connection + // between a memtable write and the WAL that supports it. So the commit + // need not reference any log as the only log to which it depends. + assert(!write_after_commit_ || log_number_ref_ > 0); + } + constexpr bool batch_boundary = true; + MaybeAdvanceSeq(batch_boundary); + + if (UNLIKELY(s.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + + return s; + } + + Status MarkRollback(const Slice& name) override { + assert(db_); + + if (recovering_log_number_ != 0) { + auto trx = db_->GetRecoveredTransaction(name.ToString()); + + // the log containing the transactions prep section + // may have been released in the previous incarnation + // because we knew it had been rolled back + if (trx != nullptr) { + db_->DeleteRecoveredTransaction(name.ToString()); + } + } else { + // in non recovery we simply ignore this tag + } + + const bool batch_boundry = true; + MaybeAdvanceSeq(batch_boundry); + + return Status::OK(); + } + + private: + MemTablePostProcessInfo* get_post_process_info(MemTable* mem) { + if (!concurrent_memtable_writes_) { + // No need to batch counters locally if we don't use concurrent mode. + return nullptr; + } + return &GetPostMap()[mem]; + } +}; + +} // anonymous namespace + +// This function can only be called in these conditions: +// 1) During Recovery() +// 2) During Write(), in a single-threaded write thread +// 3) During Write(), in a concurrent context where memtables has been cloned +// The reason is that it calls memtables->Seek(), which has a stateful cache +Status WriteBatchInternal::InsertInto( + WriteThread::WriteGroup& write_group, SequenceNumber sequence, + ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, + bool ignore_missing_column_families, uint64_t recovery_log_number, DB* db, + bool concurrent_memtable_writes, bool seq_per_batch, bool batch_per_txn) { + MemTableInserter inserter( + sequence, memtables, flush_scheduler, trim_history_scheduler, + ignore_missing_column_families, recovery_log_number, db, + concurrent_memtable_writes, nullptr /* prot_info */, + nullptr /*has_valid_writes*/, seq_per_batch, batch_per_txn); + for (auto w : write_group) { + if (w->CallbackFailed()) { + continue; + } + w->sequence = inserter.sequence(); + if (!w->ShouldWriteToMemtable()) { + // In seq_per_batch_ mode this advances the seq by one. + inserter.MaybeAdvanceSeq(true); + continue; + } + SetSequence(w->batch, inserter.sequence()); + inserter.set_log_number_ref(w->log_ref); + inserter.set_prot_info(w->batch->prot_info_.get()); + w->status = w->batch->Iterate(&inserter); + if (!w->status.ok()) { + return w->status; + } + assert(!seq_per_batch || w->batch_cnt != 0); + assert(!seq_per_batch || inserter.sequence() - w->sequence == w->batch_cnt); + } + return Status::OK(); +} + +Status WriteBatchInternal::InsertInto( + WriteThread::Writer* writer, SequenceNumber sequence, + ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, + bool ignore_missing_column_families, uint64_t log_number, DB* db, + bool concurrent_memtable_writes, bool seq_per_batch, size_t batch_cnt, + bool batch_per_txn, bool hint_per_batch) { +#ifdef NDEBUG + (void)batch_cnt; +#endif + assert(writer->ShouldWriteToMemtable()); + MemTableInserter inserter(sequence, memtables, flush_scheduler, + trim_history_scheduler, + ignore_missing_column_families, log_number, db, + concurrent_memtable_writes, nullptr /* prot_info */, + nullptr /*has_valid_writes*/, seq_per_batch, + batch_per_txn, hint_per_batch); + SetSequence(writer->batch, sequence); + inserter.set_log_number_ref(writer->log_ref); + inserter.set_prot_info(writer->batch->prot_info_.get()); + Status s = writer->batch->Iterate(&inserter); + assert(!seq_per_batch || batch_cnt != 0); + assert(!seq_per_batch || inserter.sequence() - sequence == batch_cnt); + if (concurrent_memtable_writes) { + inserter.PostProcess(); + } + return s; +} + +Status WriteBatchInternal::InsertInto( + const WriteBatch* batch, ColumnFamilyMemTables* memtables, + FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, + bool ignore_missing_column_families, uint64_t log_number, DB* db, + bool concurrent_memtable_writes, SequenceNumber* next_seq, + bool* has_valid_writes, bool seq_per_batch, bool batch_per_txn) { + MemTableInserter inserter(Sequence(batch), memtables, flush_scheduler, + trim_history_scheduler, + ignore_missing_column_families, log_number, db, + concurrent_memtable_writes, batch->prot_info_.get(), + has_valid_writes, seq_per_batch, batch_per_txn); + Status s = batch->Iterate(&inserter); + if (next_seq != nullptr) { + *next_seq = inserter.sequence(); + } + if (concurrent_memtable_writes) { + inserter.PostProcess(); + } + return s; +} + +namespace { + +// This class updates protection info for a WriteBatch. +class ProtectionInfoUpdater : public WriteBatch::Handler { + public: + explicit ProtectionInfoUpdater(WriteBatch::ProtectionInfo* prot_info) + : prot_info_(prot_info) {} + + ~ProtectionInfoUpdater() override {} + + Status PutCF(uint32_t cf, const Slice& key, const Slice& val) override { + return UpdateProtInfo(cf, key, val, kTypeValue); + } + + Status PutEntityCF(uint32_t cf, const Slice& key, + const Slice& entity) override { + return UpdateProtInfo(cf, key, entity, kTypeWideColumnEntity); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + return UpdateProtInfo(cf, key, "", kTypeDeletion); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + return UpdateProtInfo(cf, key, "", kTypeSingleDeletion); + } + + Status DeleteRangeCF(uint32_t cf, const Slice& begin_key, + const Slice& end_key) override { + return UpdateProtInfo(cf, begin_key, end_key, kTypeRangeDeletion); + } + + Status MergeCF(uint32_t cf, const Slice& key, const Slice& val) override { + return UpdateProtInfo(cf, key, val, kTypeMerge); + } + + Status PutBlobIndexCF(uint32_t cf, const Slice& key, + const Slice& val) override { + return UpdateProtInfo(cf, key, val, kTypeBlobIndex); + } + + Status MarkBeginPrepare(bool /* unprepare */) override { + return Status::OK(); + } + + Status MarkEndPrepare(const Slice& /* xid */) override { + return Status::OK(); + } + + Status MarkCommit(const Slice& /* xid */) override { return Status::OK(); } + + Status MarkCommitWithTimestamp(const Slice& /* xid */, + const Slice& /* ts */) override { + return Status::OK(); + } + + Status MarkRollback(const Slice& /* xid */) override { return Status::OK(); } + + Status MarkNoop(bool /* empty_batch */) override { return Status::OK(); } + + private: + Status UpdateProtInfo(uint32_t cf, const Slice& key, const Slice& val, + const ValueType op_type) { + if (prot_info_) { + prot_info_->entries_.emplace_back( + ProtectionInfo64().ProtectKVO(key, val, op_type).ProtectC(cf)); + } + return Status::OK(); + } + + // No copy or move. + ProtectionInfoUpdater(const ProtectionInfoUpdater&) = delete; + ProtectionInfoUpdater(ProtectionInfoUpdater&&) = delete; + ProtectionInfoUpdater& operator=(const ProtectionInfoUpdater&) = delete; + ProtectionInfoUpdater& operator=(ProtectionInfoUpdater&&) = delete; + + WriteBatch::ProtectionInfo* const prot_info_ = nullptr; +}; + +} // anonymous namespace + +Status WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { + assert(contents.size() >= WriteBatchInternal::kHeader); + assert(b->prot_info_ == nullptr); + + b->rep_.assign(contents.data(), contents.size()); + b->content_flags_.store(ContentFlags::DEFERRED, std::memory_order_relaxed); + return Status::OK(); +} + +Status WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src, + const bool wal_only) { + assert(dst->Count() == 0 || + (dst->prot_info_ == nullptr) == (src->prot_info_ == nullptr)); + if ((src->prot_info_ != nullptr && + src->prot_info_->entries_.size() != src->Count()) || + (dst->prot_info_ != nullptr && + dst->prot_info_->entries_.size() != dst->Count())) { + return Status::Corruption( + "Write batch has inconsistent count and number of checksums"); + } + + size_t src_len; + int src_count; + uint32_t src_flags; + + const SavePoint& batch_end = src->GetWalTerminationPoint(); + + if (wal_only && !batch_end.is_cleared()) { + src_len = batch_end.size - WriteBatchInternal::kHeader; + src_count = batch_end.count; + src_flags = batch_end.content_flags; + } else { + src_len = src->rep_.size() - WriteBatchInternal::kHeader; + src_count = Count(src); + src_flags = src->content_flags_.load(std::memory_order_relaxed); + } + + if (src->prot_info_ != nullptr) { + if (dst->prot_info_ == nullptr) { + dst->prot_info_.reset(new WriteBatch::ProtectionInfo()); + } + std::copy(src->prot_info_->entries_.begin(), + src->prot_info_->entries_.begin() + src_count, + std::back_inserter(dst->prot_info_->entries_)); + } else if (dst->prot_info_ != nullptr) { + // dst has empty prot_info->entries + // In this special case, we allow write batch without prot_info to + // be appende to write batch with empty prot_info + dst->prot_info_ = nullptr; + } + SetCount(dst, Count(dst) + src_count); + assert(src->rep_.size() >= WriteBatchInternal::kHeader); + dst->rep_.append(src->rep_.data() + WriteBatchInternal::kHeader, src_len); + dst->content_flags_.store( + dst->content_flags_.load(std::memory_order_relaxed) | src_flags, + std::memory_order_relaxed); + return Status::OK(); +} + +size_t WriteBatchInternal::AppendedByteSize(size_t leftByteSize, + size_t rightByteSize) { + if (leftByteSize == 0 || rightByteSize == 0) { + return leftByteSize + rightByteSize; + } else { + return leftByteSize + rightByteSize - WriteBatchInternal::kHeader; + } +} + +Status WriteBatchInternal::UpdateProtectionInfo(WriteBatch* wb, + size_t bytes_per_key, + uint64_t* checksum) { + if (bytes_per_key == 0) { + if (wb->prot_info_ != nullptr) { + wb->prot_info_.reset(); + return Status::OK(); + } else { + // Already not protected. + return Status::OK(); + } + } else if (bytes_per_key == 8) { + if (wb->prot_info_ == nullptr) { + wb->prot_info_.reset(new WriteBatch::ProtectionInfo()); + ProtectionInfoUpdater prot_info_updater(wb->prot_info_.get()); + Status s = wb->Iterate(&prot_info_updater); + if (s.ok() && checksum != nullptr) { + uint64_t expected_hash = XXH3_64bits(wb->rep_.data(), wb->rep_.size()); + if (expected_hash != *checksum) { + return Status::Corruption("Write batch content corrupted."); + } + } + return s; + } else { + // Already protected. + return Status::OK(); + } + } + return Status::NotSupported( + "WriteBatch protection info must be zero or eight bytes/key"); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/write_batch_base.cc b/src/rocksdb/db/write_batch_base.cc new file mode 100644 index 000000000..e4c0e74bd --- /dev/null +++ b/src/rocksdb/db/write_batch_base.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/write_batch_base.h" + +#include + +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// Simple implementation of SlicePart variants of Put(). Child classes +// can override these method with more performant solutions if they choose. +Status WriteBatchBase::Put(ColumnFamilyHandle* column_family, + const SliceParts& key, const SliceParts& value) { + std::string key_buf, value_buf; + Slice key_slice(key, &key_buf); + Slice value_slice(value, &value_buf); + + return Put(column_family, key_slice, value_slice); +} + +Status WriteBatchBase::Put(const SliceParts& key, const SliceParts& value) { + std::string key_buf, value_buf; + Slice key_slice(key, &key_buf); + Slice value_slice(value, &value_buf); + + return Put(key_slice, value_slice); +} + +Status WriteBatchBase::Delete(ColumnFamilyHandle* column_family, + const SliceParts& key) { + std::string key_buf; + Slice key_slice(key, &key_buf); + return Delete(column_family, key_slice); +} + +Status WriteBatchBase::Delete(const SliceParts& key) { + std::string key_buf; + Slice key_slice(key, &key_buf); + return Delete(key_slice); +} + +Status WriteBatchBase::SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key) { + std::string key_buf; + Slice key_slice(key, &key_buf); + return SingleDelete(column_family, key_slice); +} + +Status WriteBatchBase::SingleDelete(const SliceParts& key) { + std::string key_buf; + Slice key_slice(key, &key_buf); + return SingleDelete(key_slice); +} + +Status WriteBatchBase::DeleteRange(ColumnFamilyHandle* column_family, + const SliceParts& begin_key, + const SliceParts& end_key) { + std::string begin_key_buf, end_key_buf; + Slice begin_key_slice(begin_key, &begin_key_buf); + Slice end_key_slice(end_key, &end_key_buf); + return DeleteRange(column_family, begin_key_slice, end_key_slice); +} + +Status WriteBatchBase::DeleteRange(const SliceParts& begin_key, + const SliceParts& end_key) { + std::string begin_key_buf, end_key_buf; + Slice begin_key_slice(begin_key, &begin_key_buf); + Slice end_key_slice(end_key, &end_key_buf); + return DeleteRange(begin_key_slice, end_key_slice); +} + +Status WriteBatchBase::Merge(ColumnFamilyHandle* column_family, + const SliceParts& key, const SliceParts& value) { + std::string key_buf, value_buf; + Slice key_slice(key, &key_buf); + Slice value_slice(value, &value_buf); + + return Merge(column_family, key_slice, value_slice); +} + +Status WriteBatchBase::Merge(const SliceParts& key, const SliceParts& value) { + std::string key_buf, value_buf; + Slice key_slice(key, &key_buf); + Slice value_slice(value, &value_buf); + + return Merge(key_slice, value_slice); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/write_batch_internal.h b/src/rocksdb/db/write_batch_internal.h new file mode 100644 index 000000000..1be0bd140 --- /dev/null +++ b/src/rocksdb/db/write_batch_internal.h @@ -0,0 +1,401 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include "db/flush_scheduler.h" +#include "db/kv_checksum.h" +#include "db/trim_history_scheduler.h" +#include "db/write_thread.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/types.h" +#include "rocksdb/write_batch.h" +#include "util/autovector.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { + +class MemTable; +class FlushScheduler; +class ColumnFamilyData; + +class ColumnFamilyMemTables { + public: + virtual ~ColumnFamilyMemTables() {} + virtual bool Seek(uint32_t column_family_id) = 0; + // returns true if the update to memtable should be ignored + // (useful when recovering from log whose updates have already + // been processed) + virtual uint64_t GetLogNumber() const = 0; + virtual MemTable* GetMemTable() const = 0; + virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0; + virtual ColumnFamilyData* current() { return nullptr; } +}; + +class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables { + public: + explicit ColumnFamilyMemTablesDefault(MemTable* mem) + : ok_(false), mem_(mem) {} + + bool Seek(uint32_t column_family_id) override { + ok_ = (column_family_id == 0); + return ok_; + } + + uint64_t GetLogNumber() const override { return 0; } + + MemTable* GetMemTable() const override { + assert(ok_); + return mem_; + } + + ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; } + + private: + bool ok_; + MemTable* mem_; +}; + +struct WriteBatch::ProtectionInfo { + // `WriteBatch` usually doesn't contain a huge number of keys so protecting + // with a fixed, non-configurable eight bytes per key may work well enough. + autovector entries_; + + size_t GetBytesPerKey() const { return 8; } +}; + +// WriteBatchInternal provides static methods for manipulating a +// WriteBatch that we don't want in the public WriteBatch interface. +class WriteBatchInternal { + public: + // WriteBatch header has an 8-byte sequence number followed by a 4-byte count. + static constexpr size_t kHeader = 12; + + // WriteBatch methods with column_family_id instead of ColumnFamilyHandle* + static Status Put(WriteBatch* batch, uint32_t column_family_id, + const Slice& key, const Slice& value); + + static Status Put(WriteBatch* batch, uint32_t column_family_id, + const SliceParts& key, const SliceParts& value); + + static Status PutEntity(WriteBatch* batch, uint32_t column_family_id, + const Slice& key, const WideColumns& columns); + + static Status Delete(WriteBatch* batch, uint32_t column_family_id, + const SliceParts& key); + + static Status Delete(WriteBatch* batch, uint32_t column_family_id, + const Slice& key); + + static Status SingleDelete(WriteBatch* batch, uint32_t column_family_id, + const SliceParts& key); + + static Status SingleDelete(WriteBatch* batch, uint32_t column_family_id, + const Slice& key); + + static Status DeleteRange(WriteBatch* b, uint32_t column_family_id, + const Slice& begin_key, const Slice& end_key); + + static Status DeleteRange(WriteBatch* b, uint32_t column_family_id, + const SliceParts& begin_key, + const SliceParts& end_key); + + static Status Merge(WriteBatch* batch, uint32_t column_family_id, + const Slice& key, const Slice& value); + + static Status Merge(WriteBatch* batch, uint32_t column_family_id, + const SliceParts& key, const SliceParts& value); + + static Status PutBlobIndex(WriteBatch* batch, uint32_t column_family_id, + const Slice& key, const Slice& value); + + static Status MarkEndPrepare(WriteBatch* batch, const Slice& xid, + const bool write_after_commit = true, + const bool unprepared_batch = false); + + static Status MarkRollback(WriteBatch* batch, const Slice& xid); + + static Status MarkCommit(WriteBatch* batch, const Slice& xid); + + static Status MarkCommitWithTimestamp(WriteBatch* batch, const Slice& xid, + const Slice& commit_ts); + + static Status InsertNoop(WriteBatch* batch); + + // Return the number of entries in the batch. + static uint32_t Count(const WriteBatch* batch); + + // Set the count for the number of entries in the batch. + static void SetCount(WriteBatch* batch, uint32_t n); + + // Return the sequence number for the start of this batch. + static SequenceNumber Sequence(const WriteBatch* batch); + + // Store the specified number as the sequence number for the start of + // this batch. + static void SetSequence(WriteBatch* batch, SequenceNumber seq); + + // Returns the offset of the first entry in the batch. + // This offset is only valid if the batch is not empty. + static size_t GetFirstOffset(WriteBatch* batch); + + static Slice Contents(const WriteBatch* batch) { return Slice(batch->rep_); } + + static size_t ByteSize(const WriteBatch* batch) { return batch->rep_.size(); } + + static Status SetContents(WriteBatch* batch, const Slice& contents); + + static Status CheckSlicePartsLength(const SliceParts& key, + const SliceParts& value); + + // Inserts batches[i] into memtable, for i in 0..num_batches-1 inclusive. + // + // If ignore_missing_column_families == true. WriteBatch + // referencing non-existing column family will be ignored. + // If ignore_missing_column_families == false, processing of the + // batches will be stopped if a reference is found to a non-existing + // column family and InvalidArgument() will be returned. The writes + // in batches may be only partially applied at that point. + // + // If log_number is non-zero, the memtable will be updated only if + // memtables->GetLogNumber() >= log_number. + // + // If flush_scheduler is non-null, it will be invoked if the memtable + // should be flushed. + // + // Under concurrent use, the caller is responsible for making sure that + // the memtables object itself is thread-local. + static Status InsertInto( + WriteThread::WriteGroup& write_group, SequenceNumber sequence, + ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, + bool ignore_missing_column_families = false, uint64_t log_number = 0, + DB* db = nullptr, bool concurrent_memtable_writes = false, + bool seq_per_batch = false, bool batch_per_txn = true); + + // Convenience form of InsertInto when you have only one batch + // next_seq returns the seq after last sequence number used in MemTable insert + static Status InsertInto( + const WriteBatch* batch, ColumnFamilyMemTables* memtables, + FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, + bool ignore_missing_column_families = false, uint64_t log_number = 0, + DB* db = nullptr, bool concurrent_memtable_writes = false, + SequenceNumber* next_seq = nullptr, bool* has_valid_writes = nullptr, + bool seq_per_batch = false, bool batch_per_txn = true); + + static Status InsertInto(WriteThread::Writer* writer, SequenceNumber sequence, + ColumnFamilyMemTables* memtables, + FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, + bool ignore_missing_column_families = false, + uint64_t log_number = 0, DB* db = nullptr, + bool concurrent_memtable_writes = false, + bool seq_per_batch = false, size_t batch_cnt = 0, + bool batch_per_txn = true, + bool hint_per_batch = false); + + // Appends src write batch to dst write batch and updates count in dst + // write batch. Returns OK if the append is successful. Checks number of + // checksum against count in dst and src write batches, and returns Corruption + // if the count is inconsistent. + static Status Append(WriteBatch* dst, const WriteBatch* src, + const bool WAL_only = false); + + // Returns the byte size of appending a WriteBatch with ByteSize + // leftByteSize and a WriteBatch with ByteSize rightByteSize + static size_t AppendedByteSize(size_t leftByteSize, size_t rightByteSize); + + // Iterate over [begin, end) range of a write batch + static Status Iterate(const WriteBatch* wb, WriteBatch::Handler* handler, + size_t begin, size_t end); + + // This write batch includes the latest state that should be persisted. Such + // state meant to be used only during recovery. + static void SetAsLatestPersistentState(WriteBatch* b); + static bool IsLatestPersistentState(const WriteBatch* b); + + static std::tuple GetColumnFamilyIdAndTimestampSize( + WriteBatch* b, ColumnFamilyHandle* column_family); + + static bool TimestampsUpdateNeeded(const WriteBatch& wb) { + return wb.needs_in_place_update_ts_; + } + + static bool HasKeyWithTimestamp(const WriteBatch& wb) { + return wb.has_key_with_ts_; + } + + // Update per-key value protection information on this write batch. + // If checksum is provided, the batch content is verfied against the checksum. + static Status UpdateProtectionInfo(WriteBatch* wb, size_t bytes_per_key, + uint64_t* checksum = nullptr); +}; + +// LocalSavePoint is similar to a scope guard +class LocalSavePoint { + public: + explicit LocalSavePoint(WriteBatch* batch) + : batch_(batch), + savepoint_(batch->GetDataSize(), batch->Count(), + batch->content_flags_.load(std::memory_order_relaxed)) +#ifndef NDEBUG + , + committed_(false) +#endif + { + } + +#ifndef NDEBUG + ~LocalSavePoint() { assert(committed_); } +#endif + Status commit() { +#ifndef NDEBUG + committed_ = true; +#endif + if (batch_->max_bytes_ && batch_->rep_.size() > batch_->max_bytes_) { + batch_->rep_.resize(savepoint_.size); + WriteBatchInternal::SetCount(batch_, savepoint_.count); + if (batch_->prot_info_ != nullptr) { + batch_->prot_info_->entries_.resize(savepoint_.count); + } + batch_->content_flags_.store(savepoint_.content_flags, + std::memory_order_relaxed); + return Status::MemoryLimit(); + } + return Status::OK(); + } + + private: + WriteBatch* batch_; + SavePoint savepoint_; +#ifndef NDEBUG + bool committed_; +#endif +}; + +template +class TimestampUpdater : public WriteBatch::Handler { + public: + explicit TimestampUpdater(WriteBatch::ProtectionInfo* prot_info, + TimestampSizeFuncType&& ts_sz_func, const Slice& ts) + : prot_info_(prot_info), + ts_sz_func_(std::move(ts_sz_func)), + timestamp_(ts) { + assert(!timestamp_.empty()); + } + + ~TimestampUpdater() override {} + + Status PutCF(uint32_t cf, const Slice& key, const Slice&) override { + return UpdateTimestamp(cf, key); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + return UpdateTimestamp(cf, key); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + return UpdateTimestamp(cf, key); + } + + Status DeleteRangeCF(uint32_t cf, const Slice& begin_key, + const Slice& end_key) override { + Status s = UpdateTimestamp(cf, begin_key, true /* is_key */); + if (s.ok()) { + s = UpdateTimestamp(cf, end_key, false /* is_key */); + } + return s; + } + + Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override { + return UpdateTimestamp(cf, key); + } + + Status PutBlobIndexCF(uint32_t cf, const Slice& key, const Slice&) override { + return UpdateTimestamp(cf, key); + } + + Status MarkBeginPrepare(bool) override { return Status::OK(); } + + Status MarkEndPrepare(const Slice&) override { return Status::OK(); } + + Status MarkCommit(const Slice&) override { return Status::OK(); } + + Status MarkCommitWithTimestamp(const Slice&, const Slice&) override { + return Status::OK(); + } + + Status MarkRollback(const Slice&) override { return Status::OK(); } + + Status MarkNoop(bool /*empty_batch*/) override { return Status::OK(); } + + private: + // @param is_key specifies whether the update is for key or value. + Status UpdateTimestamp(uint32_t cf, const Slice& buf, bool is_key = true) { + Status s = UpdateTimestampImpl(cf, buf, idx_, is_key); + ++idx_; + return s; + } + + Status UpdateTimestampImpl(uint32_t cf, const Slice& buf, size_t /*idx*/, + bool is_key) { + if (timestamp_.empty()) { + return Status::InvalidArgument("Timestamp is empty"); + } + size_t cf_ts_sz = ts_sz_func_(cf); + if (0 == cf_ts_sz) { + // Skip this column family. + return Status::OK(); + } else if (std::numeric_limits::max() == cf_ts_sz) { + // Column family timestamp info not found. + return Status::NotFound(); + } else if (cf_ts_sz != timestamp_.size()) { + return Status::InvalidArgument("timestamp size mismatch"); + } + UpdateProtectionInformationIfNeeded(buf, timestamp_, is_key); + + char* ptr = const_cast(buf.data() + buf.size() - cf_ts_sz); + assert(ptr); + memcpy(ptr, timestamp_.data(), timestamp_.size()); + return Status::OK(); + } + + void UpdateProtectionInformationIfNeeded(const Slice& buf, const Slice& ts, + bool is_key) { + if (prot_info_ != nullptr) { + const size_t ts_sz = ts.size(); + SliceParts old(&buf, 1); + Slice old_no_ts(buf.data(), buf.size() - ts_sz); + std::array new_key_cmpts{{old_no_ts, ts}}; + SliceParts new_parts(new_key_cmpts.data(), 2); + if (is_key) { + prot_info_->entries_[idx_].UpdateK(old, new_parts); + } else { + prot_info_->entries_[idx_].UpdateV(old, new_parts); + } + } + } + + // No copy or move. + TimestampUpdater(const TimestampUpdater&) = delete; + TimestampUpdater(TimestampUpdater&&) = delete; + TimestampUpdater& operator=(const TimestampUpdater&) = delete; + TimestampUpdater& operator=(TimestampUpdater&&) = delete; + + WriteBatch::ProtectionInfo* const prot_info_ = nullptr; + const TimestampSizeFuncType ts_sz_func_{}; + const Slice timestamp_; + size_t idx_ = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/write_batch_test.cc b/src/rocksdb/db/write_batch_test.cc new file mode 100644 index 000000000..d233853e2 --- /dev/null +++ b/src/rocksdb/db/write_batch_test.cc @@ -0,0 +1,1114 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#include "db/column_family.h" +#include "db/db_test_util.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/scoped_arena_iterator.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +static std::string PrintContents(WriteBatch* b, + bool merge_operator_supported = true) { + InternalKeyComparator cmp(BytewiseComparator()); + auto factory = std::make_shared(); + Options options; + options.memtable_factory = factory; + if (merge_operator_supported) { + options.merge_operator.reset(new TestPutOperator()); + } + ImmutableOptions ioptions(options); + WriteBufferManager wb(options.db_write_buffer_size); + MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, + kMaxSequenceNumber, 0 /* column_family_id */); + mem->Ref(); + std::string state; + ColumnFamilyMemTablesDefault cf_mems_default(mem); + Status s = + WriteBatchInternal::InsertInto(b, &cf_mems_default, nullptr, nullptr); + uint32_t count = 0; + int put_count = 0; + int delete_count = 0; + int single_delete_count = 0; + int delete_range_count = 0; + int merge_count = 0; + for (int i = 0; i < 2; ++i) { + Arena arena; + ScopedArenaIterator arena_iter_guard; + std::unique_ptr iter_guard; + InternalIterator* iter; + if (i == 0) { + iter = mem->NewIterator(ReadOptions(), &arena); + arena_iter_guard.set(iter); + } else { + iter = mem->NewRangeTombstoneIterator(ReadOptions(), + kMaxSequenceNumber /* read_seq */, + false /* immutable_memtable */); + iter_guard.reset(iter); + } + if (iter == nullptr) { + continue; + } + EXPECT_OK(iter->status()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ParsedInternalKey ikey; + ikey.clear(); + EXPECT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */)); + switch (ikey.type) { + case kTypeValue: + state.append("Put("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + count++; + put_count++; + break; + case kTypeDeletion: + state.append("Delete("); + state.append(ikey.user_key.ToString()); + state.append(")"); + count++; + delete_count++; + break; + case kTypeSingleDeletion: + state.append("SingleDelete("); + state.append(ikey.user_key.ToString()); + state.append(")"); + count++; + single_delete_count++; + break; + case kTypeRangeDeletion: + state.append("DeleteRange("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + count++; + delete_range_count++; + break; + case kTypeMerge: + state.append("Merge("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + count++; + merge_count++; + break; + default: + assert(false); + break; + } + state.append("@"); + state.append(std::to_string(ikey.sequence)); + } + EXPECT_OK(iter->status()); + } + if (s.ok()) { + EXPECT_EQ(b->HasPut(), put_count > 0); + EXPECT_EQ(b->HasDelete(), delete_count > 0); + EXPECT_EQ(b->HasSingleDelete(), single_delete_count > 0); + EXPECT_EQ(b->HasDeleteRange(), delete_range_count > 0); + EXPECT_EQ(b->HasMerge(), merge_count > 0); + if (count != WriteBatchInternal::Count(b)) { + state.append("CountMismatch()"); + } + } else { + state.append(s.ToString()); + } + delete mem->Unref(); + return state; +} + +class WriteBatchTest : public testing::Test {}; + +TEST_F(WriteBatchTest, Empty) { + WriteBatch batch; + ASSERT_EQ("", PrintContents(&batch)); + ASSERT_EQ(0u, WriteBatchInternal::Count(&batch)); + ASSERT_EQ(0u, batch.Count()); +} + +TEST_F(WriteBatchTest, Multiple) { + WriteBatch batch; + ASSERT_OK(batch.Put(Slice("foo"), Slice("bar"))); + ASSERT_OK(batch.Delete(Slice("box"))); + ASSERT_OK(batch.DeleteRange(Slice("bar"), Slice("foo"))); + ASSERT_OK(batch.Put(Slice("baz"), Slice("boo"))); + WriteBatchInternal::SetSequence(&batch, 100); + ASSERT_EQ(100U, WriteBatchInternal::Sequence(&batch)); + ASSERT_EQ(4u, WriteBatchInternal::Count(&batch)); + ASSERT_EQ( + "Put(baz, boo)@103" + "Delete(box)@101" + "Put(foo, bar)@100" + "DeleteRange(bar, foo)@102", + PrintContents(&batch)); + ASSERT_EQ(4u, batch.Count()); +} + +TEST_F(WriteBatchTest, Corruption) { + WriteBatch batch; + ASSERT_OK(batch.Put(Slice("foo"), Slice("bar"))); + ASSERT_OK(batch.Delete(Slice("box"))); + WriteBatchInternal::SetSequence(&batch, 200); + Slice contents = WriteBatchInternal::Contents(&batch); + ASSERT_OK(WriteBatchInternal::SetContents( + &batch, Slice(contents.data(), contents.size() - 1))); + ASSERT_EQ( + "Put(foo, bar)@200" + "Corruption: bad WriteBatch Delete", + PrintContents(&batch)); +} + +TEST_F(WriteBatchTest, Append) { + WriteBatch b1, b2; + WriteBatchInternal::SetSequence(&b1, 200); + WriteBatchInternal::SetSequence(&b2, 300); + ASSERT_OK(WriteBatchInternal::Append(&b1, &b2)); + ASSERT_EQ("", PrintContents(&b1)); + ASSERT_EQ(0u, b1.Count()); + ASSERT_OK(b2.Put("a", "va")); + ASSERT_OK(WriteBatchInternal::Append(&b1, &b2)); + ASSERT_EQ("Put(a, va)@200", PrintContents(&b1)); + ASSERT_EQ(1u, b1.Count()); + b2.Clear(); + ASSERT_OK(b2.Put("b", "vb")); + ASSERT_OK(WriteBatchInternal::Append(&b1, &b2)); + ASSERT_EQ( + "Put(a, va)@200" + "Put(b, vb)@201", + PrintContents(&b1)); + ASSERT_EQ(2u, b1.Count()); + ASSERT_OK(b2.Delete("foo")); + ASSERT_OK(WriteBatchInternal::Append(&b1, &b2)); + ASSERT_EQ( + "Put(a, va)@200" + "Put(b, vb)@202" + "Put(b, vb)@201" + "Delete(foo)@203", + PrintContents(&b1)); + ASSERT_EQ(4u, b1.Count()); + b2.Clear(); + ASSERT_OK(b2.Put("c", "cc")); + ASSERT_OK(b2.Put("d", "dd")); + b2.MarkWalTerminationPoint(); + ASSERT_OK(b2.Put("e", "ee")); + ASSERT_OK(WriteBatchInternal::Append(&b1, &b2, /*wal only*/ true)); + ASSERT_EQ( + "Put(a, va)@200" + "Put(b, vb)@202" + "Put(b, vb)@201" + "Put(c, cc)@204" + "Put(d, dd)@205" + "Delete(foo)@203", + PrintContents(&b1)); + ASSERT_EQ(6u, b1.Count()); + ASSERT_EQ( + "Put(c, cc)@0" + "Put(d, dd)@1" + "Put(e, ee)@2", + PrintContents(&b2)); + ASSERT_EQ(3u, b2.Count()); +} + +TEST_F(WriteBatchTest, SingleDeletion) { + WriteBatch batch; + WriteBatchInternal::SetSequence(&batch, 100); + ASSERT_EQ("", PrintContents(&batch)); + ASSERT_EQ(0u, batch.Count()); + ASSERT_OK(batch.Put("a", "va")); + ASSERT_EQ("Put(a, va)@100", PrintContents(&batch)); + ASSERT_EQ(1u, batch.Count()); + ASSERT_OK(batch.SingleDelete("a")); + ASSERT_EQ( + "SingleDelete(a)@101" + "Put(a, va)@100", + PrintContents(&batch)); + ASSERT_EQ(2u, batch.Count()); +} + +namespace { +struct TestHandler : public WriteBatch::Handler { + std::string seen; + Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + if (column_family_id == 0) { + seen += "Put(" + key.ToString() + ", " + value.ToString() + ")"; + } else { + seen += "PutCF(" + std::to_string(column_family_id) + ", " + + key.ToString() + ", " + value.ToString() + ")"; + } + return Status::OK(); + } + Status DeleteCF(uint32_t column_family_id, const Slice& key) override { + if (column_family_id == 0) { + seen += "Delete(" + key.ToString() + ")"; + } else { + seen += "DeleteCF(" + std::to_string(column_family_id) + ", " + + key.ToString() + ")"; + } + return Status::OK(); + } + Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override { + if (column_family_id == 0) { + seen += "SingleDelete(" + key.ToString() + ")"; + } else { + seen += "SingleDeleteCF(" + std::to_string(column_family_id) + ", " + + key.ToString() + ")"; + } + return Status::OK(); + } + Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key, + const Slice& end_key) override { + if (column_family_id == 0) { + seen += "DeleteRange(" + begin_key.ToString() + ", " + + end_key.ToString() + ")"; + } else { + seen += "DeleteRangeCF(" + std::to_string(column_family_id) + ", " + + begin_key.ToString() + ", " + end_key.ToString() + ")"; + } + return Status::OK(); + } + Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + if (column_family_id == 0) { + seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")"; + } else { + seen += "MergeCF(" + std::to_string(column_family_id) + ", " + + key.ToString() + ", " + value.ToString() + ")"; + } + return Status::OK(); + } + void LogData(const Slice& blob) override { + seen += "LogData(" + blob.ToString() + ")"; + } + Status MarkBeginPrepare(bool unprepare) override { + seen += + "MarkBeginPrepare(" + std::string(unprepare ? "true" : "false") + ")"; + return Status::OK(); + } + Status MarkEndPrepare(const Slice& xid) override { + seen += "MarkEndPrepare(" + xid.ToString() + ")"; + return Status::OK(); + } + Status MarkNoop(bool empty_batch) override { + seen += "MarkNoop(" + std::string(empty_batch ? "true" : "false") + ")"; + return Status::OK(); + } + Status MarkCommit(const Slice& xid) override { + seen += "MarkCommit(" + xid.ToString() + ")"; + return Status::OK(); + } + Status MarkCommitWithTimestamp(const Slice& xid, const Slice& ts) override { + seen += "MarkCommitWithTimestamp(" + xid.ToString() + ", " + + ts.ToString(true) + ")"; + return Status::OK(); + } + Status MarkRollback(const Slice& xid) override { + seen += "MarkRollback(" + xid.ToString() + ")"; + return Status::OK(); + } +}; +} // anonymous namespace + +TEST_F(WriteBatchTest, PutNotImplemented) { + WriteBatch batch; + ASSERT_OK(batch.Put(Slice("k1"), Slice("v1"))); + ASSERT_EQ(1u, batch.Count()); + ASSERT_EQ("Put(k1, v1)@0", PrintContents(&batch)); + + WriteBatch::Handler handler; + ASSERT_OK(batch.Iterate(&handler)); +} + +TEST_F(WriteBatchTest, DeleteNotImplemented) { + WriteBatch batch; + ASSERT_OK(batch.Delete(Slice("k2"))); + ASSERT_EQ(1u, batch.Count()); + ASSERT_EQ("Delete(k2)@0", PrintContents(&batch)); + + WriteBatch::Handler handler; + ASSERT_OK(batch.Iterate(&handler)); +} + +TEST_F(WriteBatchTest, SingleDeleteNotImplemented) { + WriteBatch batch; + ASSERT_OK(batch.SingleDelete(Slice("k2"))); + ASSERT_EQ(1u, batch.Count()); + ASSERT_EQ("SingleDelete(k2)@0", PrintContents(&batch)); + + WriteBatch::Handler handler; + ASSERT_OK(batch.Iterate(&handler)); +} + +TEST_F(WriteBatchTest, MergeNotImplemented) { + WriteBatch batch; + ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar"))); + ASSERT_EQ(1u, batch.Count()); + ASSERT_EQ("Merge(foo, bar)@0", PrintContents(&batch)); + + WriteBatch::Handler handler; + ASSERT_OK(batch.Iterate(&handler)); +} + +TEST_F(WriteBatchTest, MergeWithoutOperatorInsertionFailure) { + WriteBatch batch; + ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar"))); + ASSERT_EQ(1u, batch.Count()); + ASSERT_EQ( + "Invalid argument: Merge requires `ColumnFamilyOptions::merge_operator " + "!= nullptr`", + PrintContents(&batch, false /* merge_operator_supported */)); +} + +TEST_F(WriteBatchTest, Blob) { + WriteBatch batch; + ASSERT_OK(batch.Put(Slice("k1"), Slice("v1"))); + ASSERT_OK(batch.Put(Slice("k2"), Slice("v2"))); + ASSERT_OK(batch.Put(Slice("k3"), Slice("v3"))); + ASSERT_OK(batch.PutLogData(Slice("blob1"))); + ASSERT_OK(batch.Delete(Slice("k2"))); + ASSERT_OK(batch.SingleDelete(Slice("k3"))); + ASSERT_OK(batch.PutLogData(Slice("blob2"))); + ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar"))); + ASSERT_EQ(6u, batch.Count()); + ASSERT_EQ( + "Merge(foo, bar)@5" + "Put(k1, v1)@0" + "Delete(k2)@3" + "Put(k2, v2)@1" + "SingleDelete(k3)@4" + "Put(k3, v3)@2", + PrintContents(&batch)); + + TestHandler handler; + ASSERT_OK(batch.Iterate(&handler)); + ASSERT_EQ( + "Put(k1, v1)" + "Put(k2, v2)" + "Put(k3, v3)" + "LogData(blob1)" + "Delete(k2)" + "SingleDelete(k3)" + "LogData(blob2)" + "Merge(foo, bar)", + handler.seen); +} + +TEST_F(WriteBatchTest, PrepareCommit) { + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::InsertNoop(&batch)); + ASSERT_OK(batch.Put(Slice("k1"), Slice("v1"))); + ASSERT_OK(batch.Put(Slice("k2"), Slice("v2"))); + batch.SetSavePoint(); + ASSERT_OK(WriteBatchInternal::MarkEndPrepare(&batch, Slice("xid1"))); + Status s = batch.RollbackToSavePoint(); + ASSERT_EQ(s, Status::NotFound()); + ASSERT_OK(WriteBatchInternal::MarkCommit(&batch, Slice("xid1"))); + ASSERT_OK(WriteBatchInternal::MarkRollback(&batch, Slice("xid1"))); + ASSERT_EQ(2u, batch.Count()); + + TestHandler handler; + ASSERT_OK(batch.Iterate(&handler)); + ASSERT_EQ( + "MarkBeginPrepare(false)" + "Put(k1, v1)" + "Put(k2, v2)" + "MarkEndPrepare(xid1)" + "MarkCommit(xid1)" + "MarkRollback(xid1)", + handler.seen); +} + +// It requires more than 30GB of memory to run the test. With single memory +// allocation of more than 30GB. +// Not all platform can run it. Also it runs a long time. So disable it. +TEST_F(WriteBatchTest, DISABLED_ManyUpdates) { + // Insert key and value of 3GB and push total batch size to 12GB. + static const size_t kKeyValueSize = 4u; + static const uint32_t kNumUpdates = uint32_t{3} << 30; + std::string raw(kKeyValueSize, 'A'); + WriteBatch batch(kNumUpdates * (4 + kKeyValueSize * 2) + 1024u); + char c = 'A'; + for (uint32_t i = 0; i < kNumUpdates; i++) { + if (c > 'Z') { + c = 'A'; + } + raw[0] = c; + raw[raw.length() - 1] = c; + c++; + ASSERT_OK(batch.Put(raw, raw)); + } + + ASSERT_EQ(kNumUpdates, batch.Count()); + + struct NoopHandler : public WriteBatch::Handler { + uint32_t num_seen = 0; + char expected_char = 'A'; + Status PutCF(uint32_t /*column_family_id*/, const Slice& key, + const Slice& value) override { + EXPECT_EQ(kKeyValueSize, key.size()); + EXPECT_EQ(kKeyValueSize, value.size()); + EXPECT_EQ(expected_char, key[0]); + EXPECT_EQ(expected_char, value[0]); + EXPECT_EQ(expected_char, key[kKeyValueSize - 1]); + EXPECT_EQ(expected_char, value[kKeyValueSize - 1]); + expected_char++; + if (expected_char > 'Z') { + expected_char = 'A'; + } + ++num_seen; + return Status::OK(); + } + Status DeleteCF(uint32_t /*column_family_id*/, + const Slice& /*key*/) override { + ADD_FAILURE(); + return Status::OK(); + } + Status SingleDeleteCF(uint32_t /*column_family_id*/, + const Slice& /*key*/) override { + ADD_FAILURE(); + return Status::OK(); + } + Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/, + const Slice& /*value*/) override { + ADD_FAILURE(); + return Status::OK(); + } + void LogData(const Slice& /*blob*/) override { ADD_FAILURE(); } + bool Continue() override { return num_seen < kNumUpdates; } + } handler; + + ASSERT_OK(batch.Iterate(&handler)); + ASSERT_EQ(kNumUpdates, handler.num_seen); +} + +// The test requires more than 18GB memory to run it, with single memory +// allocation of more than 12GB. Not all the platform can run it. So disable it. +TEST_F(WriteBatchTest, DISABLED_LargeKeyValue) { + // Insert key and value of 3GB and push total batch size to 12GB. + static const size_t kKeyValueSize = 3221225472u; + std::string raw(kKeyValueSize, 'A'); + WriteBatch batch(size_t(12884901888ull + 1024u)); + for (char i = 0; i < 2; i++) { + raw[0] = 'A' + i; + raw[raw.length() - 1] = 'A' - i; + ASSERT_OK(batch.Put(raw, raw)); + } + + ASSERT_EQ(2u, batch.Count()); + + struct NoopHandler : public WriteBatch::Handler { + int num_seen = 0; + Status PutCF(uint32_t /*column_family_id*/, const Slice& key, + const Slice& value) override { + EXPECT_EQ(kKeyValueSize, key.size()); + EXPECT_EQ(kKeyValueSize, value.size()); + EXPECT_EQ('A' + num_seen, key[0]); + EXPECT_EQ('A' + num_seen, value[0]); + EXPECT_EQ('A' - num_seen, key[kKeyValueSize - 1]); + EXPECT_EQ('A' - num_seen, value[kKeyValueSize - 1]); + ++num_seen; + return Status::OK(); + } + Status DeleteCF(uint32_t /*column_family_id*/, + const Slice& /*key*/) override { + ADD_FAILURE(); + return Status::OK(); + } + Status SingleDeleteCF(uint32_t /*column_family_id*/, + const Slice& /*key*/) override { + ADD_FAILURE(); + return Status::OK(); + } + Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/, + const Slice& /*value*/) override { + ADD_FAILURE(); + return Status::OK(); + } + void LogData(const Slice& /*blob*/) override { ADD_FAILURE(); } + bool Continue() override { return num_seen < 2; } + } handler; + + ASSERT_OK(batch.Iterate(&handler)); + ASSERT_EQ(2, handler.num_seen); +} + +TEST_F(WriteBatchTest, Continue) { + WriteBatch batch; + + struct Handler : public TestHandler { + int num_seen = 0; + Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + ++num_seen; + return TestHandler::PutCF(column_family_id, key, value); + } + Status DeleteCF(uint32_t column_family_id, const Slice& key) override { + ++num_seen; + return TestHandler::DeleteCF(column_family_id, key); + } + Status SingleDeleteCF(uint32_t column_family_id, + const Slice& key) override { + ++num_seen; + return TestHandler::SingleDeleteCF(column_family_id, key); + } + Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + ++num_seen; + return TestHandler::MergeCF(column_family_id, key, value); + } + void LogData(const Slice& blob) override { + ++num_seen; + TestHandler::LogData(blob); + } + bool Continue() override { return num_seen < 5; } + } handler; + + ASSERT_OK(batch.Put(Slice("k1"), Slice("v1"))); + ASSERT_OK(batch.Put(Slice("k2"), Slice("v2"))); + ASSERT_OK(batch.PutLogData(Slice("blob1"))); + ASSERT_OK(batch.Delete(Slice("k1"))); + ASSERT_OK(batch.SingleDelete(Slice("k2"))); + ASSERT_OK(batch.PutLogData(Slice("blob2"))); + ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar"))); + ASSERT_OK(batch.Iterate(&handler)); + ASSERT_EQ( + "Put(k1, v1)" + "Put(k2, v2)" + "LogData(blob1)" + "Delete(k1)" + "SingleDelete(k2)", + handler.seen); +} + +TEST_F(WriteBatchTest, PutGatherSlices) { + WriteBatch batch; + ASSERT_OK(batch.Put(Slice("foo"), Slice("bar"))); + + { + // Try a write where the key is one slice but the value is two + Slice key_slice("baz"); + Slice value_slices[2] = {Slice("header"), Slice("payload")}; + ASSERT_OK( + batch.Put(SliceParts(&key_slice, 1), SliceParts(value_slices, 2))); + } + + { + // One where the key is composite but the value is a single slice + Slice key_slices[3] = {Slice("key"), Slice("part2"), Slice("part3")}; + Slice value_slice("value"); + ASSERT_OK( + batch.Put(SliceParts(key_slices, 3), SliceParts(&value_slice, 1))); + } + + WriteBatchInternal::SetSequence(&batch, 100); + ASSERT_EQ( + "Put(baz, headerpayload)@101" + "Put(foo, bar)@100" + "Put(keypart2part3, value)@102", + PrintContents(&batch)); + ASSERT_EQ(3u, batch.Count()); +} + +namespace { +class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl { + public: + explicit ColumnFamilyHandleImplDummy(int id) + : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {} + explicit ColumnFamilyHandleImplDummy(int id, const Comparator* ucmp) + : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), + id_(id), + ucmp_(ucmp) {} + uint32_t GetID() const override { return id_; } + const Comparator* GetComparator() const override { return ucmp_; } + + private: + uint32_t id_; + const Comparator* const ucmp_ = BytewiseComparator(); +}; +} // anonymous namespace + +TEST_F(WriteBatchTest, ColumnFamiliesBatchTest) { + WriteBatch batch; + ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8); + ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar"))); + ASSERT_OK(batch.Put(&two, Slice("twofoo"), Slice("bar2"))); + ASSERT_OK(batch.Put(&eight, Slice("eightfoo"), Slice("bar8"))); + ASSERT_OK(batch.Delete(&eight, Slice("eightfoo"))); + ASSERT_OK(batch.SingleDelete(&two, Slice("twofoo"))); + ASSERT_OK(batch.DeleteRange(&two, Slice("3foo"), Slice("4foo"))); + ASSERT_OK(batch.Merge(&three, Slice("threethree"), Slice("3three"))); + ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar"))); + ASSERT_OK(batch.Merge(Slice("omom"), Slice("nom"))); + + TestHandler handler; + ASSERT_OK(batch.Iterate(&handler)); + ASSERT_EQ( + "Put(foo, bar)" + "PutCF(2, twofoo, bar2)" + "PutCF(8, eightfoo, bar8)" + "DeleteCF(8, eightfoo)" + "SingleDeleteCF(2, twofoo)" + "DeleteRangeCF(2, 3foo, 4foo)" + "MergeCF(3, threethree, 3three)" + "Put(foo, bar)" + "Merge(omom, nom)", + handler.seen); +} + +#ifndef ROCKSDB_LITE +TEST_F(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) { + WriteBatchWithIndex batch; + ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8); + ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar"))); + ASSERT_OK(batch.Put(&two, Slice("twofoo"), Slice("bar2"))); + ASSERT_OK(batch.Put(&eight, Slice("eightfoo"), Slice("bar8"))); + ASSERT_OK(batch.Delete(&eight, Slice("eightfoo"))); + ASSERT_OK(batch.SingleDelete(&two, Slice("twofoo"))); + ASSERT_OK(batch.Merge(&three, Slice("threethree"), Slice("3three"))); + ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar"))); + ASSERT_OK(batch.Merge(Slice("omom"), Slice("nom"))); + + std::unique_ptr iter; + + iter.reset(batch.NewIterator(&eight)); + iter->Seek("eightfoo"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type); + ASSERT_EQ("eightfoo", iter->Entry().key.ToString()); + ASSERT_EQ("bar8", iter->Entry().value.ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(WriteType::kDeleteRecord, iter->Entry().type); + ASSERT_EQ("eightfoo", iter->Entry().key.ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter.reset(batch.NewIterator(&two)); + iter->Seek("twofoo"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type); + ASSERT_EQ("twofoo", iter->Entry().key.ToString()); + ASSERT_EQ("bar2", iter->Entry().value.ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(WriteType::kSingleDeleteRecord, iter->Entry().type); + ASSERT_EQ("twofoo", iter->Entry().key.ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter.reset(batch.NewIterator()); + iter->Seek("gggg"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(WriteType::kMergeRecord, iter->Entry().type); + ASSERT_EQ("omom", iter->Entry().key.ToString()); + ASSERT_EQ("nom", iter->Entry().value.ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter.reset(batch.NewIterator(&zero)); + iter->Seek("foo"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type); + ASSERT_EQ("foo", iter->Entry().key.ToString()); + ASSERT_EQ("bar", iter->Entry().value.ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type); + ASSERT_EQ("foo", iter->Entry().key.ToString()); + ASSERT_EQ("bar", iter->Entry().value.ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(WriteType::kMergeRecord, iter->Entry().type); + ASSERT_EQ("omom", iter->Entry().key.ToString()); + ASSERT_EQ("nom", iter->Entry().value.ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + TestHandler handler; + ASSERT_OK(batch.GetWriteBatch()->Iterate(&handler)); + ASSERT_EQ( + "Put(foo, bar)" + "PutCF(2, twofoo, bar2)" + "PutCF(8, eightfoo, bar8)" + "DeleteCF(8, eightfoo)" + "SingleDeleteCF(2, twofoo)" + "MergeCF(3, threethree, 3three)" + "Put(foo, bar)" + "Merge(omom, nom)", + handler.seen); +} +#endif // !ROCKSDB_LITE + +TEST_F(WriteBatchTest, SavePointTest) { + Status s; + WriteBatch batch; + batch.SetSavePoint(); + + ASSERT_OK(batch.Put("A", "a")); + ASSERT_OK(batch.Put("B", "b")); + batch.SetSavePoint(); + + ASSERT_OK(batch.Put("C", "c")); + ASSERT_OK(batch.Delete("A")); + batch.SetSavePoint(); + batch.SetSavePoint(); + + ASSERT_OK(batch.RollbackToSavePoint()); + ASSERT_EQ( + "Delete(A)@3" + "Put(A, a)@0" + "Put(B, b)@1" + "Put(C, c)@2", + PrintContents(&batch)); + + ASSERT_OK(batch.RollbackToSavePoint()); + ASSERT_OK(batch.RollbackToSavePoint()); + ASSERT_EQ( + "Put(A, a)@0" + "Put(B, b)@1", + PrintContents(&batch)); + + ASSERT_OK(batch.Delete("A")); + ASSERT_OK(batch.Put("B", "bb")); + + ASSERT_OK(batch.RollbackToSavePoint()); + ASSERT_EQ("", PrintContents(&batch)); + + s = batch.RollbackToSavePoint(); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ("", PrintContents(&batch)); + + ASSERT_OK(batch.Put("D", "d")); + ASSERT_OK(batch.Delete("A")); + + batch.SetSavePoint(); + + ASSERT_OK(batch.Put("A", "aaa")); + + ASSERT_OK(batch.RollbackToSavePoint()); + ASSERT_EQ( + "Delete(A)@1" + "Put(D, d)@0", + PrintContents(&batch)); + + batch.SetSavePoint(); + + ASSERT_OK(batch.Put("D", "d")); + ASSERT_OK(batch.Delete("A")); + + ASSERT_OK(batch.RollbackToSavePoint()); + ASSERT_EQ( + "Delete(A)@1" + "Put(D, d)@0", + PrintContents(&batch)); + + s = batch.RollbackToSavePoint(); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ( + "Delete(A)@1" + "Put(D, d)@0", + PrintContents(&batch)); + + WriteBatch batch2; + + s = batch2.RollbackToSavePoint(); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ("", PrintContents(&batch2)); + + ASSERT_OK(batch2.Delete("A")); + batch2.SetSavePoint(); + + s = batch2.RollbackToSavePoint(); + ASSERT_OK(s); + ASSERT_EQ("Delete(A)@0", PrintContents(&batch2)); + + batch2.Clear(); + ASSERT_EQ("", PrintContents(&batch2)); + + batch2.SetSavePoint(); + + ASSERT_OK(batch2.Delete("B")); + ASSERT_EQ("Delete(B)@0", PrintContents(&batch2)); + + batch2.SetSavePoint(); + s = batch2.RollbackToSavePoint(); + ASSERT_OK(s); + ASSERT_EQ("Delete(B)@0", PrintContents(&batch2)); + + s = batch2.RollbackToSavePoint(); + ASSERT_OK(s); + ASSERT_EQ("", PrintContents(&batch2)); + + s = batch2.RollbackToSavePoint(); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ("", PrintContents(&batch2)); + + WriteBatch batch3; + + s = batch3.PopSavePoint(); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ("", PrintContents(&batch3)); + + batch3.SetSavePoint(); + ASSERT_OK(batch3.Delete("A")); + + s = batch3.PopSavePoint(); + ASSERT_OK(s); + ASSERT_EQ("Delete(A)@0", PrintContents(&batch3)); +} + +TEST_F(WriteBatchTest, MemoryLimitTest) { + Status s; + // The header size is 12 bytes. The two Puts take 8 bytes which gives total + // of 12 + 8 * 2 = 28 bytes. + WriteBatch batch(0, 28); + + ASSERT_OK(batch.Put("a", "....")); + ASSERT_OK(batch.Put("b", "....")); + s = batch.Put("c", "...."); + ASSERT_TRUE(s.IsMemoryLimit()); +} + +namespace { +class TimestampChecker : public WriteBatch::Handler { + public: + explicit TimestampChecker( + std::unordered_map cf_to_ucmps, Slice ts) + : cf_to_ucmps_(std::move(cf_to_ucmps)), timestamp_(std::move(ts)) {} + Status PutCF(uint32_t cf, const Slice& key, const Slice& /*value*/) override { + auto cf_iter = cf_to_ucmps_.find(cf); + if (cf_iter == cf_to_ucmps_.end()) { + return Status::Corruption(); + } + const Comparator* const ucmp = cf_iter->second; + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + if (ts_sz == 0) { + return Status::OK(); + } + if (key.size() < ts_sz) { + return Status::Corruption(); + } + Slice ts = ExtractTimestampFromUserKey(key, ts_sz); + if (ts.compare(timestamp_) != 0) { + return Status::Corruption(); + } + return Status::OK(); + } + + private: + std::unordered_map cf_to_ucmps_; + Slice timestamp_; +}; + +Status CheckTimestampsInWriteBatch( + WriteBatch& wb, Slice timestamp, + std::unordered_map cf_to_ucmps) { + TimestampChecker ts_checker(cf_to_ucmps, timestamp); + return wb.Iterate(&ts_checker); +} +} // anonymous namespace + +TEST_F(WriteBatchTest, SanityChecks) { + ColumnFamilyHandleImplDummy cf0(0, + test::BytewiseComparatorWithU64TsWrapper()); + ColumnFamilyHandleImplDummy cf4(4); + + WriteBatch wb(0, 0, 0, /*default_cf_ts_sz=*/sizeof(uint64_t)); + + // Sanity checks for the new WriteBatch APIs with extra 'ts' arg. + ASSERT_TRUE(wb.Put(nullptr, "key", "ts", "value").IsInvalidArgument()); + ASSERT_TRUE(wb.Delete(nullptr, "key", "ts").IsInvalidArgument()); + ASSERT_TRUE(wb.SingleDelete(nullptr, "key", "ts").IsInvalidArgument()); + ASSERT_TRUE(wb.Merge(nullptr, "key", "ts", "value").IsInvalidArgument()); + ASSERT_TRUE(wb.DeleteRange(nullptr, "begin_key", "end_key", "ts") + .IsInvalidArgument()); + + ASSERT_TRUE(wb.Put(&cf4, "key", "ts", "value").IsInvalidArgument()); + ASSERT_TRUE(wb.Delete(&cf4, "key", "ts").IsInvalidArgument()); + ASSERT_TRUE(wb.SingleDelete(&cf4, "key", "ts").IsInvalidArgument()); + ASSERT_TRUE(wb.Merge(&cf4, "key", "ts", "value").IsInvalidArgument()); + ASSERT_TRUE( + wb.DeleteRange(&cf4, "begin_key", "end_key", "ts").IsInvalidArgument()); + + constexpr size_t wrong_ts_sz = 1 + sizeof(uint64_t); + std::string ts(wrong_ts_sz, '\0'); + + ASSERT_TRUE(wb.Put(&cf0, "key", ts, "value").IsInvalidArgument()); + ASSERT_TRUE(wb.Delete(&cf0, "key", ts).IsInvalidArgument()); + ASSERT_TRUE(wb.SingleDelete(&cf0, "key", ts).IsInvalidArgument()); + ASSERT_TRUE(wb.Merge(&cf0, "key", ts, "value").IsInvalidArgument()); + ASSERT_TRUE( + wb.DeleteRange(&cf0, "begin_key", "end_key", ts).IsInvalidArgument()); + + // Sanity checks for the new WriteBatch APIs without extra 'ts' arg. + WriteBatch wb1(0, 0, 0, wrong_ts_sz); + ASSERT_TRUE(wb1.Put(&cf0, "key", "value").IsInvalidArgument()); + ASSERT_TRUE(wb1.Delete(&cf0, "key").IsInvalidArgument()); + ASSERT_TRUE(wb1.SingleDelete(&cf0, "key").IsInvalidArgument()); + ASSERT_TRUE(wb1.Merge(&cf0, "key", "value").IsInvalidArgument()); + ASSERT_TRUE( + wb1.DeleteRange(&cf0, "begin_key", "end_key").IsInvalidArgument()); +} + +TEST_F(WriteBatchTest, UpdateTimestamps) { + // We assume the last eight bytes of each key is reserved for timestamps. + // Therefore, we must make sure each key is longer than eight bytes. + constexpr size_t key_size = 16; + constexpr size_t num_of_keys = 10; + std::vector key_strs(num_of_keys, std::string(key_size, '\0')); + + ColumnFamilyHandleImplDummy cf0(0); + ColumnFamilyHandleImplDummy cf4(4, + test::BytewiseComparatorWithU64TsWrapper()); + ColumnFamilyHandleImplDummy cf5(5, + test::BytewiseComparatorWithU64TsWrapper()); + + const std::unordered_map cf_to_ucmps = { + {0, cf0.GetComparator()}, + {4, cf4.GetComparator()}, + {5, cf5.GetComparator()}}; + + static constexpr size_t timestamp_size = sizeof(uint64_t); + + { + WriteBatch wb1, wb2, wb3, wb4, wb5, wb6, wb7; + ASSERT_OK(wb1.Put(&cf0, "key", "value")); + ASSERT_FALSE(WriteBatchInternal::HasKeyWithTimestamp(wb1)); + ASSERT_OK(wb2.Put(&cf4, "key", "value")); + ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb2)); + ASSERT_OK(wb3.Put(&cf4, "key", /*ts=*/std::string(timestamp_size, '\xfe'), + "value")); + ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb3)); + ASSERT_OK(wb4.Delete(&cf4, "key", + /*ts=*/std::string(timestamp_size, '\xfe'))); + ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb4)); + ASSERT_OK(wb5.Delete(&cf4, "key")); + ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb5)); + ASSERT_OK(wb6.SingleDelete(&cf4, "key")); + ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb6)); + ASSERT_OK(wb7.SingleDelete(&cf4, "key", + /*ts=*/std::string(timestamp_size, '\xfe'))); + ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb7)); + } + + WriteBatch batch; + // Write to the batch. We will assign timestamps later. + for (const auto& key_str : key_strs) { + ASSERT_OK(batch.Put(&cf0, key_str, "value")); + ASSERT_OK(batch.Put(&cf4, key_str, "value")); + ASSERT_OK(batch.Put(&cf5, key_str, "value")); + } + + const auto checker1 = [](uint32_t cf) { + if (cf == 4 || cf == 5) { + return timestamp_size; + } else if (cf == 0) { + return static_cast(0); + } else { + return std::numeric_limits::max(); + } + }; + ASSERT_OK( + batch.UpdateTimestamps(std::string(timestamp_size, '\xfe'), checker1)); + ASSERT_OK(CheckTimestampsInWriteBatch( + batch, std::string(timestamp_size, '\xfe'), cf_to_ucmps)); + + // We use indexed_cf_to_ucmps, non_indexed_cfs_with_ts and timestamp_size to + // simulate the case in which a transaction enables indexing for some writes + // while disables indexing for other writes. A transaction uses a + // WriteBatchWithIndex object to buffer writes (we consider Write-committed + // policy only). If indexing is enabled, then writes go through + // WriteBatchWithIndex API populating a WBWI internal data structure, i.e. a + // mapping from cf to user comparators. If indexing is disabled, a transaction + // writes directly to the underlying raw WriteBatch. We will need to track the + // comparator information for the column families to which un-indexed writes + // are performed. When calling UpdateTimestamp API of WriteBatch, we need + // indexed_cf_to_ucmps, non_indexed_cfs_with_ts, and timestamp_size to perform + // checking. + std::unordered_map indexed_cf_to_ucmps = { + {0, cf0.GetComparator()}, {4, cf4.GetComparator()}}; + std::unordered_set non_indexed_cfs_with_ts = {cf5.GetID()}; + const auto checker2 = [&indexed_cf_to_ucmps, + &non_indexed_cfs_with_ts](uint32_t cf) { + if (non_indexed_cfs_with_ts.count(cf) > 0) { + return timestamp_size; + } + auto cf_iter = indexed_cf_to_ucmps.find(cf); + if (cf_iter == indexed_cf_to_ucmps.end()) { + assert(false); + return std::numeric_limits::max(); + } + const Comparator* const ucmp = cf_iter->second; + assert(ucmp); + return ucmp->timestamp_size(); + }; + ASSERT_OK( + batch.UpdateTimestamps(std::string(timestamp_size, '\xef'), checker2)); + ASSERT_OK(CheckTimestampsInWriteBatch( + batch, std::string(timestamp_size, '\xef'), cf_to_ucmps)); +} + +TEST_F(WriteBatchTest, CommitWithTimestamp) { + WriteBatch wb; + const std::string txn_name = "xid1"; + std::string ts; + constexpr uint64_t commit_ts = 23; + PutFixed64(&ts, commit_ts); + ASSERT_OK(WriteBatchInternal::MarkCommitWithTimestamp(&wb, txn_name, ts)); + TestHandler handler; + ASSERT_OK(wb.Iterate(&handler)); + ASSERT_EQ("MarkCommitWithTimestamp(" + txn_name + ", " + + Slice(ts).ToString(true) + ")", + handler.seen); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/write_callback.h b/src/rocksdb/db/write_callback.h new file mode 100644 index 000000000..106d02041 --- /dev/null +++ b/src/rocksdb/db/write_callback.h @@ -0,0 +1,27 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class DB; + +class WriteCallback { + public: + virtual ~WriteCallback() {} + + // Will be called while on the write thread before the write executes. If + // this function returns a non-OK status, the write will be aborted and this + // status will be returned to the caller of DB::Write(). + virtual Status Callback(DB* db) = 0; + + // return true if writes with this callback can be batched with other writes + virtual bool AllowWriteBatching() = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/write_callback_test.cc b/src/rocksdb/db/write_callback_test.cc new file mode 100644 index 000000000..e6ebaae08 --- /dev/null +++ b/src/rocksdb/db/write_callback_test.cc @@ -0,0 +1,465 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "db/write_callback.h" + +#include +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/write_batch.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "util/random.h" + +using std::string; + +namespace ROCKSDB_NAMESPACE { + +class WriteCallbackTest : public testing::Test { + public: + string dbname; + + WriteCallbackTest() { + dbname = test::PerThreadDBPath("write_callback_testdb"); + } +}; + +class WriteCallbackTestWriteCallback1 : public WriteCallback { + public: + bool was_called = false; + + Status Callback(DB* db) override { + was_called = true; + + // Make sure db is a DBImpl + DBImpl* db_impl = dynamic_cast(db); + if (db_impl == nullptr) { + return Status::InvalidArgument(""); + } + + return Status::OK(); + } + + bool AllowWriteBatching() override { return true; } +}; + +class WriteCallbackTestWriteCallback2 : public WriteCallback { + public: + Status Callback(DB* /*db*/) override { return Status::Busy(); } + bool AllowWriteBatching() override { return true; } +}; + +class MockWriteCallback : public WriteCallback { + public: + bool should_fail_ = false; + bool allow_batching_ = false; + std::atomic was_called_{false}; + + MockWriteCallback() {} + + MockWriteCallback(const MockWriteCallback& other) { + should_fail_ = other.should_fail_; + allow_batching_ = other.allow_batching_; + was_called_.store(other.was_called_.load()); + } + + Status Callback(DB* /*db*/) override { + was_called_.store(true); + if (should_fail_) { + return Status::Busy(); + } else { + return Status::OK(); + } + } + + bool AllowWriteBatching() override { return allow_batching_; } +}; + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +class WriteCallbackPTest + : public WriteCallbackTest, + public ::testing::WithParamInterface< + std::tuple> { + public: + WriteCallbackPTest() { + std::tie(unordered_write_, seq_per_batch_, two_queues_, allow_parallel_, + allow_batching_, enable_WAL_, enable_pipelined_write_) = + GetParam(); + } + + protected: + bool unordered_write_; + bool seq_per_batch_; + bool two_queues_; + bool allow_parallel_; + bool allow_batching_; + bool enable_WAL_; + bool enable_pipelined_write_; +}; + +TEST_P(WriteCallbackPTest, WriteWithCallbackTest) { + struct WriteOP { + WriteOP(bool should_fail = false) { callback_.should_fail_ = should_fail; } + + void Put(const string& key, const string& val) { + kvs_.push_back(std::make_pair(key, val)); + ASSERT_OK(write_batch_.Put(key, val)); + } + + void Clear() { + kvs_.clear(); + write_batch_.Clear(); + callback_.was_called_.store(false); + } + + MockWriteCallback callback_; + WriteBatch write_batch_; + std::vector> kvs_; + }; + + // In each scenario we'll launch multiple threads to write. + // The size of each array equals to number of threads, and + // each boolean in it denote whether callback of corresponding + // thread should succeed or fail. + std::vector> write_scenarios = { + {true}, + {false}, + {false, false}, + {true, true}, + {true, false}, + {false, true}, + {false, false, false}, + {true, true, true}, + {false, true, false}, + {true, false, true}, + {true, false, false, false, false}, + {false, false, false, false, true}, + {false, false, true, false, true}, + }; + + for (auto& write_group : write_scenarios) { + Options options; + options.create_if_missing = true; + options.unordered_write = unordered_write_; + options.allow_concurrent_memtable_write = allow_parallel_; + options.enable_pipelined_write = enable_pipelined_write_; + options.two_write_queues = two_queues_; + // Skip unsupported combinations + if (options.enable_pipelined_write && seq_per_batch_) { + continue; + } + if (options.enable_pipelined_write && options.two_write_queues) { + continue; + } + if (options.unordered_write && !options.allow_concurrent_memtable_write) { + continue; + } + if (options.unordered_write && options.enable_pipelined_write) { + continue; + } + + ReadOptions read_options; + DB* db; + DBImpl* db_impl; + + ASSERT_OK(DestroyDB(dbname, options)); + + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + std::vector handles; + auto open_s = DBImpl::Open(db_options, dbname, column_families, &handles, + &db, seq_per_batch_, true /* batch_per_txn */); + ASSERT_OK(open_s); + assert(handles.size() == 1); + delete handles[0]; + + db_impl = dynamic_cast(db); + ASSERT_TRUE(db_impl); + + // Writers that have called JoinBatchGroup. + std::atomic threads_joining(0); + // Writers that have linked to the queue + std::atomic threads_linked(0); + // Writers that pass WriteThread::JoinBatchGroup:Wait sync-point. + std::atomic threads_verified(0); + + std::atomic seq(db_impl->GetLatestSequenceNumber()); + ASSERT_EQ(db_impl->GetLatestSequenceNumber(), 0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:Start", [&](void*) { + uint64_t cur_threads_joining = threads_joining.fetch_add(1); + // Wait for the last joined writer to link to the queue. + // In this way the writers link to the queue one by one. + // This allows us to confidently detect the first writer + // who increases threads_linked as the leader. + while (threads_linked.load() < cur_threads_joining) { + } + }); + + // Verification once writers call JoinBatchGroup. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:Wait", [&](void* arg) { + uint64_t cur_threads_linked = threads_linked.fetch_add(1); + bool is_leader = false; + bool is_last = false; + + // who am i + is_leader = (cur_threads_linked == 0); + is_last = (cur_threads_linked == write_group.size() - 1); + + // check my state + auto* writer = reinterpret_cast(arg); + + if (is_leader) { + ASSERT_TRUE(writer->state == + WriteThread::State::STATE_GROUP_LEADER); + } else { + ASSERT_TRUE(writer->state == WriteThread::State::STATE_INIT); + } + + // (meta test) the first WriteOP should indeed be the first + // and the last should be the last (all others can be out of + // order) + if (is_leader) { + ASSERT_TRUE(writer->callback->Callback(nullptr).ok() == + !write_group.front().callback_.should_fail_); + } else if (is_last) { + ASSERT_TRUE(writer->callback->Callback(nullptr).ok() == + !write_group.back().callback_.should_fail_); + } + + threads_verified.fetch_add(1); + // Wait here until all verification in this sync-point + // callback finish for all writers. + while (threads_verified.load() < write_group.size()) { + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:DoneWaiting", [&](void* arg) { + // check my state + auto* writer = reinterpret_cast(arg); + + if (!allow_batching_) { + // no batching so everyone should be a leader + ASSERT_TRUE(writer->state == + WriteThread::State::STATE_GROUP_LEADER); + } else if (!allow_parallel_) { + ASSERT_TRUE(writer->state == WriteThread::State::STATE_COMPLETED || + (enable_pipelined_write_ && + writer->state == + WriteThread::State::STATE_MEMTABLE_WRITER_LEADER)); + } + }); + + std::atomic thread_num(0); + std::atomic dummy_key(0); + + // Each write thread create a random write batch and write to DB + // with a write callback. + std::function write_with_callback_func = [&]() { + uint32_t i = thread_num.fetch_add(1); + Random rnd(i); + + // leaders gotta lead + while (i > 0 && threads_verified.load() < 1) { + } + + // loser has to lose + while (i == write_group.size() - 1 && + threads_verified.load() < write_group.size() - 1) { + } + + auto& write_op = write_group.at(i); + write_op.Clear(); + write_op.callback_.allow_batching_ = allow_batching_; + + // insert some keys + for (uint32_t j = 0; j < rnd.Next() % 50; j++) { + // grab unique key + char my_key = dummy_key.fetch_add(1); + + string skey(5, my_key); + string sval(10, my_key); + write_op.Put(skey, sval); + + if (!write_op.callback_.should_fail_ && !seq_per_batch_) { + seq.fetch_add(1); + } + } + if (!write_op.callback_.should_fail_ && seq_per_batch_) { + seq.fetch_add(1); + } + + WriteOptions woptions; + woptions.disableWAL = !enable_WAL_; + woptions.sync = enable_WAL_; + if (woptions.protection_bytes_per_key > 0) { + ASSERT_OK(WriteBatchInternal::UpdateProtectionInfo( + &write_op.write_batch_, woptions.protection_bytes_per_key)); + } + Status s; + if (seq_per_batch_) { + class PublishSeqCallback : public PreReleaseCallback { + public: + PublishSeqCallback(DBImpl* db_impl_in) : db_impl_(db_impl_in) {} + Status Callback(SequenceNumber last_seq, bool /*not used*/, uint64_t, + size_t /*index*/, size_t /*total*/) override { + db_impl_->SetLastPublishedSequence(last_seq); + return Status::OK(); + } + DBImpl* db_impl_; + } publish_seq_callback(db_impl); + // seq_per_batch_ requires a natural batch separator or Noop + ASSERT_OK(WriteBatchInternal::InsertNoop(&write_op.write_batch_)); + const size_t ONE_BATCH = 1; + s = db_impl->WriteImpl(woptions, &write_op.write_batch_, + &write_op.callback_, nullptr, 0, false, nullptr, + ONE_BATCH, + two_queues_ ? &publish_seq_callback : nullptr); + } else { + s = db_impl->WriteWithCallback(woptions, &write_op.write_batch_, + &write_op.callback_); + } + + if (write_op.callback_.should_fail_) { + ASSERT_TRUE(s.IsBusy()); + } else { + ASSERT_OK(s); + } + }; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // do all the writes + std::vector threads; + for (uint32_t i = 0; i < write_group.size(); i++) { + threads.emplace_back(write_with_callback_func); + } + for (auto& t : threads) { + t.join(); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + // check for keys + string value; + for (auto& w : write_group) { + ASSERT_TRUE(w.callback_.was_called_.load()); + for (auto& kvp : w.kvs_) { + if (w.callback_.should_fail_) { + ASSERT_TRUE(db->Get(read_options, kvp.first, &value).IsNotFound()); + } else { + ASSERT_OK(db->Get(read_options, kvp.first, &value)); + ASSERT_EQ(value, kvp.second); + } + } + } + + ASSERT_EQ(seq.load(), db_impl->TEST_GetLastVisibleSequence()); + + delete db; + ASSERT_OK(DestroyDB(dbname, options)); + } +} + +INSTANTIATE_TEST_CASE_P(WriteCallbackPTest, WriteCallbackPTest, + ::testing::Combine(::testing::Bool(), ::testing::Bool(), + ::testing::Bool(), ::testing::Bool(), + ::testing::Bool(), ::testing::Bool(), + ::testing::Bool())); +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_F(WriteCallbackTest, WriteCallBackTest) { + Options options; + WriteOptions write_options; + ReadOptions read_options; + string value; + DB* db; + DBImpl* db_impl; + + ASSERT_OK(DestroyDB(dbname, options)); + + options.create_if_missing = true; + Status s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + + db_impl = dynamic_cast(db); + ASSERT_TRUE(db_impl); + + WriteBatch wb; + + ASSERT_OK(wb.Put("a", "value.a")); + ASSERT_OK(wb.Delete("x")); + + // Test a simple Write + s = db->Write(write_options, &wb); + ASSERT_OK(s); + + s = db->Get(read_options, "a", &value); + ASSERT_OK(s); + ASSERT_EQ("value.a", value); + + // Test WriteWithCallback + WriteCallbackTestWriteCallback1 callback1; + WriteBatch wb2; + + ASSERT_OK(wb2.Put("a", "value.a2")); + + s = db_impl->WriteWithCallback(write_options, &wb2, &callback1); + ASSERT_OK(s); + ASSERT_TRUE(callback1.was_called); + + s = db->Get(read_options, "a", &value); + ASSERT_OK(s); + ASSERT_EQ("value.a2", value); + + // Test WriteWithCallback for a callback that fails + WriteCallbackTestWriteCallback2 callback2; + WriteBatch wb3; + + ASSERT_OK(wb3.Put("a", "value.a3")); + + s = db_impl->WriteWithCallback(write_options, &wb3, &callback2); + ASSERT_NOK(s); + + s = db->Get(read_options, "a", &value); + ASSERT_OK(s); + ASSERT_EQ("value.a2", value); + + delete db; + ASSERT_OK(DestroyDB(dbname, options)); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as WriteWithCallback is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/write_controller.cc b/src/rocksdb/db/write_controller.cc new file mode 100644 index 000000000..c5f744375 --- /dev/null +++ b/src/rocksdb/db/write_controller.cc @@ -0,0 +1,121 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/write_controller.h" + +#include +#include +#include +#include + +#include "rocksdb/system_clock.h" + +namespace ROCKSDB_NAMESPACE { + +std::unique_ptr WriteController::GetStopToken() { + ++total_stopped_; + return std::unique_ptr(new StopWriteToken(this)); +} + +std::unique_ptr WriteController::GetDelayToken( + uint64_t write_rate) { + if (0 == total_delayed_++) { + // Starting delay, so reset counters. + next_refill_time_ = 0; + credit_in_bytes_ = 0; + } + // NOTE: for simplicity, any current credit_in_bytes_ or "debt" in + // next_refill_time_ will be based on an old rate. This rate will apply + // for subsequent additional debts and for the next refill. + set_delayed_write_rate(write_rate); + return std::unique_ptr(new DelayWriteToken(this)); +} + +std::unique_ptr +WriteController::GetCompactionPressureToken() { + ++total_compaction_pressure_; + return std::unique_ptr( + new CompactionPressureToken(this)); +} + +bool WriteController::IsStopped() const { + return total_stopped_.load(std::memory_order_relaxed) > 0; +} +// This is inside DB mutex, so we can't sleep and need to minimize +// frequency to get time. +// If it turns out to be a performance issue, we can redesign the thread +// synchronization model here. +// The function trust caller will sleep micros returned. +uint64_t WriteController::GetDelay(SystemClock* clock, uint64_t num_bytes) { + if (total_stopped_.load(std::memory_order_relaxed) > 0) { + return 0; + } + if (total_delayed_.load(std::memory_order_relaxed) == 0) { + return 0; + } + + if (credit_in_bytes_ >= num_bytes) { + credit_in_bytes_ -= num_bytes; + return 0; + } + // The frequency to get time inside DB mutex is less than one per refill + // interval. + auto time_now = NowMicrosMonotonic(clock); + + const uint64_t kMicrosPerSecond = 1000000; + // Refill every 1 ms + const uint64_t kMicrosPerRefill = 1000; + + if (next_refill_time_ == 0) { + // Start with an initial allotment of bytes for one interval + next_refill_time_ = time_now; + } + if (next_refill_time_ <= time_now) { + // Refill based on time interval plus any extra elapsed + uint64_t elapsed = time_now - next_refill_time_ + kMicrosPerRefill; + credit_in_bytes_ += static_cast( + 1.0 * elapsed / kMicrosPerSecond * delayed_write_rate_ + 0.999999); + next_refill_time_ = time_now + kMicrosPerRefill; + + if (credit_in_bytes_ >= num_bytes) { + // Avoid delay if possible, to reduce DB mutex release & re-aquire. + credit_in_bytes_ -= num_bytes; + return 0; + } + } + + // We need to delay to avoid exceeding write rate. + assert(num_bytes > credit_in_bytes_); + uint64_t bytes_over_budget = num_bytes - credit_in_bytes_; + uint64_t needed_delay = static_cast( + 1.0 * bytes_over_budget / delayed_write_rate_ * kMicrosPerSecond); + + credit_in_bytes_ = 0; + next_refill_time_ += needed_delay; + + // Minimum delay of refill interval, to reduce DB mutex contention. + return std::max(next_refill_time_ - time_now, kMicrosPerRefill); +} + +uint64_t WriteController::NowMicrosMonotonic(SystemClock* clock) { + return clock->NowNanos() / std::milli::den; +} + +StopWriteToken::~StopWriteToken() { + assert(controller_->total_stopped_ >= 1); + --controller_->total_stopped_; +} + +DelayWriteToken::~DelayWriteToken() { + controller_->total_delayed_--; + assert(controller_->total_delayed_.load() >= 0); +} + +CompactionPressureToken::~CompactionPressureToken() { + controller_->total_compaction_pressure_--; + assert(controller_->total_compaction_pressure_ >= 0); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/write_controller.h b/src/rocksdb/db/write_controller.h new file mode 100644 index 000000000..bcead165b --- /dev/null +++ b/src/rocksdb/db/write_controller.h @@ -0,0 +1,148 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include +#include + +#include "rocksdb/rate_limiter.h" + +namespace ROCKSDB_NAMESPACE { + +class SystemClock; +class WriteControllerToken; + +// WriteController is controlling write stalls in our write code-path. Write +// stalls happen when compaction can't keep up with write rate. +// All of the methods here (including WriteControllerToken's destructors) need +// to be called while holding DB mutex +class WriteController { + public: + explicit WriteController(uint64_t _delayed_write_rate = 1024u * 1024u * 32u, + int64_t low_pri_rate_bytes_per_sec = 1024 * 1024) + : total_stopped_(0), + total_delayed_(0), + total_compaction_pressure_(0), + credit_in_bytes_(0), + next_refill_time_(0), + low_pri_rate_limiter_( + NewGenericRateLimiter(low_pri_rate_bytes_per_sec)) { + set_max_delayed_write_rate(_delayed_write_rate); + } + ~WriteController() = default; + + // When an actor (column family) requests a stop token, all writes will be + // stopped until the stop token is released (deleted) + std::unique_ptr GetStopToken(); + // When an actor (column family) requests a delay token, total delay for all + // writes to the DB will be controlled under the delayed write rate. Every + // write needs to call GetDelay() with number of bytes writing to the DB, + // which returns number of microseconds to sleep. + std::unique_ptr GetDelayToken( + uint64_t delayed_write_rate); + // When an actor (column family) requests a moderate token, compaction + // threads will be increased + std::unique_ptr GetCompactionPressureToken(); + + // these three metods are querying the state of the WriteController + bool IsStopped() const; + bool NeedsDelay() const { return total_delayed_.load() > 0; } + bool NeedSpeedupCompaction() const { + return IsStopped() || NeedsDelay() || total_compaction_pressure_.load() > 0; + } + // return how many microseconds the caller needs to sleep after the call + // num_bytes: how many number of bytes to put into the DB. + // Prerequisite: DB mutex held. + uint64_t GetDelay(SystemClock* clock, uint64_t num_bytes); + void set_delayed_write_rate(uint64_t write_rate) { + // avoid divide 0 + if (write_rate == 0) { + write_rate = 1u; + } else if (write_rate > max_delayed_write_rate()) { + write_rate = max_delayed_write_rate(); + } + delayed_write_rate_ = write_rate; + } + + void set_max_delayed_write_rate(uint64_t write_rate) { + // avoid divide 0 + if (write_rate == 0) { + write_rate = 1u; + } + max_delayed_write_rate_ = write_rate; + // update delayed_write_rate_ as well + delayed_write_rate_ = write_rate; + } + + uint64_t delayed_write_rate() const { return delayed_write_rate_; } + + uint64_t max_delayed_write_rate() const { return max_delayed_write_rate_; } + + RateLimiter* low_pri_rate_limiter() { return low_pri_rate_limiter_.get(); } + + private: + uint64_t NowMicrosMonotonic(SystemClock* clock); + + friend class WriteControllerToken; + friend class StopWriteToken; + friend class DelayWriteToken; + friend class CompactionPressureToken; + + std::atomic total_stopped_; + std::atomic total_delayed_; + std::atomic total_compaction_pressure_; + + // Number of bytes allowed to write without delay + uint64_t credit_in_bytes_; + // Next time that we can add more credit of bytes + uint64_t next_refill_time_; + // Write rate set when initialization or by `DBImpl::SetDBOptions` + uint64_t max_delayed_write_rate_; + // Current write rate (bytes / second) + uint64_t delayed_write_rate_; + + std::unique_ptr low_pri_rate_limiter_; +}; + +class WriteControllerToken { + public: + explicit WriteControllerToken(WriteController* controller) + : controller_(controller) {} + virtual ~WriteControllerToken() {} + + protected: + WriteController* controller_; + + private: + // no copying allowed + WriteControllerToken(const WriteControllerToken&) = delete; + void operator=(const WriteControllerToken&) = delete; +}; + +class StopWriteToken : public WriteControllerToken { + public: + explicit StopWriteToken(WriteController* controller) + : WriteControllerToken(controller) {} + virtual ~StopWriteToken(); +}; + +class DelayWriteToken : public WriteControllerToken { + public: + explicit DelayWriteToken(WriteController* controller) + : WriteControllerToken(controller) {} + virtual ~DelayWriteToken(); +}; + +class CompactionPressureToken : public WriteControllerToken { + public: + explicit CompactionPressureToken(WriteController* controller) + : WriteControllerToken(controller) {} + virtual ~CompactionPressureToken(); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/write_controller_test.cc b/src/rocksdb/db/write_controller_test.cc new file mode 100644 index 000000000..b6321a3bc --- /dev/null +++ b/src/rocksdb/db/write_controller_test.cc @@ -0,0 +1,248 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "db/write_controller.h" + +#include +#include + +#include "rocksdb/system_clock.h" +#include "test_util/testharness.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +class TimeSetClock : public SystemClockWrapper { + public: + explicit TimeSetClock() : SystemClockWrapper(nullptr) {} + const char* Name() const override { return "TimeSetClock"; } + uint64_t now_micros_ = 6666; + uint64_t NowNanos() override { return now_micros_ * std::milli::den; } +}; +} // anonymous namespace +class WriteControllerTest : public testing::Test { + public: + WriteControllerTest() { clock_ = std::make_shared(); } + std::shared_ptr clock_; +}; + +// Make tests easier to read +#define MILLION *1000000u +#define MB MILLION +#define MBPS MILLION +#define SECS MILLION // in microseconds + +TEST_F(WriteControllerTest, BasicAPI) { + WriteController controller(40 MBPS); // also set max delayed rate + EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS); + EXPECT_FALSE(controller.IsStopped()); + EXPECT_FALSE(controller.NeedsDelay()); + EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB)); + + // set, get + controller.set_delayed_write_rate(20 MBPS); + EXPECT_EQ(controller.delayed_write_rate(), 20 MBPS); + EXPECT_FALSE(controller.IsStopped()); + EXPECT_FALSE(controller.NeedsDelay()); + EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB)); + + { + // set with token, get + auto delay_token_0 = controller.GetDelayToken(10 MBPS); + EXPECT_EQ(controller.delayed_write_rate(), 10 MBPS); + EXPECT_FALSE(controller.IsStopped()); + EXPECT_TRUE(controller.NeedsDelay()); + // test with delay + EXPECT_EQ(2 SECS, controller.GetDelay(clock_.get(), 20 MB)); + clock_->now_micros_ += 2 SECS; // pay the "debt" + + auto delay_token_1 = controller.GetDelayToken(2 MBPS); + EXPECT_EQ(10 SECS, controller.GetDelay(clock_.get(), 20 MB)); + clock_->now_micros_ += 10 SECS; // pay the "debt" + + auto delay_token_2 = controller.GetDelayToken(1 MBPS); + EXPECT_EQ(20 SECS, controller.GetDelay(clock_.get(), 20 MB)); + clock_->now_micros_ += 20 SECS; // pay the "debt" + + auto delay_token_3 = controller.GetDelayToken(20 MBPS); + EXPECT_EQ(1 SECS, controller.GetDelay(clock_.get(), 20 MB)); + clock_->now_micros_ += 1 SECS; // pay the "debt" + + // 60M is more than the max rate of 40M. Max rate will be used. + EXPECT_EQ(controller.delayed_write_rate(), 20 MBPS); + auto delay_token_4 = + controller.GetDelayToken(controller.delayed_write_rate() * 3); + EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS); + EXPECT_EQ(static_cast(0.5 SECS), + controller.GetDelay(clock_.get(), 20 MB)); + + EXPECT_FALSE(controller.IsStopped()); + EXPECT_TRUE(controller.NeedsDelay()); + + // Test stop tokens + { + auto stop_token_1 = controller.GetStopToken(); + EXPECT_TRUE(controller.IsStopped()); + EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB)); + { + auto stop_token_2 = controller.GetStopToken(); + EXPECT_TRUE(controller.IsStopped()); + EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB)); + } + EXPECT_TRUE(controller.IsStopped()); + EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB)); + } + // Stop tokens released + EXPECT_FALSE(controller.IsStopped()); + EXPECT_TRUE(controller.NeedsDelay()); + EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS); + // pay the previous "debt" + clock_->now_micros_ += static_cast(0.5 SECS); + EXPECT_EQ(1 SECS, controller.GetDelay(clock_.get(), 40 MB)); + } + + // Delay tokens released + EXPECT_FALSE(controller.NeedsDelay()); +} + +TEST_F(WriteControllerTest, StartFilled) { + WriteController controller(10 MBPS); + + // Attempt to write two things that combined would be allowed within + // a single refill interval + auto delay_token_0 = + controller.GetDelayToken(controller.delayed_write_rate()); + + // Verify no delay because write rate has not been exceeded within + // refill interval. + EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/)); + EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/)); + + // Allow refill (kMicrosPerRefill) + clock_->now_micros_ += 1000; + + // Again + EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/)); + EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/)); + + // Control: something bigger that would exceed write rate within interval + uint64_t delay = controller.GetDelay(clock_.get(), 10 MB); + EXPECT_GT(1.0 * delay, 0.999 SECS); + EXPECT_LT(1.0 * delay, 1.001 SECS); +} + +TEST_F(WriteControllerTest, DebtAccumulation) { + WriteController controller(10 MBPS); + + std::array, 10> tokens; + + // Accumulate a time delay debt with no passage of time, like many column + // families delaying writes simultaneously. (Old versions of WriteController + // would reset the debt on every GetDelayToken.) + uint64_t debt = 0; + for (unsigned i = 0; i < tokens.size(); ++i) { + tokens[i] = controller.GetDelayToken((i + 1u) MBPS); + uint64_t delay = controller.GetDelay(clock_.get(), 63 MB); + ASSERT_GT(delay, debt); + uint64_t incremental = delay - debt; + ASSERT_EQ(incremental, (63 SECS) / (i + 1u)); + debt += incremental; + } + + // Pay down the debt + clock_->now_micros_ += debt; + debt = 0; + + // Now accumulate debt with some passage of time. + for (unsigned i = 0; i < tokens.size(); ++i) { + // Debt is accumulated in time, not in bytes, so this new write + // limit is not applied to prior requested delays, even it they are + // in progress. + tokens[i] = controller.GetDelayToken((i + 1u) MBPS); + uint64_t delay = controller.GetDelay(clock_.get(), 63 MB); + ASSERT_GT(delay, debt); + uint64_t incremental = delay - debt; + ASSERT_EQ(incremental, (63 SECS) / (i + 1u)); + debt += incremental; + uint64_t credit = debt / 2; + clock_->now_micros_ += credit; + debt -= credit; + } + + // Pay down the debt + clock_->now_micros_ += debt; + debt = 0; // consistent state + (void)debt; // appease clang-analyze + + // Verify paid down + EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/)); + + // Accumulate another debt, without accounting, and releasing tokens + for (unsigned i = 0; i < tokens.size(); ++i) { + // Big and small are delayed + ASSERT_LT(0U, controller.GetDelay(clock_.get(), 63 MB)); + ASSERT_LT(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/)); + tokens[i].reset(); + } + // All tokens released. + // Verify that releasing all tokens pays down debt, even with no time passage. + tokens[0] = controller.GetDelayToken(1 MBPS); + ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/)); +} + +// This may or may not be a "good" feature, but it's an old feature +TEST_F(WriteControllerTest, CreditAccumulation) { + WriteController controller(10 MBPS); + + std::array, 10> tokens; + + // Ensure started + tokens[0] = controller.GetDelayToken(1 MBPS); + ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB)); + clock_->now_micros_ += 10 SECS; + + // Accumulate a credit + uint64_t credit = 1000 SECS /* see below: * 1 MB / 1 SEC */; + clock_->now_micros_ += credit; + + // Spend some credit (burst of I/O) + for (unsigned i = 0; i < tokens.size(); ++i) { + tokens[i] = controller.GetDelayToken((i + 1u) MBPS); + ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 63 MB)); + // In WriteController, credit is accumulated in bytes, not in time. + // After an "unnecessary" delay, all of our time credit will be + // translated to bytes on the next operation, in this case with + // setting 1 MBPS. So regardless of the rate at delay time, we just + // account for the bytes. + credit -= 63 MB; + } + // Spend remaining credit + tokens[0] = controller.GetDelayToken(1 MBPS); + ASSERT_EQ(0U, controller.GetDelay(clock_.get(), credit)); + // Verify + ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB)); + clock_->now_micros_ += 10 SECS; + + // Accumulate a credit, no accounting + clock_->now_micros_ += 1000 SECS; + + // Spend a small amount, releasing tokens + for (unsigned i = 0; i < tokens.size(); ++i) { + ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 3 MB)); + tokens[i].reset(); + } + + // All tokens released. + // Verify credit is wiped away on new delay. + tokens[0] = controller.GetDelayToken(1 MBPS); + ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB)); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/write_thread.cc b/src/rocksdb/db/write_thread.cc new file mode 100644 index 000000000..cc8645f37 --- /dev/null +++ b/src/rocksdb/db/write_thread.cc @@ -0,0 +1,815 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/write_thread.h" + +#include +#include + +#include "db/column_family.h" +#include "monitoring/perf_context_imp.h" +#include "port/port.h" +#include "test_util/sync_point.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +WriteThread::WriteThread(const ImmutableDBOptions& db_options) + : max_yield_usec_(db_options.enable_write_thread_adaptive_yield + ? db_options.write_thread_max_yield_usec + : 0), + slow_yield_usec_(db_options.write_thread_slow_yield_usec), + allow_concurrent_memtable_write_( + db_options.allow_concurrent_memtable_write), + enable_pipelined_write_(db_options.enable_pipelined_write), + max_write_batch_group_size_bytes( + db_options.max_write_batch_group_size_bytes), + newest_writer_(nullptr), + newest_memtable_writer_(nullptr), + last_sequence_(0), + write_stall_dummy_(), + stall_mu_(), + stall_cv_(&stall_mu_) {} + +uint8_t WriteThread::BlockingAwaitState(Writer* w, uint8_t goal_mask) { + // We're going to block. Lazily create the mutex. We guarantee + // propagation of this construction to the waker via the + // STATE_LOCKED_WAITING state. The waker won't try to touch the mutex + // or the condvar unless they CAS away the STATE_LOCKED_WAITING that + // we install below. + w->CreateMutex(); + + auto state = w->state.load(std::memory_order_acquire); + assert(state != STATE_LOCKED_WAITING); + if ((state & goal_mask) == 0 && + w->state.compare_exchange_strong(state, STATE_LOCKED_WAITING)) { + // we have permission (and an obligation) to use StateMutex + std::unique_lock guard(w->StateMutex()); + w->StateCV().wait(guard, [w] { + return w->state.load(std::memory_order_relaxed) != STATE_LOCKED_WAITING; + }); + state = w->state.load(std::memory_order_relaxed); + } + // else tricky. Goal is met or CAS failed. In the latter case the waker + // must have changed the state, and compare_exchange_strong has updated + // our local variable with the new one. At the moment WriteThread never + // waits for a transition across intermediate states, so we know that + // since a state change has occurred the goal must have been met. + assert((state & goal_mask) != 0); + return state; +} + +uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask, + AdaptationContext* ctx) { + uint8_t state = 0; + + // 1. Busy loop using "pause" for 1 micro sec + // 2. Else SOMETIMES busy loop using "yield" for 100 micro sec (default) + // 3. Else blocking wait + + // On a modern Xeon each loop takes about 7 nanoseconds (most of which + // is the effect of the pause instruction), so 200 iterations is a bit + // more than a microsecond. This is long enough that waits longer than + // this can amortize the cost of accessing the clock and yielding. + for (uint32_t tries = 0; tries < 200; ++tries) { + state = w->state.load(std::memory_order_acquire); + if ((state & goal_mask) != 0) { + return state; + } + port::AsmVolatilePause(); + } + + // This is below the fast path, so that the stat is zero when all writes are + // from the same thread. + PERF_TIMER_GUARD(write_thread_wait_nanos); + + // If we're only going to end up waiting a short period of time, + // it can be a lot more efficient to call std::this_thread::yield() + // in a loop than to block in StateMutex(). For reference, on my 4.0 + // SELinux test server with support for syscall auditing enabled, the + // minimum latency between FUTEX_WAKE to returning from FUTEX_WAIT is + // 2.7 usec, and the average is more like 10 usec. That can be a big + // drag on RockDB's single-writer design. Of course, spinning is a + // bad idea if other threads are waiting to run or if we're going to + // wait for a long time. How do we decide? + // + // We break waiting into 3 categories: short-uncontended, + // short-contended, and long. If we had an oracle, then we would always + // spin for short-uncontended, always block for long, and our choice for + // short-contended might depend on whether we were trying to optimize + // RocksDB throughput or avoid being greedy with system resources. + // + // Bucketing into short or long is easy by measuring elapsed time. + // Differentiating short-uncontended from short-contended is a bit + // trickier, but not too bad. We could look for involuntary context + // switches using getrusage(RUSAGE_THREAD, ..), but it's less work + // (portability code and CPU) to just look for yield calls that take + // longer than we expect. sched_yield() doesn't actually result in any + // context switch overhead if there are no other runnable processes + // on the current core, in which case it usually takes less than + // a microsecond. + // + // There are two primary tunables here: the threshold between "short" + // and "long" waits, and the threshold at which we suspect that a yield + // is slow enough to indicate we should probably block. If these + // thresholds are chosen well then CPU-bound workloads that don't + // have more threads than cores will experience few context switches + // (voluntary or involuntary), and the total number of context switches + // (voluntary and involuntary) will not be dramatically larger (maybe + // 2x) than the number of voluntary context switches that occur when + // --max_yield_wait_micros=0. + // + // There's another constant, which is the number of slow yields we will + // tolerate before reversing our previous decision. Solitary slow + // yields are pretty common (low-priority small jobs ready to run), + // so this should be at least 2. We set this conservatively to 3 so + // that we can also immediately schedule a ctx adaptation, rather than + // waiting for the next update_ctx. + + const size_t kMaxSlowYieldsWhileSpinning = 3; + + // Whether the yield approach has any credit in this context. The credit is + // added by yield being succesfull before timing out, and decreased otherwise. + auto& yield_credit = ctx->value; + // Update the yield_credit based on sample runs or right after a hard failure + bool update_ctx = false; + // Should we reinforce the yield credit + bool would_spin_again = false; + // The samling base for updating the yeild credit. The sampling rate would be + // 1/sampling_base. + const int sampling_base = 256; + + if (max_yield_usec_ > 0) { + update_ctx = Random::GetTLSInstance()->OneIn(sampling_base); + + if (update_ctx || yield_credit.load(std::memory_order_relaxed) >= 0) { + // we're updating the adaptation statistics, or spinning has > + // 50% chance of being shorter than max_yield_usec_ and causing no + // involuntary context switches + auto spin_begin = std::chrono::steady_clock::now(); + + // this variable doesn't include the final yield (if any) that + // causes the goal to be met + size_t slow_yield_count = 0; + + auto iter_begin = spin_begin; + while ((iter_begin - spin_begin) <= + std::chrono::microseconds(max_yield_usec_)) { + std::this_thread::yield(); + + state = w->state.load(std::memory_order_acquire); + if ((state & goal_mask) != 0) { + // success + would_spin_again = true; + break; + } + + auto now = std::chrono::steady_clock::now(); + if (now == iter_begin || + now - iter_begin >= std::chrono::microseconds(slow_yield_usec_)) { + // conservatively count it as a slow yield if our clock isn't + // accurate enough to measure the yield duration + ++slow_yield_count; + if (slow_yield_count >= kMaxSlowYieldsWhileSpinning) { + // Not just one ivcsw, but several. Immediately update yield_credit + // and fall back to blocking + update_ctx = true; + break; + } + } + iter_begin = now; + } + } + } + + if ((state & goal_mask) == 0) { + TEST_SYNC_POINT_CALLBACK("WriteThread::AwaitState:BlockingWaiting", w); + state = BlockingAwaitState(w, goal_mask); + } + + if (update_ctx) { + // Since our update is sample based, it is ok if a thread overwrites the + // updates by other threads. Thus the update does not have to be atomic. + auto v = yield_credit.load(std::memory_order_relaxed); + // fixed point exponential decay with decay constant 1/1024, with +1 + // and -1 scaled to avoid overflow for int32_t + // + // On each update the positive credit is decayed by a facor of 1/1024 (i.e., + // 0.1%). If the sampled yield was successful, the credit is also increased + // by X. Setting X=2^17 ensures that the credit never exceeds + // 2^17*2^10=2^27, which is lower than 2^31 the upperbound of int32_t. Same + // logic applies to negative credits. + v = v - (v / 1024) + (would_spin_again ? 1 : -1) * 131072; + yield_credit.store(v, std::memory_order_relaxed); + } + + assert((state & goal_mask) != 0); + return state; +} + +void WriteThread::SetState(Writer* w, uint8_t new_state) { + assert(w); + auto state = w->state.load(std::memory_order_acquire); + if (state == STATE_LOCKED_WAITING || + !w->state.compare_exchange_strong(state, new_state)) { + assert(state == STATE_LOCKED_WAITING); + + std::lock_guard guard(w->StateMutex()); + assert(w->state.load(std::memory_order_relaxed) != new_state); + w->state.store(new_state, std::memory_order_relaxed); + w->StateCV().notify_one(); + } +} + +bool WriteThread::LinkOne(Writer* w, std::atomic* newest_writer) { + assert(newest_writer != nullptr); + assert(w->state == STATE_INIT); + Writer* writers = newest_writer->load(std::memory_order_relaxed); + while (true) { + // If write stall in effect, and w->no_slowdown is not true, + // block here until stall is cleared. If its true, then return + // immediately + if (writers == &write_stall_dummy_) { + if (w->no_slowdown) { + w->status = Status::Incomplete("Write stall"); + SetState(w, STATE_COMPLETED); + return false; + } + // Since no_slowdown is false, wait here to be notified of the write + // stall clearing + { + MutexLock lock(&stall_mu_); + writers = newest_writer->load(std::memory_order_relaxed); + if (writers == &write_stall_dummy_) { + TEST_SYNC_POINT_CALLBACK("WriteThread::WriteStall::Wait", w); + stall_cv_.Wait(); + // Load newest_writers_ again since it may have changed + writers = newest_writer->load(std::memory_order_relaxed); + continue; + } + } + } + w->link_older = writers; + if (newest_writer->compare_exchange_weak(writers, w)) { + return (writers == nullptr); + } + } +} + +bool WriteThread::LinkGroup(WriteGroup& write_group, + std::atomic* newest_writer) { + assert(newest_writer != nullptr); + Writer* leader = write_group.leader; + Writer* last_writer = write_group.last_writer; + Writer* w = last_writer; + while (true) { + // Unset link_newer pointers to make sure when we call + // CreateMissingNewerLinks later it create all missing links. + w->link_newer = nullptr; + w->write_group = nullptr; + if (w == leader) { + break; + } + w = w->link_older; + } + Writer* newest = newest_writer->load(std::memory_order_relaxed); + while (true) { + leader->link_older = newest; + if (newest_writer->compare_exchange_weak(newest, last_writer)) { + return (newest == nullptr); + } + } +} + +void WriteThread::CreateMissingNewerLinks(Writer* head) { + while (true) { + Writer* next = head->link_older; + if (next == nullptr || next->link_newer != nullptr) { + assert(next == nullptr || next->link_newer == head); + break; + } + next->link_newer = head; + head = next; + } +} + +void WriteThread::CompleteLeader(WriteGroup& write_group) { + assert(write_group.size > 0); + Writer* leader = write_group.leader; + if (write_group.size == 1) { + write_group.leader = nullptr; + write_group.last_writer = nullptr; + } else { + assert(leader->link_newer != nullptr); + leader->link_newer->link_older = nullptr; + write_group.leader = leader->link_newer; + } + write_group.size -= 1; + SetState(leader, STATE_COMPLETED); +} + +void WriteThread::CompleteFollower(Writer* w, WriteGroup& write_group) { + assert(write_group.size > 1); + assert(w != write_group.leader); + if (w == write_group.last_writer) { + w->link_older->link_newer = nullptr; + write_group.last_writer = w->link_older; + } else { + w->link_older->link_newer = w->link_newer; + w->link_newer->link_older = w->link_older; + } + write_group.size -= 1; + SetState(w, STATE_COMPLETED); +} + +void WriteThread::BeginWriteStall() { + LinkOne(&write_stall_dummy_, &newest_writer_); + + // Walk writer list until w->write_group != nullptr. The current write group + // will not have a mix of slowdown/no_slowdown, so its ok to stop at that + // point + Writer* w = write_stall_dummy_.link_older; + Writer* prev = &write_stall_dummy_; + while (w != nullptr && w->write_group == nullptr) { + if (w->no_slowdown) { + prev->link_older = w->link_older; + w->status = Status::Incomplete("Write stall"); + SetState(w, STATE_COMPLETED); + // Only update `link_newer` if it's already set. + // `CreateMissingNewerLinks()` will update the nullptr `link_newer` later, + // which assumes the the first non-nullptr `link_newer` is the last + // nullptr link in the writer list. + // If `link_newer` is set here, `CreateMissingNewerLinks()` may stop + // updating the whole list when it sees the first non nullptr link. + if (prev->link_older && prev->link_older->link_newer) { + prev->link_older->link_newer = prev; + } + w = prev->link_older; + } else { + prev = w; + w = w->link_older; + } + } +} + +void WriteThread::EndWriteStall() { + MutexLock lock(&stall_mu_); + + // Unlink write_stall_dummy_ from the write queue. This will unblock + // pending write threads to enqueue themselves + assert(newest_writer_.load(std::memory_order_relaxed) == &write_stall_dummy_); + assert(write_stall_dummy_.link_older != nullptr); + write_stall_dummy_.link_older->link_newer = write_stall_dummy_.link_newer; + newest_writer_.exchange(write_stall_dummy_.link_older); + + // Wake up writers + stall_cv_.SignalAll(); +} + +static WriteThread::AdaptationContext jbg_ctx("JoinBatchGroup"); +void WriteThread::JoinBatchGroup(Writer* w) { + TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Start", w); + assert(w->batch != nullptr); + + bool linked_as_leader = LinkOne(w, &newest_writer_); + + if (linked_as_leader) { + SetState(w, STATE_GROUP_LEADER); + } + + TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Wait", w); + TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Wait2", w); + + if (!linked_as_leader) { + /** + * Wait util: + * 1) An existing leader pick us as the new leader when it finishes + * 2) An existing leader pick us as its follewer and + * 2.1) finishes the memtable writes on our behalf + * 2.2) Or tell us to finish the memtable writes in pralallel + * 3) (pipelined write) An existing leader pick us as its follower and + * finish book-keeping and WAL write for us, enqueue us as pending + * memtable writer, and + * 3.1) we become memtable writer group leader, or + * 3.2) an existing memtable writer group leader tell us to finish memtable + * writes in parallel. + */ + TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:BeganWaiting", w); + AwaitState(w, + STATE_GROUP_LEADER | STATE_MEMTABLE_WRITER_LEADER | + STATE_PARALLEL_MEMTABLE_WRITER | STATE_COMPLETED, + &jbg_ctx); + TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:DoneWaiting", w); + } +} + +size_t WriteThread::EnterAsBatchGroupLeader(Writer* leader, + WriteGroup* write_group) { + assert(leader->link_older == nullptr); + assert(leader->batch != nullptr); + assert(write_group != nullptr); + + size_t size = WriteBatchInternal::ByteSize(leader->batch); + + // Allow the group to grow up to a maximum size, but if the + // original write is small, limit the growth so we do not slow + // down the small write too much. + size_t max_size = max_write_batch_group_size_bytes; + const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8; + if (size <= min_batch_size_bytes) { + max_size = size + min_batch_size_bytes; + } + + leader->write_group = write_group; + write_group->leader = leader; + write_group->last_writer = leader; + write_group->size = 1; + Writer* newest_writer = newest_writer_.load(std::memory_order_acquire); + + // This is safe regardless of any db mutex status of the caller. Previous + // calls to ExitAsGroupLeader either didn't call CreateMissingNewerLinks + // (they emptied the list and then we added ourself as leader) or had to + // explicitly wake us up (the list was non-empty when we added ourself, + // so we have already received our MarkJoined). + CreateMissingNewerLinks(newest_writer); + + // Tricky. Iteration start (leader) is exclusive and finish + // (newest_writer) is inclusive. Iteration goes from old to new. + Writer* w = leader; + while (w != newest_writer) { + assert(w->link_newer); + w = w->link_newer; + + if (w->sync && !leader->sync) { + // Do not include a sync write into a batch handled by a non-sync write. + break; + } + + if (w->no_slowdown != leader->no_slowdown) { + // Do not mix writes that are ok with delays with the ones that + // request fail on delays. + break; + } + + if (w->disable_wal != leader->disable_wal) { + // Do not mix writes that enable WAL with the ones whose + // WAL disabled. + break; + } + + if (w->protection_bytes_per_key != leader->protection_bytes_per_key) { + // Do not mix writes with different levels of integrity protection. + break; + } + + if (w->rate_limiter_priority != leader->rate_limiter_priority) { + // Do not mix writes with different rate limiter priorities. + break; + } + + if (w->batch == nullptr) { + // Do not include those writes with nullptr batch. Those are not writes, + // those are something else. They want to be alone + break; + } + + if (w->callback != nullptr && !w->callback->AllowWriteBatching()) { + // don't batch writes that don't want to be batched + break; + } + + auto batch_size = WriteBatchInternal::ByteSize(w->batch); + if (size + batch_size > max_size) { + // Do not make batch too big + break; + } + + w->write_group = write_group; + size += batch_size; + write_group->last_writer = w; + write_group->size++; + } + TEST_SYNC_POINT_CALLBACK("WriteThread::EnterAsBatchGroupLeader:End", w); + return size; +} + +void WriteThread::EnterAsMemTableWriter(Writer* leader, + WriteGroup* write_group) { + assert(leader != nullptr); + assert(leader->link_older == nullptr); + assert(leader->batch != nullptr); + assert(write_group != nullptr); + + size_t size = WriteBatchInternal::ByteSize(leader->batch); + + // Allow the group to grow up to a maximum size, but if the + // original write is small, limit the growth so we do not slow + // down the small write too much. + size_t max_size = max_write_batch_group_size_bytes; + const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8; + if (size <= min_batch_size_bytes) { + max_size = size + min_batch_size_bytes; + } + + leader->write_group = write_group; + write_group->leader = leader; + write_group->size = 1; + Writer* last_writer = leader; + + if (!allow_concurrent_memtable_write_ || !leader->batch->HasMerge()) { + Writer* newest_writer = newest_memtable_writer_.load(); + CreateMissingNewerLinks(newest_writer); + + Writer* w = leader; + while (w != newest_writer) { + assert(w->link_newer); + w = w->link_newer; + + if (w->batch == nullptr) { + break; + } + + if (w->batch->HasMerge()) { + break; + } + + if (!allow_concurrent_memtable_write_) { + auto batch_size = WriteBatchInternal::ByteSize(w->batch); + if (size + batch_size > max_size) { + // Do not make batch too big + break; + } + size += batch_size; + } + + w->write_group = write_group; + last_writer = w; + write_group->size++; + } + } + + write_group->last_writer = last_writer; + write_group->last_sequence = + last_writer->sequence + WriteBatchInternal::Count(last_writer->batch) - 1; +} + +void WriteThread::ExitAsMemTableWriter(Writer* /*self*/, + WriteGroup& write_group) { + Writer* leader = write_group.leader; + Writer* last_writer = write_group.last_writer; + + Writer* newest_writer = last_writer; + if (!newest_memtable_writer_.compare_exchange_strong(newest_writer, + nullptr)) { + CreateMissingNewerLinks(newest_writer); + Writer* next_leader = last_writer->link_newer; + assert(next_leader != nullptr); + next_leader->link_older = nullptr; + SetState(next_leader, STATE_MEMTABLE_WRITER_LEADER); + } + Writer* w = leader; + while (true) { + if (!write_group.status.ok()) { + w->status = write_group.status; + } + Writer* next = w->link_newer; + if (w != leader) { + SetState(w, STATE_COMPLETED); + } + if (w == last_writer) { + break; + } + assert(next); + w = next; + } + // Note that leader has to exit last, since it owns the write group. + SetState(leader, STATE_COMPLETED); +} + +void WriteThread::LaunchParallelMemTableWriters(WriteGroup* write_group) { + assert(write_group != nullptr); + write_group->running.store(write_group->size); + for (auto w : *write_group) { + SetState(w, STATE_PARALLEL_MEMTABLE_WRITER); + } +} + +static WriteThread::AdaptationContext cpmtw_ctx( + "CompleteParallelMemTableWriter"); +// This method is called by both the leader and parallel followers +bool WriteThread::CompleteParallelMemTableWriter(Writer* w) { + auto* write_group = w->write_group; + if (!w->status.ok()) { + std::lock_guard guard(write_group->leader->StateMutex()); + write_group->status = w->status; + } + + if (write_group->running-- > 1) { + // we're not the last one + AwaitState(w, STATE_COMPLETED, &cpmtw_ctx); + return false; + } + // else we're the last parallel worker and should perform exit duties. + w->status = write_group->status; + // Callers of this function must ensure w->status is checked. + write_group->status.PermitUncheckedError(); + return true; +} + +void WriteThread::ExitAsBatchGroupFollower(Writer* w) { + auto* write_group = w->write_group; + + assert(w->state == STATE_PARALLEL_MEMTABLE_WRITER); + assert(write_group->status.ok()); + ExitAsBatchGroupLeader(*write_group, write_group->status); + assert(w->status.ok()); + assert(w->state == STATE_COMPLETED); + SetState(write_group->leader, STATE_COMPLETED); +} + +static WriteThread::AdaptationContext eabgl_ctx("ExitAsBatchGroupLeader"); +void WriteThread::ExitAsBatchGroupLeader(WriteGroup& write_group, + Status& status) { + TEST_SYNC_POINT_CALLBACK("WriteThread::ExitAsBatchGroupLeader:Start", + &write_group); + + Writer* leader = write_group.leader; + Writer* last_writer = write_group.last_writer; + assert(leader->link_older == nullptr); + + // If status is non-ok already, then write_group.status won't have the chance + // of being propagated to caller. + if (!status.ok()) { + write_group.status.PermitUncheckedError(); + } + + // Propagate memtable write error to the whole group. + if (status.ok() && !write_group.status.ok()) { + status = write_group.status; + } + + if (enable_pipelined_write_) { + // We insert a dummy Writer right before our current write_group. This + // allows us to unlink our write_group without the risk that a subsequent + // writer becomes a new leader and might overtake us and add itself to the + // memtable-writer-list before we can do so. This ensures that writers are + // added to the memtable-writer-list in the exact same order in which they + // were in the newest_writer list. + // This must happen before completing the writers from our group to prevent + // a race where the owning thread of one of these writers can start a new + // write operation. + Writer dummy; + Writer* head = newest_writer_.load(std::memory_order_acquire); + if (head != last_writer || + !newest_writer_.compare_exchange_strong(head, &dummy)) { + // Either last_writer wasn't the head during the load(), or it was the + // head during the load() but somebody else pushed onto the list before + // we did the compare_exchange_strong (causing it to fail). In the latter + // case compare_exchange_strong has the effect of re-reading its first + // param (head). No need to retry a failing CAS, because only a departing + // leader (which we are at the moment) can remove nodes from the list. + assert(head != last_writer); + + // After walking link_older starting from head (if not already done) we + // will be able to traverse w->link_newer below. + CreateMissingNewerLinks(head); + assert(last_writer->link_newer != nullptr); + last_writer->link_newer->link_older = &dummy; + dummy.link_newer = last_writer->link_newer; + } + + // Complete writers that don't write to memtable + for (Writer* w = last_writer; w != leader;) { + Writer* next = w->link_older; + w->status = status; + if (!w->ShouldWriteToMemtable()) { + CompleteFollower(w, write_group); + } + w = next; + } + if (!leader->ShouldWriteToMemtable()) { + CompleteLeader(write_group); + } + + TEST_SYNC_POINT_CALLBACK( + "WriteThread::ExitAsBatchGroupLeader:AfterCompleteWriters", + &write_group); + + // Link the remaining of the group to memtable writer list. + // We have to link our group to memtable writer queue before wake up the + // next leader or set newest_writer_ to null, otherwise the next leader + // can run ahead of us and link to memtable writer queue before we do. + if (write_group.size > 0) { + if (LinkGroup(write_group, &newest_memtable_writer_)) { + // The leader can now be different from current writer. + SetState(write_group.leader, STATE_MEMTABLE_WRITER_LEADER); + } + } + + // Unlink the dummy writer from the list and identify the new leader + head = newest_writer_.load(std::memory_order_acquire); + if (head != &dummy || + !newest_writer_.compare_exchange_strong(head, nullptr)) { + CreateMissingNewerLinks(head); + Writer* new_leader = dummy.link_newer; + assert(new_leader != nullptr); + new_leader->link_older = nullptr; + SetState(new_leader, STATE_GROUP_LEADER); + } + + AwaitState(leader, + STATE_MEMTABLE_WRITER_LEADER | STATE_PARALLEL_MEMTABLE_WRITER | + STATE_COMPLETED, + &eabgl_ctx); + } else { + Writer* head = newest_writer_.load(std::memory_order_acquire); + if (head != last_writer || + !newest_writer_.compare_exchange_strong(head, nullptr)) { + // Either last_writer wasn't the head during the load(), or it was the + // head during the load() but somebody else pushed onto the list before + // we did the compare_exchange_strong (causing it to fail). In the + // latter case compare_exchange_strong has the effect of re-reading + // its first param (head). No need to retry a failing CAS, because + // only a departing leader (which we are at the moment) can remove + // nodes from the list. + assert(head != last_writer); + + // After walking link_older starting from head (if not already done) + // we will be able to traverse w->link_newer below. This function + // can only be called from an active leader, only a leader can + // clear newest_writer_, we didn't, and only a clear newest_writer_ + // could cause the next leader to start their work without a call + // to MarkJoined, so we can definitely conclude that no other leader + // work is going on here (with or without db mutex). + CreateMissingNewerLinks(head); + assert(last_writer->link_newer != nullptr); + assert(last_writer->link_newer->link_older == last_writer); + last_writer->link_newer->link_older = nullptr; + + // Next leader didn't self-identify, because newest_writer_ wasn't + // nullptr when they enqueued (we were definitely enqueued before them + // and are still in the list). That means leader handoff occurs when + // we call MarkJoined + SetState(last_writer->link_newer, STATE_GROUP_LEADER); + } + // else nobody else was waiting, although there might already be a new + // leader now + + while (last_writer != leader) { + assert(last_writer); + last_writer->status = status; + // we need to read link_older before calling SetState, because as soon + // as it is marked committed the other thread's Await may return and + // deallocate the Writer. + auto next = last_writer->link_older; + SetState(last_writer, STATE_COMPLETED); + + last_writer = next; + } + } +} + +static WriteThread::AdaptationContext eu_ctx("EnterUnbatched"); +void WriteThread::EnterUnbatched(Writer* w, InstrumentedMutex* mu) { + assert(w != nullptr && w->batch == nullptr); + mu->Unlock(); + bool linked_as_leader = LinkOne(w, &newest_writer_); + if (!linked_as_leader) { + TEST_SYNC_POINT("WriteThread::EnterUnbatched:Wait"); + // Last leader will not pick us as a follower since our batch is nullptr + AwaitState(w, STATE_GROUP_LEADER, &eu_ctx); + } + if (enable_pipelined_write_) { + WaitForMemTableWriters(); + } + mu->Lock(); +} + +void WriteThread::ExitUnbatched(Writer* w) { + assert(w != nullptr); + Writer* newest_writer = w; + if (!newest_writer_.compare_exchange_strong(newest_writer, nullptr)) { + CreateMissingNewerLinks(newest_writer); + Writer* next_leader = w->link_newer; + assert(next_leader != nullptr); + next_leader->link_older = nullptr; + SetState(next_leader, STATE_GROUP_LEADER); + } +} + +static WriteThread::AdaptationContext wfmw_ctx("WaitForMemTableWriters"); +void WriteThread::WaitForMemTableWriters() { + assert(enable_pipelined_write_); + if (newest_memtable_writer_.load() == nullptr) { + return; + } + Writer w; + if (!LinkOne(&w, &newest_memtable_writer_)) { + AwaitState(&w, STATE_MEMTABLE_WRITER_LEADER, &wfmw_ctx); + } + newest_memtable_writer_.store(nullptr); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/write_thread.h b/src/rocksdb/db/write_thread.h new file mode 100644 index 000000000..0ea51d922 --- /dev/null +++ b/src/rocksdb/db/write_thread.h @@ -0,0 +1,440 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/post_memtable_callback.h" +#include "db/pre_release_callback.h" +#include "db/write_callback.h" +#include "monitoring/instrumented_mutex.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/write_batch.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class WriteThread { + public: + enum State : uint8_t { + // The initial state of a writer. This is a Writer that is + // waiting in JoinBatchGroup. This state can be left when another + // thread informs the waiter that it has become a group leader + // (-> STATE_GROUP_LEADER), when a leader that has chosen to be + // non-parallel informs a follower that its writes have been committed + // (-> STATE_COMPLETED), or when a leader that has chosen to perform + // updates in parallel and needs this Writer to apply its batch (-> + // STATE_PARALLEL_MEMTABLE_WRITER). + STATE_INIT = 1, + + // The state used to inform a waiting Writer that it has become the + // leader, and it should now build a write batch group. Tricky: + // this state is not used if newest_writer_ is empty when a writer + // enqueues itself, because there is no need to wait (or even to + // create the mutex and condvar used to wait) in that case. This is + // a terminal state unless the leader chooses to make this a parallel + // batch, in which case the last parallel worker to finish will move + // the leader to STATE_COMPLETED. + STATE_GROUP_LEADER = 2, + + // The state used to inform a waiting writer that it has become the + // leader of memtable writer group. The leader will either write + // memtable for the whole group, or launch a parallel group write + // to memtable by calling LaunchParallelMemTableWrite. + STATE_MEMTABLE_WRITER_LEADER = 4, + + // The state used to inform a waiting writer that it has become a + // parallel memtable writer. It can be the group leader who launch the + // parallel writer group, or one of the followers. The writer should then + // apply its batch to the memtable concurrently and call + // CompleteParallelMemTableWriter. + STATE_PARALLEL_MEMTABLE_WRITER = 8, + + // A follower whose writes have been applied, or a parallel leader + // whose followers have all finished their work. This is a terminal + // state. + STATE_COMPLETED = 16, + + // A state indicating that the thread may be waiting using StateMutex() + // and StateCondVar() + STATE_LOCKED_WAITING = 32, + }; + + struct Writer; + + struct WriteGroup { + Writer* leader = nullptr; + Writer* last_writer = nullptr; + SequenceNumber last_sequence; + // before running goes to zero, status needs leader->StateMutex() + Status status; + std::atomic running; + size_t size = 0; + + struct Iterator { + Writer* writer; + Writer* last_writer; + + explicit Iterator(Writer* w, Writer* last) + : writer(w), last_writer(last) {} + + Writer* operator*() const { return writer; } + + Iterator& operator++() { + assert(writer != nullptr); + if (writer == last_writer) { + writer = nullptr; + } else { + writer = writer->link_newer; + } + return *this; + } + + bool operator!=(const Iterator& other) const { + return writer != other.writer; + } + }; + + Iterator begin() const { return Iterator(leader, last_writer); } + Iterator end() const { return Iterator(nullptr, nullptr); } + }; + + // Information kept for every waiting writer. + struct Writer { + WriteBatch* batch; + bool sync; + bool no_slowdown; + bool disable_wal; + Env::IOPriority rate_limiter_priority; + bool disable_memtable; + size_t batch_cnt; // if non-zero, number of sub-batches in the write batch + size_t protection_bytes_per_key; + PreReleaseCallback* pre_release_callback; + PostMemTableCallback* post_memtable_callback; + uint64_t log_used; // log number that this batch was inserted into + uint64_t log_ref; // log number that memtable insert should reference + WriteCallback* callback; + bool made_waitable; // records lazy construction of mutex and cv + std::atomic state; // write under StateMutex() or pre-link + WriteGroup* write_group; + SequenceNumber sequence; // the sequence number to use for the first key + Status status; + Status callback_status; // status returned by callback->Callback() + + std::aligned_storage::type state_mutex_bytes; + std::aligned_storage::type state_cv_bytes; + Writer* link_older; // read/write only before linking, or as leader + Writer* link_newer; // lazy, read/write only before linking, or as leader + + Writer() + : batch(nullptr), + sync(false), + no_slowdown(false), + disable_wal(false), + rate_limiter_priority(Env::IOPriority::IO_TOTAL), + disable_memtable(false), + batch_cnt(0), + protection_bytes_per_key(0), + pre_release_callback(nullptr), + post_memtable_callback(nullptr), + log_used(0), + log_ref(0), + callback(nullptr), + made_waitable(false), + state(STATE_INIT), + write_group(nullptr), + sequence(kMaxSequenceNumber), + link_older(nullptr), + link_newer(nullptr) {} + + Writer(const WriteOptions& write_options, WriteBatch* _batch, + WriteCallback* _callback, uint64_t _log_ref, bool _disable_memtable, + size_t _batch_cnt = 0, + PreReleaseCallback* _pre_release_callback = nullptr, + PostMemTableCallback* _post_memtable_callback = nullptr) + : batch(_batch), + sync(write_options.sync), + no_slowdown(write_options.no_slowdown), + disable_wal(write_options.disableWAL), + rate_limiter_priority(write_options.rate_limiter_priority), + disable_memtable(_disable_memtable), + batch_cnt(_batch_cnt), + protection_bytes_per_key(_batch->GetProtectionBytesPerKey()), + pre_release_callback(_pre_release_callback), + post_memtable_callback(_post_memtable_callback), + log_used(0), + log_ref(_log_ref), + callback(_callback), + made_waitable(false), + state(STATE_INIT), + write_group(nullptr), + sequence(kMaxSequenceNumber), + link_older(nullptr), + link_newer(nullptr) {} + + ~Writer() { + if (made_waitable) { + StateMutex().~mutex(); + StateCV().~condition_variable(); + } + status.PermitUncheckedError(); + callback_status.PermitUncheckedError(); + } + + bool CheckCallback(DB* db) { + if (callback != nullptr) { + callback_status = callback->Callback(db); + } + return callback_status.ok(); + } + + void CreateMutex() { + if (!made_waitable) { + // Note that made_waitable is tracked separately from state + // transitions, because we can't atomically create the mutex and + // link into the list. + made_waitable = true; + new (&state_mutex_bytes) std::mutex; + new (&state_cv_bytes) std::condition_variable; + } + } + + // returns the aggregate status of this Writer + Status FinalStatus() { + if (!status.ok()) { + // a non-ok memtable write status takes presidence + assert(callback == nullptr || callback_status.ok()); + return status; + } else if (!callback_status.ok()) { + // if the callback failed then that is the status we want + // because a memtable insert should not have been attempted + assert(callback != nullptr); + assert(status.ok()); + return callback_status; + } else { + // if there is no callback then we only care about + // the memtable insert status + assert(callback == nullptr || callback_status.ok()); + return status; + } + } + + bool CallbackFailed() { + return (callback != nullptr) && !callback_status.ok(); + } + + bool ShouldWriteToMemtable() { + return status.ok() && !CallbackFailed() && !disable_memtable; + } + + bool ShouldWriteToWAL() { + return status.ok() && !CallbackFailed() && !disable_wal; + } + + // No other mutexes may be acquired while holding StateMutex(), it is + // always last in the order + std::mutex& StateMutex() { + assert(made_waitable); + return *static_cast(static_cast(&state_mutex_bytes)); + } + + std::condition_variable& StateCV() { + assert(made_waitable); + return *static_cast( + static_cast(&state_cv_bytes)); + } + }; + + struct AdaptationContext { + const char* name; + std::atomic value; + + explicit AdaptationContext(const char* name0) : name(name0), value(0) {} + }; + + explicit WriteThread(const ImmutableDBOptions& db_options); + + virtual ~WriteThread() = default; + + // IMPORTANT: None of the methods in this class rely on the db mutex + // for correctness. All of the methods except JoinBatchGroup and + // EnterUnbatched may be called either with or without the db mutex held. + // Correctness is maintained by ensuring that only a single thread is + // a leader at a time. + + // Registers w as ready to become part of a batch group, waits until the + // caller should perform some work, and returns the current state of the + // writer. If w has become the leader of a write batch group, returns + // STATE_GROUP_LEADER. If w has been made part of a sequential batch + // group and the leader has performed the write, returns STATE_DONE. + // If w has been made part of a parallel batch group and is responsible + // for updating the memtable, returns STATE_PARALLEL_MEMTABLE_WRITER. + // + // The db mutex SHOULD NOT be held when calling this function, because + // it will block. + // + // Writer* w: Writer to be executed as part of a batch group + void JoinBatchGroup(Writer* w); + + // Constructs a write batch group led by leader, which should be a + // Writer passed to JoinBatchGroup on the current thread. + // + // Writer* leader: Writer that is STATE_GROUP_LEADER + // WriteGroup* write_group: Out-param of group members + // returns: Total batch group byte size + size_t EnterAsBatchGroupLeader(Writer* leader, WriteGroup* write_group); + + // Unlinks the Writer-s in a batch group, wakes up the non-leaders, + // and wakes up the next leader (if any). + // + // WriteGroup* write_group: the write group + // Status status: Status of write operation + void ExitAsBatchGroupLeader(WriteGroup& write_group, Status& status); + + // Exit batch group on behalf of batch group leader. + void ExitAsBatchGroupFollower(Writer* w); + + // Constructs a write batch group led by leader from newest_memtable_writers_ + // list. The leader should either write memtable for the whole group and + // call ExitAsMemTableWriter, or launch parallel memtable write through + // LaunchParallelMemTableWriters. + void EnterAsMemTableWriter(Writer* leader, WriteGroup* write_grup); + + // Memtable writer group leader, or the last finished writer in a parallel + // write group, exit from the newest_memtable_writers_ list, and wake up + // the next leader if needed. + void ExitAsMemTableWriter(Writer* self, WriteGroup& write_group); + + // Causes JoinBatchGroup to return STATE_PARALLEL_MEMTABLE_WRITER for all of + // the non-leader members of this write batch group. Sets Writer::sequence + // before waking them up. + // + // WriteGroup* write_group: Extra state used to coordinate the parallel add + void LaunchParallelMemTableWriters(WriteGroup* write_group); + + // Reports the completion of w's batch to the parallel group leader, and + // waits for the rest of the parallel batch to complete. Returns true + // if this thread is the last to complete, and hence should advance + // the sequence number and then call EarlyExitParallelGroup, false if + // someone else has already taken responsibility for that. + bool CompleteParallelMemTableWriter(Writer* w); + + // Waits for all preceding writers (unlocking mu while waiting), then + // registers w as the currently proceeding writer. + // + // Writer* w: A Writer not eligible for batching + // InstrumentedMutex* mu: The db mutex, to unlock while waiting + // REQUIRES: db mutex held + void EnterUnbatched(Writer* w, InstrumentedMutex* mu); + + // Completes a Writer begun with EnterUnbatched, unblocking subsequent + // writers. + void ExitUnbatched(Writer* w); + + // Wait for all parallel memtable writers to finish, in case pipelined + // write is enabled. + void WaitForMemTableWriters(); + + SequenceNumber UpdateLastSequence(SequenceNumber sequence) { + if (sequence > last_sequence_) { + last_sequence_ = sequence; + } + return last_sequence_; + } + + // Insert a dummy writer at the tail of the write queue to indicate a write + // stall, and fail any writers in the queue with no_slowdown set to true + void BeginWriteStall(); + + // Remove the dummy writer and wake up waiting writers + void EndWriteStall(); + + private: + // See AwaitState. + const uint64_t max_yield_usec_; + const uint64_t slow_yield_usec_; + + // Allow multiple writers write to memtable concurrently. + const bool allow_concurrent_memtable_write_; + + // Enable pipelined write to WAL and memtable. + const bool enable_pipelined_write_; + + // The maximum limit of number of bytes that are written in a single batch + // of WAL or memtable write. It is followed when the leader write size + // is larger than 1/8 of this limit. + const uint64_t max_write_batch_group_size_bytes; + + // Points to the newest pending writer. Only leader can remove + // elements, adding can be done lock-free by anybody. + std::atomic newest_writer_; + + // Points to the newest pending memtable writer. Used only when pipelined + // write is enabled. + std::atomic newest_memtable_writer_; + + // The last sequence that have been consumed by a writer. The sequence + // is not necessary visible to reads because the writer can be ongoing. + SequenceNumber last_sequence_; + + // A dummy writer to indicate a write stall condition. This will be inserted + // at the tail of the writer queue by the leader, so newer writers can just + // check for this and bail + Writer write_stall_dummy_; + + // Mutex and condvar for writers to block on a write stall. During a write + // stall, writers with no_slowdown set to false will wait on this rather + // on the writer queue + port::Mutex stall_mu_; + port::CondVar stall_cv_; + + // Waits for w->state & goal_mask using w->StateMutex(). Returns + // the state that satisfies goal_mask. + uint8_t BlockingAwaitState(Writer* w, uint8_t goal_mask); + + // Blocks until w->state & goal_mask, returning the state value + // that satisfied the predicate. Uses ctx to adaptively use + // std::this_thread::yield() to avoid mutex overheads. ctx should be + // a context-dependent static. + uint8_t AwaitState(Writer* w, uint8_t goal_mask, AdaptationContext* ctx); + + // Set writer state and wake the writer up if it is waiting. + void SetState(Writer* w, uint8_t new_state); + + // Links w into the newest_writer list. Return true if w was linked directly + // into the leader position. Safe to call from multiple threads without + // external locking. + bool LinkOne(Writer* w, std::atomic* newest_writer); + + // Link write group into the newest_writer list as a whole, while keeping the + // order of the writers unchanged. Return true if the group was linked + // directly into the leader position. + bool LinkGroup(WriteGroup& write_group, std::atomic* newest_writer); + + // Computes any missing link_newer links. Should not be called + // concurrently with itself. + void CreateMissingNewerLinks(Writer* head); + + // Set the leader in write_group to completed state and remove it from the + // write group. + void CompleteLeader(WriteGroup& write_group); + + // Set a follower in write_group to completed state and remove it from the + // write group. + void CompleteFollower(Writer* w, WriteGroup& write_group); +}; + +} // namespace ROCKSDB_NAMESPACE -- cgit v1.2.3